chore(health): monitor job flushes

This commit is contained in:
Gregor Kleen 2021-06-21 15:53:42 +02:00
parent 6c12737ad9
commit aa1c0c8a3e
5 changed files with 23 additions and 0 deletions

View File

@ -50,6 +50,7 @@ health-check-interval:
smtp-connect: "_env:HEALTHCHECK_INTERVAL_SMTP_CONNECT:600"
widget-memcached: "_env:HEALTHCHECK_INTERVAL_WIDGET_MEMCACHED:600"
active-job-executors: "_env:HEALTHCHECK_INTERVAL_ACTIVE_JOB_EXECUTORS:60"
does-flush: "_env:HEALTHCHECK_INTERVAL_DOES_FLUSH:15"
health-check-delay-notify: "_env:HEALTHCHECK_DELAY_NOTIFY:true"
health-check-http: "_env:HEALTHCHECK_HTTP:true" # Can we assume, that we can reach ourselves under APPROOT via HTTP (reverse proxies or firewalls might prevent this)?

View File

@ -117,6 +117,7 @@ handleJobs foundation@UniWorX{..}
jobCurrentCrontab <- liftIO $ newTVarIO Nothing
jobHeldLocks <- liftIO $ newTVarIO Set.empty
jobOffload <- liftIO newEmptyTMVarIO
jobLastFlush <- liftIO $ newTVarIO Nothing
registerJobHeldLocksCount jobHeldLocks
registerJobWorkerQueueDepth appJobState
atomically $ putTMVar appJobState JobState
@ -501,6 +502,8 @@ handleJobs' wNum = C.mapM_ $ \jctl -> hoist delimitInternalState . withJobWorker
void . lift . runDB . runConduit
$ selectKeys [ QueuedJobId /<-. Set.toList heldLocks ] [ Asc QueuedJobCreationTime ]
.| C.mapM_ (\j -> lift $ runReaderT (writeJobCtl $ JobCtlPerform j) =<< getYesod)
lFlushTVar <- asks jobLastFlush
atomically . modifyTVar' lFlushTVar . max . Just =<< liftIO getCurrentTime
$logInfoS logIdent "JobCtlFlush"
handleCmd (JobCtlQueue job) = do
$logDebugS logIdent "JobCtlQueue..."

View File

@ -173,3 +173,16 @@ dispatchHealthCheckActiveJobExecutors = HealthActiveJobExecutors <$> do
if
| Map.null workers -> return Nothing
| otherwise -> return . Just $ responders % fromIntegral (Map.size workers)
dispatchHealthCheckDoesFlush :: Handler HealthReport
dispatchHealthCheckDoesFlush = fmap HealthDoesFlush . runMaybeT $ do
UniWorX{ appSettings' = AppSettings{..}, appJobState } <- getYesod
interval <- hoistMaybe $ guardOnM (isn't _JobsOffload appJobMode) appJobFlushInterval
lFlush <- MaybeT . atomically $ do
jState <- tryReadTMVar appJobState
fmap join . for jState $ \JobState{jobContext} -> readTVar $ jobLastFlush jobContext
now <- liftIO getCurrentTime
return $ toRational (now `diffUTCTime` lFlush) / toRational interval

View File

@ -284,6 +284,7 @@ data JobContext = JobContext
, jobConfirm :: TVar (HashMap JobCtl (NonEmpty (TMVar (Maybe SomeException))))
, jobHeldLocks :: TVar (Set QueuedJobId)
, jobOffload :: TMVar JobOffloadHandler
, jobLastFlush :: TVar (Maybe UTCTime)
}

View File

@ -16,6 +16,7 @@ data HealthCheck
| HealthCheckSMTPConnect
| HealthCheckWidgetMemcached
| HealthCheckActiveJobExecutors
| HealthCheckDoesFlush
deriving (Eq, Ord, Read, Show, Enum, Bounded, Generic, Typeable)
instance Universe HealthCheck
instance Finite HealthCheck
@ -43,6 +44,7 @@ data HealthReport
-- ^ Can we store values in memcached and retrieve them via HTTP?
| HealthActiveJobExecutors { healthActiveJobExecutors :: Maybe Rational }
-- ^ Proportion of job executors (excluding the one running the healthcheck) responding within a timeout
| HealthDoesFlush { healthFlushOverdue :: Maybe Rational }
deriving (Eq, Ord, Read, Show, Data, Generic, Typeable)
instance NFData HealthReport
@ -62,6 +64,7 @@ classifyHealthReport HealthHTTPReachable{} = HealthCheckHTTPReachable
classifyHealthReport HealthSMTPConnect{} = HealthCheckSMTPConnect
classifyHealthReport HealthWidgetMemcached{} = HealthCheckWidgetMemcached
classifyHealthReport HealthActiveJobExecutors{} = HealthCheckActiveJobExecutors
classifyHealthReport HealthDoesFlush{} = HealthCheckDoesFlush
-- | `HealthReport` classified (`classifyHealthReport`) by badness
--
@ -91,4 +94,6 @@ healthReportStatus = \case
HealthWidgetMemcached (Just False) -> HealthFailure -- TODO: investigate this failure mode; do we just handle it gracefully?
HealthActiveJobExecutors (Just prop )
| prop <= 0 -> HealthFailure
HealthDoesFlush (Just prop )
| prop >= 2 -> HealthFailure
_other -> maxBound -- Minimum badness