chore(health): monitor job flushes

2021-06-21 15:53:42 +02:00 · 2021-06-21 15:53:42 +02:00 · aa1c0c8a3e
commit aa1c0c8a3e
parent 6c12737ad9
5 changed files with 23 additions and 0 deletions
--- a/config/settings.yml
+++ b/config/settings.yml
@ -50,6 +50,7 @@ health-check-interval:
  smtp-connect: "_env:HEALTHCHECK_INTERVAL_SMTP_CONNECT:600"
  widget-memcached: "_env:HEALTHCHECK_INTERVAL_WIDGET_MEMCACHED:600"
  active-job-executors: "_env:HEALTHCHECK_INTERVAL_ACTIVE_JOB_EXECUTORS:60"
+  does-flush: "_env:HEALTHCHECK_INTERVAL_DOES_FLUSH:15"
 health-check-delay-notify: "_env:HEALTHCHECK_DELAY_NOTIFY:true"
 health-check-http: "_env:HEALTHCHECK_HTTP:true" # Can we assume, that we can reach ourselves under APPROOT via HTTP (reverse proxies or firewalls might prevent this)?

--- a/src/Jobs.hs
+++ b/src/Jobs.hs
@ -117,6 +117,7 @@ handleJobs foundation@UniWorX{..}
      jobCurrentCrontab <- liftIO $ newTVarIO Nothing
      jobHeldLocks <- liftIO $ newTVarIO Set.empty
      jobOffload <- liftIO newEmptyTMVarIO
+      jobLastFlush <- liftIO $ newTVarIO Nothing
      registerJobHeldLocksCount jobHeldLocks
      registerJobWorkerQueueDepth appJobState
      atomically $ putTMVar appJobState JobState
@ -501,6 +502,8 @@ handleJobs' wNum = C.mapM_ $ \jctl -> hoist delimitInternalState . withJobWorker
      void . lift . runDB . runConduit
        $ selectKeys [ QueuedJobId /<-. Set.toList heldLocks ] [ Asc QueuedJobCreationTime ]
       .| C.mapM_ (\j -> lift $ runReaderT (writeJobCtl $ JobCtlPerform j) =<< getYesod)
+      lFlushTVar <- asks jobLastFlush
+      atomically . modifyTVar' lFlushTVar . max . Just =<< liftIO getCurrentTime
      $logInfoS logIdent "JobCtlFlush"
    handleCmd (JobCtlQueue job) = do
      $logDebugS logIdent "JobCtlQueue..."
--- a/src/Jobs/HealthReport.hs
+++ b/src/Jobs/HealthReport.hs
@ -173,3 +173,16 @@ dispatchHealthCheckActiveJobExecutors = HealthActiveJobExecutors <$> do
      if
        | Map.null workers -> return Nothing
        | otherwise        -> return . Just $ responders % fromIntegral (Map.size workers)
+
+
+dispatchHealthCheckDoesFlush :: Handler HealthReport
+dispatchHealthCheckDoesFlush = fmap HealthDoesFlush . runMaybeT $ do
+  UniWorX{ appSettings' = AppSettings{..}, appJobState } <- getYesod
+
+  interval <- hoistMaybe $ guardOnM (isn't _JobsOffload appJobMode) appJobFlushInterval
+  lFlush <- MaybeT . atomically $ do
+    jState <- tryReadTMVar appJobState
+    fmap join . for jState $ \JobState{jobContext} -> readTVar $ jobLastFlush jobContext
+
+  now <- liftIO getCurrentTime
+  return $ toRational (now `diffUTCTime` lFlush) / toRational interval
--- a/src/Jobs/Types.hs
+++ b/src/Jobs/Types.hs
@ -284,6 +284,7 @@ data JobContext = JobContext
  , jobConfirm :: TVar (HashMap JobCtl (NonEmpty (TMVar (Maybe SomeException))))
  , jobHeldLocks :: TVar (Set QueuedJobId)
  , jobOffload :: TMVar JobOffloadHandler
+  , jobLastFlush :: TVar (Maybe UTCTime)
  }


--- a/src/Model/Types/Health.hs
+++ b/src/Model/Types/Health.hs
@ -16,6 +16,7 @@ data HealthCheck
  | HealthCheckSMTPConnect
  | HealthCheckWidgetMemcached
  | HealthCheckActiveJobExecutors
+  | HealthCheckDoesFlush
  deriving (Eq, Ord, Read, Show, Enum, Bounded, Generic, Typeable)
 instance Universe HealthCheck
 instance Finite HealthCheck
@ -43,6 +44,7 @@ data HealthReport
    -- ^ Can we store values in memcached and retrieve them via HTTP?
  | HealthActiveJobExecutors { healthActiveJobExecutors :: Maybe Rational }
    -- ^ Proportion of job executors (excluding the one running the healthcheck) responding within a timeout
+  | HealthDoesFlush { healthFlushOverdue :: Maybe Rational }
  deriving (Eq, Ord, Read, Show, Data, Generic, Typeable)

 instance NFData HealthReport
@ -62,6 +64,7 @@ classifyHealthReport HealthHTTPReachable{}         = HealthCheckHTTPReachable
 classifyHealthReport HealthSMTPConnect{}           = HealthCheckSMTPConnect
 classifyHealthReport HealthWidgetMemcached{}       = HealthCheckWidgetMemcached
 classifyHealthReport HealthActiveJobExecutors{}    = HealthCheckActiveJobExecutors
+classifyHealthReport HealthDoesFlush{}             = HealthCheckDoesFlush

 -- | `HealthReport` classified (`classifyHealthReport`) by badness
 --
@ -91,4 +94,6 @@ healthReportStatus = \case
  HealthWidgetMemcached       (Just False) -> HealthFailure -- TODO: investigate this failure mode; do we just handle it gracefully?
  HealthActiveJobExecutors    (Just prop )
    | prop <= 0                            -> HealthFailure
+  HealthDoesFlush             (Just prop )
+    | prop >= 2                            -> HealthFailure
  _other                                   -> maxBound -- Minimum badness