diff --git a/config/settings.yml b/config/settings.yml index a0bf49f47..441096909 100644 --- a/config/settings.yml +++ b/config/settings.yml @@ -50,6 +50,7 @@ health-check-interval: smtp-connect: "_env:HEALTHCHECK_INTERVAL_SMTP_CONNECT:600" widget-memcached: "_env:HEALTHCHECK_INTERVAL_WIDGET_MEMCACHED:600" active-job-executors: "_env:HEALTHCHECK_INTERVAL_ACTIVE_JOB_EXECUTORS:60" + does-flush: "_env:HEALTHCHECK_INTERVAL_DOES_FLUSH:15" health-check-delay-notify: "_env:HEALTHCHECK_DELAY_NOTIFY:true" health-check-http: "_env:HEALTHCHECK_HTTP:true" # Can we assume, that we can reach ourselves under APPROOT via HTTP (reverse proxies or firewalls might prevent this)? diff --git a/src/Jobs.hs b/src/Jobs.hs index c58b9f444..ee817c880 100644 --- a/src/Jobs.hs +++ b/src/Jobs.hs @@ -117,6 +117,7 @@ handleJobs foundation@UniWorX{..} jobCurrentCrontab <- liftIO $ newTVarIO Nothing jobHeldLocks <- liftIO $ newTVarIO Set.empty jobOffload <- liftIO newEmptyTMVarIO + jobLastFlush <- liftIO $ newTVarIO Nothing registerJobHeldLocksCount jobHeldLocks registerJobWorkerQueueDepth appJobState atomically $ putTMVar appJobState JobState @@ -501,6 +502,8 @@ handleJobs' wNum = C.mapM_ $ \jctl -> hoist delimitInternalState . withJobWorker void . lift . runDB . runConduit $ selectKeys [ QueuedJobId /<-. Set.toList heldLocks ] [ Asc QueuedJobCreationTime ] .| C.mapM_ (\j -> lift $ runReaderT (writeJobCtl $ JobCtlPerform j) =<< getYesod) + lFlushTVar <- asks jobLastFlush + atomically . modifyTVar' lFlushTVar . max . Just =<< liftIO getCurrentTime $logInfoS logIdent "JobCtlFlush" handleCmd (JobCtlQueue job) = do $logDebugS logIdent "JobCtlQueue..." diff --git a/src/Jobs/HealthReport.hs b/src/Jobs/HealthReport.hs index 83c685700..ec6debd87 100644 --- a/src/Jobs/HealthReport.hs +++ b/src/Jobs/HealthReport.hs @@ -173,3 +173,16 @@ dispatchHealthCheckActiveJobExecutors = HealthActiveJobExecutors <$> do if | Map.null workers -> return Nothing | otherwise -> return . Just $ responders % fromIntegral (Map.size workers) + + +dispatchHealthCheckDoesFlush :: Handler HealthReport +dispatchHealthCheckDoesFlush = fmap HealthDoesFlush . runMaybeT $ do + UniWorX{ appSettings' = AppSettings{..}, appJobState } <- getYesod + + interval <- hoistMaybe $ guardOnM (isn't _JobsOffload appJobMode) appJobFlushInterval + lFlush <- MaybeT . atomically $ do + jState <- tryReadTMVar appJobState + fmap join . for jState $ \JobState{jobContext} -> readTVar $ jobLastFlush jobContext + + now <- liftIO getCurrentTime + return $ toRational (now `diffUTCTime` lFlush) / toRational interval diff --git a/src/Jobs/Types.hs b/src/Jobs/Types.hs index ded70d4af..81c741d12 100644 --- a/src/Jobs/Types.hs +++ b/src/Jobs/Types.hs @@ -284,6 +284,7 @@ data JobContext = JobContext , jobConfirm :: TVar (HashMap JobCtl (NonEmpty (TMVar (Maybe SomeException)))) , jobHeldLocks :: TVar (Set QueuedJobId) , jobOffload :: TMVar JobOffloadHandler + , jobLastFlush :: TVar (Maybe UTCTime) } diff --git a/src/Model/Types/Health.hs b/src/Model/Types/Health.hs index 0dc4af620..6cacd2148 100644 --- a/src/Model/Types/Health.hs +++ b/src/Model/Types/Health.hs @@ -16,6 +16,7 @@ data HealthCheck | HealthCheckSMTPConnect | HealthCheckWidgetMemcached | HealthCheckActiveJobExecutors + | HealthCheckDoesFlush deriving (Eq, Ord, Read, Show, Enum, Bounded, Generic, Typeable) instance Universe HealthCheck instance Finite HealthCheck @@ -43,6 +44,7 @@ data HealthReport -- ^ Can we store values in memcached and retrieve them via HTTP? | HealthActiveJobExecutors { healthActiveJobExecutors :: Maybe Rational } -- ^ Proportion of job executors (excluding the one running the healthcheck) responding within a timeout + | HealthDoesFlush { healthFlushOverdue :: Maybe Rational } deriving (Eq, Ord, Read, Show, Data, Generic, Typeable) instance NFData HealthReport @@ -62,6 +64,7 @@ classifyHealthReport HealthHTTPReachable{} = HealthCheckHTTPReachable classifyHealthReport HealthSMTPConnect{} = HealthCheckSMTPConnect classifyHealthReport HealthWidgetMemcached{} = HealthCheckWidgetMemcached classifyHealthReport HealthActiveJobExecutors{} = HealthCheckActiveJobExecutors +classifyHealthReport HealthDoesFlush{} = HealthCheckDoesFlush -- | `HealthReport` classified (`classifyHealthReport`) by badness -- @@ -91,4 +94,6 @@ healthReportStatus = \case HealthWidgetMemcached (Just False) -> HealthFailure -- TODO: investigate this failure mode; do we just handle it gracefully? HealthActiveJobExecutors (Just prop ) | prop <= 0 -> HealthFailure + HealthDoesFlush (Just prop ) + | prop >= 2 -> HealthFailure _other -> maxBound -- Minimum badness