feat(metrics): monitor job durations

This commit is contained in:
Gregor Kleen 2020-02-21 13:28:52 +01:00
parent 697c3e11fc
commit 0da6c49392
3 changed files with 52 additions and 15 deletions

View File

@ -402,7 +402,7 @@ handleJobs' wNum = C.mapM_ $ \jctl -> do
instanceID' <- getsYesod $ view instanceID
now <- liftIO getCurrentTime
performJob content
withJobDuration (classifyJob content) $ performJob content
-- `performJob` is expected to throw an exception if it detects that the job was not done
runDB . setSerializable $ do

View File

@ -2,6 +2,7 @@
module Jobs.Types
( Job(..), Notification(..)
, classifyJob
, JobCtl(..)
, JobContext(..)
, JobState(..)
@ -12,11 +13,14 @@ module Jobs.Types
, JobPriority(..), prioritiseJob
) where
import Import.NoFoundation hiding (Unique)
import Import.NoFoundation hiding (Unique, state)
import qualified Data.Aeson as Aeson
import Data.Aeson (defaultOptions, Options(..), SumEncoding(..))
import Data.Aeson.TH (deriveJSON)
import qualified Data.HashMap.Strict as HashMap
import Data.List.NonEmpty (NonEmpty)
import Data.Unique
@ -27,6 +31,8 @@ import qualified Data.Set as Set
import Data.PQueue.Prio.Max (MaxPQueue)
import qualified Data.PQueue.Prio.Max as PQ
import Utils.Metrics (observeJobQueueDepth)
data Job = JobSendNotification { jRecipient :: UserId, jNotification :: Notification }
| JobSendTestEmail { jEmail :: Email, jMailContext :: MailContext }
@ -116,6 +122,12 @@ deriveJSON defaultOptions
, sumEncoding = TaggedObject "notification" "data"
} ''Notification
classifyJob :: Job -> String
classifyJob job = unpack tag
where
Aeson.Object obj = Aeson.toJSON job
Aeson.String tag = obj HashMap.! "job"
data JobCtl = JobCtlFlush
| JobCtlPerform QueuedJobId
@ -151,6 +163,8 @@ instance Universe JobPriority
instance Finite JobPriority
instance NFData JobPriority
nullaryPathPiece ''JobPriority $ camelToPathPiece' 2
prioritiseJob :: JobCtl -> JobPriority
prioritiseJob JobCtlTest = JobPrioRealtime
prioritiseJob (JobCtlGenerateHealthReport _) = JobPrioRealtime

View File

@ -3,6 +3,7 @@ module Utils.Metrics
, registerGHCMetrics
, observeHTTPRequestLatency
, registerReadyMetric
, withJobDuration
) where
import Import.NoFoundation hiding (Vector, Info)
@ -37,28 +38,24 @@ histogramBuckets bMin bMax = map fromRational . takeWhile (<= bMax) . go bMin $
{-# NOINLINE healthReportTime #-}
healthReportTime :: Vector Label2 Gauge
healthReportTime = unsafeRegister . vector ("check", "status") $ gauge Info
{ metricName = "uni2work_health_check_time"
, metricHelp = "POSIXTime of last health check performed by this Uni2work-instance"
}
healthReportTime = unsafeRegister . vector ("check", "status") $ gauge info
where
info = Info "uni2work_health_check_time"
"POSIXTime of last health check performed by this Uni2work-instance"
{-# NOINLINE healthReportDuration #-}
healthReportDuration :: Vector Label2 Histogram
healthReportDuration = unsafeRegister . vector ("check", "status") $ histogram info buckets
where
info = Info
{ metricName = "uni2work_health_check_duration_seconds"
, metricHelp = "Duration of last health check performed by this Uni2work-instance"
}
info = Info "uni2work_health_check_duration_seconds"
"Duration of last health check performed by this Uni2work-instance"
buckets = histogramBuckets 5e-6 100e-3
{-# NOINLINE httpRequestLatency #-}
httpRequestLatency :: Vector Label3 Histogram
httpRequestLatency = unsafeRegister
$ vector ("handler", "method", "status")
$ histogram info buckets
httpRequestLatency = unsafeRegister . vector ("handler", "method", "status") $ histogram info buckets
where info = Info "http_request_duration_seconds"
"HTTP request latency"
"HTTP request latency"
buckets = histogramBuckets 50e-6 500
data ReadySince = MkReadySince
@ -67,9 +64,23 @@ readyMetric :: POSIXTime -> Metric ReadySince
readyMetric ts = Metric $ return (MkReadySince, collectReadySince)
where
collectReadySince = return [SampleGroup info GaugeType [Sample "ready_time" [] sample]]
info = Info "ready_time" "POSIXTime this Uni2work-instance became ready"
info = Info "ready_time"
"POSIXTime this Uni2work-instance became ready"
sample = encodeUtf8 $ tshow (realToFrac ts :: Nano)
{-# NOINLINE jobDuration #-}
jobDuration :: Vector Label2 Histogram
jobDuration = unsafeRegister . vector ("kind", "status") $ histogram info buckets
where info = Info "uni2work_job_duration_seconds"
"Duration of time taken to execute a background job"
buckets = histogramBuckets 5e-6 500
{-# NOINLINE jobQueueDepth #-}
jobQueueDepth :: Vector Label2 Gauge
jobQueueDepth = unsafeRegister . vector ("worker", "priority") $ gauge info
where info = Info "uni2work_job_queue_size_count"
"Current depth of worker queue"
withHealthReportMetrics :: MonadIO m => m HealthReport -> m HealthReport
withHealthReportMetrics act = do
@ -111,3 +122,15 @@ observeHTTPRequestLatency classifyHandler app req respond' = do
registerReadyMetric :: MonadIO m => m ()
registerReadyMetric = liftIO $ void . register . readyMetric =<< getPOSIXTime
withJobDuration :: (MonadIO m, MonadCatch m) => String -> m a -> m a
withJobDuration job doJob = do
start <- liftIO getPOSIXTime
res <- handleAll (return . Left) $ Right <$> doJob
end <- liftIO getPOSIXTime
liftIO . withLabel jobDuration (pack job, bool "failure" "success" $ is _Right res) . flip observe . realToFrac $ end - start
case res of
Left exc -> throwM exc
Right res' -> return res'