fradrive/src/Utils/Metrics.hs
2020-10-14 12:40:08 +02:00

359 lines
14 KiB
Haskell

{-# OPTIONS_GHC -fno-warn-incomplete-uni-patterns #-}
module Utils.Metrics
( withHealthReportMetrics
, registerGHCMetrics
, observeHTTPRequestLatency
, registerReadyMetric
, withJobWorkerState
, observeYesodCacheSize
, observeFavouritesQuickActionsDuration
, LoginOutcome(..), observeLoginOutcome
, registerJobHeldLocksCount
, FileChunkStorage(..), observeSourcedChunk, observeSunkChunk
, observeDeletedUnreferencedFiles, observeDeletedUnreferencedChunks, observeInjectedFiles, observeRechunkedFiles
, registerJobWorkerQueueDepth
, observeMissingFiles
) where
import Import.NoModel hiding (Vector, Info)
import Model
import Prometheus
import Prometheus.Metric.GHC
import qualified Data.List as List
import System.Clock
import Data.Time.Clock.POSIX
import Network.Wai (Middleware)
import qualified Network.Wai as Wai
import qualified Network.HTTP.Types as HTTP
import Yesod.Core.Types (HandlerData(..), GHState(..))
import qualified Data.Set as Set
import Jobs.Types
import qualified Data.Aeson as Aeson
import qualified Data.HashMap.Strict as HashMap
{-# ANN module ("HLint: ignore Use even" :: String) #-}
histogramBuckets :: Rational -- ^ min
-> Rational -- ^ max
-> [Double]
histogramBuckets bMin bMax = map fromRational . takeWhile (<= bMax) . go bMin $ List.cycle factors
where
go n [] = [n]
go n (f:fs) = n : go (f * n) fs
factors
| bMin' `mod` 2 == 0 = [2.5, 2, 2]
| bMin' `mod` 5 == 0 = [2, 2, 2.5]
| otherwise = [2, 2.5, 2]
where
bMin' :: Integer
bMin' = floor . List.head . dropWhile (< 1) $ List.iterate (* 10) bMin
{-# NOINLINE healthReportTime #-}
healthReportTime :: Vector Label2 Gauge
healthReportTime = unsafeRegister . vector ("check", "status") $ gauge info
where
info = Info "uni2work_health_check_time"
"POSIXTime of last health check performed by this Uni2work-instance"
{-# NOINLINE healthReportDuration #-}
healthReportDuration :: Vector Label2 Histogram
healthReportDuration = unsafeRegister . vector ("check", "status") $ histogram info buckets
where
info = Info "uni2work_health_check_duration_seconds"
"Duration of last health check performed by this Uni2work-instance"
buckets = histogramBuckets 5e-6 100e-3
{-# NOINLINE httpRequestLatency #-}
httpRequestLatency :: Vector Label3 Histogram
httpRequestLatency = unsafeRegister . vector ("handler", "method", "status") $ histogram info buckets
where info = Info "http_request_duration_seconds"
"HTTP request latency"
buckets = histogramBuckets 50e-6 500
data ReadySince = MkReadySince
readyMetric :: POSIXTime -> Metric ReadySince
readyMetric ts = Metric $ return (MkReadySince, collectReadySince)
where
collectReadySince = return [SampleGroup info GaugeType [Sample "ready_time" [] sample]]
info = Info "ready_time"
"POSIXTime this Uni2work-instance became ready"
sample = encodeUtf8 $ tshow (realToFrac ts :: Nano)
{-# NOINLINE jobWorkerStateDuration #-}
jobWorkerStateDuration :: Vector Label4 Histogram
jobWorkerStateDuration = unsafeRegister . vector ("worker", "state", "jobctl", "task") $ histogram info buckets
where info = Info "uni2work_job_worker_state_duration_seconds"
"Duration of time a Uni2work job executor spent in a certain state"
buckets = histogramBuckets 1e-6 5000
{-# NOINLINE jobWorkerStateTransitions #-}
jobWorkerStateTransitions :: Vector Label4 Counter
jobWorkerStateTransitions = unsafeRegister . vector ("worker", "state", "jobctl", "task") $ counter info
where info = Info "uni2work_job_worker_state_transitions_total"
"Number of times a Uni2work job executor entered a certain state"
{-# NOINLINE yesodCacheSize #-}
yesodCacheSize :: Histogram
yesodCacheSize = unsafeRegister $ histogram info buckets
where info = Info "yesod_ghs_cache_items"
"Number of items in Yesod's ghsCache and ghsCacheBy"
buckets = 0 : histogramBuckets 1 1e6
{-# NOINLINE favouritesQuickActionsDuration #-}
favouritesQuickActionsDuration :: Histogram
favouritesQuickActionsDuration = unsafeRegister $ histogram info buckets
where info = Info "uni2work_favourites_quick_actions_seconds"
"Duration of time needed to calculate a set of favourite quick actions"
buckets = histogramBuckets 500e-6 50
{-# NOINLINE loginOutcomes #-}
loginOutcomes :: Vector Label2 Counter
loginOutcomes = unsafeRegister . vector ("plugin", "outcome") $ counter info
where info = Info "uni2work_login_attempts_total"
"Number of login attempts"
data JobHeldLocksCount = MkJobHeldLocksCount
jobHeldLocksCount :: TVar (Set QueuedJobId) -> Metric JobHeldLocksCount
jobHeldLocksCount heldLocks = Metric $ return (MkJobHeldLocksCount, collectJobHeldLocksCount)
where
collectJobHeldLocksCount = do
nLocks <- Set.size <$> readTVarIO heldLocks
let sample = encodeUtf8 $ tshow nLocks
return [SampleGroup info GaugeType [Sample "uni2work_jobs_held_locks_count" [] sample]]
info = Info "uni2work_jobs_held_locks_count"
"Number of job locks currently held by this Uni2work-instance"
{-# NOINLINE sourcedFileChunkSizes #-}
sourcedFileChunkSizes :: Vector Label1 Histogram
sourcedFileChunkSizes = unsafeRegister . vector "storage" $ histogram info buckets
where info = Info "uni2work_sourced_file_chunks_bytes"
"Sizes of file chunks sourced"
buckets = 0 : histogramBuckets 1 1000000000
{-# NOINLINE sunkFileChunkSizes #-}
sunkFileChunkSizes :: Vector Label1 Histogram
sunkFileChunkSizes = unsafeRegister . vector "storage" $ histogram info buckets
where info = Info "uni2work_sunk_file_chunks_bytes"
"Sizes of file chunks sunk"
buckets = 0 : histogramBuckets 1 1000000000
{-# NOINLINE deletedUnreferencedFiles #-}
deletedUnreferencedFiles :: Counter
deletedUnreferencedFiles = unsafeRegister $ counter info
where info = Info "uni2work_deleted_unreferenced_files_count"
"Number of unreferenced files deleted"
{-# NOINLINE deletedUnreferencedChunks #-}
deletedUnreferencedChunks :: Counter
deletedUnreferencedChunks = unsafeRegister $ counter info
where info = Info "uni2work_deleted_unreferenced_chunks_count"
"Number of unreferenced chunks deleted"
{-# NOINLINE deletedUnreferencedChunksBytes #-}
deletedUnreferencedChunksBytes :: Counter
deletedUnreferencedChunksBytes = unsafeRegister $ counter info
where info = Info "uni2work_deleted_unreferenced_chunks_bytes"
"Size of unreferenced chunks deleted"
{-# NOINLINE injectedFiles #-}
injectedFiles :: Counter
injectedFiles = unsafeRegister $ counter info
where info = Info "uni2work_injected_files_count"
"Number of files injected from upload cache into database"
{-# NOINLINE injectedFilesBytes #-}
injectedFilesBytes :: Counter
injectedFilesBytes = unsafeRegister $ counter info
where info = Info "uni2work_injected_files_bytes"
"Size of files injected from upload cache into database"
{-# NOINLINE rechunkedFiles #-}
rechunkedFiles :: Counter
rechunkedFiles = unsafeRegister $ counter info
where info = Info "uni2work_rechunked_files_count"
"Number of files rechunked within database"
{-# NOINLINE rechunkedFilesBytes #-}
rechunkedFilesBytes :: Counter
rechunkedFilesBytes = unsafeRegister $ counter info
where info = Info "uni2work_rechunked_files_bytes"
"Size of files rechunked within database"
data JobWorkerQueueDepth = MkJobWorkerQueueDepth
jobWorkerQueueDepth :: TMVar JobState -> Metric JobWorkerQueueDepth
jobWorkerQueueDepth jSt = Metric $ return (MkJobWorkerQueueDepth, collectJobWorkerQueueDepth)
where
collectJobWorkerQueueDepth = maybeT (return []) $ do
wQueues <- hoist atomically $ do
JobState{..} <- MaybeT $ tryReadTMVar jSt
flip ifoldMapM jobWorkers $ \wAsync wQueue
-> lift $ pure . (jobWorkerName wAsync, ) . jqDepth <$> readTVar wQueue
return [ SampleGroup info GaugeType
[ Sample "uni2work_queued_jobs_count" [("worker", showWorkerId wName)] . encodeUtf8 $ tshow wDepth
| (wName, wDepth) <- wQueues
]
]
info = Info "uni2work_queued_jobs_count"
"Number of JobQueue entries in this Uni2work-instance"
{-# NOINLINE missingFiles #-}
missingFiles :: Vector Label1 Gauge
missingFiles = unsafeRegister . vector "ref" $ gauge info
where info = Info "uni2work_missing_files_count"
"Number of files referenced from within database that are missing"
withHealthReportMetrics :: MonadIO m => m HealthReport -> m HealthReport
withHealthReportMetrics act = do
before <- liftIO $ getTime Monotonic
report <- act
after <- liftIO $ getTime Monotonic
let checkVal = toPathPiece $ classifyHealthReport report
statusVal = toPathPiece $ healthReportStatus report
timeSample, durationSample :: Double
timeSample = realToFrac (realToFrac after :: Nano)
durationSample = realToFrac (realToFrac $ after - before :: Nano)
liftIO $ withLabel healthReportTime (checkVal, statusVal) $ flip setGauge timeSample
liftIO $ withLabel healthReportDuration (checkVal, statusVal) $ flip observe durationSample
return report
registerGHCMetrics :: MonadIO m => m ()
registerGHCMetrics = void $ register ghcMetrics
observeHTTPRequestLatency :: forall site. ParseRoute site => (Route site -> String) -> Middleware
observeHTTPRequestLatency classifyHandler app req respond' = do
start <- getTime Monotonic
app req $ \res -> do
end <- getTime Monotonic
let method = decodeUtf8 $ Wai.requestMethod req
status = tshow . HTTP.statusCode $ Wai.responseStatus res
route :: Maybe (Route site)
route = parseRoute ( Wai.pathInfo req
, over (mapped . _2) (fromMaybe "") . HTTP.queryToQueryText $ Wai.queryString req
)
handler' = pack . classifyHandler <$> route
labels :: Label3
labels = (fromMaybe "n/a" handler', method, status)
withLabel httpRequestLatency labels . flip observe . realToFrac $ end - start
respond' res
registerReadyMetric :: MonadIO m => m ()
registerReadyMetric = liftIO $ void . register . readyMetric =<< getPOSIXTime
classifyJobWorkerState :: JobWorkerId -> JobWorkerState -> Prometheus.Label4
classifyJobWorkerState wId jws = (showWorkerId wId, tag, maybe "n/a" pack mJobCtl, maybe "n/a" pack mJob)
where
Aeson.Object obj = Aeson.toJSON jws
Aeson.String tag = obj HashMap.! "state"
mJobCtl = asum
[ classifyJobCtl <$> jws ^? _jobWorkerJobCtl
, "perform" <$ jws ^? _jobWorkerJob
]
mJob = classifyJob <$> jws ^? _jobWorkerJob
withJobWorkerState :: (MonadIO m, MonadMask m) => JobWorkerId -> JobWorkerState -> m a -> m a
withJobWorkerState wId newSt = withJobWorkerStateLbls $ classifyJobWorkerState wId newSt
withJobWorkerStateLbls :: (MonadIO m, MonadMask m) => Label4 -> m a -> m a
withJobWorkerStateLbls newLbls act = do
liftIO $ withLabel jobWorkerStateTransitions newLbls incCounter
start <- liftIO $ getTime Monotonic
res <- handleAll (return . Left) $ Right <$> act
end <- liftIO $ getTime Monotonic
liftIO . withLabel jobWorkerStateDuration newLbls . flip observe . realToFrac $ end - start
either throwM return res
observeYesodCacheSize :: MonadHandler m => m ()
observeYesodCacheSize = do
HandlerData{handlerState} <- liftHandler ask
liftIO $ do
GHState{..} <- readIORef handlerState
let size = fromIntegral $ length ghsCache + length ghsCacheBy
observe yesodCacheSize size
observeFavouritesQuickActionsDuration :: (MonadIO m, MonadMask m) => m a -> m a
observeFavouritesQuickActionsDuration act = do
start <- liftIO $ getTime Monotonic
res <- handleAll (return . Left) $ Right <$> act
end <- liftIO $ getTime Monotonic
liftIO . observe favouritesQuickActionsDuration . realToFrac $ end - start
either throwM return res
data LoginOutcome
= LoginSuccessful
| LoginInvalidCredentials
| LoginADInvalidCredentials
| LoginError
deriving (Eq, Ord, Read, Show, Enum, Bounded, Generic, Typeable)
deriving anyclass (Universe, Finite)
nullaryPathPiece ''LoginOutcome $ camelToPathPiece' 1
observeLoginOutcome :: MonadHandler m
=> Text -- ^ Plugin
-> LoginOutcome
-> m ()
observeLoginOutcome plugin outcome
= liftIO $ withLabel loginOutcomes (plugin, toPathPiece outcome) incCounter
registerJobHeldLocksCount :: MonadIO m => TVar (Set QueuedJobId) -> m ()
registerJobHeldLocksCount = liftIO . void . register . jobHeldLocksCount
data FileChunkStorage = StorageMinio | StorageDB
deriving (Eq, Ord, Read, Show, Enum, Bounded, Generic, Typeable)
deriving anyclass (Universe, Finite)
nullaryPathPiece ''FileChunkStorage $ camelToPathPiece' 1
observeSunkChunk, observeSourcedChunk :: (Integral n, MonadIO m) => FileChunkStorage -> n -> m ()
observeSunkChunk store = liftIO . observeChunkSize sunkFileChunkSizes store . fromIntegral
observeSourcedChunk store = liftIO . observeChunkSize sourcedFileChunkSizes store . fromIntegral
observeChunkSize :: Vector Label1 Histogram -> FileChunkStorage -> Integer -> IO ()
observeChunkSize metric (toPathPiece -> storageLabel) = withLabel metric storageLabel . flip observe . fromInteger
observeDeletedUnreferencedFiles :: MonadIO m => Natural -> m ()
observeDeletedUnreferencedFiles = liftIO . void . addCounter deletedUnreferencedFiles . fromIntegral
observeDeletedUnreferencedChunks :: MonadIO m => Natural -> Word64 -> m ()
observeDeletedUnreferencedChunks num size = liftIO $ do
void . addCounter deletedUnreferencedChunks $ fromIntegral num
void . addCounter deletedUnreferencedChunksBytes $ fromIntegral size
observeInjectedFiles :: MonadIO m => Natural -> Word64 -> m ()
observeInjectedFiles num size = liftIO $ do
void . addCounter injectedFiles $ fromIntegral num
void . addCounter injectedFilesBytes $ fromIntegral size
observeRechunkedFiles :: MonadIO m => Natural -> Word64 -> m ()
observeRechunkedFiles num size = liftIO $ do
void . addCounter rechunkedFiles $ fromIntegral num
void . addCounter rechunkedFilesBytes $ fromIntegral size
registerJobWorkerQueueDepth :: MonadIO m => TMVar JobState -> m ()
registerJobWorkerQueueDepth = liftIO . void . register . jobWorkerQueueDepth
observeMissingFiles :: MonadIO m => Text -> Int -> m ()
observeMissingFiles refIdent = liftIO . withLabel missingFiles refIdent . flip setGauge . fromIntegral