feat(metrics): report on health checks

This commit is contained in:
Gregor Kleen 2020-02-19 13:51:21 +01:00
parent 5ffee38979
commit bec40236db
7 changed files with 132 additions and 53 deletions

View File

@ -463,6 +463,7 @@ ul.list--inline
.deflist
display: grid
grid-template-columns: minmax(0,100%)
grid-row-gap: 7px
.deflist__dt,
.deflist__dd

View File

@ -22,8 +22,6 @@ import Database.Persist.Postgresql (createPostgresqlPool, pgConnStr,
import Import hiding (cancel)
import Language.Haskell.TH.Syntax (qLocation)
import Network.Wai (Middleware)
import qualified Network.Wai as Wai
import qualified Network.HTTP.Types as HTTP
import Network.Wai.Handler.Warp (Settings, defaultSettings,
defaultShouldDisplayException,
runSettings, runSettingsSocket, setHost,
@ -90,15 +88,8 @@ import qualified Data.Set as Set
import Data.Semigroup (Min(..))
import qualified Prometheus.Metric.GHC as Prometheus
import qualified Prometheus
import Data.Time.Clock.POSIX
import Handler.Utils.Routes (classifyHandler)
import Data.List (cycle)
-- Import all relevant handler modules here.
-- (HPack takes care to add new modules to our cabal file nowadays.)
import Handler.News
@ -138,7 +129,7 @@ mkYesodDispatch "UniWorX" resourcesUniWorX
-- migrations handled by Yesod.
makeFoundation :: (MonadResource m, MonadUnliftIO m) => AppSettings -> m UniWorX
makeFoundation appSettings'@AppSettings{..} = do
void $ Prometheus.register Prometheus.ghcMetrics
registerGHCMetrics
-- Some basic initializations: HTTP connection manager, logger, and static
-- subsite.
@ -306,38 +297,8 @@ makeApplication foundation = liftIO $ do
logWare <- makeLogWare foundation
-- Create the WAI application and apply middlewares
appPlain <- toWaiAppPlain foundation
return . prometheusMiddleware . logWare $ defaultMiddlewaresNoLogging appPlain
return . observeHTTPRequestLatency classifyHandler . logWare $ defaultMiddlewaresNoLogging appPlain
where
prometheusMiddleware :: Middleware
prometheusMiddleware app req respond' = do
start <- getPOSIXTime
app req $ \res -> do
end <- getPOSIXTime
let method = decodeUtf8 $ Wai.requestMethod req
status = tshow . HTTP.statusCode $ Wai.responseStatus res
route :: Maybe (Route UniWorX)
route = parseRoute ( Wai.pathInfo req
, over (mapped . _2) (fromMaybe "") . HTTP.queryToQueryText $ Wai.queryString req
)
handler' = pack . classifyHandler <$> route
labels :: Prometheus.Label3
labels = (fromMaybe "n/a" handler', method, status)
Prometheus.withLabel requestLatency labels . flip Prometheus.observe . realToFrac $ end - start
respond' res
{-# NOINLINE requestLatency #-}
requestLatency :: Prometheus.Vector Prometheus.Label3 Prometheus.Histogram
requestLatency = Prometheus.unsafeRegister
$ Prometheus.vector ("handler", "method", "status")
$ Prometheus.histogram info buckets
where info = Prometheus.Info "http_request_duration_seconds"
"HTTP request latency"
buckets = map fromRational . takeWhile (<= 500) . go 50e-6 $ cycle [2, 2, 2.5]
where
go n [] = [n]
go n (f:fs) = n : go (f * n) fs
makeLogWare :: MonadIO m => UniWorX -> m Middleware
@ -368,8 +329,6 @@ makeLogWare app = do
logWare <- either mkLogWare return lookupRes
logWare wai req fin
data ReadySince = MkReadySince
-- | Warp settings for the given foundation value.
warpSettings :: UniWorX -> Settings
warpSettings foundation = defaultSettings
@ -377,13 +336,8 @@ warpSettings foundation = defaultSettings
let notifyReady = do
$logInfoS "setup" "Ready"
void . liftIO $ do
void . Prometheus.register . readyMetric =<< getCurrentTime
registerReadyMetric
Systemd.notifyReady
readyMetric ts = Prometheus.Metric $ return (MkReadySince, collectReadySince)
where
collectReadySince = return [Prometheus.SampleGroup info Prometheus.GaugeType [Prometheus.Sample "ready_time" [] sample]]
info = Prometheus.Info "ready_time" "POSIXTime this Uni2work-instance became ready"
sample = encodeUtf8 . tshow . (realToFrac :: POSIXTime -> Nano) $ utcTimeToPOSIXSeconds ts
if
| foundation ^. _appHealthCheckDelayNotify
-> void . forkIO $ do

View File

@ -10,6 +10,8 @@ import qualified Network.Wai.Middleware.Prometheus as Prometheus
import qualified Data.Text as Text
import qualified Data.HashSet as HashSet
import qualified Data.Set as Set
getMetricsR :: Handler TypedContent
getMetricsR = selectRep $ do
@ -43,3 +45,9 @@ getMetricsR = selectRep $ do
singleSample base [Sample sName lPairs sValue]
| sName == base = Just (lPairs, sValue)
singleSample _ _ = Nothing
showMetricName metricName mSamples = Set.size (Set.fromList sampleBasenames) > 1
where
sampleBasenames = do
Sample sName _ _ <- mSamples
return $ metricBasename metricName sName

View File

@ -6,3 +6,4 @@ import Foundation as Import
import Import.NoFoundation as Import
import Utils.SystemMessage as Import
import Utils.Metrics as Import

View File

@ -32,7 +32,7 @@ import UnliftIO.Concurrent (myThreadId)
generateHealthReport :: HealthCheck -> Handler HealthReport
generateHealthReport = $(dispatchTH ''HealthCheck)
generateHealthReport = withHealthReportMetrics . $(dispatchTH ''HealthCheck)
dispatchHealthCheckMatchingClusterConfig :: Handler HealthReport

113
src/Utils/Metrics.hs Normal file
View File

@ -0,0 +1,113 @@
module Utils.Metrics
( withHealthReportMetrics
, registerGHCMetrics
, observeHTTPRequestLatency
, registerReadyMetric
) where
import Import.NoFoundation hiding (Vector, Info)
import Prometheus
import Prometheus.Metric.GHC
import qualified Data.List as List
import Data.Time.Clock.POSIX
import Network.Wai (Middleware)
import qualified Network.Wai as Wai
import qualified Network.HTTP.Types as HTTP
histogramBuckets :: Rational -- ^ min
-> Rational -- ^ max
-> [Double]
histogramBuckets bMin bMax = map fromRational . takeWhile (<= bMax) . go bMin $ List.cycle factors
where
go n [] = [n]
go n (f:fs) = n : go (f * n) fs
factors
| bMin' `mod` 2 == 0 = [2.5, 2, 2]
| bMin' `mod` 5 == 0 = [2, 2, 2.5]
| otherwise = [2, 2.5, 2]
where
bMin' :: Integer
bMin' = floor . List.head . dropWhile (< 1) $ List.iterate (* 10) bMin
{-# NOINLINE healthReportTime #-}
healthReportTime :: Vector Label2 Gauge
healthReportTime = unsafeRegister . vector ("check", "status") $ gauge Info
{ metricName = "uni2work_health_check_time"
, metricHelp = "POSIXTime of last health check performed by this Uni2work-instance"
}
{-# NOINLINE healthReportDuration #-}
healthReportDuration :: Vector Label2 Histogram
healthReportDuration = unsafeRegister . vector ("check", "status") $ histogram info buckets
where
info = Info
{ metricName = "uni2work_health_check_duration_seconds"
, metricHelp = "Duration of last health check performed by this Uni2work-instance"
}
buckets = histogramBuckets 5e-6 100e-3
{-# NOINLINE httpRequestLatency #-}
httpRequestLatency :: Vector Label3 Histogram
httpRequestLatency = unsafeRegister
$ vector ("handler", "method", "status")
$ histogram info buckets
where info = Info "http_request_duration_seconds"
"HTTP request latency"
buckets = histogramBuckets 50e-6 500
data ReadySince = MkReadySince
readyMetric :: POSIXTime -> Metric ReadySince
readyMetric ts = Metric $ return (MkReadySince, collectReadySince)
where
collectReadySince = return [SampleGroup info GaugeType [Sample "ready_time" [] sample]]
info = Info "ready_time" "POSIXTime this Uni2work-instance became ready"
sample = encodeUtf8 $ tshow (realToFrac ts :: Nano)
withHealthReportMetrics :: MonadIO m => m HealthReport -> m HealthReport
withHealthReportMetrics act = do
before <- liftIO getPOSIXTime
report <- act
after <- liftIO getPOSIXTime
let checkVal = toPathPiece $ classifyHealthReport report
statusVal = toPathPiece $ healthReportStatus report
timeSample, durationSample :: Double
timeSample = realToFrac (realToFrac after :: Nano)
durationSample = realToFrac (realToFrac $ after - before :: Nano)
liftIO $ withLabel healthReportTime (checkVal, statusVal) $ flip setGauge timeSample
liftIO $ withLabel healthReportDuration (checkVal, statusVal) $ flip observe durationSample
return report
registerGHCMetrics :: MonadIO m => m ()
registerGHCMetrics = void $ register ghcMetrics
observeHTTPRequestLatency :: forall site. ParseRoute site => (Route site -> String) -> Middleware
observeHTTPRequestLatency classifyHandler app req respond' = do
start <- getPOSIXTime
app req $ \res -> do
end <- getPOSIXTime
let method = decodeUtf8 $ Wai.requestMethod req
status = tshow . HTTP.statusCode $ Wai.responseStatus res
route :: Maybe (Route site)
route = parseRoute ( Wai.pathInfo req
, over (mapped . _2) (fromMaybe "") . HTTP.queryToQueryText $ Wai.queryString req
)
handler' = pack . classifyHandler <$> route
labels :: Label3
labels = (fromMaybe "n/a" handler', method, status)
withLabel httpRequestLatency labels . flip observe . realToFrac $ end - start
respond' res
registerReadyMetric :: MonadIO m => m ()
registerReadyMetric = liftIO $ void . register . readyMetric =<< getPOSIXTime

View File

@ -29,15 +29,17 @@ $maybe t <- metricsToken
<table .table .table--striped .table--hover>
<thead>
<tr .table__row .table__row--head>
<th .table__th>_{MsgMetricName}
$if showMetricName metricName mSamples
<th .table__th>_{MsgMetricName}
$forall l <- allLabels
<th .table__th style="font-family: monospace">#{l}
<th .table__th>_{MsgMetricValue}
<tbody>
$forall Sample sName lPairs sValue <- mSamples
<tr .table__row>
<td .table__td title=#{sName}>
#{metricBasename metricName sName}
$if showMetricName metricName mSamples
<td .table__td title=#{sName}>
#{metricBasename metricName sName}
$forall l <- allLabels
<td .table__td>
$maybe lValue <- lookup l lPairs