feat(health): timeout all health checks

This commit is contained in:
Gregor Kleen 2019-09-10 09:41:52 +02:00
parent 5ee6d33325
commit 33338cdfe9
4 changed files with 46 additions and 10 deletions

View File

@ -39,7 +39,13 @@ health-check-interval:
active-job-executors: "_env:HEALTHCHECK_INTERVAL_ACTIVE_JOB_EXECUTORS:60"
health-check-delay-notify: "_env:HEALTHCHECK_DELAY_NOTIFY:true"
health-check-http: "_env:HEALTHCHECK_HTTP:true" # Can we assume, that we can reach ourselves under APPROOT via HTTP (reverse proxies or firewalls might prevent this)?
health-check-active-job-executors-timeout: "_env:HEALTHCHECK_ACTIVE_JOB_EXECUTORS_TIMEOUT:5"
health-check-active-widget-memcached-timeout: "_env:HEALTHCHECK_ACTIVE_WIDGET_MEMCACHED_TIMEOUT:2"
health-check-smtp-connect-timeout: "_env:HEALTHCHECK_SMTP_CONNECT_TIMEOUT:5"
health-check-ldap-admins-timeout: "_env:HEALTHCHECK_LDAP_ADMINS_TIMEOUT:60"
health-check-http-reachable-timeout: "_env:HEALTHCHECK_HTTP_REACHABLE_TIMEOUT:2"
health-check-matching-cluster-config-timeout: "_env:HEALTHCHECK_MATCHING_CLUSTER_CONFIG_TIMEOUT:2"
synchronise-ldap-users-within: "_env:SYNCHRONISE_LDAP_WITHIN:1209600"
synchronise-ldap-users-interval: "_env:SYNCHRONISE_LDAP_INTERVAL:3600"

View File

@ -26,8 +26,6 @@ import qualified Data.CaseInsensitive as CI
import qualified Network.HaskellNet.SMTP as SMTP
import Data.Pool (withResource)
import System.Timeout
import Jobs.Queue
import Control.Concurrent.Async.Lifted.Safe (forConcurrently)
@ -35,11 +33,12 @@ import Control.Concurrent.Async.Lifted.Safe (forConcurrently)
generateHealthReport :: HealthCheck -> Handler HealthReport
generateHealthReport = $(dispatchTH ''HealthCheck)
dispatchHealthCheckMatchingClusterConfig :: Handler HealthReport
-- ^ Can the cluster configuration be read from the database and does it match our configuration?
dispatchHealthCheckMatchingClusterConfig
= fmap HealthMatchingClusterConfig . runDB $ and <$> forM universeF clusterSettingMatches
= fmap HealthMatchingClusterConfig . yesodTimeout (^. _appHealthCheckMatchingClusterConfigTimeout) False . runDB $ and <$> forM universeF clusterSettingMatches
where
clusterSettingMatches ClusterCryptoIDKey = do
ourSetting <- getsYesod appCryptoIDKey
@ -75,7 +74,7 @@ dispatchHealthCheckMatchingClusterConfig
dispatchHealthCheckHTTPReachable :: Handler HealthReport
dispatchHealthCheckHTTPReachable = HealthHTTPReachable <$> do
dispatchHealthCheckHTTPReachable = fmap HealthHTTPReachable . yesodTimeout (^. _appHealthCheckHTTPReachableTimeout) (Just False) $ do
staticAppRoot <- getsYesod $ view _appRoot
doHTTP <- getsYesod $ view _appHealthCheckHTTP
for (staticAppRoot <* guard doHTTP) $ \_ -> do
@ -89,7 +88,7 @@ dispatchHealthCheckHTTPReachable = HealthHTTPReachable <$> do
dispatchHealthCheckLDAPAdmins :: Handler HealthReport
dispatchHealthCheckLDAPAdmins = HealthLDAPAdmins <$> do
dispatchHealthCheckLDAPAdmins = fmap HealthLDAPAdmins . yesodTimeout (^. _appHealthCheckLDAPAdminsTimeout) (Just 0) $ do
ldapPool' <- getsYesod appLdapPool
ldapConf' <- getsYesod $ view _appLdapConf
ldapAdminUsers <- fmap (map E.unValue) . runDB . E.select . E.from $ \(user `E.InnerJoin` lecturer) -> E.distinctOnOrderBy [E.asc $ user E.^. UserId] $ do
@ -110,7 +109,7 @@ dispatchHealthCheckLDAPAdmins = HealthLDAPAdmins <$> do
dispatchHealthCheckSMTPConnect :: Handler HealthReport
dispatchHealthCheckSMTPConnect = HealthSMTPConnect <$> do
dispatchHealthCheckSMTPConnect = fmap HealthSMTPConnect . yesodTimeout (^. _appHealthCheckSMTPConnectTimeout) (Just False) $ do
smtpPool <- getsYesod appSmtpPool
for smtpPool . flip withResource $ \smtpConn -> do
response@(rCode, _) <- liftIO $ SMTP.sendCommand smtpConn SMTP.NOOP
@ -122,7 +121,7 @@ dispatchHealthCheckSMTPConnect = HealthSMTPConnect <$> do
dispatchHealthCheckWidgetMemcached :: Handler HealthReport
dispatchHealthCheckWidgetMemcached = HealthWidgetMemcached <$> do
dispatchHealthCheckWidgetMemcached = fmap HealthWidgetMemcached . yesodTimeout (^. _appHealthCheckActiveWidgetMemcachedTimeout) (Just False) $ do
memcachedConn <- getsYesod appWidgetMemcached
for memcachedConn $ \_memcachedConn' -> do
let ext = "bin"
@ -155,11 +154,9 @@ dispatchHealthCheckActiveJobExecutors = HealthActiveJobExecutors <$> do
tid <- liftIO myThreadId
let workers' = Map.fromSet jobWorkerName (Map.keysSet jobWorkers)
workers = Map.filterWithKey (\a _ -> asyncThreadId a /= tid) workers'
timeoutMicro = let (MkFixed micro :: Micro) = realToFrac timeoutLength
in fromInteger micro
$logDebugS "HealthCheckActiveJobExecutors" . tshow . map showWorkerId $ Map.elems workers'
responders <- fmap (getSum . fold) . liftIO . forConcurrently (Map.toList workers) $ \(_, wName)
-> fromMaybe (Sum 0) <$> timeout timeoutMicro (runReaderT ?? app $ Sum 1 <$ writeJobCtlBlock' (writeJobCtl' wName) JobCtlTest)
-> diffTimeout timeoutLength (Sum 0) (runReaderT ?? app $ Sum 1 <$ writeJobCtlBlock' (writeJobCtl' wName) JobCtlTest)
if
| Map.null workers -> return Nothing
| otherwise -> return . Just $ responders % fromIntegral (Map.size workers)

View File

@ -116,7 +116,13 @@ data AppSettings = AppSettings
, appHealthCheckInterval :: HealthCheck -> Maybe NominalDiffTime
, appHealthCheckDelayNotify :: Bool
, appHealthCheckHTTP :: Bool
, appHealthCheckActiveJobExecutorsTimeout :: NominalDiffTime
, appHealthCheckActiveWidgetMemcachedTimeout :: NominalDiffTime
, appHealthCheckSMTPConnectTimeout :: NominalDiffTime
, appHealthCheckLDAPAdminsTimeout :: NominalDiffTime
, appHealthCheckHTTPReachableTimeout :: NominalDiffTime
, appHealthCheckMatchingClusterConfigTimeout :: NominalDiffTime
, appSynchroniseLdapUsersWithin :: Maybe NominalDiffTime
, appSynchroniseLdapUsersInterval :: NominalDiffTime
@ -398,7 +404,13 @@ instance FromJSON AppSettings where
appHealthCheckInterval <- (assertM' (> 0) . ) <$> o .: "health-check-interval"
appHealthCheckDelayNotify <- o .: "health-check-delay-notify"
appHealthCheckHTTP <- o .: "health-check-http"
appHealthCheckActiveJobExecutorsTimeout <- o .: "health-check-active-job-executors-timeout"
appHealthCheckActiveWidgetMemcachedTimeout <- o .: "health-check-active-widget-memcached-timeout"
appHealthCheckSMTPConnectTimeout <- o .: "health-check-smtp-connect-timeout"
appHealthCheckLDAPAdminsTimeout <- o .: "health-check-ldap-admins-timeout"
appHealthCheckHTTPReachableTimeout <- o .: "health-check-http-reachable-timeout"
appHealthCheckMatchingClusterConfigTimeout <- o .: "health-check-matching-cluster-config-timeout"
appSessionTimeout <- o .: "session-timeout"

View File

@ -52,6 +52,7 @@ import Control.Monad.Trans.Except (ExceptT(..), throwE, runExceptT)
import Control.Monad.Except (MonadError(..))
import Control.Monad.Trans.Maybe as Utils (MaybeT(..))
import Control.Monad.Catch (catchIf)
import System.Timeout.Lifted (timeout)
import Language.Haskell.TH
import Language.Haskell.TH.Instances ()
@ -682,6 +683,26 @@ mconcatForM = flip mconcatMapM
findM :: (Monad m, Foldable f) => (a -> MaybeT m b) -> f a -> m (Maybe b)
findM f = runMaybeT . Fold.foldr (\x as -> f x <|> as) mzero
yesodTimeout :: ( MonadHandler m
, MonadBaseControl IO m
)
=> (HandlerSite m -> NominalDiffTime) -- ^ Calculate timeout
-> a -- ^ Default value
-> m a -- ^ Computation
-> m a -- ^ Result of computation or default value, if timeout is reached
yesodTimeout getTimeout timeoutRes act = do
timeoutLength <- getsYesod getTimeout
diffTimeout timeoutLength timeoutRes act
diffTimeout :: MonadBaseControl IO m
=> NominalDiffTime -> a -> m a -> m a
diffTimeout timeoutLength timeoutRes act = fromMaybe timeoutRes <$> timeout timeoutMicro act
where
timeoutMicro
= let (MkFixed micro :: Micro) = realToFrac timeoutLength
in fromInteger micro
-------------
-- Conduit --
-------------