{-# LANGUAGE BangPatterns #-} module Jobs.Handler.Files ( dispatchJobPruneSessionFiles , dispatchJobPruneUnreferencedFiles , dispatchJobInjectFiles, dispatchJobRechunkFiles , dispatchJobDetectMissingFiles ) where import Import hiding (matching, maximumBy, init) import Database.Persist.Sql (deleteWhereCount) import qualified Database.Esqueleto as E import qualified Database.Esqueleto.PostgreSQL as E import qualified Database.Esqueleto.Utils as E import qualified Data.Conduit.Combinators as C import qualified Data.Conduit.List as C (mapMaybe, unfoldM) import Handler.Utils.Minio import qualified Network.Minio as Minio import Crypto.Hash (hashDigestSize, digestFromByteString) import Data.List ((!!), unfoldr, maximumBy, init, genericLength) import qualified Data.ByteString as ByteString import Data.Bits (Bits(shiftR)) import qualified Data.Map.Strict as Map import Control.Monad.Random.Lazy (evalRand, mkStdGen) import System.Random.Shuffle (shuffleM) import System.IO.Unsafe import Handler.Utils.Files (sourceFileDB) import Control.Monad.Logger (askLoggerIO, runLoggingT) import System.Clock import qualified Data.Set as Set import qualified Data.Sequence as Seq import Jobs.Queue (YesodJobDB) dispatchJobPruneSessionFiles :: JobHandler UniWorX dispatchJobPruneSessionFiles = JobHandlerAtomicWithFinalizer act fin where act = hoist lift $ do now <- liftIO getCurrentTime expires <- getsYesod $ view _appSessionFilesExpire deleteWhereCount [ SessionFileTouched <. addUTCTime (- expires) now ] fin n = $logInfoS "PruneSessionFiles" [st|Deleted #{n} expired session files|] fileReferences :: E.SqlExpr (E.Value FileContentReference) -> [E.SqlQuery ()] fileReferences (E.just -> fHash) = [ E.from $ \appFile -> E.where_ $ appFile E.^. CourseApplicationFileContent E.==. fHash , E.from $ \matFile -> E.where_ $ matFile E.^. MaterialFileContent E.==. fHash , E.from $ \newsFile -> E.where_ $ newsFile E.^. CourseNewsFileContent E.==. fHash , E.from $ \sheetFile -> E.where_ $ sheetFile E.^. SheetFileContent E.==. fHash , E.from $ \appInstr -> E.where_ $ appInstr E.^. CourseAppInstructionFileContent E.==. fHash , E.from $ \matching -> E.where_ $ E.just (matching E.^. AllocationMatchingLog) E.==. fHash , E.from $ \subFile -> E.where_ $ subFile E.^. SubmissionFileContent E.==. fHash , E.from $ \sessFile -> E.where_ $ sessFile E.^. SessionFileContent E.==. fHash , E.from $ \lock -> E.where_ $ E.just (lock E.^. FileLockContent) E.==. fHash , E.from $ \chunkLock -> E.where_ . E.exists . E.from $ \fileContentEntry -> E.where_ $ E.just (fileContentEntry E.^. FileContentEntryHash) E.==. fHash E.&&. chunkLock E.^. FileChunkLockHash E.==. E.subSelectForeign fileContentEntry FileContentEntryChunkHash (E.^. FileContentChunkHash) ] workflowFileReferences :: MonadResource m => ConduitT () FileContentReference (SqlPersistT m) () workflowFileReferences = mconcat [ E.selectSource (E.from $ pure . (E.^. WorkflowDefinitionGraph)) .| awaitForever (mapMOf_ (typesCustom @WorkflowChildren . _fileReferenceContent . _Just) yield . E.unValue) , E.selectSource (E.from $ pure . (E.^. WorkflowInstanceGraph )) .| awaitForever (mapMOf_ (typesCustom @WorkflowChildren . _fileReferenceContent . _Just) yield . E.unValue) , E.selectSource (E.from $ pure . (E.^. WorkflowWorkflowGraph )) .| awaitForever (mapMOf_ (typesCustom @WorkflowChildren . _fileReferenceContent . _Just) yield . E.unValue) , E.selectSource (E.from $ pure . (E.^. WorkflowWorkflowState )) .| awaitForever (mapMOf_ (typesCustom @WorkflowChildren . _fileReferenceContent . _Just) yield . E.unValue) ] dispatchJobDetectMissingFiles :: JobHandler UniWorX dispatchJobDetectMissingFiles = JobHandlerAtomicWithFinalizer act fin where act :: YesodJobDB UniWorX (Map Text (NonNull (Set FileContentReference))) act = hoist lift $ do uploadBucket <- getsYesod $ view _appUploadCacheBucket missingDb <- execWriterT $ do tellM . forM trackedReferences $ \refQuery -> fmap (Set.fromList . mapMaybe E.unValue) . E.select $ do ref <- refQuery E.where_ . E.not_ $ E.isNothing ref E.where_ . E.not_ . E.exists . E.from $ \fileContentEntry -> E.where_ $ E.just (fileContentEntry E.^. FileContentEntryHash) E.==. ref E.distinctOnOrderBy [E.asc ref] $ return ref tellM . fmap (Map.singleton "workflows") . runConduit $ workflowFileReferences .| C.foldMap Set.singleton let allMissingDb :: Set Minio.Object allMissingDb = setOf (folded . folded . re minioFileReference) missingDb filterMissingDb :: forall m. Monad m => Set Minio.Object -> ConduitT Minio.ListItem (Set Minio.Object) m () filterMissingDb remaining = maybeT (yield remaining) $ do nextMinio <- MaybeT await remaining' <- case nextMinio of Minio.ListItemObject oi -> do let (missingMinio, remaining') = Set.split (Minio.oiObject oi) remaining lift $ yield missingMinio return remaining' _other -> return remaining lift $ filterMissingDb remaining' allMissingMinio <- maybeT (return $ fold missingDb) . hoistMaybeM . runAppMinio . runMaybeT . runConduit $ transPipe lift (Minio.listObjects uploadBucket Nothing True) .| filterMissingDb allMissingDb .| C.foldMapE (setOf minioFileReference) return $ Map.mapMaybe (fromNullable . Set.intersection allMissingMinio) missingDb fin :: Map Text (NonNull (Set FileContentReference)) -> Handler () fin missingCounts = do forM_ (Map.keysSet trackedReferences) $ \refIdent -> observeMissingFiles refIdent . maybe 0 olength $ missingCounts Map.!? refIdent iforM_ missingCounts $ \refIdent missingFiles -> let missingRefs = unlines . map tshow . Set.toList $ toNullable missingFiles in $logErrorS "MissingFiles" [st|#{refIdent}: #{olength missingFiles}\n#{missingRefs}|] when (Map.null missingCounts) $ $logInfoS "MissingFiles" [st|No missing files|] trackedReferences = Map.fromList $ over (traverse . _1) nameToPathPiece [ (''CourseApplicationFile, E.from $ \appFile -> return $ appFile E.^. CourseApplicationFileContent ) , (''MaterialFile, E.from $ \matFile -> return $ matFile E.^. MaterialFileContent ) , (''CourseNewsFile, E.from $ \newsFile -> return $ newsFile E.^. CourseNewsFileContent ) , (''SheetFile, E.from $ \sheetFile -> return $ sheetFile E.^. SheetFileContent ) , (''CourseAppInstructionFile, E.from $ \appInstr -> return $ appInstr E.^. CourseAppInstructionFileContent) , (''SubmissionFile, E.from $ \subFile -> return $ subFile E.^. SubmissionFileContent ) , (''SessionFile, E.from $ \sessFile -> return $ sessFile E.^. SessionFileContent ) , (''AllocationMatching, E.from $ \matching -> return . E.just $ matching E.^. AllocationMatchingLog ) ] {-# NOINLINE pruneUnreferencedFilesIntervalsCache #-} pruneUnreferencedFilesIntervalsCache :: TVar (Map Natural [(Maybe FileContentChunkReference, Maybe FileContentChunkReference)]) pruneUnreferencedFilesIntervalsCache = unsafePerformIO $ newTVarIO Map.empty dispatchJobPruneUnreferencedFiles :: Natural -> Natural -> Natural -> JobHandler UniWorX dispatchJobPruneUnreferencedFiles numIterations epoch iteration = JobHandlerAtomicWithFinalizer act fin where act = hoist lift $ do now <- liftIO getCurrentTime interval <- getsYesod $ view _appPruneUnreferencedFilesInterval keep <- fmap (max 0) . getsYesod $ view _appKeepUnreferencedFiles let chunkHashBytes :: forall h. ( Unwrapped FileContentChunkReference ~ Digest h ) => Integer chunkHashBytes = fromIntegral (hashDigestSize (error "hashDigestSize inspected argument" :: h)) chunkHashBits = chunkHashBytes * 8 base :: Integer base = 2 ^ chunkHashBits intervals :: [Integer] -- | Exclusive upper bounds intervals | numIterations <= 0 = pure base | otherwise = go protoIntervals ^.. folded . _1 where go [] = [] go ints | maximumOf (folded . _1) ints == Just base = ints | otherwise = go $ lts ++ over _1 succ (over _2 (subtract $ toInteger numIterations) closest) : map (over _1 succ) gts where closest = maximumBy (comparing $ view _2) ints (lts, geqs) = partition (((>) `on` view _1) closest) ints gts = filter (((<) `on` view _1) closest) geqs -- | Exclusive upper bounds protoIntervals :: [(Integer, Integer)] protoIntervals = [ over _1 (i *) $ base `divMod` toInteger numIterations | i <- [1 .. toInteger numIterations] ] intervalsDgsts' = zipWith (curry . over both $ toDigest <=< assertM' (> 0)) (0 : init intervals) intervals toDigest :: Integer -> Maybe FileContentChunkReference toDigest = fmap (review _Wrapped) . digestFromByteString . pad . ByteString.pack . reverse . unfoldr step where step i | i <= 0 = Nothing | otherwise = Just (fromIntegral i, i `shiftR` 8) pad bs | toInteger (ByteString.length bs) >= chunkHashBytes = bs | otherwise = pad $ ByteString.cons 0 bs intervalsDgsts <- atomically $ do cachedDgsts <- readTVar pruneUnreferencedFilesIntervalsCache case Map.lookup numIterations cachedDgsts of Just c -> return c Nothing -> do modifyTVar' pruneUnreferencedFilesIntervalsCache $ force . Map.insert numIterations intervalsDgsts' return intervalsDgsts' let permIntervalsDgsts = shuffleM intervalsDgsts `evalRand` mkStdGen (hash epoch) (minBoundDgst, maxBoundDgst) = permIntervalsDgsts !! fromIntegral (toInteger iteration `mod` genericLength permIntervalsDgsts) chunkIdFilter :: E.SqlExpr (E.Value FileContentChunkReference) -> E.SqlExpr (E.Value Bool) chunkIdFilter cRef = E.and $ catMaybes [ minBoundDgst <&> \b -> cRef E.>=. E.val b , maxBoundDgst <&> \b -> cRef E.<. E.val b ] $logDebugS "PruneUnreferencedFiles" . tshow $ (minBoundDgst, maxBoundDgst) E.insertSelectWithConflict (UniqueFileContentChunkUnreferenced $ error "insertSelectWithConflict inspected constraint") (E.from $ \fileContentChunk -> do E.where_ . E.not_ . E.subSelectOr . E.from $ \fileContentEntry -> do E.where_ $ fileContentEntry E.^. FileContentEntryChunkHash E.==. fileContentChunk E.^. FileContentChunkId return $ E.any E.exists (fileReferences $ fileContentEntry E.^. FileContentEntryHash) E.where_ . chunkIdFilter $ fileContentChunk E.^. FileContentChunkHash return $ FileContentChunkUnreferenced E.<# (fileContentChunk E.^. FileContentChunkId) E.<&> E.val now ) (\current excluded -> [ FileContentChunkUnreferencedSince E.=. E.min (current E.^. FileContentChunkUnreferencedSince) (excluded E.^. FileContentChunkUnreferencedSince) ] ) E.delete . E.from $ \fileContentChunkUnreferenced -> do E.where_ . E.subSelectOr . E.from $ \fileContentEntry -> do E.where_ $ fileContentEntry E.^. FileContentEntryChunkHash E.==. fileContentChunkUnreferenced E.^. FileContentChunkUnreferencedHash return $ E.any E.exists (fileReferences $ fileContentEntry E.^. FileContentEntryHash) E.where_ . chunkIdFilter $ E.subSelectForeign fileContentChunkUnreferenced FileContentChunkUnreferencedHash (E.^. FileContentChunkHash) let unmarkWorkflowFiles (otoList -> fRefs) = E.delete . E.from $ \fileContentChunkUnreferenced -> do E.where_ . E.subSelectOr . E.from $ \fileContentEntry -> do E.where_ $ fileContentEntry E.^. FileContentEntryChunkHash E.==. fileContentChunkUnreferenced E.^. FileContentChunkUnreferencedHash return $ fileContentEntry E.^. FileContentEntryHash `E.in_` E.valList fRefs E.where_ . chunkIdFilter $ E.subSelectForeign fileContentChunkUnreferenced FileContentChunkUnreferencedHash (E.^. FileContentChunkHash) chunkSize = 100 in runConduit $ workflowFileReferences .| C.map Seq.singleton .| C.chunksOfE chunkSize .| C.mapM_ unmarkWorkflowFiles let getEntryCandidates = E.selectSource . E.from $ \fileContentEntry -> do let unreferencedSince = E.subSelectMaybe . E.from $ \(fileContentEntry' `E.InnerJoin` fileContentChunkUnreferenced) -> do E.on $ fileContentEntry' E.^. FileContentEntryChunkHash E.==. fileContentChunkUnreferenced E.^. FileContentChunkUnreferencedHash E.where_ $ fileContentEntry' E.^. FileContentEntryHash E.==. fileContentEntry E.^. FileContentEntryHash E.where_ . chunkIdFilter $ E.subSelectForeign fileContentChunkUnreferenced FileContentChunkUnreferencedHash (E.^. FileContentChunkHash) return . E.max_ $ fileContentChunkUnreferenced E.^. FileContentChunkUnreferencedSince E.where_ $ E.maybe E.false (E.<. E.val (addUTCTime (-keep) now)) unreferencedSince E.groupBy $ fileContentEntry E.^. FileContentEntryHash E.orderBy [ E.asc $ fileContentEntry E.^. FileContentEntryHash ] return $ fileContentEntry E.^. FileContentEntryHash deleteEntry :: _ -> DB (Sum Natural) deleteEntry (E.Value fRef) = bool 0 1 . (> 0) <$> deleteWhereCount [FileContentEntryHash ==. fRef] Sum deletedEntries <- runConduit $ getEntryCandidates .| takeWhileTime (interval / 3) .| C.mapM deleteEntry .| C.fold let getChunkCandidates = E.selectSource . E.from $ \fileContentChunkUnreferenced -> do E.where_ $ fileContentChunkUnreferenced E.^. FileContentChunkUnreferencedSince E.<. E.val (addUTCTime (-keep) now) E.where_ . E.not_ . E.exists . E.from $ \fileContentEntry -> E.where_ $ fileContentEntry E.^. FileContentEntryChunkHash E.==. fileContentChunkUnreferenced E.^. FileContentChunkUnreferencedHash E.where_ . chunkIdFilter $ E.subSelectForeign fileContentChunkUnreferenced FileContentChunkUnreferencedHash (E.^. FileContentChunkHash) return ( fileContentChunkUnreferenced E.^. FileContentChunkUnreferencedHash , E.subSelectForeign fileContentChunkUnreferenced FileContentChunkUnreferencedHash $ E.length_ . (E.^. FileContentChunkContent) ) deleteChunk :: _ -> DB (Sum Natural, Sum Word64) deleteChunk (E.Value cRef, E.Value size) = do deleteWhere [ FileContentChunkUnreferencedHash ==. cRef ] (, Sum size) . fromIntegral <$> deleteWhereCount [FileContentChunkHash ==. unFileContentChunkKey cRef] (Sum deletedChunks, Sum deletedChunkSize) <- runConduit $ getChunkCandidates .| takeWhileTime (interval / 3) .| persistentTokenBucketTakeC' TokenBucketPruneFiles (view $ _2 . _Value :: _ -> Word64) .| C.mapM deleteChunk .| C.fold return (deletedEntries, deletedChunks, deletedChunkSize) fin (deletedEntries, deletedChunks, deletedChunkSize) = do observeDeletedUnreferencedFiles deletedEntries $logInfoS "PruneUnreferencedFiles" [st|Deleted #{tshow deletedEntries} long-unreferenced files|] observeDeletedUnreferencedChunks deletedChunks deletedChunkSize $logInfoS "PruneUnreferencedFiles" [st|Deleted #{tshow deletedChunks} chunks (#{textBytes deletedChunkSize})|] dispatchJobInjectFiles :: JobHandler UniWorX dispatchJobInjectFiles = JobHandlerException . maybeT (return ()) $ do uploadBucket <- getsYesod $ view _appUploadCacheBucket interval <- getsYesod $ view _appInjectFiles let extractReference (Minio.ListItemObject oi) = (oi, ) <$> Minio.oiObject oi ^? minioFileReference extractReference _ = Nothing injectOrDelete :: (Minio.ObjectInfo, FileContentReference) -> Handler (Sum Natural, Sum Word64) injectOrDelete (objInfo, fRef) = do let obj = Minio.oiObject objInfo sz = fromIntegral $ Minio.oiSize objInfo fRef' <- runDB $ do chunkVar <- newEmptyTMVarIO dbAsync <- allocateLinkedAsync $ do atomically $ isEmptyTMVar chunkVar >>= guard . not sinkFileDB False $ C.unfoldM (\x -> fmap (, x) <$> atomically (takeTMVar chunkVar)) () logger <- askLoggerIO didSend <- maybeT (return False) . hoistMaybeM . runAppMinio . runMaybeT $ do objRes <- catchIfMaybeT minioIsDoesNotExist $ Minio.getObject uploadBucket obj Minio.defaultGetObjectOptions let sendChunks = go 0 0 Nothing =<< liftIO (getTime Monotonic) where go :: forall m. MonadIO m => Natural -> Int64 -> Maybe TimeSpec -> TimeSpec -> ConduitT ByteString Void m () go c accsz lastReport startT = do currT <- liftIO $ getTime Monotonic chunk' <- await whenIsJust chunk' $ \chunk -> do let csz = fromIntegral $ olength chunk !c' = succ c !sz' = accsz + csz !lastReport' | toRational currT - toRational (fromMaybe startT lastReport) > 5 = Just currT | otherwise = lastReport when (csz > 0) $ do let p :: Centi p = realToFrac $ (toInteger sz' % toInteger sz) * 100 eta :: Maybe Integer eta = do accsz' <- assertM' (/= 0) accsz return . ceiling $ (toRational currT - toRational startT) / fromIntegral accsz' * (fromIntegral sz - fromIntegral accsz) when (lastReport' /= lastReport || sz' >= fromIntegral sz) $ flip runLoggingT logger . $logInfoS "InjectFiles" . mconcat $ catMaybes [ pure [st|Sinking chunk ##{tshow c} (#{textBytes csz}): #{textBytes sz'}/#{textBytes sz} (#{tshow p}%)|] , eta <&> \eta' -> [st| ETA #{textDuration eta'}|] , pure "..." ] atomically . putTMVar chunkVar $ Just chunk go c' sz' lastReport' startT lift . runConduit $ Minio.gorObjectStream objRes .| sendChunks return True if | not didSend -> Nothing <$ cancel dbAsync | otherwise -> do atomically $ putTMVar chunkVar Nothing Just <$> waitAsync dbAsync let matchesFRef = is _Just $ assertM (== fRef) fRef' if | matchesFRef -> maybeT (return ()) . runAppMinio . handleIf minioIsDoesNotExist (const $ return ()) $ Minio.removeObject uploadBucket obj | otherwise -> $logErrorS "InjectFiles" [st|Minio object “#{obj}”'s content does not match it's name (content hash: #{tshow fRef'} /= name hash: #{tshow fRef})|] return . bool mempty (Sum 1, Sum sz) $ is _Just fRef' (Sum injectedFiles, Sum injectedSize) <- runConduit $ transPipe runAppMinio (Minio.listObjects uploadBucket Nothing True) .| C.mapMaybe extractReference .| maybe (C.map id) (takeWhileTime . (/ 2)) interval .| transPipe (lift . runDB . setSerializable) (persistentTokenBucketTakeC' TokenBucketInjectFiles $ views _1 Minio.oiSize) .| C.mapM (lift . injectOrDelete) .| C.mapM (\res@(Sum inj, Sum sz) -> res <$ observeInjectedFiles inj sz) .| C.fold $logInfoS "InjectFiles" [st|Injected #{tshow injectedFiles} files from upload cache into database (#{textBytes injectedSize})|] data RechunkFileException = RechunkFileExceptionHashMismatch { oldHash, newHash :: FileContentReference } deriving (Eq, Ord, Show, Generic, Typeable) deriving anyclass (Exception) dispatchJobRechunkFiles :: JobHandler UniWorX dispatchJobRechunkFiles = JobHandlerAtomicWithFinalizer act fin where act = hoist lift $ do interval <- getsYesod $ view _appRechunkFiles let getEntryCandidates = E.selectSource . E.from $ \fileContentEntry -> E.distinctOnOrderBy [E.asc $ fileContentEntry E.^. FileContentEntryHash] $ do E.where_ . E.exists . E.from $ \(fileContentEntry' `E.InnerJoin` fileContentChunk) -> do E.on $ fileContentChunk E.^. FileContentChunkId E.==. fileContentEntry' E.^. FileContentEntryChunkHash E.where_ $ fileContentEntry' E.^. FileContentEntryHash E.==. fileContentEntry E.^. FileContentEntryHash E.where_ . E.not_ $ fileContentChunk E.^. FileContentChunkContentBased let size = E.subSelectMaybe . E.from $ \(fileContentEntry' `E.InnerJoin` fileContentChunk) -> do E.on $ fileContentChunk E.^. FileContentChunkId E.==. fileContentEntry' E.^. FileContentEntryChunkHash E.where_ $ fileContentEntry' E.^. FileContentEntryHash E.==. fileContentEntry E.^. FileContentEntryHash return $ E.sum_ (E.length_ $ fileContentChunk E.^. FileContentChunkContent:: E.SqlExpr (E.Value Word64)) return ( fileContentEntry E.^. FileContentEntryHash , size ) rechunkFile :: FileContentReference -> Word64 -> DB (Sum Natural, Sum Word64) rechunkFile fRef sz = do fRef' <- sinkFileDB True $ sourceFileDB fRef unless (fRef == fRef') $ throwM $ RechunkFileExceptionHashMismatch fRef fRef' return (Sum 1, Sum sz) (Sum rechunkedFiles, Sum rechunkedSize) <- runConduit $ getEntryCandidates .| C.mapMaybe (\(E.Value fRef, E.Value sz) -> (fRef, ) <$> sz) .| maybe (C.map id) (takeWhileTime . (/ 2)) interval .| persistentTokenBucketTakeC' TokenBucketRechunkFiles (view _2 :: _ -> Word64) .| C.mapM (uncurry rechunkFile) .| C.fold return (rechunkedFiles, rechunkedSize) fin (rechunkedFiles, rechunkedSize) = do observeRechunkedFiles rechunkedFiles rechunkedSize $logInfoS "RechunkFiles" [st|Rechunked #{tshow rechunkedFiles} files in database (#{textBytes rechunkedSize} bytes)|]