Merge pull request #343 from commercialhaskell/b/disk-cleanup

Post Hoogle generation disk cleanup
This commit is contained in:
Bryan Richter 2025-03-18 10:45:03 +02:00 committed by GitHub
commit a384248d68
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 302 additions and 175 deletions

View File

@ -58,7 +58,7 @@ optsParser =
T.unpack defHaddockBucketName)) <*> T.unpack defHaddockBucketName)) <*>
switch switch
(long "do-not-upload" <> (long "do-not-upload" <>
help "Stop from hoogle db and snapshots.json from being generated and uploaded") <*> help "Disable upload of Hoogle database and snapshots.json") <*>
option option
readLogLevel readLogLevel
(long "log-level" <> metavar "LOG_LEVEL" <> short 'l' <> value LevelInfo <> (long "log-level" <> metavar "LOG_LEVEL" <> short 'l' <> value LevelInfo <>

View File

@ -104,11 +104,15 @@ getStackageSnapshotsDir = do
withResponseUnliftIO :: MonadUnliftIO m => Request -> Manager -> (Response BodyReader -> m b) -> m b withResponseUnliftIO :: MonadUnliftIO m => Request -> Manager -> (Response BodyReader -> m b) -> m b
withResponseUnliftIO req man f = withRunInIO $ \ runInIO -> withResponse req man (runInIO . f) withResponseUnliftIO req man f = withRunInIO $ \ runInIO -> withResponse req man (runInIO . f)
-- | Under the SingleRun wrapper that ensures only one thing at a time is -- | Returns an action that, under the SingleRun wrapper that ensures only one
-- writing the file in question, ensure that a Hoogle database exists on the -- thing at a time is writing the file in question, ensure that a Hoogle
-- filesystem for the given SnapName. But only going so far as downloading it -- database exists on the filesystem for the given SnapName. But only going so
-- from the haddock bucket. See 'createHoogleDB' for the function that puts it -- far as downloading it from the haddock bucket. See 'buildAndUploadHoogleDBs' for the
-- there in the first place. -- function that puts it there in the first place. If no db exists in the
-- bucket, the action will return 'Nothing'.
--
-- The location searched is $PWD/hoogle/<snapshot>/<hoogle-version>.hoo
-- E.g. in production, ~stackage-update/hoogle/lts-22.20/5.0.18.4.hoo (for stackage-update).
newHoogleLocker :: newHoogleLocker ::
(HasLogFunc env, MonadIO m) => env -> Manager -> Text -> m (SingleRun SnapName (Maybe FilePath)) (HasLogFunc env, MonadIO m) => env -> Manager -> Text -> m (SingleRun SnapName (Maybe FilePath))
newHoogleLocker env man bucketUrl = mkSingleRun hoogleLocker newHoogleLocker env man bucketUrl = mkSingleRun hoogleLocker
@ -232,38 +236,48 @@ runStackageUpdate doNotUpload = do
corePackageGetters <- makeCorePackageGetters corePackageGetters <- makeCorePackageGetters
runResourceT $ runResourceT $
join $ join $
-- @createOrUpdateSnapshot@ processes package N while processing docs for
-- package N-1. This @pure ()@ is the "documentation processing action"
-- for the -1'th package.
runConduit $ sourceSnapshots .| foldMC (createOrUpdateSnapshot corePackageGetters) (pure ()) runConduit $ sourceSnapshots .| foldMC (createOrUpdateSnapshot corePackageGetters) (pure ())
unless doNotUpload uploadSnapshotsJSON unless doNotUpload uploadSnapshotsJSON
buildAndUploadHoogleDB doNotUpload buildAndUploadHoogleDBs doNotUpload
logInfo "Finished building and uploading Hoogle DBs" logInfo "Finished building and uploading Hoogle DBs"
-- | This will look at 'global-hints.yaml' and will create core package getters that are reused -- | This will look at 'global-hints.yaml' and will create core package getters that are reused
-- later for adding those package to individual snapshot. -- later for adding those package to individual snapshots.
makeCorePackageGetters :: makeCorePackageGetters ::
RIO StackageCron (Map CompilerP [CorePackageGetter]) RIO StackageCron (Map CompilerP [CorePackageGetter])
makeCorePackageGetters = do makeCorePackageGetters = do
rootDir <- scStackageRoot <$> ask rootDir <- scStackageRoot <$> ask
contentDir <- getStackageContentDir rootDir contentDir <- getStackageContentDir rootDir
coreCabalFiles <- getCoreCabalFiles rootDir backupCoreCabalFiles <- getBackupCoreCabalFiles rootDir
liftIO (decodeFileEither (contentDir </> "stack" </> "global-hints.yaml")) >>= \case liftIO (decodeFileEither (contentDir </> "stack" </> "global-hints.yaml")) >>= \case
Right (hints :: Map CompilerP (Map PackageNameP VersionP)) -> Right (hints :: Map CompilerP (Map PackageNameP VersionP)) ->
Map.traverseWithKey Map.traverseWithKey
(\compiler -> (\compiler ->
fmap Map.elems . fmap Map.elems .
Map.traverseMaybeWithKey (makeCorePackageGetter compiler coreCabalFiles)) Map.traverseMaybeWithKey (makeCorePackageGetter compiler backupCoreCabalFiles))
hints hints
Left exc -> do Left exc -> do
logError $ logError $
"Error parsing 'global-hints.yaml' file: " <> fromString (displayException exc) "Error parsing 'global-hints.yaml' file: " <> fromString (displayException exc)
return mempty return mempty
getCoreCabalFiles :: -- | Packages distributed with GHC aren't taken from Hackage like normal
-- packages. Release managers do upload them, however, so that their docs are
-- available.
--
-- Or at least, they should. The release process was fragile, and some packages
-- weren't uploaded. This mechanism gives us a chance to fill in missing
-- packages.
getBackupCoreCabalFiles ::
FilePath FilePath
-> RIO StackageCron (Map PackageIdentifierP (GenericPackageDescription, CabalFileIds)) -> RIO StackageCron (Map PackageIdentifierP (GenericPackageDescription, CabalFileIds))
getCoreCabalFiles rootDir = do getBackupCoreCabalFiles rootDir = do
coreCabalFilesDir <- getCoreCabalFilesDir rootDir backupCoreCabalFilesDir <- getBackupCoreCabalFilesDir rootDir
cabalFileNames <- getDirectoryContents coreCabalFilesDir cabalFileNames <- getDirectoryContents backupCoreCabalFilesDir
cabalFiles <- cabalFiles <-
forM (filter (isExtensionOf ".cabal") cabalFileNames) $ \cabalFileName -> forM (filter (isExtensionOf ".cabal") cabalFileNames) $ \cabalFileName ->
let pidTxt = T.pack (dropExtension (takeFileName cabalFileName)) let pidTxt = T.pack (dropExtension (takeFileName cabalFileName))
@ -272,15 +286,17 @@ getCoreCabalFiles rootDir = do
logError $ "Invalid package identifier: " <> fromString cabalFileName logError $ "Invalid package identifier: " <> fromString cabalFileName
pure Nothing pure Nothing
Just pid -> do Just pid -> do
cabalBlob <- readFileBinary (coreCabalFilesDir </> cabalFileName) cabalBlob <- readFileBinary (backupCoreCabalFilesDir </> cabalFileName)
mCabalInfo <- run $ addCabalFile pid cabalBlob mCabalInfo <- run $ addCabalFile pid cabalBlob
pure ((,) pid <$> mCabalInfo) pure ((,) pid <$> mCabalInfo)
pure $ Map.fromList $ catMaybes cabalFiles pure $ Map.fromList $ catMaybes cabalFiles
-- | Core package info rarely changes between the snapshots, therefore it would be wasteful to -- | Core package info rarely changes between the snapshots, therefore it would be wasteful to
-- load, parse and update all packages from gloabl-hints for each snapshot, instead we produce -- load, parse and update all packages from gloabl-hints for each snapshot. Instead we produce
-- a memoized version that will do it once initiall and then return information aboat a -- a memoized version that will do it once initially and then return information about a
-- package on subsequent invocations. -- package on subsequent invocations.
--
-- FIXME: The compiler argument is unused (and never has been). Should it be used?
makeCorePackageGetter :: makeCorePackageGetter ::
CompilerP CompilerP
-> Map PackageIdentifierP (GenericPackageDescription, CabalFileIds) -> Map PackageIdentifierP (GenericPackageDescription, CabalFileIds)
@ -309,6 +325,9 @@ makeCorePackageGetter _compiler fallbackCabalFileMap pname ver =
Nothing -> do Nothing -> do
whenM (scReportProgress <$> ask) $ whenM (scReportProgress <$> ask) $
logSticky $ "Loading core package: " <> display pid logSticky $ "Loading core package: " <> display pid
-- I have no idea what's happening here. I guess I
-- don't know what it means to "load" a package.
-- What is actually going on?
htr <- getHackageTarball pir Nothing htr <- getHackageTarball pir Nothing
case htrFreshPackageInfo htr of case htrFreshPackageInfo htr of
Just (gpd, treeId) -> do Just (gpd, treeId) -> do
@ -336,64 +355,70 @@ makeCorePackageGetter _compiler fallbackCabalFileMap pname ver =
PackageIdentifierRevision (unPackageNameP pname) (unVersionP ver) (CFIRevision (Revision 0)) PackageIdentifierRevision (unPackageNameP pname) (unVersionP ver) (CFIRevision (Revision 0))
-- TODO: for now it is only from hackage, PantryPackage needs an update to use other origins -- | Populates the database with information about a package?
-- | A pantry package is being added to a particular snapshot. Extra information like compiler and --
-- flags are passed on in order to properly figure out dependencies and modules -- Specifically, a pantry package is being added to a particular snapshot.
--
-- Extra information like compiler and flags are passed on in order to properly
-- figure out dependencies and modules.
--
-- TODO: for now it is only from hackage. PantryPackage needs an update to use other origins
addPantryPackage :: addPantryPackage ::
SnapshotId -> CompilerP -> Bool -> Map FlagNameP Bool -> PantryPackage -> RIO StackageCron Bool SnapshotId -> CompilerP -> Bool -> Map FlagNameP Bool -> PantryPackage -> RIO StackageCron Bool
addPantryPackage sid compiler isHidden flags (PantryPackage pc treeKey) = do addPantryPackage snapId compiler isHidden flags (PantryPackage pcabal pTreeKey) = do
env <- ask env <- ask
let gpdCachedRef = scCachedGPD env let pkgDescCache = scCachedGPD env
cache = scCacheCabalFiles env cacheP = scCacheCabalFiles env
let blobKeyToInt = fromIntegral . unSqlBackendKey . unBlobKey let blobKeyToInt = fromIntegral . unSqlBackendKey . unBlobKey
let updateCacheGPD blobId gpd = let cachedPkgDesc cabalBlobId pkgDesc =
gpd `deepseq` pkgDesc `deepseq`
atomicModifyIORef' gpdCachedRef (\cacheMap -> (IntMap.insert blobId gpd cacheMap, gpd)) atomicModifyIORef' pkgDescCache (\cacheMap -> (IntMap.insert cabalBlobId pkgDesc cacheMap, pkgDesc))
let getCachedGPD treeCabal = let getPkgDesc cabalBlobId =
\case \case
Just gpd | cache -> updateCacheGPD (blobKeyToInt treeCabal) gpd Just pkgDesc | cacheP -> cachedPkgDesc (blobKeyToInt cabalBlobId) pkgDesc
Just gpd -> pure gpd Just pkgDesc -> pure pkgDesc
Nothing | cache -> do Nothing | cacheP -> do
cacheMap <- readIORef gpdCachedRef cacheMap <- readIORef pkgDescCache
case IntMap.lookup (blobKeyToInt treeCabal) cacheMap of case IntMap.lookup (blobKeyToInt cabalBlobId) cacheMap of
Just gpd -> pure gpd Just pkgDesc -> pure pkgDesc
Nothing -> Nothing ->
loadBlobById treeCabal >>= loadBlobById cabalBlobId >>=
updateCacheGPD (blobKeyToInt treeCabal) . parseCabalBlob cachedPkgDesc (blobKeyToInt cabalBlobId) . parseCabalBlob
Nothing -> parseCabalBlob <$> loadBlobById treeCabal Nothing -> parseCabalBlob <$> loadBlobById cabalBlobId
let storeHackageSnapshotPackage hcid mtid mgpd = let storeHackageSnapshotPackage hackageCabalId mTreeId mpkgDesc =
getTreeForKey treeKey >>= \case getTreeForKey pTreeKey >>= \case
Just (Entity treeId _) -- error case #1
| Just tid <- mtid Just (Entity treeId' _)
, tid /= treeId -> do | Just treeId <- mTreeId
lift $ logError $ "Pantry Tree Key mismatch for: " <> display pc , treeId /= treeId' -> do
lift $ logError $ "Pantry Tree Key mismatch for: " <> display pcabal
pure False pure False
Just tree@(Entity _ Tree {treeCabal}) -- happy case
| Just treeCabal' <- treeCabal -> do Just pkgTree@(Entity _ Tree {treeCabal})
gpd <- getCachedGPD treeCabal' mgpd | Just cabalBlobId <- treeCabal -> do
let mhcid = Just hcid pkgDesc <- getPkgDesc cabalBlobId mpkgDesc
eTree = Right tree addSnapshotPackage snapId compiler Hackage (Right pkgTree) (Just hackageCabalId) isHidden flags packageId pkgDesc
addSnapshotPackage sid compiler Hackage eTree mhcid isHidden flags pid gpd
pure True pure True
-- error case #2
_ -> do _ -> do
lift $ logError $ "Pantry is missing the source tree for " <> display pc lift $ logError $ "Pantry is missing the source tree for " <> display pcabal
pure False pure False
mHackageCabalInfo <- run $ getHackageCabalByKey pid (pcCabalKey pc) mHackageCabalInfo <- run $ getHackageCabalByKey packageId (pcCabalKey pcabal)
case mHackageCabalInfo of case mHackageCabalInfo of
Nothing -> do Nothing -> do
logError $ "Could not find the cabal file for: " <> display pc logError $ "Could not find the cabal file for: " <> display pcabal
pure False pure False
Just (hcid, Nothing) -> do Just (hackageCabalId, Nothing) -> do
mHPI <- mHPI <-
htrFreshPackageInfo <$> htrFreshPackageInfo <$>
getHackageTarball (toPackageIdentifierRevision pc) (Just treeKey) getHackageTarball (toPackageIdentifierRevision pcabal) (Just pTreeKey)
run $ run $
case mHPI of case mHPI of
Just (gpd, treeId) -> storeHackageSnapshotPackage hcid (Just treeId) (Just gpd) Just (pkgDesc, treeId) -> storeHackageSnapshotPackage hackageCabalId (Just treeId) (Just pkgDesc)
Nothing -> storeHackageSnapshotPackage hcid Nothing Nothing Nothing -> storeHackageSnapshotPackage hackageCabalId Nothing Nothing
Just (hcid, mtid) -> run $ storeHackageSnapshotPackage hcid mtid Nothing Just (hackageCabalId, mTreeId) -> run $ storeHackageSnapshotPackage hackageCabalId mTreeId Nothing
where where
pid = PackageIdentifierP (pcPackageName pc) (pcVersion pc) packageId = PackageIdentifierP (pcPackageName pcabal) (pcVersion pcabal)
@ -404,33 +429,33 @@ checkForDocs :: SnapshotId -> SnapName -> ResourceT (RIO StackageCron) ()
checkForDocs snapshotId snapName = do checkForDocs snapshotId snapName = do
bucketName <- lift (scDownloadBucketName <$> ask) bucketName <- lift (scDownloadBucketName <$> ask)
env <- asks scEnvAWS env <- asks scEnvAWS
mods <- -- it is faster to download all modules in this snapshot separately, rather
-- than process them with a conduit all the way to the database.
packageModules <-
runConduit $ runConduit $
paginate env (req bucketName) .| concatMapC (fromMaybe [] . (^. listObjectsV2Response_contents)) .| paginate env (listSnapshotObjects bucketName)
mapC (\obj -> toText (obj ^. object_key)) .| .| concatMapC (fromMaybe [] . (^. listObjectsV2Response_contents))
concatMapC (T.stripSuffix ".html" >=> T.stripPrefix prefix >=> pathToPackageModule) .| .| mapC (\obj -> toText (obj ^. object_key))
sinkList .| concatMapC (T.stripSuffix ".html" >=> T.stripPrefix prefix >=> pathToPackageModule)
-- it is faster to download all modules in this snapshot, than process them with a conduit all .| sinkList
-- the way to the database. -- Cache SnapshotPackageId rather than look it up many times for each module in the package.
sidsCacheRef <- newIORef Map.empty sidsCacheRef <- newIORef Map.empty
-- Cache is for SnapshotPackageId, there will be many modules per peckage, no need to look into -- The other half of the cores are used in 'updateSnapshot'
-- the database for each one of them.
n <- max 1 . (`div` 2) <$> getNumCapabilities n <- max 1 . (`div` 2) <$> getNumCapabilities
unexpectedPackages <- lift $ pooledMapConcurrentlyN n (markModules sidsCacheRef) mods unexpectedPackages <- lift $ pooledMapConcurrentlyN n (markModule sidsCacheRef) packageModules
forM_ (Set.fromList $ catMaybes unexpectedPackages) $ \pid -> forM_ (Set.fromList $ catMaybes unexpectedPackages) $ \pid ->
lift $ lift $ logWarn $
logWarn $ "Documentation found for package '" <> display pid <>
"Documentation found for package '" <> display pid <> "', which does not exist in this snapshot: " <>
"', which does not exist in this snapshot: " <>
display snapName display snapName
where where
prefix = textDisplay snapName <> "/" prefix = textDisplay snapName <> "/"
req bucketName = newListObjectsV2 (BucketName bucketName) & listObjectsV2_prefix ?~ prefix listSnapshotObjects bucketName = newListObjectsV2 (BucketName bucketName) & listObjectsV2_prefix ?~ prefix
-- | This function records all package modules that have documentation available, the ones -- | This function records all package modules that have documentation available, the ones
-- that are not found in the snapshot reported back as an error. Besides being run -- that are not found in the snapshot reported back as an error. Besides being run
-- concurrently this function optimizes the SnapshotPackageId lookup as well, since that can -- concurrently this function optimizes the SnapshotPackageId lookup as well, since that can
-- be shared amongst many modules of one package. -- be shared amongst many modules of one package.
markModules sidsCacheRef (pid, modName) = do markModule sidsCacheRef (pid, modName) = do
sidsCache <- readIORef sidsCacheRef sidsCache <- readIORef sidsCacheRef
let mSnapshotPackageId = Map.lookup pid sidsCache let mSnapshotPackageId = Map.lookup pid sidsCache
mFound <- run $ markModuleHasDocs snapshotId pid mSnapshotPackageId modName mFound <- run $ markModuleHasDocs snapshotId pid mSnapshotPackageId modName
@ -456,8 +481,7 @@ sourceSnapshots :: ConduitT a SnapshotFileInfo (ResourceT (RIO StackageCron)) ()
sourceSnapshots = do sourceSnapshots = do
snapshotsDir <- lift $ lift getStackageSnapshotsDir snapshotsDir <- lift $ lift getStackageSnapshotsDir
sourceDirectoryDeep False (snapshotsDir </> "lts") .| concatMapMC (getLtsParser snapshotsDir) sourceDirectoryDeep False (snapshotsDir </> "lts") .| concatMapMC (getLtsParser snapshotsDir)
sourceDirectoryDeep False (snapshotsDir </> "nightly") .| sourceDirectoryDeep False (snapshotsDir </> "nightly") .| concatMapMC (getNightlyParser snapshotsDir)
concatMapMC (getNightlyParser snapshotsDir)
where where
makeSnapshotFileInfo gitDir fp mFileNameDate snapName = do makeSnapshotFileInfo gitDir fp mFileNameDate snapName = do
let parseSnapshot updatedOn = do let parseSnapshot updatedOn = do
@ -500,39 +524,52 @@ sourceSnapshots = do
return Nothing return Nothing
-- | Creates a new `Snapshot` if it is not yet present in the database and decides if update data DecisionResult a e = NothingToDo | NoSnapshotFile | NeedsUpdate a e | DoesntExist e
-- | Creates a new `Snapshot` if it is not yet present in the database, and decides if update
-- is necessary when it already exists. -- is necessary when it already exists.
--
-- TODO: Silently ignoring snapshots where the getter returns Nothing seems like
-- a potential problem. Anyway I'd rather run it beforehand!
decideOnSnapshotUpdate :: SnapshotFileInfo -> RIO StackageCron (Maybe (SnapshotId, SnapshotFile)) decideOnSnapshotUpdate :: SnapshotFileInfo -> RIO StackageCron (Maybe (SnapshotId, SnapshotFile))
decideOnSnapshotUpdate SnapshotFileInfo {sfiSnapName, sfiUpdatedOn, sfiSnapshotFileGetter} = do decideOnSnapshotUpdate SnapshotFileInfo {sfiSnapName, sfiUpdatedOn, sfiSnapshotFileGetter} = do
forceUpdate <- scForceFullUpdate <$> ask forceUpdate <- scForceFullUpdate <$> ask
let mkLogMsg rest = "Snapshot with name: " <> display sfiSnapName <> " " <> rest let mkLogMsg rest = "Snapshot with name: " <> display sfiSnapName <> " " <> rest
mKeySnapFile <- mKeySnapFile <-
run (getBy (UniqueSnapshot sfiSnapName)) >>= \case run (getBy (UniqueSnapshot sfiSnapName)) >>= \case
-- exists, up to date, no force-updated requested; nothing to do
Just (Entity _key snap) Just (Entity _key snap)
| snapshotUpdatedOn snap == Just sfiUpdatedOn && not forceUpdate -> do | snapshotUpdatedOn snap == Just sfiUpdatedOn && not forceUpdate ->
logInfo $ mkLogMsg "already exists and is up to date." return NothingToDo
return Nothing -- exists but updatedOn was not previously set.
Just entity@(Entity _key snap) Just entity@(Entity _key snap)
| Nothing <- snapshotUpdatedOn snap -> do | Nothing <- snapshotUpdatedOn snap -> do
logWarn $ mkLogMsg "did not finish updating last time." logWarn $ mkLogMsg "did not finish updating last time."
fmap (Just entity, ) <$> sfiSnapshotFileGetter maybe NoSnapshotFile (NeedsUpdate entity) <$> sfiSnapshotFileGetter
-- exists, but updatedOn does not match or force-update was requested.
Just entity -> do Just entity -> do
unless forceUpdate $ logWarn $ mkLogMsg "was updated, applying new patch." unless forceUpdate $ logWarn $ mkLogMsg "was updated, applying new patch."
fmap (Just entity, ) <$> sfiSnapshotFileGetter maybe NoSnapshotFile (NeedsUpdate entity) <$> sfiSnapshotFileGetter
Nothing -> fmap (Nothing, ) <$> sfiSnapshotFileGetter -- does not exist
Nothing -> maybe NoSnapshotFile DoesntExist <$> sfiSnapshotFileGetter
-- Add new snapshot to the database, when necessary -- Add new snapshot to the database, when necessary
case mKeySnapFile of case mKeySnapFile of
Just (Just (Entity snapKey snap), sf@SnapshotFile {sfCompiler, sfPublishDate}) NothingToDo -> Nothing <$ logInfo (mkLogMsg "already exists and is up to date.")
NoSnapshotFile -> Nothing <$ logWarn (mkLogMsg "has no (readable?) snapshot file.")
NeedsUpdate (Entity oldSnapKey oldSnap) sf@SnapshotFile {sfCompiler, sfPublishDate}
| Just publishDate <- sfPublishDate -> do | Just publishDate <- sfPublishDate -> do
let updatedSnap = let updatedSnap =
Snapshot sfiSnapName sfCompiler publishDate (snapshotUpdatedOn snap) Snapshot sfiSnapName sfCompiler publishDate (snapshotUpdatedOn oldSnap)
run $ replace snapKey updatedSnap run $ replace oldSnapKey updatedSnap
pure $ Just (snapKey, sf) pure $ Just (oldSnapKey, sf)
Just (Nothing, sf@SnapshotFile {sfCompiler, sfPublishDate}) | otherwise -> return Nothing
| Just publishDate <- sfPublishDate ->
DoesntExist sf@SnapshotFile {sfCompiler, sfPublishDate}
| Just publishDate <- sfPublishDate -> do
logInfo $ mkLogMsg "is new, adding to the database."
fmap (, sf) <$> fmap (, sf) <$>
run (insertUnique (Snapshot sfiSnapName sfCompiler publishDate Nothing)) run (insertUnique (Snapshot sfiSnapName sfCompiler publishDate Nothing))
_ -> return Nothing | otherwise -> Nothing <$ logWarn (mkLogMsg "has no publish date, skipping.")
type CorePackageGetter type CorePackageGetter
= RIO StackageCron ( Either CabalFileIds (Entity Tree) = RIO StackageCron ( Either CabalFileIds (Entity Tree)
@ -545,14 +582,38 @@ type CorePackageGetter
-- current snapshot as well as an action that was passed as an argument. At the end it will return -- current snapshot as well as an action that was passed as an argument. At the end it will return
-- an action that should be invoked in order to mark modules that have documentation available, -- an action that should be invoked in order to mark modules that have documentation available,
-- which in turn can be passed as an argument to the next snapshot loader. -- which in turn can be passed as an argument to the next snapshot loader.
-- Something something ouroboros.
--
-- Question: When do the docs for the last snapshot get loaded?
--
-- Well, this binding is called as @join $ runConduit $ foldMC (createOrUpdateSnapshot corePackageInfoGetters) (pure ())@
--
-- So the answer: the doc-loading action for the last snapshot gets returned by @runConduit $ foldMC ...@,
-- which means it gets executed by @join $ runConduit $ foldMC ...@.
--
-- Evidence:
--
-- Since @foldMC :: (a -> b -> m a) -> a -> ConduitT b o m a@, we see
--
-- @@
-- a ~ ResourceT (RIO Stackage Cron) () -- this is the doc-loading action
-- b ~ SnapshotFileInfo
-- m ~ ResourceT (RIO StackageCron)
-- @@
-- and the foldMC creates a @ConduitT SnapshotFileInfo o (ResourceT (RIO StackageCron)) (ResourceT (RIO StackageCron) ())@
--
-- TODO: It might be more efficient to just put all the actions (snapshot
-- creation and documentation writing both) on a queue and let a bunch of
-- workers chew on it. The current impl creates arbitrary synchronization points
-- with 'runConcurrently'. Granted, I don't know what a good chunk size would
-- actually be.
createOrUpdateSnapshot :: createOrUpdateSnapshot ::
Map CompilerP [CorePackageGetter] Map CompilerP [CorePackageGetter]
-> ResourceT (RIO StackageCron) () -> ResourceT (RIO StackageCron) ()
-> SnapshotFileInfo -> SnapshotFileInfo
-> ResourceT (RIO StackageCron) (ResourceT (RIO StackageCron) ()) -> ResourceT (RIO StackageCron) (ResourceT (RIO StackageCron) ())
createOrUpdateSnapshot corePackageInfoGetters prevAction sfi@SnapshotFileInfo { sfiSnapName createOrUpdateSnapshot corePackageInfoGetters prevAction sfi@SnapshotFileInfo { sfiSnapName , sfiUpdatedOn } = do
, sfiUpdatedOn
} = do
finishedDocs <- newIORef False finishedDocs <- newIORef False
runConcurrently runConcurrently
(Concurrently (prevAction >> writeIORef finishedDocs True) *> (Concurrently (prevAction >> writeIORef finishedDocs True) *>
@ -561,6 +622,7 @@ createOrUpdateSnapshot corePackageInfoGetters prevAction sfi@SnapshotFileInfo {
loadCurrentSnapshot finishedDocs = do loadCurrentSnapshot finishedDocs = do
loadDocs <- loadDocs <-
decideOnSnapshotUpdate sfi >>= \case decideOnSnapshotUpdate sfi >>= \case
-- Nothing to do, and thus no docs to process
Nothing -> return $ pure () Nothing -> return $ pure ()
Just (snapshotId, snapshotFile) -> Just (snapshotId, snapshotFile) ->
updateSnapshot updateSnapshot
@ -575,22 +637,30 @@ createOrUpdateSnapshot corePackageInfoGetters prevAction sfi@SnapshotFileInfo {
logSticky "Still loading the docs for previous snapshot ..." logSticky "Still loading the docs for previous snapshot ..."
pure loadDocs pure loadDocs
-- | Updates all packages in the snapshot. If any missing they will be created. Returns an action -- | Creates Lts or Nightly entity [Question(bryan): Why not do this when
-- that will check for available documentation for modules that are known to exist and mark as -- creating the snapshot? Why is this a separate table anyway?] and updates all
-- documented when haddock is present on AWS S3. Only after documentation has been checked this -- packages in the snapshot. If any packages are missing they will be created.
-- snapshot will be marked as completely updated. This is required in case something goes wrong and -- Returns an action that will (a) check for available documentation for the
-- process is interrupted -- packages' modules and (b) mark the packages as documented when haddock is
-- present on AWS S3.
--
-- (Only after documentation has been checked will this snapshot be marked as
-- completely updated. This is required in case something goes wrong and process
-- is interrupted.)
updateSnapshot :: updateSnapshot ::
Map CompilerP [CorePackageGetter] Map CompilerP [CorePackageGetter]
-> SnapshotId -> SnapshotId
-> SnapName -> SnapName
-> UTCTime -> UTCTime
-> SnapshotFile -> SnapshotFile
-> RIO StackageCron (ResourceT (RIO StackageCron) ()) -> RIO StackageCron (ResourceT (RIO StackageCron) ()) -- ^ Returns the action for processing docs
updateSnapshot corePackageGetters snapshotId snapName updatedOn SnapshotFile {..} = do updateSnapshot corePackageGetters snapshotId snapName updatedOn SnapshotFile {..} = do
insertSnapshotName snapshotId snapName insertSnapshotName snapshotId snapName
loadedPackageCountRef <- newIORef (0 :: Int) loadedPackageCountRef <- newIORef (0 :: Int)
let totalPackages = length sfPackages let totalPackages = length sfPackages
-- A wrapper for 'addPantryPackage' that extracts the package info from
-- snapshot info, increments the count of loaded packages, and reports success
-- as a Bool.
addPantryPackageWithReport pp = do addPantryPackageWithReport pp = do
let PantryCabal {pcPackageName} = ppPantryCabal pp let PantryCabal {pcPackageName} = ppPantryCabal pp
isHidden = fromMaybe False (Map.lookup pcPackageName sfHidden) isHidden = fromMaybe False (Map.lookup pcPackageName sfHidden)
@ -695,6 +765,9 @@ uploadHoogleDB fp key =
body <- toBody <$> readFileBinary fpgz body <- toBody <$> readFileBinary fpgz
uploadBucket <- scUploadBucketName <$> ask uploadBucket <- scUploadBucketName <$> ask
uploadFromRIO key $ uploadFromRIO key $
-- FIXME: I should also set content encoding explicitly here. But
-- then I would break stackage-server, which applies an 'ungzip' in
-- 'newHoogleLocker'. :(
set putObject_acl (Just ObjectCannedACL_Public_read) $ newPutObject (BucketName uploadBucket) key body set putObject_acl (Just ObjectCannedACL_Public_read) $ newPutObject (BucketName uploadBucket) key body
@ -708,17 +781,31 @@ uploadFromRIO key po = do
logError $ "Couldn't upload " <> displayShow key <> " to S3 becuase " <> displayShow e logError $ "Couldn't upload " <> displayShow key <> " to S3 becuase " <> displayShow e
Right _ -> logInfo $ "Successfully uploaded " <> displayShow key <> " to S3" Right _ -> logInfo $ "Successfully uploaded " <> displayShow key <> " to S3"
buildAndUploadHoogleDB :: Bool -> RIO StackageCron () -- | As the name says, build and upload Hoogle DBs.
buildAndUploadHoogleDB doNotUpload = do --
-- Which DBs? The last 5 LTS and the last 5 Nightlies that are missing their
-- Hoogle DBs.
--
-- How? It downloads the Hoogle inputs that were previously generated alongside
-- the Haddocks, runs @hoogle@ on them, and uploads the result back to the same
-- bucket. Those inputs were generated by snapshot curation.
--
-- Why? I feel like this should be a short Bash script using curl and hoogle, and
-- maybe one day it will be.
--
-- This action is only run by stackage-server-cron.
buildAndUploadHoogleDBs :: Bool -> RIO StackageCron ()
buildAndUploadHoogleDBs doNotUpload = do
snapshots <- lastLtsNightlyWithoutHoogleDb 5 5 snapshots <- lastLtsNightlyWithoutHoogleDb 5 5
-- currentHoogleVersionId <- scHoogleVersionId <$> ask
env <- ask env <- ask
awsEnv <- asks scEnvAWS awsEnv <- asks scEnvAWS
bucketUrl <- asks scDownloadBucketUrl bucketUrl <- asks scDownloadBucketUrl
-- locker is an action that returns the path to a hoogle db, if one exists -- locker is an action that returns the path to a hoogle db, if one exists
-- in the haddock bucket already. -- in the haddock bucket already. It takes the SnapName as an argument.
-- I think it might be overkill.
locker <- newHoogleLocker (env ^. logFuncL) (awsEnv ^. env_manager) bucketUrl locker <- newHoogleLocker (env ^. logFuncL) (awsEnv ^. env_manager) bucketUrl
let insertH = checkInsertSnapshotHoogleDb True let -- These bindings undo a questionable conflation of operations
insertH = checkInsertSnapshotHoogleDb True
checkH = checkInsertSnapshotHoogleDb False checkH = checkInsertSnapshotHoogleDb False
for_ snapshots $ \(snapshotId, snapName) -> for_ snapshots $ \(snapshotId, snapName) ->
-- Even though we just got a list of snapshots that don't have hoogle -- Even though we just got a list of snapshots that don't have hoogle
@ -727,72 +814,82 @@ buildAndUploadHoogleDB doNotUpload = do
-- checkInsertSnapshotHoogleDb just check against SnapshotHoogleDb. -- checkInsertSnapshotHoogleDb just check against SnapshotHoogleDb.
-- Perhaps the check can be removed. -- Perhaps the check can be removed.
unlessM (checkH snapshotId) $ do unlessM (checkH snapshotId) $ do
withSystemTempDirectory ("hoogle-" ++ T.unpack (textDisplay snapName)) $ \tmpdir -> do
logInfo $ "Starting Hoogle database download: " <> display (hoogleKey snapName) logInfo $ "Starting Hoogle database download: " <> display (hoogleKey snapName)
-- Check if the database already exists (by downloading it).
-- FIXME: Why not just send a HEAD?
-- Perhaps the idea was to put the hoogle database somewhere the
-- main Stackage server process can find it? But nowadays
-- stackage-server downloads its own version separately.
mfp <- singleRun locker snapName mfp <- singleRun locker snapName
case mfp of case mfp of
Just _ -> do Just fp -> do
logInfo $ "Current hoogle database exists for: " <> display snapName -- Something bad must have happened: we created the hoogle db
-- previously, but didn't get to record it as available.
logWarn $ "Unregistered hoogle database found for: " <> display snapName
<> ". Registering now."
void $ insertH snapshotId void $ insertH snapshotId
-- FIXME: For now we need to delete this file we just
-- downloaded. We probably shouldn't download it in the
-- first place, though (just use HEAD).
liftIO $ removeFile fp
Nothing -> do Nothing -> do
logInfo $ "Current hoogle database does not yet exist in the bucket for: " <> display snapName logInfo $ "Current hoogle database does not yet exist in the bucket for: " <> display snapName
mfp' <- createHoogleDB snapshotId snapName -- NB: createHoogleDB will fail if something goes wrong.
forM_ mfp' $ \fp -> do fp <- createHoogleDB tmpdir snapshotId snapName
let key = hoogleKey snapName let key = hoogleKey snapName
dest = T.unpack key unless doNotUpload $ do
createDirectoryIfMissing True $ takeDirectory dest uploadHoogleDB fp (ObjectKey key)
renamePath fp dest void $ insertH snapshotId
unless doNotUpload $ do
uploadHoogleDB dest (ObjectKey key)
void $ insertH snapshotId
-- | Create a hoogle db from haddocks for the given snapshot, and upload it to -- | Create a hoogle db from haddocks for the given snapshot.
-- the haddock bucket. --
createHoogleDB :: SnapshotId -> SnapName -> RIO StackageCron (Maybe FilePath) -- Haddocks are downloaded from the documentation bucket, where they were
createHoogleDB snapshotId snapName = -- uploaded as a tar file.
handleAny logException $ do --
logInfo $ "Creating Hoogle DB for " <> display snapName -- Returns the path to the .hoo database, which will be found in the first
downloadBucketUrl <- scDownloadBucketUrl <$> ask -- argument. It will look like @<rootDir>/hoogle-gen/output.hoo@.
let root = "hoogle-gen" createHoogleDB :: FilePath -> SnapshotId -> SnapName -> RIO StackageCron FilePath
bindir = root </> "bindir" createHoogleDB rootDir snapshotId snapName = do
outname = root </> "output.hoo" logInfo $ "Creating Hoogle DB for " <> display snapName
tarKey = toPathPiece snapName <> "/hoogle/orig.tar" downloadBucketUrl <- scDownloadBucketUrl <$> ask
tarUrl = downloadBucketUrl <> "/" <> tarKey let root = rootDir </> "hoogle-gen"
tarFP = root </> T.unpack tarKey outname = root </> "output.hoo"
-- When tarball is downloaded it is saved with durability and atomicity, so if it inputTarKey = toPathPiece snapName <> "/hoogle/orig.tar"
-- is present it is not in a corrupted state inputTarUrl = downloadBucketUrl <> "/" <> inputTarKey
unlessM (doesFileExist tarFP) $ do outputTarFP = root </> T.unpack inputTarKey
req <- parseRequest $ T.unpack tarUrl -- Fetch the tarball with Hoogle inputs
env <- asks scEnvAWS req <- parseRequest $ T.unpack inputTarUrl
let man = env ^. env_manager env <- asks scEnvAWS
withResponseUnliftIO req {decompress = const True} man $ \res -> do let man = env ^. env_manager
throwErrorStatusCodes req res withResponseUnliftIO req {decompress = const True} man $ \res -> do
createDirectoryIfMissing True $ takeDirectory tarFP throwErrorStatusCodes req res
withBinaryFileDurableAtomic tarFP WriteMode $ \tarHandle -> createDirectoryIfMissing True $ takeDirectory outputTarFP
runConduitRes $ bodyReaderSource (responseBody res) .| sinkHandle tarHandle withBinaryFileDurableAtomic outputTarFP WriteMode $ \tarHandle ->
void $ tryIO $ removeDirectoryRecursive bindir runConduitRes $ bodyReaderSource (responseBody res) .| sinkHandle tarHandle
void $ tryIO $ removeFile outname -- Extract the Hoogle inputs from the tarball into a separate temp dir, then
createDirectoryIfMissing True bindir -- generate the hoogle database.
withSystemTempDirectory ("hoogle-" ++ T.unpack (textDisplay snapName)) $ \tmpdir -> do withSystemTempDirectory ("hoogle-" ++ T.unpack (textDisplay snapName)) $ \tmpdir -> do
Any hasRestored <- Any hasRestored <-
runConduitRes $ runConduitRes $
sourceFile tarFP .| sourceFile outputTarFP .|
untar (restoreHoogleTxtFileWithCabal tmpdir snapshotId snapName) .| untar (restoreHoogleTxtFileWithCabal tmpdir snapshotId snapName) .|
foldMapC Any foldMapC Any
unless hasRestored $ error "No Hoogle .txt files found" -- We just check if we have any Hoogle .txt file at all.
let args = ["generate", "--database=" ++ outname, "--local=" ++ tmpdir] unless hasRestored $ error "No Hoogle .txt files found"
logInfo $ -- Generate the hoogle database
mconcat let args = ["generate", "--database=" ++ outname, "--local=" ++ tmpdir]
[ "Merging databases... (" logInfo $
, foldMap fromString $ L.intersperse " " ("hoogle" : args) mconcat
, ")" [ "Merging databases... ("
] , foldMap fromString $ L.intersperse " " ("hoogle" : args)
liftIO $ Hoogle.hoogle args , ")"
logInfo "Merge done" ]
return $ Just outname -- 'Hoogle.hoogle' expects to run as an app, and crashes if something
where -- goes wrong. That's good.
logException exc = liftIO $ Hoogle.hoogle args
logError ("Problem creating hoogle db for " <> display snapName <> ": " <> displayShow exc) $> logInfo "Merge done"
Nothing pure outname
-- | Grabs hoogle txt file from the tarball and a matching cabal file from pantry. Writes -- | Grabs hoogle txt file from the tarball and a matching cabal file from pantry. Writes
@ -824,7 +921,10 @@ restoreHoogleTxtFileWithCabal tmpdir snapshotId snapName fileInfo =
_ -> yield False _ -> yield False
pathToPackageModule :: Text -> Maybe (PackageIdentifierP, ModuleNameP) pathToPackageModule
:: Text
-- ^ Input is like @ace-0.6/ACE-Combinators@
-> Maybe (PackageIdentifierP, ModuleNameP)
pathToPackageModule txt = pathToPackageModule txt =
case T.split (== '/') txt of case T.split (== '/') txt of
[pkgIdentifier, moduleNameDashes] -> do [pkgIdentifier, moduleNameDashes] -> do

View File

@ -4,7 +4,7 @@ module Stackage.Database.Github
( cloneOrUpdate ( cloneOrUpdate
, lastGitFileUpdate , lastGitFileUpdate
, getStackageContentDir , getStackageContentDir
, getCoreCabalFilesDir , getBackupCoreCabalFilesDir
, GithubRepo(..) , GithubRepo(..)
) where ) where
@ -81,9 +81,9 @@ getStackageContentDir rootDir =
cloneOrUpdate rootDir (GithubRepo "commercialhaskell" "stackage-content") cloneOrUpdate rootDir (GithubRepo "commercialhaskell" "stackage-content")
-- | Use backup location with cabal files, hackage doesn't have all of them. -- | Use backup location with cabal files, hackage doesn't have all of them.
getCoreCabalFilesDir :: getBackupCoreCabalFilesDir ::
(MonadReader env m, HasLogFunc env, HasProcessContext env, MonadIO m) (MonadReader env m, HasLogFunc env, HasProcessContext env, MonadIO m)
=> FilePath => FilePath
-> m FilePath -> m FilePath
getCoreCabalFilesDir rootDir = getBackupCoreCabalFilesDir rootDir =
cloneOrUpdate rootDir (GithubRepo "commercialhaskell" "core-cabal-files") cloneOrUpdate rootDir (GithubRepo "commercialhaskell" "core-cabal-files")

View File

@ -173,7 +173,7 @@ ltsBefore x y = do
lastLtsNightlyWithoutHoogleDb :: Int -> Int -> RIO StackageCron [(SnapshotId, SnapName)] lastLtsNightlyWithoutHoogleDb :: Int -> Int -> RIO StackageCron [(SnapshotId, SnapName)]
lastLtsNightlyWithoutHoogleDb ltsCount nightlyCount = do lastLtsNightlyWithoutHoogleDb ltsCount nightlyCount = do
currentHoogleVersionId <- scHoogleVersionId <$> ask currentHoogleVersionId <- scHoogleVersionId <$> ask
let getSnapshotsWithoutHoogeDb snapId snapCount = let getSnapshotsWithoutHoogleDb snapId snapCount =
map (unValue *** unValue) <$> map (unValue *** unValue) <$>
select select
-- "snap" is either Lts or Nightly, while "snapshot" is indeed -- "snap" is either Lts or Nightly, while "snapshot" is indeed
@ -206,12 +206,12 @@ lastLtsNightlyWithoutHoogleDb ltsCount nightlyCount = do
-- order by snapshot.created desc -- order by snapshot.created desc
-- limit $snapCount -- limit $snapCount
-- --
-- So it returns a list of snapshots where there is no -- So it returns a limited list of snapshots where there is no
-- corresponding entry in the snapshot_hoogle_db table for the -- corresponding entry in the snapshot_hoogle_db table for the
-- current hoogle version. -- current hoogle version.
run $ do run $ do
lts <- getSnapshotsWithoutHoogeDb LtsSnap ltsCount lts <- getSnapshotsWithoutHoogleDb LtsSnap ltsCount
nightly <- getSnapshotsWithoutHoogeDb NightlySnap nightlyCount nightly <- getSnapshotsWithoutHoogleDb NightlySnap nightlyCount
pure $ lts ++ nightly pure $ lts ++ nightly
@ -1100,6 +1100,8 @@ getHackageCabalByKey (PackageIdentifierP pname ver) (BlobKey sha size) =
return (hc ^. HackageCabalId, hc ^. HackageCabalTree) return (hc ^. HackageCabalId, hc ^. HackageCabalTree)
-- | Gets the id for the SnapshotPackage that corresponds to the given Snapshot
-- and PackageIdentifier.
getSnapshotPackageId :: getSnapshotPackageId ::
SnapshotId SnapshotId
-> PackageIdentifierP -> PackageIdentifierP
@ -1114,6 +1116,18 @@ getSnapshotPackageId snapshotId (PackageIdentifierP pname ver) =
(pn ^. PackageNameName ==. val pname) &&. (pn ^. PackageNameName ==. val pname) &&.
(v ^. VersionVersion ==. val ver)) (v ^. VersionVersion ==. val ver))
return (sp ^. SnapshotPackageId) return (sp ^. SnapshotPackageId)
--
-- i.e.
--
-- select sp.id
-- from snapshot_package sp
-- join version
-- on version.id = sp.version
-- join package_name pn
-- on pn.id = sp.package_name
-- where sp.snapshot = $snapshot_id
-- and pn.name = $name
-- and v.version = $version
getSnapshotPackageCabalBlob :: getSnapshotPackageCabalBlob ::
@ -1127,6 +1141,16 @@ getSnapshotPackageCabalBlob snapshotId pname =
((sp ^. SnapshotPackageSnapshot ==. val snapshotId) &&. ((sp ^. SnapshotPackageSnapshot ==. val snapshotId) &&.
(pn ^. PackageNameName ==. val pname)) (pn ^. PackageNameName ==. val pname))
return (blob ^. BlobContents) return (blob ^. BlobContents)
-- i.e.
--
-- select blob.content
-- from snapshot_package sp
-- join package_name pn
-- on pn.id = sp.package_name
-- join blob
-- on blob.id = sp.cabal
-- where sp.snapshot = $snapshotId
-- and pn.name = $name
-- | Idempotent and thread safe way of adding a new module. -- | Idempotent and thread safe way of adding a new module.
insertModuleSafe :: ModuleNameP -> ReaderT SqlBackend (RIO env) ModuleNameId insertModuleSafe :: ModuleNameP -> ReaderT SqlBackend (RIO env) ModuleNameId
@ -1164,6 +1188,7 @@ markModuleHasDocs snapshotId pid mSnapshotPackageId modName =
\AND snapshot_package_module.snapshot_package = ?" \AND snapshot_package_module.snapshot_package = ?"
[toPersistValue modName, toPersistValue snapshotPackageId] [toPersistValue modName, toPersistValue snapshotPackageId]
return $ Just snapshotPackageId return $ Just snapshotPackageId
-- FIXME: The Nothing case seems like it should not happen.
Nothing -> return Nothing Nothing -> return Nothing
@ -1199,9 +1224,10 @@ checkInsertSnapshotHoogleDb shouldInsert snapshotId = do
lift $ lift $
logInfo $ logInfo $
"Marking hoogle database for version " <> display hver <> " as available." "Marking hoogle database for version " <> display hver <> " as available."
-- whether or not the version exists, we still put it into snapshot_hoogle_db -- whether or not the version exists, we still put it into
-- So literally the only use of the above query is to log the -- snapshot_hoogle_db. So literally the only use of the above
-- action we're taking. -- query is to log the action we're taking. Whether or not it
-- exists is immaterial to the following action.
isJust <$> P.insertUniqueEntity sh isJust <$> P.insertUniqueEntity sh
-- if we're not inserting, we're just checking if it already exists -- if we're not inserting, we're just checking if it already exists
-- in snapshot_hoogle_db. -- in snapshot_hoogle_db.

View File

@ -117,6 +117,7 @@ data SnapshotFile = SnapshotFile
} deriving (Show) } deriving (Show)
-- Is this a reference to a cabal file stored in Pantry?
data PantryCabal = PantryCabal data PantryCabal = PantryCabal
{ pcPackageName :: !PackageNameP { pcPackageName :: !PackageNameP
, pcVersion :: !VersionP , pcVersion :: !VersionP
@ -131,6 +132,7 @@ instance Display PantryCabal where
instance ToMarkup PantryCabal where instance ToMarkup PantryCabal where
toMarkup = toMarkup . textDisplay toMarkup = toMarkup . textDisplay
-- A Cabal file (package name, version, blob) and source tree
data PantryPackage = PantryPackage data PantryPackage = PantryPackage
{ ppPantryCabal :: !PantryCabal { ppPantryCabal :: !PantryCabal
, ppPantryKey :: !TreeKey , ppPantryKey :: !TreeKey

View File

@ -34,5 +34,4 @@ nix:
- zlib - zlib
- postgresql - postgresql
- pkg-config - pkg-config
- haskell-language-server
- cacert - cacert