Merge pull request #343 from commercialhaskell/b/disk-cleanup

Post Hoogle generation disk cleanup
This commit is contained in:
Bryan Richter 2025-03-18 10:45:03 +02:00 committed by GitHub
commit a384248d68
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 302 additions and 175 deletions

View File

@ -58,7 +58,7 @@ optsParser =
T.unpack defHaddockBucketName)) <*>
switch
(long "do-not-upload" <>
help "Stop from hoogle db and snapshots.json from being generated and uploaded") <*>
help "Disable upload of Hoogle database and snapshots.json") <*>
option
readLogLevel
(long "log-level" <> metavar "LOG_LEVEL" <> short 'l' <> value LevelInfo <>

View File

@ -104,11 +104,15 @@ getStackageSnapshotsDir = do
withResponseUnliftIO :: MonadUnliftIO m => Request -> Manager -> (Response BodyReader -> m b) -> m b
withResponseUnliftIO req man f = withRunInIO $ \ runInIO -> withResponse req man (runInIO . f)
-- | Under the SingleRun wrapper that ensures only one thing at a time is
-- writing the file in question, ensure that a Hoogle database exists on the
-- filesystem for the given SnapName. But only going so far as downloading it
-- from the haddock bucket. See 'createHoogleDB' for the function that puts it
-- there in the first place.
-- | Returns an action that, under the SingleRun wrapper that ensures only one
-- thing at a time is writing the file in question, ensure that a Hoogle
-- database exists on the filesystem for the given SnapName. But only going so
-- far as downloading it from the haddock bucket. See 'buildAndUploadHoogleDBs' for the
-- function that puts it there in the first place. If no db exists in the
-- bucket, the action will return 'Nothing'.
--
-- The location searched is $PWD/hoogle/<snapshot>/<hoogle-version>.hoo
-- E.g. in production, ~stackage-update/hoogle/lts-22.20/5.0.18.4.hoo (for stackage-update).
newHoogleLocker ::
(HasLogFunc env, MonadIO m) => env -> Manager -> Text -> m (SingleRun SnapName (Maybe FilePath))
newHoogleLocker env man bucketUrl = mkSingleRun hoogleLocker
@ -232,38 +236,48 @@ runStackageUpdate doNotUpload = do
corePackageGetters <- makeCorePackageGetters
runResourceT $
join $
-- @createOrUpdateSnapshot@ processes package N while processing docs for
-- package N-1. This @pure ()@ is the "documentation processing action"
-- for the -1'th package.
runConduit $ sourceSnapshots .| foldMC (createOrUpdateSnapshot corePackageGetters) (pure ())
unless doNotUpload uploadSnapshotsJSON
buildAndUploadHoogleDB doNotUpload
buildAndUploadHoogleDBs doNotUpload
logInfo "Finished building and uploading Hoogle DBs"
-- | This will look at 'global-hints.yaml' and will create core package getters that are reused
-- later for adding those package to individual snapshot.
-- later for adding those package to individual snapshots.
makeCorePackageGetters ::
RIO StackageCron (Map CompilerP [CorePackageGetter])
makeCorePackageGetters = do
rootDir <- scStackageRoot <$> ask
contentDir <- getStackageContentDir rootDir
coreCabalFiles <- getCoreCabalFiles rootDir
backupCoreCabalFiles <- getBackupCoreCabalFiles rootDir
liftIO (decodeFileEither (contentDir </> "stack" </> "global-hints.yaml")) >>= \case
Right (hints :: Map CompilerP (Map PackageNameP VersionP)) ->
Map.traverseWithKey
(\compiler ->
fmap Map.elems .
Map.traverseMaybeWithKey (makeCorePackageGetter compiler coreCabalFiles))
Map.traverseMaybeWithKey (makeCorePackageGetter compiler backupCoreCabalFiles))
hints
Left exc -> do
logError $
"Error parsing 'global-hints.yaml' file: " <> fromString (displayException exc)
return mempty
getCoreCabalFiles ::
-- | Packages distributed with GHC aren't taken from Hackage like normal
-- packages. Release managers do upload them, however, so that their docs are
-- available.
--
-- Or at least, they should. The release process was fragile, and some packages
-- weren't uploaded. This mechanism gives us a chance to fill in missing
-- packages.
getBackupCoreCabalFiles ::
FilePath
-> RIO StackageCron (Map PackageIdentifierP (GenericPackageDescription, CabalFileIds))
getCoreCabalFiles rootDir = do
coreCabalFilesDir <- getCoreCabalFilesDir rootDir
cabalFileNames <- getDirectoryContents coreCabalFilesDir
getBackupCoreCabalFiles rootDir = do
backupCoreCabalFilesDir <- getBackupCoreCabalFilesDir rootDir
cabalFileNames <- getDirectoryContents backupCoreCabalFilesDir
cabalFiles <-
forM (filter (isExtensionOf ".cabal") cabalFileNames) $ \cabalFileName ->
let pidTxt = T.pack (dropExtension (takeFileName cabalFileName))
@ -272,15 +286,17 @@ getCoreCabalFiles rootDir = do
logError $ "Invalid package identifier: " <> fromString cabalFileName
pure Nothing
Just pid -> do
cabalBlob <- readFileBinary (coreCabalFilesDir </> cabalFileName)
cabalBlob <- readFileBinary (backupCoreCabalFilesDir </> cabalFileName)
mCabalInfo <- run $ addCabalFile pid cabalBlob
pure ((,) pid <$> mCabalInfo)
pure $ Map.fromList $ catMaybes cabalFiles
-- | Core package info rarely changes between the snapshots, therefore it would be wasteful to
-- load, parse and update all packages from gloabl-hints for each snapshot, instead we produce
-- a memoized version that will do it once initiall and then return information aboat a
-- load, parse and update all packages from gloabl-hints for each snapshot. Instead we produce
-- a memoized version that will do it once initially and then return information about a
-- package on subsequent invocations.
--
-- FIXME: The compiler argument is unused (and never has been). Should it be used?
makeCorePackageGetter ::
CompilerP
-> Map PackageIdentifierP (GenericPackageDescription, CabalFileIds)
@ -309,6 +325,9 @@ makeCorePackageGetter _compiler fallbackCabalFileMap pname ver =
Nothing -> do
whenM (scReportProgress <$> ask) $
logSticky $ "Loading core package: " <> display pid
-- I have no idea what's happening here. I guess I
-- don't know what it means to "load" a package.
-- What is actually going on?
htr <- getHackageTarball pir Nothing
case htrFreshPackageInfo htr of
Just (gpd, treeId) -> do
@ -336,64 +355,70 @@ makeCorePackageGetter _compiler fallbackCabalFileMap pname ver =
PackageIdentifierRevision (unPackageNameP pname) (unVersionP ver) (CFIRevision (Revision 0))
-- TODO: for now it is only from hackage, PantryPackage needs an update to use other origins
-- | A pantry package is being added to a particular snapshot. Extra information like compiler and
-- flags are passed on in order to properly figure out dependencies and modules
-- | Populates the database with information about a package?
--
-- Specifically, a pantry package is being added to a particular snapshot.
--
-- Extra information like compiler and flags are passed on in order to properly
-- figure out dependencies and modules.
--
-- TODO: for now it is only from hackage. PantryPackage needs an update to use other origins
addPantryPackage ::
SnapshotId -> CompilerP -> Bool -> Map FlagNameP Bool -> PantryPackage -> RIO StackageCron Bool
addPantryPackage sid compiler isHidden flags (PantryPackage pc treeKey) = do
addPantryPackage snapId compiler isHidden flags (PantryPackage pcabal pTreeKey) = do
env <- ask
let gpdCachedRef = scCachedGPD env
cache = scCacheCabalFiles env
let pkgDescCache = scCachedGPD env
cacheP = scCacheCabalFiles env
let blobKeyToInt = fromIntegral . unSqlBackendKey . unBlobKey
let updateCacheGPD blobId gpd =
gpd `deepseq`
atomicModifyIORef' gpdCachedRef (\cacheMap -> (IntMap.insert blobId gpd cacheMap, gpd))
let getCachedGPD treeCabal =
let cachedPkgDesc cabalBlobId pkgDesc =
pkgDesc `deepseq`
atomicModifyIORef' pkgDescCache (\cacheMap -> (IntMap.insert cabalBlobId pkgDesc cacheMap, pkgDesc))
let getPkgDesc cabalBlobId =
\case
Just gpd | cache -> updateCacheGPD (blobKeyToInt treeCabal) gpd
Just gpd -> pure gpd
Nothing | cache -> do
cacheMap <- readIORef gpdCachedRef
case IntMap.lookup (blobKeyToInt treeCabal) cacheMap of
Just gpd -> pure gpd
Just pkgDesc | cacheP -> cachedPkgDesc (blobKeyToInt cabalBlobId) pkgDesc
Just pkgDesc -> pure pkgDesc
Nothing | cacheP -> do
cacheMap <- readIORef pkgDescCache
case IntMap.lookup (blobKeyToInt cabalBlobId) cacheMap of
Just pkgDesc -> pure pkgDesc
Nothing ->
loadBlobById treeCabal >>=
updateCacheGPD (blobKeyToInt treeCabal) . parseCabalBlob
Nothing -> parseCabalBlob <$> loadBlobById treeCabal
let storeHackageSnapshotPackage hcid mtid mgpd =
getTreeForKey treeKey >>= \case
Just (Entity treeId _)
| Just tid <- mtid
, tid /= treeId -> do
lift $ logError $ "Pantry Tree Key mismatch for: " <> display pc
loadBlobById cabalBlobId >>=
cachedPkgDesc (blobKeyToInt cabalBlobId) . parseCabalBlob
Nothing -> parseCabalBlob <$> loadBlobById cabalBlobId
let storeHackageSnapshotPackage hackageCabalId mTreeId mpkgDesc =
getTreeForKey pTreeKey >>= \case
-- error case #1
Just (Entity treeId' _)
| Just treeId <- mTreeId
, treeId /= treeId' -> do
lift $ logError $ "Pantry Tree Key mismatch for: " <> display pcabal
pure False
Just tree@(Entity _ Tree {treeCabal})
| Just treeCabal' <- treeCabal -> do
gpd <- getCachedGPD treeCabal' mgpd
let mhcid = Just hcid
eTree = Right tree
addSnapshotPackage sid compiler Hackage eTree mhcid isHidden flags pid gpd
-- happy case
Just pkgTree@(Entity _ Tree {treeCabal})
| Just cabalBlobId <- treeCabal -> do
pkgDesc <- getPkgDesc cabalBlobId mpkgDesc
addSnapshotPackage snapId compiler Hackage (Right pkgTree) (Just hackageCabalId) isHidden flags packageId pkgDesc
pure True
-- error case #2
_ -> do
lift $ logError $ "Pantry is missing the source tree for " <> display pc
lift $ logError $ "Pantry is missing the source tree for " <> display pcabal
pure False
mHackageCabalInfo <- run $ getHackageCabalByKey pid (pcCabalKey pc)
mHackageCabalInfo <- run $ getHackageCabalByKey packageId (pcCabalKey pcabal)
case mHackageCabalInfo of
Nothing -> do
logError $ "Could not find the cabal file for: " <> display pc
logError $ "Could not find the cabal file for: " <> display pcabal
pure False
Just (hcid, Nothing) -> do
Just (hackageCabalId, Nothing) -> do
mHPI <-
htrFreshPackageInfo <$>
getHackageTarball (toPackageIdentifierRevision pc) (Just treeKey)
getHackageTarball (toPackageIdentifierRevision pcabal) (Just pTreeKey)
run $
case mHPI of
Just (gpd, treeId) -> storeHackageSnapshotPackage hcid (Just treeId) (Just gpd)
Nothing -> storeHackageSnapshotPackage hcid Nothing Nothing
Just (hcid, mtid) -> run $ storeHackageSnapshotPackage hcid mtid Nothing
Just (pkgDesc, treeId) -> storeHackageSnapshotPackage hackageCabalId (Just treeId) (Just pkgDesc)
Nothing -> storeHackageSnapshotPackage hackageCabalId Nothing Nothing
Just (hackageCabalId, mTreeId) -> run $ storeHackageSnapshotPackage hackageCabalId mTreeId Nothing
where
pid = PackageIdentifierP (pcPackageName pc) (pcVersion pc)
packageId = PackageIdentifierP (pcPackageName pcabal) (pcVersion pcabal)
@ -404,33 +429,33 @@ checkForDocs :: SnapshotId -> SnapName -> ResourceT (RIO StackageCron) ()
checkForDocs snapshotId snapName = do
bucketName <- lift (scDownloadBucketName <$> ask)
env <- asks scEnvAWS
mods <-
-- it is faster to download all modules in this snapshot separately, rather
-- than process them with a conduit all the way to the database.
packageModules <-
runConduit $
paginate env (req bucketName) .| concatMapC (fromMaybe [] . (^. listObjectsV2Response_contents)) .|
mapC (\obj -> toText (obj ^. object_key)) .|
concatMapC (T.stripSuffix ".html" >=> T.stripPrefix prefix >=> pathToPackageModule) .|
sinkList
-- it is faster to download all modules in this snapshot, than process them with a conduit all
-- the way to the database.
paginate env (listSnapshotObjects bucketName)
.| concatMapC (fromMaybe [] . (^. listObjectsV2Response_contents))
.| mapC (\obj -> toText (obj ^. object_key))
.| concatMapC (T.stripSuffix ".html" >=> T.stripPrefix prefix >=> pathToPackageModule)
.| sinkList
-- Cache SnapshotPackageId rather than look it up many times for each module in the package.
sidsCacheRef <- newIORef Map.empty
-- Cache is for SnapshotPackageId, there will be many modules per peckage, no need to look into
-- the database for each one of them.
-- The other half of the cores are used in 'updateSnapshot'
n <- max 1 . (`div` 2) <$> getNumCapabilities
unexpectedPackages <- lift $ pooledMapConcurrentlyN n (markModules sidsCacheRef) mods
unexpectedPackages <- lift $ pooledMapConcurrentlyN n (markModule sidsCacheRef) packageModules
forM_ (Set.fromList $ catMaybes unexpectedPackages) $ \pid ->
lift $
logWarn $
"Documentation found for package '" <> display pid <>
"', which does not exist in this snapshot: " <>
lift $ logWarn $
"Documentation found for package '" <> display pid <>
"', which does not exist in this snapshot: " <>
display snapName
where
prefix = textDisplay snapName <> "/"
req bucketName = newListObjectsV2 (BucketName bucketName) & listObjectsV2_prefix ?~ prefix
listSnapshotObjects bucketName = newListObjectsV2 (BucketName bucketName) & listObjectsV2_prefix ?~ prefix
-- | This function records all package modules that have documentation available, the ones
-- that are not found in the snapshot reported back as an error. Besides being run
-- concurrently this function optimizes the SnapshotPackageId lookup as well, since that can
-- be shared amongst many modules of one package.
markModules sidsCacheRef (pid, modName) = do
markModule sidsCacheRef (pid, modName) = do
sidsCache <- readIORef sidsCacheRef
let mSnapshotPackageId = Map.lookup pid sidsCache
mFound <- run $ markModuleHasDocs snapshotId pid mSnapshotPackageId modName
@ -456,8 +481,7 @@ sourceSnapshots :: ConduitT a SnapshotFileInfo (ResourceT (RIO StackageCron)) ()
sourceSnapshots = do
snapshotsDir <- lift $ lift getStackageSnapshotsDir
sourceDirectoryDeep False (snapshotsDir </> "lts") .| concatMapMC (getLtsParser snapshotsDir)
sourceDirectoryDeep False (snapshotsDir </> "nightly") .|
concatMapMC (getNightlyParser snapshotsDir)
sourceDirectoryDeep False (snapshotsDir </> "nightly") .| concatMapMC (getNightlyParser snapshotsDir)
where
makeSnapshotFileInfo gitDir fp mFileNameDate snapName = do
let parseSnapshot updatedOn = do
@ -500,39 +524,52 @@ sourceSnapshots = do
return Nothing
-- | Creates a new `Snapshot` if it is not yet present in the database and decides if update
data DecisionResult a e = NothingToDo | NoSnapshotFile | NeedsUpdate a e | DoesntExist e
-- | Creates a new `Snapshot` if it is not yet present in the database, and decides if update
-- is necessary when it already exists.
--
-- TODO: Silently ignoring snapshots where the getter returns Nothing seems like
-- a potential problem. Anyway I'd rather run it beforehand!
decideOnSnapshotUpdate :: SnapshotFileInfo -> RIO StackageCron (Maybe (SnapshotId, SnapshotFile))
decideOnSnapshotUpdate SnapshotFileInfo {sfiSnapName, sfiUpdatedOn, sfiSnapshotFileGetter} = do
forceUpdate <- scForceFullUpdate <$> ask
let mkLogMsg rest = "Snapshot with name: " <> display sfiSnapName <> " " <> rest
mKeySnapFile <-
run (getBy (UniqueSnapshot sfiSnapName)) >>= \case
-- exists, up to date, no force-updated requested; nothing to do
Just (Entity _key snap)
| snapshotUpdatedOn snap == Just sfiUpdatedOn && not forceUpdate -> do
logInfo $ mkLogMsg "already exists and is up to date."
return Nothing
| snapshotUpdatedOn snap == Just sfiUpdatedOn && not forceUpdate ->
return NothingToDo
-- exists but updatedOn was not previously set.
Just entity@(Entity _key snap)
| Nothing <- snapshotUpdatedOn snap -> do
logWarn $ mkLogMsg "did not finish updating last time."
fmap (Just entity, ) <$> sfiSnapshotFileGetter
maybe NoSnapshotFile (NeedsUpdate entity) <$> sfiSnapshotFileGetter
-- exists, but updatedOn does not match or force-update was requested.
Just entity -> do
unless forceUpdate $ logWarn $ mkLogMsg "was updated, applying new patch."
fmap (Just entity, ) <$> sfiSnapshotFileGetter
Nothing -> fmap (Nothing, ) <$> sfiSnapshotFileGetter
maybe NoSnapshotFile (NeedsUpdate entity) <$> sfiSnapshotFileGetter
-- does not exist
Nothing -> maybe NoSnapshotFile DoesntExist <$> sfiSnapshotFileGetter
-- Add new snapshot to the database, when necessary
case mKeySnapFile of
Just (Just (Entity snapKey snap), sf@SnapshotFile {sfCompiler, sfPublishDate})
NothingToDo -> Nothing <$ logInfo (mkLogMsg "already exists and is up to date.")
NoSnapshotFile -> Nothing <$ logWarn (mkLogMsg "has no (readable?) snapshot file.")
NeedsUpdate (Entity oldSnapKey oldSnap) sf@SnapshotFile {sfCompiler, sfPublishDate}
| Just publishDate <- sfPublishDate -> do
let updatedSnap =
Snapshot sfiSnapName sfCompiler publishDate (snapshotUpdatedOn snap)
run $ replace snapKey updatedSnap
pure $ Just (snapKey, sf)
Just (Nothing, sf@SnapshotFile {sfCompiler, sfPublishDate})
| Just publishDate <- sfPublishDate ->
Snapshot sfiSnapName sfCompiler publishDate (snapshotUpdatedOn oldSnap)
run $ replace oldSnapKey updatedSnap
pure $ Just (oldSnapKey, sf)
| otherwise -> return Nothing
DoesntExist sf@SnapshotFile {sfCompiler, sfPublishDate}
| Just publishDate <- sfPublishDate -> do
logInfo $ mkLogMsg "is new, adding to the database."
fmap (, sf) <$>
run (insertUnique (Snapshot sfiSnapName sfCompiler publishDate Nothing))
_ -> return Nothing
run (insertUnique (Snapshot sfiSnapName sfCompiler publishDate Nothing))
| otherwise -> Nothing <$ logWarn (mkLogMsg "has no publish date, skipping.")
type CorePackageGetter
= RIO StackageCron ( Either CabalFileIds (Entity Tree)
@ -545,14 +582,38 @@ type CorePackageGetter
-- current snapshot as well as an action that was passed as an argument. At the end it will return
-- an action that should be invoked in order to mark modules that have documentation available,
-- which in turn can be passed as an argument to the next snapshot loader.
-- Something something ouroboros.
--
-- Question: When do the docs for the last snapshot get loaded?
--
-- Well, this binding is called as @join $ runConduit $ foldMC (createOrUpdateSnapshot corePackageInfoGetters) (pure ())@
--
-- So the answer: the doc-loading action for the last snapshot gets returned by @runConduit $ foldMC ...@,
-- which means it gets executed by @join $ runConduit $ foldMC ...@.
--
-- Evidence:
--
-- Since @foldMC :: (a -> b -> m a) -> a -> ConduitT b o m a@, we see
--
-- @@
-- a ~ ResourceT (RIO Stackage Cron) () -- this is the doc-loading action
-- b ~ SnapshotFileInfo
-- m ~ ResourceT (RIO StackageCron)
-- @@
-- and the foldMC creates a @ConduitT SnapshotFileInfo o (ResourceT (RIO StackageCron)) (ResourceT (RIO StackageCron) ())@
--
-- TODO: It might be more efficient to just put all the actions (snapshot
-- creation and documentation writing both) on a queue and let a bunch of
-- workers chew on it. The current impl creates arbitrary synchronization points
-- with 'runConcurrently'. Granted, I don't know what a good chunk size would
-- actually be.
createOrUpdateSnapshot ::
Map CompilerP [CorePackageGetter]
-> ResourceT (RIO StackageCron) ()
-> SnapshotFileInfo
-> ResourceT (RIO StackageCron) (ResourceT (RIO StackageCron) ())
createOrUpdateSnapshot corePackageInfoGetters prevAction sfi@SnapshotFileInfo { sfiSnapName
, sfiUpdatedOn
} = do
createOrUpdateSnapshot corePackageInfoGetters prevAction sfi@SnapshotFileInfo { sfiSnapName , sfiUpdatedOn } = do
finishedDocs <- newIORef False
runConcurrently
(Concurrently (prevAction >> writeIORef finishedDocs True) *>
@ -561,6 +622,7 @@ createOrUpdateSnapshot corePackageInfoGetters prevAction sfi@SnapshotFileInfo {
loadCurrentSnapshot finishedDocs = do
loadDocs <-
decideOnSnapshotUpdate sfi >>= \case
-- Nothing to do, and thus no docs to process
Nothing -> return $ pure ()
Just (snapshotId, snapshotFile) ->
updateSnapshot
@ -575,22 +637,30 @@ createOrUpdateSnapshot corePackageInfoGetters prevAction sfi@SnapshotFileInfo {
logSticky "Still loading the docs for previous snapshot ..."
pure loadDocs
-- | Updates all packages in the snapshot. If any missing they will be created. Returns an action
-- that will check for available documentation for modules that are known to exist and mark as
-- documented when haddock is present on AWS S3. Only after documentation has been checked this
-- snapshot will be marked as completely updated. This is required in case something goes wrong and
-- process is interrupted
-- | Creates Lts or Nightly entity [Question(bryan): Why not do this when
-- creating the snapshot? Why is this a separate table anyway?] and updates all
-- packages in the snapshot. If any packages are missing they will be created.
-- Returns an action that will (a) check for available documentation for the
-- packages' modules and (b) mark the packages as documented when haddock is
-- present on AWS S3.
--
-- (Only after documentation has been checked will this snapshot be marked as
-- completely updated. This is required in case something goes wrong and process
-- is interrupted.)
updateSnapshot ::
Map CompilerP [CorePackageGetter]
-> SnapshotId
-> SnapName
-> UTCTime
-> SnapshotFile
-> RIO StackageCron (ResourceT (RIO StackageCron) ())
-> RIO StackageCron (ResourceT (RIO StackageCron) ()) -- ^ Returns the action for processing docs
updateSnapshot corePackageGetters snapshotId snapName updatedOn SnapshotFile {..} = do
insertSnapshotName snapshotId snapName
loadedPackageCountRef <- newIORef (0 :: Int)
let totalPackages = length sfPackages
-- A wrapper for 'addPantryPackage' that extracts the package info from
-- snapshot info, increments the count of loaded packages, and reports success
-- as a Bool.
addPantryPackageWithReport pp = do
let PantryCabal {pcPackageName} = ppPantryCabal pp
isHidden = fromMaybe False (Map.lookup pcPackageName sfHidden)
@ -695,6 +765,9 @@ uploadHoogleDB fp key =
body <- toBody <$> readFileBinary fpgz
uploadBucket <- scUploadBucketName <$> ask
uploadFromRIO key $
-- FIXME: I should also set content encoding explicitly here. But
-- then I would break stackage-server, which applies an 'ungzip' in
-- 'newHoogleLocker'. :(
set putObject_acl (Just ObjectCannedACL_Public_read) $ newPutObject (BucketName uploadBucket) key body
@ -708,17 +781,31 @@ uploadFromRIO key po = do
logError $ "Couldn't upload " <> displayShow key <> " to S3 becuase " <> displayShow e
Right _ -> logInfo $ "Successfully uploaded " <> displayShow key <> " to S3"
buildAndUploadHoogleDB :: Bool -> RIO StackageCron ()
buildAndUploadHoogleDB doNotUpload = do
-- | As the name says, build and upload Hoogle DBs.
--
-- Which DBs? The last 5 LTS and the last 5 Nightlies that are missing their
-- Hoogle DBs.
--
-- How? It downloads the Hoogle inputs that were previously generated alongside
-- the Haddocks, runs @hoogle@ on them, and uploads the result back to the same
-- bucket. Those inputs were generated by snapshot curation.
--
-- Why? I feel like this should be a short Bash script using curl and hoogle, and
-- maybe one day it will be.
--
-- This action is only run by stackage-server-cron.
buildAndUploadHoogleDBs :: Bool -> RIO StackageCron ()
buildAndUploadHoogleDBs doNotUpload = do
snapshots <- lastLtsNightlyWithoutHoogleDb 5 5
-- currentHoogleVersionId <- scHoogleVersionId <$> ask
env <- ask
awsEnv <- asks scEnvAWS
bucketUrl <- asks scDownloadBucketUrl
-- locker is an action that returns the path to a hoogle db, if one exists
-- in the haddock bucket already.
-- in the haddock bucket already. It takes the SnapName as an argument.
-- I think it might be overkill.
locker <- newHoogleLocker (env ^. logFuncL) (awsEnv ^. env_manager) bucketUrl
let insertH = checkInsertSnapshotHoogleDb True
let -- These bindings undo a questionable conflation of operations
insertH = checkInsertSnapshotHoogleDb True
checkH = checkInsertSnapshotHoogleDb False
for_ snapshots $ \(snapshotId, snapName) ->
-- Even though we just got a list of snapshots that don't have hoogle
@ -727,72 +814,82 @@ buildAndUploadHoogleDB doNotUpload = do
-- checkInsertSnapshotHoogleDb just check against SnapshotHoogleDb.
-- Perhaps the check can be removed.
unlessM (checkH snapshotId) $ do
withSystemTempDirectory ("hoogle-" ++ T.unpack (textDisplay snapName)) $ \tmpdir -> do
logInfo $ "Starting Hoogle database download: " <> display (hoogleKey snapName)
-- Check if the database already exists (by downloading it).
-- FIXME: Why not just send a HEAD?
-- Perhaps the idea was to put the hoogle database somewhere the
-- main Stackage server process can find it? But nowadays
-- stackage-server downloads its own version separately.
mfp <- singleRun locker snapName
case mfp of
Just _ -> do
logInfo $ "Current hoogle database exists for: " <> display snapName
Just fp -> do
-- Something bad must have happened: we created the hoogle db
-- previously, but didn't get to record it as available.
logWarn $ "Unregistered hoogle database found for: " <> display snapName
<> ". Registering now."
void $ insertH snapshotId
-- FIXME: For now we need to delete this file we just
-- downloaded. We probably shouldn't download it in the
-- first place, though (just use HEAD).
liftIO $ removeFile fp
Nothing -> do
logInfo $ "Current hoogle database does not yet exist in the bucket for: " <> display snapName
mfp' <- createHoogleDB snapshotId snapName
forM_ mfp' $ \fp -> do
let key = hoogleKey snapName
dest = T.unpack key
createDirectoryIfMissing True $ takeDirectory dest
renamePath fp dest
unless doNotUpload $ do
uploadHoogleDB dest (ObjectKey key)
void $ insertH snapshotId
-- NB: createHoogleDB will fail if something goes wrong.
fp <- createHoogleDB tmpdir snapshotId snapName
let key = hoogleKey snapName
unless doNotUpload $ do
uploadHoogleDB fp (ObjectKey key)
void $ insertH snapshotId
-- | Create a hoogle db from haddocks for the given snapshot, and upload it to
-- the haddock bucket.
createHoogleDB :: SnapshotId -> SnapName -> RIO StackageCron (Maybe FilePath)
createHoogleDB snapshotId snapName =
handleAny logException $ do
logInfo $ "Creating Hoogle DB for " <> display snapName
downloadBucketUrl <- scDownloadBucketUrl <$> ask
let root = "hoogle-gen"
bindir = root </> "bindir"
outname = root </> "output.hoo"
tarKey = toPathPiece snapName <> "/hoogle/orig.tar"
tarUrl = downloadBucketUrl <> "/" <> tarKey
tarFP = root </> T.unpack tarKey
-- When tarball is downloaded it is saved with durability and atomicity, so if it
-- is present it is not in a corrupted state
unlessM (doesFileExist tarFP) $ do
req <- parseRequest $ T.unpack tarUrl
env <- asks scEnvAWS
let man = env ^. env_manager
withResponseUnliftIO req {decompress = const True} man $ \res -> do
throwErrorStatusCodes req res
createDirectoryIfMissing True $ takeDirectory tarFP
withBinaryFileDurableAtomic tarFP WriteMode $ \tarHandle ->
runConduitRes $ bodyReaderSource (responseBody res) .| sinkHandle tarHandle
void $ tryIO $ removeDirectoryRecursive bindir
void $ tryIO $ removeFile outname
createDirectoryIfMissing True bindir
withSystemTempDirectory ("hoogle-" ++ T.unpack (textDisplay snapName)) $ \tmpdir -> do
Any hasRestored <-
runConduitRes $
sourceFile tarFP .|
untar (restoreHoogleTxtFileWithCabal tmpdir snapshotId snapName) .|
foldMapC Any
unless hasRestored $ error "No Hoogle .txt files found"
let args = ["generate", "--database=" ++ outname, "--local=" ++ tmpdir]
logInfo $
mconcat
[ "Merging databases... ("
, foldMap fromString $ L.intersperse " " ("hoogle" : args)
, ")"
]
liftIO $ Hoogle.hoogle args
logInfo "Merge done"
return $ Just outname
where
logException exc =
logError ("Problem creating hoogle db for " <> display snapName <> ": " <> displayShow exc) $>
Nothing
-- | Create a hoogle db from haddocks for the given snapshot.
--
-- Haddocks are downloaded from the documentation bucket, where they were
-- uploaded as a tar file.
--
-- Returns the path to the .hoo database, which will be found in the first
-- argument. It will look like @<rootDir>/hoogle-gen/output.hoo@.
createHoogleDB :: FilePath -> SnapshotId -> SnapName -> RIO StackageCron FilePath
createHoogleDB rootDir snapshotId snapName = do
logInfo $ "Creating Hoogle DB for " <> display snapName
downloadBucketUrl <- scDownloadBucketUrl <$> ask
let root = rootDir </> "hoogle-gen"
outname = root </> "output.hoo"
inputTarKey = toPathPiece snapName <> "/hoogle/orig.tar"
inputTarUrl = downloadBucketUrl <> "/" <> inputTarKey
outputTarFP = root </> T.unpack inputTarKey
-- Fetch the tarball with Hoogle inputs
req <- parseRequest $ T.unpack inputTarUrl
env <- asks scEnvAWS
let man = env ^. env_manager
withResponseUnliftIO req {decompress = const True} man $ \res -> do
throwErrorStatusCodes req res
createDirectoryIfMissing True $ takeDirectory outputTarFP
withBinaryFileDurableAtomic outputTarFP WriteMode $ \tarHandle ->
runConduitRes $ bodyReaderSource (responseBody res) .| sinkHandle tarHandle
-- Extract the Hoogle inputs from the tarball into a separate temp dir, then
-- generate the hoogle database.
withSystemTempDirectory ("hoogle-" ++ T.unpack (textDisplay snapName)) $ \tmpdir -> do
Any hasRestored <-
runConduitRes $
sourceFile outputTarFP .|
untar (restoreHoogleTxtFileWithCabal tmpdir snapshotId snapName) .|
foldMapC Any
-- We just check if we have any Hoogle .txt file at all.
unless hasRestored $ error "No Hoogle .txt files found"
-- Generate the hoogle database
let args = ["generate", "--database=" ++ outname, "--local=" ++ tmpdir]
logInfo $
mconcat
[ "Merging databases... ("
, foldMap fromString $ L.intersperse " " ("hoogle" : args)
, ")"
]
-- 'Hoogle.hoogle' expects to run as an app, and crashes if something
-- goes wrong. That's good.
liftIO $ Hoogle.hoogle args
logInfo "Merge done"
pure outname
-- | Grabs hoogle txt file from the tarball and a matching cabal file from pantry. Writes
@ -824,7 +921,10 @@ restoreHoogleTxtFileWithCabal tmpdir snapshotId snapName fileInfo =
_ -> yield False
pathToPackageModule :: Text -> Maybe (PackageIdentifierP, ModuleNameP)
pathToPackageModule
:: Text
-- ^ Input is like @ace-0.6/ACE-Combinators@
-> Maybe (PackageIdentifierP, ModuleNameP)
pathToPackageModule txt =
case T.split (== '/') txt of
[pkgIdentifier, moduleNameDashes] -> do

View File

@ -4,7 +4,7 @@ module Stackage.Database.Github
( cloneOrUpdate
, lastGitFileUpdate
, getStackageContentDir
, getCoreCabalFilesDir
, getBackupCoreCabalFilesDir
, GithubRepo(..)
) where
@ -81,9 +81,9 @@ getStackageContentDir rootDir =
cloneOrUpdate rootDir (GithubRepo "commercialhaskell" "stackage-content")
-- | Use backup location with cabal files, hackage doesn't have all of them.
getCoreCabalFilesDir ::
getBackupCoreCabalFilesDir ::
(MonadReader env m, HasLogFunc env, HasProcessContext env, MonadIO m)
=> FilePath
-> m FilePath
getCoreCabalFilesDir rootDir =
getBackupCoreCabalFilesDir rootDir =
cloneOrUpdate rootDir (GithubRepo "commercialhaskell" "core-cabal-files")

View File

@ -173,7 +173,7 @@ ltsBefore x y = do
lastLtsNightlyWithoutHoogleDb :: Int -> Int -> RIO StackageCron [(SnapshotId, SnapName)]
lastLtsNightlyWithoutHoogleDb ltsCount nightlyCount = do
currentHoogleVersionId <- scHoogleVersionId <$> ask
let getSnapshotsWithoutHoogeDb snapId snapCount =
let getSnapshotsWithoutHoogleDb snapId snapCount =
map (unValue *** unValue) <$>
select
-- "snap" is either Lts or Nightly, while "snapshot" is indeed
@ -206,12 +206,12 @@ lastLtsNightlyWithoutHoogleDb ltsCount nightlyCount = do
-- order by snapshot.created desc
-- limit $snapCount
--
-- So it returns a list of snapshots where there is no
-- So it returns a limited list of snapshots where there is no
-- corresponding entry in the snapshot_hoogle_db table for the
-- current hoogle version.
run $ do
lts <- getSnapshotsWithoutHoogeDb LtsSnap ltsCount
nightly <- getSnapshotsWithoutHoogeDb NightlySnap nightlyCount
lts <- getSnapshotsWithoutHoogleDb LtsSnap ltsCount
nightly <- getSnapshotsWithoutHoogleDb NightlySnap nightlyCount
pure $ lts ++ nightly
@ -1100,6 +1100,8 @@ getHackageCabalByKey (PackageIdentifierP pname ver) (BlobKey sha size) =
return (hc ^. HackageCabalId, hc ^. HackageCabalTree)
-- | Gets the id for the SnapshotPackage that corresponds to the given Snapshot
-- and PackageIdentifier.
getSnapshotPackageId ::
SnapshotId
-> PackageIdentifierP
@ -1114,6 +1116,18 @@ getSnapshotPackageId snapshotId (PackageIdentifierP pname ver) =
(pn ^. PackageNameName ==. val pname) &&.
(v ^. VersionVersion ==. val ver))
return (sp ^. SnapshotPackageId)
--
-- i.e.
--
-- select sp.id
-- from snapshot_package sp
-- join version
-- on version.id = sp.version
-- join package_name pn
-- on pn.id = sp.package_name
-- where sp.snapshot = $snapshot_id
-- and pn.name = $name
-- and v.version = $version
getSnapshotPackageCabalBlob ::
@ -1127,6 +1141,16 @@ getSnapshotPackageCabalBlob snapshotId pname =
((sp ^. SnapshotPackageSnapshot ==. val snapshotId) &&.
(pn ^. PackageNameName ==. val pname))
return (blob ^. BlobContents)
-- i.e.
--
-- select blob.content
-- from snapshot_package sp
-- join package_name pn
-- on pn.id = sp.package_name
-- join blob
-- on blob.id = sp.cabal
-- where sp.snapshot = $snapshotId
-- and pn.name = $name
-- | Idempotent and thread safe way of adding a new module.
insertModuleSafe :: ModuleNameP -> ReaderT SqlBackend (RIO env) ModuleNameId
@ -1164,6 +1188,7 @@ markModuleHasDocs snapshotId pid mSnapshotPackageId modName =
\AND snapshot_package_module.snapshot_package = ?"
[toPersistValue modName, toPersistValue snapshotPackageId]
return $ Just snapshotPackageId
-- FIXME: The Nothing case seems like it should not happen.
Nothing -> return Nothing
@ -1199,9 +1224,10 @@ checkInsertSnapshotHoogleDb shouldInsert snapshotId = do
lift $
logInfo $
"Marking hoogle database for version " <> display hver <> " as available."
-- whether or not the version exists, we still put it into snapshot_hoogle_db
-- So literally the only use of the above query is to log the
-- action we're taking.
-- whether or not the version exists, we still put it into
-- snapshot_hoogle_db. So literally the only use of the above
-- query is to log the action we're taking. Whether or not it
-- exists is immaterial to the following action.
isJust <$> P.insertUniqueEntity sh
-- if we're not inserting, we're just checking if it already exists
-- in snapshot_hoogle_db.

View File

@ -117,6 +117,7 @@ data SnapshotFile = SnapshotFile
} deriving (Show)
-- Is this a reference to a cabal file stored in Pantry?
data PantryCabal = PantryCabal
{ pcPackageName :: !PackageNameP
, pcVersion :: !VersionP
@ -131,6 +132,7 @@ instance Display PantryCabal where
instance ToMarkup PantryCabal where
toMarkup = toMarkup . textDisplay
-- A Cabal file (package name, version, blob) and source tree
data PantryPackage = PantryPackage
{ ppPantryCabal :: !PantryCabal
, ppPantryKey :: !TreeKey

View File

@ -34,5 +34,4 @@ nix:
- zlib
- postgresql
- pkg-config
- haskell-language-server
- cacert