-- SPDX-FileCopyrightText: 2022 Gregor Kleen ,Steffen Jost -- -- SPDX-License-Identifier: AGPL-3.0-or-later module Handler.Utils.TermCandidates where import Import -- import Handler.Utils -- Import this module as Candidates -- import Utils.Lens -- import Data.Time -- import qualified Data.Text as T -- import Data.Function ((&)) -- import Yesod.Form.Bootstrap3 -- import Colonnade hiding (fromMaybe) -- import Yesod.Colonnade -- import qualified Data.UUID.Cryptographic as UUID -- import Control.Monad.Trans.Writer (mapWriterT) -- import Database.Persist.Sql (fromSqlKey) import qualified Data.Set as Set import qualified Data.Map as Map import qualified Database.Esqueleto.Legacy as E import qualified Database.Esqueleto.Internal.Internal as E {-# ANN module ("HLint: ignore Use newtype instead of data"::String) #-} type STKey = Int -- for convenience, assmued identical to field StudyTermNameCandidateKey data FailedCandidateInference = FailedCandidateInference [Entity StudyTerms] deriving (Typeable, Show) instance Exception FailedCandidateInference -- Default Instance -- -- | Just an heuristik to fill in defaults -- shortenStudyTerm :: Text -> Text -- shortenStudyTerm = concatMap (take 4) . splitCamel -- | Attempt to identify new StudyTerms based on observations, returning: -- * list of ambiguous instances that were discarded outright (identical names for differents keys observed in single incidences) -- * list of problems, ie. StudyTerms that contradict observed incidences -- * list of redundants, i.e. redundant observed incidences -- * list of accepted, i.e. newly accepted key/name pairs inferNamesHandler :: Handler ([Entity StudyTerms],[TermCandidateIncidence],[Entity StudyTermNameCandidate],[(StudyTermsId,Text)]) inferNamesHandler = runDB $ inferAcc mempty where inferAcc (accAmbiguous, accRedundants, accAccepted) = handle (\(FailedCandidateInference fails) -> (fails, accAmbiguous, accRedundants, accAccepted') <$ E.transactionUndo) $ do (infAmbis, infReds, infAccs) <- inferStep if | null infAccs -> return ([], accAmbiguous, infReds <> accRedundants, accAccepted') | otherwise -> do E.transactionSave -- commit transaction if there are no problems inferAcc (infAmbis <> accAmbiguous, infReds <> accRedundants, infAccs <> accAccepted) where accAccepted' = over (traversed . _1) StudyTermsKey' accAccepted inferStep = do ambiguous <- removeAmbiguousNames redundants <- removeRedundantNames accepted <- acceptSingletonNames problems <- nameConflicts unless (null problems) $ throwM $ FailedCandidateInference problems return (ambiguous, redundants, accepted) inferParentsHandler :: Handler ([Entity StudySubTermParentCandidate], [Entity StudySubTerms]) inferParentsHandler = runDB $ inferAcc mempty where inferAcc (infReds', infAccs') = do (infReds, infAccs) <- inferStep if | null infAccs -> return (infReds' <> infReds, infAccs') | otherwise -> inferAcc (infReds' <> infReds, infAccs' <> infAccs) inferStep = do redundants <- removeRedundantParents accepted <- acceptSingletonParents return (redundants, accepted) {- Candidate 1 11 "A" Candidate 1 11 "B" Candidate 1 12 "A" Candidate 1 12 "B" Candidate 2 12 "B" Candidate 2 12 "C" Candidate 2 13 "B" Candidate 2 13 "C" should readily yield 11/A, 12/B 13/C: it can infer due to overlab that 12/B must be true, then eliminating B identifies A and C; this rests on the assumption that the Names are unique, which is NOT TRUE; as a fix we simply eliminate all observations that have the same name twice, see removeInconsistent -} -- | remove candidates with ambiguous observations, -- ie. candidates that have duplicated term names with differing keys -- which may happen in rare cases removeAmbiguousNames :: DB [TermCandidateIncidence] removeAmbiguousNames = do ambiList <- E.select $ E.from $ \candidate -> do E.groupBy ( candidate E.^. StudyTermNameCandidateIncidence , candidate E.^. StudyTermNameCandidateKey , candidate E.^. StudyTermNameCandidateName ) E.having $ E.countRows E.!=. E.val (1 :: Int64) return $ candidate E.^. StudyTermNameCandidateIncidence let ambiSet = E.unValue <$> nubOrd ambiList -- Most SQL dialects won't allow deletion and queries on the same table at once, hence we delete in two steps. deleteWhere [StudyTermNameCandidateIncidence <-. ambiSet] return ambiSet -- | remove known StudyTerm from candidates that have the _exact_ name, -- ie. if a candidate contains a known key, we remove it and its associated fullname -- only save if ambiguous candidates haven been removed removeRedundantNames :: DB [Entity StudyTermNameCandidate] removeRedundantNames = do redundants <- E.select $ E.distinct $ E.from $ \(candidate `E.InnerJoin` sterm) -> do E.on $ candidate E.^. StudyTermNameCandidateKey E.==. sterm E.^. StudyTermsKey E.&&. ( E.just (candidate E.^. StudyTermNameCandidateName) E.==. sterm E.^. StudyTermsName E.||. E.exists (E.from $ \(subTerm `E.InnerJoin` sterm2) -> do E.on $ subTerm E.^. StudySubTermsParent E.==. sterm E.^. StudyTermsId E.&&. subTerm E.^. StudySubTermsChild E.==. sterm2 E.^. StudyTermsId E.where_ $ E.just (candidate E.^. StudyTermNameCandidateName) E.==. sterm2 E.^. StudyTermsName ) ) return candidate -- Most SQL dialects won't allow deletion and queries on the same table at once, hence we delete in two steps. forM_ redundants $ \Entity{entityVal=StudyTermNameCandidate{..}} -> deleteWhere $ ( StudyTermNameCandidateIncidence ==. studyTermNameCandidateIncidence ) : ([ StudyTermNameCandidateKey ==. studyTermNameCandidateKey ] ||. [ StudyTermNameCandidateName ==. studyTermNameCandidateName ]) return redundants removeRedundantParents :: DB [Entity StudySubTermParentCandidate] removeRedundantParents = do redundants <- E.select . E.distinct . E.from $ \(candidate `E.InnerJoin` subTerm) -> do E.on $ candidate E.^. StudySubTermParentCandidateKey E.==. E.veryUnsafeCoerceSqlExprValue (subTerm E.^. StudySubTermsChild) E.&&. candidate E.^. StudySubTermParentCandidateParent E.==. E.veryUnsafeCoerceSqlExprValue (subTerm E.^. StudySubTermsParent) return candidate forM_ redundants $ \(Entity _ StudySubTermParentCandidate{..}) -> E.delete . E.from $ \candidate -> E.where_ $ candidate E.^. StudySubTermParentCandidateIncidence E.==. E.val studySubTermParentCandidateIncidence E.&&. ( candidate E.^. StudySubTermParentCandidateParent `E.in_` E.valList [studySubTermParentCandidateParent, studySubTermParentCandidateKey] E.||. candidate E.^. StudySubTermParentCandidateKey `E.in_` E.valList [studySubTermParentCandidateParent, studySubTermParentCandidateKey] ) return redundants removeRedundantStandalone :: DB [Entity StudyTermStandaloneCandidate] removeRedundantStandalone = do redundants <- E.select . E.distinct . E.from $ \(candidate `E.InnerJoin` sterm) -> do E.on $ candidate E.^. StudyTermStandaloneCandidateKey E.==. sterm E.^. StudyTermsKey E.&&. E.not_ (E.isNothing $ sterm E.^. StudyTermsDefaultDegree) E.&&. E.not_ (E.isNothing $ sterm E.^. StudyTermsDefaultType) return candidate deleteWhere [ StudyTermStandaloneCandidateId <-. map entityKey redundants ] return redundants -- | Search for single candidates and memorize them as StudyTerms. -- Should be called after @removeRedundant@ to increase success chances and reduce cost; otherwise memory heavy! -- Does not delete the used candidates, user @removeRedundant@ for this later on. -- Esqueleto does not provide the INTERESECT operator, thus -- we load the table into Haskell and operate there. Memory usage problem? StudyTermsCandidate may become huge. acceptSingletonNames :: DB [(STKey,Text)] acceptSingletonNames = do knownKeys <- fmap unStudyTermsKey <$> selectKeysList [StudyTermsName !=. Nothing] [Asc StudyTermsKey] -- let knownKeysSet = Set.fromAscList knownKeys -- In case of memory problems, change next lines to conduit proper: incidences <- fmap entityVal <$> selectList [StudyTermNameCandidateKey /<-. knownKeys] [] -- LimitTo might be dangerous here, if we get a partial incidence. Possibly first select N incidences, then retrieving all those only. -- incidences <- E.select $ E.from $ \candidate -> do -- E.where_ $ candidate E.^. StudyTermCandidayeKey `E.notIn` E.valList knownKeys -- return candidate -- Possibly expensive pure computations follows. Break runDB to shorten transaction? let groupedCandidates :: Map STKey (Map UUID (Set Text)) groupedCandidates = foldl' groupFun mempty incidences -- given a key, map each incidence to set of possible names for this key groupFun :: Map STKey (Map TermCandidateIncidence (Set Text)) -> StudyTermNameCandidate -> Map STKey (Map TermCandidateIncidence (Set Text)) groupFun m StudyTermNameCandidate{..} = insertWith (Map.unionWith Set.union) studyTermNameCandidateKey (Map.singleton studyTermNameCandidateIncidence $ Set.singleton studyTermNameCandidateName) m -- pointwise intersection per incidence gives possible candidates per key keyCandidates :: Map STKey (Set Text) keyCandidates = Map.map (setIntersections . Map.elems) groupedCandidates -- filter candidates having a unique possibility left fixedKeys :: [(STKey,Text)] fixedKeys = fst $ Map.foldlWithKey' combFixed mempty keyCandidates where combFixed :: ([(STKey,Text)], Set STKey) -> STKey -> Set Text -> ([(STKey,Text)], Set STKey) combFixed (acc, bad) k s | Set.member k bad = (acc, bad) | maybe False (`Set.notMember` s) (lookup k acc) = (filter (\(k', _) -> k /= k') acc, Set.insert k bad) | [n] <- Set.elems s = ((k,n) : acc, bad) | otherwise = (acc, bad) -- registerFixed :: (STKey, Text) -> DB (Key StudyTerms) registerFixed :: (STKey, Text) -> DB () registerFixed (key, name) = repsert (StudyTermsKey' key) $ StudyTerms key Nothing (Just name) Nothing Nothing -- register newly fixed candidates forM_ fixedKeys registerFixed return fixedKeys acceptSingletonParents :: DB [Entity StudySubTerms] acceptSingletonParents = do candidates <- map entityVal <$> selectList [] [] let groupedCandidates :: Map STKey (Map UUID (Set STKey)) groupedCandidates = foldl' groupFun mempty candidates where groupFun :: Map STKey (Map UUID (Set STKey)) -> StudySubTermParentCandidate -> Map STKey (Map UUID (Set STKey)) groupFun m StudySubTermParentCandidate{..} = Map.insertWith (Map.unionWith Set.union) studySubTermParentCandidateKey (Map.singleton studySubTermParentCandidateIncidence $ Set.singleton studySubTermParentCandidateParent) m parentCandidates :: Map STKey (Set STKey) parentCandidates = Map.map (setIntersections . Map.elems) groupedCandidates fixedParents :: [(STKey, STKey)] fixedParents = fst $ Map.foldlWithKey' combFixed mempty parentCandidates where combFixed :: ([(STKey, STKey)], Set STKey) -> STKey -> Set STKey -> ([(STKey, STKey)], Set STKey) combFixed (acc, bad) k s | Set.member k bad = (acc, bad) | maybe False (`Set.notMember` s) (lookup k acc) = (filter (\(k', _) -> k /= k') acc, Set.insert k bad) | [p] <- Set.elems s = ((k, p) : acc, bad) | otherwise = (acc, bad) inserted <- forM fixedParents $ \(key, parent) -> do unlessM (existsKey $ StudyTermsKey' key) $ insert_ (StudyTerms key Nothing Nothing Nothing Nothing) unlessM (existsKey $ StudyTermsKey' parent) $ insert_ (StudyTerms parent Nothing Nothing Nothing Nothing) insertUnique $ StudySubTerms { studySubTermsChild = StudyTermsKey' key , studySubTermsParent = StudyTermsKey' parent } mapM getJustEntity $ catMaybes inserted -- | all existing StudyTerms that are contradiced by current observations nameConflicts :: DB [Entity StudyTerms] nameConflicts = E.select $ E.from $ \studyTerms -> do E.where_ $ E.not_ $ E.isNothing $ studyTerms E.^. StudyTermsName E.where_ $ E.exists $ E.from $ \candidateOne -> do E.where_ $ candidateOne E.^. StudyTermNameCandidateKey E.==. studyTerms E.^. StudyTermsKey E.where_ $ E.notExists . E.from $ \candidateTwo -> do E.where_ $ candidateTwo E.^. StudyTermNameCandidateIncidence E.==. candidateOne E.^. StudyTermNameCandidateIncidence E.where_ $ studyTerms E.^. StudyTermsName E.==. E.just (candidateTwo E.^. StudyTermNameCandidateName) E.||. E.exists ( E.from $ \(pCandidate `E.LeftOuterJoin` termsTwo) -> do E.on $ pCandidate E.^. StudySubTermParentCandidateParent E.==. studyTerms E.^. StudyTermsKey E.&&. E.just (pCandidate E.^. StudySubTermParentCandidateKey) E.==. termsTwo E.?. StudyTermsKey E.where_ $ E.joinV (termsTwo E.?. StudyTermsName) E.==. E.just (candidateTwo E.^. StudyTermNameCandidateName) E.||. E.isNothing (E.joinV $ termsTwo E.?. StudyTermsName) ) E.||. E.exists ( E.from $ \(subTerms `E.InnerJoin` termsTwo) -> do E.on $ subTerms E.^. StudySubTermsParent E.==. studyTerms E.^. StudyTermsId E.&&. subTerms E.^. StudySubTermsChild E.==. termsTwo E.^. StudyTermsId E.where_ $ termsTwo E.^. StudyTermsName E.==. E.just (candidateTwo E.^. StudyTermNameCandidateName) E.||. E.isNothing (termsTwo E.^. StudyTermsName) ) return studyTerms -- | retrieve all incidence keys having containing a certain @StudyTermKey @ getNameIncidencesFor :: [StudyTermsId] -> DB [E.Value TermCandidateIncidence] getNameIncidencesFor stks = E.select $ E.distinct $ E.from $ \candidate -> do E.where_ $ candidate E.^. StudyTermNameCandidateKey `E.in_` E.valList stks' return $ candidate E.^. StudyTermNameCandidateIncidence where stks' = stks <&> unStudyTermsKey