297 lines
14 KiB
Haskell
297 lines
14 KiB
Haskell
module Handler.Utils.TermCandidates where
|
|
|
|
import Import
|
|
-- import Handler.Utils
|
|
|
|
|
|
-- Import this module as Candidates
|
|
|
|
-- import Utils.Lens
|
|
|
|
-- import Data.Time
|
|
-- import qualified Data.Text as T
|
|
-- import Data.Function ((&))
|
|
-- import Yesod.Form.Bootstrap3
|
|
-- import Colonnade hiding (fromMaybe)
|
|
-- import Yesod.Colonnade
|
|
-- import qualified Data.UUID.Cryptographic as UUID
|
|
-- import Control.Monad.Trans.Writer (mapWriterT)
|
|
-- import Database.Persist.Sql (fromSqlKey)
|
|
import qualified Data.Set as Set
|
|
import qualified Data.List as List
|
|
import qualified Data.Map as Map
|
|
|
|
|
|
import qualified Database.Esqueleto as E
|
|
import qualified Database.Esqueleto.Internal.Internal as E
|
|
|
|
{-# ANN module ("HLint: ignore Use newtype instead of data"::String) #-}
|
|
|
|
type STKey = Int -- for convenience, assmued identical to field StudyTermNameCandidateKey
|
|
|
|
data FailedCandidateInference = FailedCandidateInference [Entity StudyTerms]
|
|
deriving (Typeable, Show)
|
|
|
|
instance Exception FailedCandidateInference
|
|
-- Default Instance
|
|
|
|
-- -- | Just an heuristik to fill in defaults
|
|
-- shortenStudyTerm :: Text -> Text
|
|
-- shortenStudyTerm = concatMap (take 4) . splitCamel
|
|
|
|
-- | Attempt to identify new StudyTerms based on observations, returning:
|
|
-- * list of ambiguous instances that were discarded outright (identical names for differents keys observed in single incidences)
|
|
-- * list of problems, ie. StudyTerms that contradict observed incidences
|
|
-- * list of redundants, i.e. redundant observed incidences
|
|
-- * list of accepted, i.e. newly accepted key/name pairs
|
|
inferNamesHandler :: Handler ([Entity StudyTerms],[TermCandidateIncidence],[Entity StudyTermNameCandidate],[(StudyTermsId,Text)])
|
|
inferNamesHandler = runDB $ inferAcc mempty
|
|
where
|
|
inferAcc (accAmbiguous, accRedundants, accAccepted) =
|
|
handle (\(FailedCandidateInference fails) -> (fails, accAmbiguous, accRedundants, accAccepted') <$ E.transactionUndo) $ do
|
|
(infAmbis, infReds, infAccs) <- inferStep
|
|
if
|
|
| null infAccs ->
|
|
return ([], accAmbiguous, infReds <> accRedundants, accAccepted')
|
|
| otherwise -> do
|
|
E.transactionSave -- commit transaction if there are no problems
|
|
inferAcc (infAmbis <> accAmbiguous, infReds <> accRedundants, infAccs <> accAccepted)
|
|
where
|
|
accAccepted' = over (traversed . _1) StudyTermsKey' accAccepted
|
|
|
|
inferStep = do
|
|
ambiguous <- removeAmbiguousNames
|
|
redundants <- removeRedundantNames
|
|
accepted <- acceptSingletonNames
|
|
problems <- nameConflicts
|
|
unless (null problems) $ throwM $ FailedCandidateInference problems
|
|
return (ambiguous, redundants, accepted)
|
|
|
|
inferParentsHandler :: Handler ([Entity StudySubTermParentCandidate], [Entity StudySubTerms])
|
|
inferParentsHandler = runDB $ inferAcc mempty
|
|
where
|
|
inferAcc (infReds', infAccs') = do
|
|
(infReds, infAccs) <- inferStep
|
|
if
|
|
| null infAccs ->
|
|
return (infReds' <> infReds, infAccs')
|
|
| otherwise ->
|
|
inferAcc (infReds' <> infReds, infAccs' <> infAccs)
|
|
inferStep = do
|
|
redundants <- removeRedundantParents
|
|
accepted <- acceptSingletonParents
|
|
return (redundants, accepted)
|
|
|
|
{-
|
|
Candidate 1 11 "A"
|
|
Candidate 1 11 "B"
|
|
Candidate 1 12 "A"
|
|
Candidate 1 12 "B"
|
|
Candidate 2 12 "B"
|
|
Candidate 2 12 "C"
|
|
Candidate 2 13 "B"
|
|
Candidate 2 13 "C"
|
|
|
|
should readily yield 11/A, 12/B 13/C:
|
|
|
|
it can infer due to overlab that 12/B must be true, then eliminating B identifies A and C;
|
|
this rests on the assumption that the Names are unique, which is NOT TRUE;
|
|
as a fix we simply eliminate all observations that have the same name twice, see removeInconsistent
|
|
|
|
-}
|
|
|
|
-- | remove candidates with ambiguous observations,
|
|
-- ie. candidates that have duplicated term names with differing keys
|
|
-- which may happen in rare cases
|
|
removeAmbiguousNames :: DB [TermCandidateIncidence]
|
|
removeAmbiguousNames = do
|
|
ambiList <- E.select $ E.from $ \candidate -> do
|
|
E.groupBy ( candidate E.^. StudyTermNameCandidateIncidence
|
|
, candidate E.^. StudyTermNameCandidateKey
|
|
, candidate E.^. StudyTermNameCandidateName
|
|
)
|
|
E.having $ E.countRows E.!=. E.val (1 :: Int64)
|
|
return $ candidate E.^. StudyTermNameCandidateIncidence
|
|
let ambiSet = E.unValue <$> List.nub ambiList
|
|
-- Most SQL dialects won't allow deletion and queries on the same table at once, hence we delete in two steps.
|
|
deleteWhere [StudyTermNameCandidateIncidence <-. ambiSet]
|
|
return ambiSet
|
|
|
|
|
|
-- | remove known StudyTerm from candidates that have the _exact_ name,
|
|
-- ie. if a candidate contains a known key, we remove it and its associated fullname
|
|
-- only save if ambiguous candidates haven been removed
|
|
removeRedundantNames :: DB [Entity StudyTermNameCandidate]
|
|
removeRedundantNames = do
|
|
redundants <- E.select $ E.distinct $ E.from $ \(candidate `E.InnerJoin` sterm) -> do
|
|
E.on $ candidate E.^. StudyTermNameCandidateKey E.==. sterm E.^. StudyTermsKey
|
|
E.&&. ( E.just (candidate E.^. StudyTermNameCandidateName) E.==. sterm E.^. StudyTermsName
|
|
E.||. E.exists (E.from $ \(subTerm `E.InnerJoin` sterm2) -> do
|
|
E.on $ subTerm E.^. StudySubTermsParent E.==. sterm E.^. StudyTermsId
|
|
E.&&. subTerm E.^. StudySubTermsChild E.==. sterm2 E.^. StudyTermsId
|
|
E.where_ $ E.just (candidate E.^. StudyTermNameCandidateName) E.==. sterm2 E.^. StudyTermsName
|
|
)
|
|
)
|
|
return candidate
|
|
-- Most SQL dialects won't allow deletion and queries on the same table at once, hence we delete in two steps.
|
|
forM_ redundants $ \Entity{entityVal=StudyTermNameCandidate{..}} ->
|
|
deleteWhere $ ( StudyTermNameCandidateIncidence ==. studyTermNameCandidateIncidence )
|
|
: ([ StudyTermNameCandidateKey ==. studyTermNameCandidateKey ]
|
|
||. [ StudyTermNameCandidateName ==. studyTermNameCandidateName ])
|
|
return redundants
|
|
|
|
removeRedundantParents :: DB [Entity StudySubTermParentCandidate]
|
|
removeRedundantParents = do
|
|
redundants <- E.select . E.distinct . E.from $ \(candidate `E.InnerJoin` subTerm) -> do
|
|
E.on $ candidate E.^. StudySubTermParentCandidateKey E.==. E.veryUnsafeCoerceSqlExprValue (subTerm E.^. StudySubTermsChild)
|
|
E.&&. candidate E.^. StudySubTermParentCandidateParent E.==. E.veryUnsafeCoerceSqlExprValue (subTerm E.^. StudySubTermsParent)
|
|
return candidate
|
|
forM_ redundants $ \(Entity _ StudySubTermParentCandidate{..}) ->
|
|
E.delete . E.from $ \candidate ->
|
|
E.where_ $ candidate E.^. StudySubTermParentCandidateIncidence E.==. E.val studySubTermParentCandidateIncidence
|
|
E.&&. ( candidate E.^. StudySubTermParentCandidateParent `E.in_` E.valList [studySubTermParentCandidateParent, studySubTermParentCandidateKey]
|
|
E.||. candidate E.^. StudySubTermParentCandidateKey `E.in_` E.valList [studySubTermParentCandidateParent, studySubTermParentCandidateKey]
|
|
)
|
|
return redundants
|
|
|
|
removeRedundantStandalone :: DB [Entity StudyTermStandaloneCandidate]
|
|
removeRedundantStandalone = do
|
|
redundants <- E.select . E.distinct . E.from $ \(candidate `E.InnerJoin` sterm) -> do
|
|
E.on $ candidate E.^. StudyTermStandaloneCandidateKey E.==. sterm E.^. StudyTermsKey
|
|
E.&&. E.not_ (E.isNothing $ sterm E.^. StudyTermsDefaultDegree)
|
|
E.&&. E.not_ (E.isNothing $ sterm E.^. StudyTermsDefaultType)
|
|
return candidate
|
|
deleteWhere [ StudyTermStandaloneCandidateId <-. map entityKey redundants ]
|
|
return redundants
|
|
|
|
|
|
-- | Search for single candidates and memorize them as StudyTerms.
|
|
-- Should be called after @removeRedundant@ to increase success chances and reduce cost; otherwise memory heavy!
|
|
-- Does not delete the used candidates, user @removeRedundant@ for this later on.
|
|
-- Esqueleto does not provide the INTERESECT operator, thus
|
|
-- we load the table into Haskell and operate there. Memory usage problem? StudyTermsCandidate may become huge.
|
|
acceptSingletonNames :: DB [(STKey,Text)]
|
|
acceptSingletonNames = do
|
|
knownKeys <- fmap unStudyTermsKey <$> selectKeysList [StudyTermsName !=. Nothing] [Asc StudyTermsKey]
|
|
-- let knownKeysSet = Set.fromAscList knownKeys
|
|
-- In case of memory problems, change next lines to conduit proper:
|
|
incidences <- fmap entityVal <$> selectList [StudyTermNameCandidateKey /<-. knownKeys] [] -- LimitTo might be dangerous here, if we get a partial incidence. Possibly first select N incidences, then retrieving all those only.
|
|
-- incidences <- E.select $ E.from $ \candidate -> do
|
|
-- E.where_ $ candidate E.^. StudyTermCandidayeKey `E.notIn` E.valList knownKeys
|
|
-- return candidate
|
|
|
|
-- Possibly expensive pure computations follows. Break runDB to shorten transaction?
|
|
let groupedCandidates :: Map STKey (Map UUID (Set Text))
|
|
groupedCandidates = foldl' groupFun mempty incidences
|
|
|
|
-- given a key, map each incidence to set of possible names for this key
|
|
groupFun :: Map STKey (Map TermCandidateIncidence (Set Text)) -> StudyTermNameCandidate -> Map STKey (Map TermCandidateIncidence (Set Text))
|
|
groupFun m StudyTermNameCandidate{..} =
|
|
insertWith (Map.unionWith Set.union)
|
|
studyTermNameCandidateKey
|
|
(Map.singleton studyTermNameCandidateIncidence $ Set.singleton studyTermNameCandidateName)
|
|
m
|
|
|
|
-- pointwise intersection per incidence gives possible candidates per key
|
|
keyCandidates :: Map STKey (Set Text)
|
|
keyCandidates = Map.map (setIntersections . Map.elems) groupedCandidates
|
|
|
|
-- filter candidates having a unique possibility left
|
|
fixedKeys :: [(STKey,Text)]
|
|
fixedKeys = fst $ Map.foldlWithKey' combFixed mempty keyCandidates
|
|
where
|
|
combFixed :: ([(STKey,Text)], Set STKey) -> STKey -> Set Text -> ([(STKey,Text)], Set STKey)
|
|
combFixed (acc, bad) k s
|
|
| Set.member k bad
|
|
= (acc, bad)
|
|
| maybe False (`Set.notMember` s) (lookup k acc)
|
|
= (filter (\(k', _) -> k /= k') acc, Set.insert k bad)
|
|
| [n] <- Set.elems s
|
|
= ((k,n) : acc, bad)
|
|
| otherwise = (acc, bad)
|
|
|
|
-- registerFixed :: (STKey, Text) -> DB (Key StudyTerms)
|
|
registerFixed :: (STKey, Text) -> DB ()
|
|
registerFixed (key, name) =
|
|
repsert (StudyTermsKey' key) $ StudyTerms key Nothing (Just name) Nothing Nothing
|
|
|
|
-- register newly fixed candidates
|
|
forM_ fixedKeys registerFixed
|
|
return fixedKeys
|
|
|
|
acceptSingletonParents :: DB [Entity StudySubTerms]
|
|
acceptSingletonParents = do
|
|
candidates <- map entityVal <$> selectList [] []
|
|
|
|
let
|
|
groupedCandidates :: Map STKey (Map UUID (Set STKey))
|
|
groupedCandidates = foldl' groupFun mempty candidates
|
|
where
|
|
groupFun :: Map STKey (Map UUID (Set STKey)) -> StudySubTermParentCandidate -> Map STKey (Map UUID (Set STKey))
|
|
groupFun m StudySubTermParentCandidate{..} =
|
|
Map.insertWith (Map.unionWith Set.union)
|
|
studySubTermParentCandidateKey
|
|
(Map.singleton studySubTermParentCandidateIncidence $ Set.singleton studySubTermParentCandidateParent)
|
|
m
|
|
|
|
parentCandidates :: Map STKey (Set STKey)
|
|
parentCandidates = Map.map (setIntersections . Map.elems) groupedCandidates
|
|
|
|
fixedParents :: [(STKey, STKey)]
|
|
fixedParents = fst $ Map.foldlWithKey' combFixed mempty parentCandidates
|
|
where
|
|
combFixed :: ([(STKey, STKey)], Set STKey) -> STKey -> Set STKey -> ([(STKey, STKey)], Set STKey)
|
|
combFixed (acc, bad) k s
|
|
| Set.member k bad
|
|
= (acc, bad)
|
|
| maybe False (`Set.notMember` s) (lookup k acc)
|
|
= (filter (\(k', _) -> k /= k') acc, Set.insert k bad)
|
|
| [p] <- Set.elems s
|
|
= ((k, p) : acc, bad)
|
|
| otherwise = (acc, bad)
|
|
|
|
inserted <- forM fixedParents $ \(key, parent) -> do
|
|
unlessM (existsKey $ StudyTermsKey' key) $
|
|
insert_ (StudyTerms key Nothing Nothing Nothing Nothing)
|
|
unlessM (existsKey $ StudyTermsKey' parent) $
|
|
insert_ (StudyTerms parent Nothing Nothing Nothing Nothing)
|
|
insertUnique $ StudySubTerms
|
|
{ studySubTermsChild = StudyTermsKey' key
|
|
, studySubTermsParent = StudyTermsKey' parent
|
|
}
|
|
|
|
mapM getJustEntity $ catMaybes inserted
|
|
|
|
|
|
-- | all existing StudyTerms that are contradiced by current observations
|
|
nameConflicts :: DB [Entity StudyTerms]
|
|
nameConflicts = E.select $ E.from $ \studyTerms -> do
|
|
E.where_ $ E.not_ $ E.isNothing $ studyTerms E.^. StudyTermsName
|
|
E.where_ $ E.exists $ E.from $ \candidateOne -> do
|
|
E.where_ $ candidateOne E.^. StudyTermNameCandidateKey E.==. studyTerms E.^. StudyTermsKey
|
|
E.where_ $ E.notExists . E.from $ \candidateTwo -> do
|
|
E.where_ $ candidateTwo E.^. StudyTermNameCandidateIncidence E.==. candidateOne E.^. StudyTermNameCandidateIncidence
|
|
E.where_ $ studyTerms E.^. StudyTermsName E.==. E.just (candidateTwo E.^. StudyTermNameCandidateName)
|
|
E.||. E.exists ( E.from $ \(pCandidate `E.LeftOuterJoin` termsTwo) -> do
|
|
E.on $ pCandidate E.^. StudySubTermParentCandidateParent E.==. studyTerms E.^. StudyTermsKey
|
|
E.&&. E.just (pCandidate E.^. StudySubTermParentCandidateKey) E.==. termsTwo E.?. StudyTermsKey
|
|
E.where_ $ E.joinV (termsTwo E.?. StudyTermsName) E.==. E.just (candidateTwo E.^. StudyTermNameCandidateName)
|
|
E.||. E.isNothing (E.joinV $ termsTwo E.?. StudyTermsName)
|
|
)
|
|
E.||. E.exists ( E.from $ \(subTerms `E.InnerJoin` termsTwo) -> do
|
|
E.on $ subTerms E.^. StudySubTermsParent E.==. studyTerms E.^. StudyTermsId
|
|
E.&&. subTerms E.^. StudySubTermsChild E.==. termsTwo E.^. StudyTermsId
|
|
E.where_ $ termsTwo E.^. StudyTermsName E.==. E.just (candidateTwo E.^. StudyTermNameCandidateName)
|
|
E.||. E.isNothing (termsTwo E.^. StudyTermsName)
|
|
)
|
|
return studyTerms
|
|
|
|
|
|
-- | retrieve all incidence keys having containing a certain @StudyTermKey @
|
|
getNameIncidencesFor :: [StudyTermsId] -> DB [E.Value TermCandidateIncidence]
|
|
getNameIncidencesFor stks = E.select $ E.distinct $ E.from $ \candidate -> do
|
|
E.where_ $ candidate E.^. StudyTermNameCandidateKey `E.in_` E.valList stks'
|
|
return $ candidate E.^. StudyTermNameCandidateIncidence
|
|
where
|
|
stks' = stks <&> unStudyTermsKey
|