fradrive/src/Handler/Utils/TermCandidates.hs
2019-03-20 14:09:46 +01:00

185 lines
8.1 KiB
Haskell

module Handler.Utils.TermCandidates where
import Import
-- import Handler.Utils
-- Import this module as Candidates
-- import Utils.Lens
-- import Data.Time
-- import qualified Data.Text as T
-- import Data.Function ((&))
-- import Yesod.Form.Bootstrap3
-- import Colonnade hiding (fromMaybe)
-- import Yesod.Colonnade
-- import qualified Data.UUID.Cryptographic as UUID
-- import Control.Monad.Trans.Writer (mapWriterT)
-- import Database.Persist.Sql (fromSqlKey)
import Data.Set (Set)
import qualified Data.Set as Set
import qualified Data.List as List
import Data.Map (Map)
import qualified Data.Map as Map
import qualified Database.Esqueleto as E
-- import Database.Esqueleto.Utils as E
{-# ANN module ("HLint: ignore Use newtype instead of data"::String) #-}
type STKey = Int -- for convenience, assmued identical to field StudyTermCandidateKey
data FailedCandidateInference = FailedCandidateInference [Entity StudyTerms]
deriving (Typeable, Show)
instance Exception FailedCandidateInference
-- Default Instance
-- -- | Just an heuristik to fill in defaults
-- shortenStudyTerm :: Text -> Text
-- shortenStudyTerm = concatMap (take 4) . splitCamel
-- | Attempt to identify new StudyTerms based on observations, returning:
-- * list of ambiguous instances that were discarded outright (identical names for differents keys observed in single incidences)
-- * list of problems, ie. StudyTerms that contradict observed incidences
-- * list of redundants, i.e. redundant observed incidences
-- * list of accepted, i.e. newly accepted key/name pairs
inferHandler :: Handler ([Entity StudyTerms],[TermCandidateIncidence],[Entity StudyTermCandidate],[(STKey,Text)])
inferHandler = runDB $ inferAcc ([],[],[])
where
inferAcc (accAmbiguous, accRedundants, accAccepted) =
handle (\(FailedCandidateInference fails) -> (fails,accAmbiguous,accRedundants,accAccepted) <$ E.transactionUndo) $ do
(infAmbis, infReds,infAccs) <- inferStep
if null infAccs
then return ([], accAmbiguous, infReds ++ accRedundants, accAccepted)
else do
E.transactionSave -- commit transaction if there are no problems
inferAcc (infAmbis ++ accAmbiguous, infReds ++ accRedundants, infAccs ++ accAccepted)
inferStep = do
ambiguous <- removeAmbiguous
redundants <- removeRedundant
accepted <- acceptSingletons
problems <- conflicts
unless (null problems) $ throwM $ FailedCandidateInference problems
return (ambiguous, redundants, accepted)
{-
Candidate 1 11 "A"
Candidate 1 11 "B"
Candidate 1 12 "A"
Candidate 1 12 "B"
Candidate 2 12 "B"
Candidate 2 12 "C"
Candidate 2 13 "B"
Candidate 2 13 "C"
should readily yield 11/A, 12/B 13/C:
it can infer due to overlab that 12/B must be true, then eliminating B identifies A and C;
this rests on the assumption that the Names are unique, which is NOT TRUE;
as a fix we simply eliminate all observations that have the same name twice, see removeInconsistent
-}
-- | remove candidates with ambiguous observations,
-- ie. candidates that have duplicated term names with differing keys
-- which may happen in rare cases
removeAmbiguous :: DB [TermCandidateIncidence]
removeAmbiguous = do
ambiList <- E.select $ E.from $ \candidate -> do
E.groupBy ( candidate E.^. StudyTermCandidateIncidence
, candidate E.^. StudyTermCandidateKey
, candidate E.^. StudyTermCandidateName
)
E.having $ E.countRows E.!=. E.val (1 :: Int64)
return $ candidate E.^. StudyTermCandidateIncidence
let ambiSet = E.unValue <$> List.nub ambiList
-- Most SQL dialects won't allow deletion and queries on the same table at once, hence we delete in two steps.
deleteWhere [StudyTermCandidateIncidence <-. ambiSet]
return ambiSet
-- | remove known StudyTerm from candidates that have the _exact_ name,
-- ie. if a candidate contains a known key, we remove it and its associated fullname
-- only save if ambiguous candidates haven been removed
removeRedundant :: DB [Entity StudyTermCandidate]
removeRedundant = do
redundants <- E.select $ E.distinct $ E.from $ \(candidate `E.InnerJoin` sterm) -> do
E.on $ candidate E.^. StudyTermCandidateKey E.==. sterm E.^. StudyTermsKey
E.&&. E.just (candidate E.^. StudyTermCandidateName) E.==. sterm E.^. StudyTermsName
return candidate
-- Most SQL dialects won't allow deletion and queries on the same table at once, hence we delete in two steps.
forM_ redundants $ \Entity{entityVal=StudyTermCandidate{..}} ->
deleteWhere $ ( StudyTermCandidateIncidence ==. studyTermCandidateIncidence )
: ([ StudyTermCandidateKey ==. studyTermCandidateKey ]
||. [ StudyTermCandidateName ==. studyTermCandidateName ])
return redundants
-- | Search for single candidates and memorize them as StudyTerms.
-- Should be called after @removeRedundant@ to increase success chances and reduce cost; otherwise memory heavy!
-- Does not delete the used candidates, user @removeRedundant@ for this later on.
-- Esqueleto does not provide the INTERESECT operator, thus
-- we load the table into Haskell and operate there. Memory usage problem? StudyTermsCandidate may become huge.
acceptSingletons :: DB [(STKey,Text)]
acceptSingletons = do
knownKeys <- fmap unStudyTermsKey <$> selectKeysList [StudyTermsName !=. Nothing] [Asc StudyTermsKey]
-- let knownKeysSet = Set.fromAscList knownKeys
-- In case of memory problems, change next lines to conduit proper:
incidences <- fmap entityVal <$> selectList [StudyTermCandidateKey /<-. knownKeys] [] -- LimitTo might be dangerous here, if we get a partial incidence. Possibly first select N incidences, then retrieving all those only.
-- incidences <- E.select $ E.from $ \candidate -> do
-- E.where_ $ candidate E.^. StudyTermCandidayeKey `E.notIn` E.valList knownKeys
-- return candidate
-- Possibly expensive pure computations follows. Break runDB to shorten transaction?
let groupedCandidates :: Map STKey (Map UUID (Set Text))
groupedCandidates = foldl' groupFun mempty incidences
-- given a key, map each incidence to set of possible names for this key
groupFun :: Map STKey (Map TermCandidateIncidence (Set Text)) -> StudyTermCandidate -> Map STKey (Map TermCandidateIncidence (Set Text))
groupFun m StudyTermCandidate{..} =
insertWith (Map.unionWith Set.union)
studyTermCandidateKey
(Map.singleton studyTermCandidateIncidence $ Set.singleton studyTermCandidateName)
m
-- pointwise intersection per incidence gives possible candidates per key
keyCandidates :: Map STKey (Set Text)
keyCandidates = Map.map (setIntersections . Map.elems) groupedCandidates
-- filter candidates having a unique possibility left
fixedKeys :: [(STKey,Text)]
fixedKeys = Map.foldlWithKey' combFixed [] keyCandidates
combFixed :: [(STKey,Text)] -> STKey -> Set Text -> [(STKey,Text)]
combFixed acc k s | Set.size s == 1 -- possibly redundant
, [n] <- Set.elems s = (k,n):acc
-- empty sets should not occur here , if LDAP is consistent. Maybe raise a warning?!
| otherwise = acc
-- registerFixed :: (STKey, Text) -> DB (Key StudyTerms)
registerFixed :: (STKey, Text) -> DB ()
registerFixed (key, name) = repsert (StudyTermsKey' key) $ StudyTerms key Nothing (Just name)
-- register newly fixed candidates
forM_ fixedKeys registerFixed
return fixedKeys
-- | all existing StudyTerms that are contradiced by current observations
conflicts :: DB [Entity StudyTerms]
conflicts = E.select $ E.from $ \studyTerms -> do
E.where_ $ E.not_ $ E.isNothing $ studyTerms E.^. StudyTermsName
E.where_ $ E.exists $ E.from $ \candidateOne -> do
E.where_ $ candidateOne E.^. StudyTermCandidateKey E.==. studyTerms E.^. StudyTermsKey
E.where_ $ E.notExists . E.from $ \candidateTwo -> do
E.where_ $ candidateTwo E.^. StudyTermCandidateIncidence E.==. candidateOne E.^. StudyTermCandidateIncidence
E.where_ $ studyTerms E.^. StudyTermsName E.==. E.just (candidateTwo E.^. StudyTermCandidateName)
return studyTerms