module Handler.Utils.TermCandidates where import Import -- import Handler.Utils -- Import this module as Candidates -- import Utils.Lens -- import Data.Time -- import qualified Data.Text as T -- import Data.Function ((&)) -- import Yesod.Form.Bootstrap3 -- import Colonnade hiding (fromMaybe) -- import Yesod.Colonnade -- import qualified Data.UUID.Cryptographic as UUID -- import Control.Monad.Trans.Writer (mapWriterT) -- import Database.Persist.Sql (fromSqlKey) import Data.Set (Set) import qualified Data.Set as Set import qualified Data.List as List import Data.Map (Map) import qualified Data.Map as Map import qualified Database.Esqueleto as E -- import Database.Esqueleto.Utils as E {-# ANN module ("HLint: ignore Use newtype instead of data"::String) #-} type STKey = Int -- for convenience, assmued identical to field StudyTermCandidateKey data FailedCandidateInference = FailedCandidateInference [Entity StudyTerms] deriving (Typeable, Show) instance Exception FailedCandidateInference -- Default Instance -- -- | Just an heuristik to fill in defaults -- shortenStudyTerm :: Text -> Text -- shortenStudyTerm = concatMap (take 4) . splitCamel -- | Attempt to identify new StudyTerms based on observations, returning: -- * list of ambiguous instances that were discarded outright (identical names for differents keys observed in single incidences) -- * list of problems, ie. StudyTerms that contradict observed incidences -- * list of redundants, i.e. redundant observed incidences -- * list of accepted, i.e. newly accepted key/name pairs inferHandler :: Handler ([Entity StudyTerms],[TermCandidateIncidence],[Entity StudyTermCandidate],[(STKey,Text)]) inferHandler = runDB $ inferAcc ([],[],[]) where inferAcc (accAmbiguous, accRedundants, accAccepted) = handle (\(FailedCandidateInference fails) -> (fails,accAmbiguous,accRedundants,accAccepted) <$ E.transactionUndo) $ do (infAmbis, infReds,infAccs) <- inferStep if null infAccs then return ([], accAmbiguous, infReds ++ accRedundants, accAccepted) else do E.transactionSave -- commit transaction if there are no problems inferAcc (infAmbis ++ accAmbiguous, infReds ++ accRedundants, infAccs ++ accAccepted) inferStep = do ambiguous <- removeAmbiguous redundants <- removeRedundant accepted <- acceptSingletons problems <- conflicts unless (null problems) $ throwM $ FailedCandidateInference problems return (ambiguous, redundants, accepted) {- Candidate 1 11 "A" Candidate 1 11 "B" Candidate 1 12 "A" Candidate 1 12 "B" Candidate 2 12 "B" Candidate 2 12 "C" Candidate 2 13 "B" Candidate 2 13 "C" should readily yield 11/A, 12/B 13/C: it can infer due to overlab that 12/B must be true, then eliminating B identifies A and C; this rests on the assumption that the Names are unique, which is NOT TRUE; as a fix we simply eliminate all observations that have the same name twice, see removeInconsistent -} -- | remove candidates with ambiguous observations, -- ie. candidates that have duplicated term names with differing keys -- which may happen in rare cases removeAmbiguous :: DB [TermCandidateIncidence] removeAmbiguous = do ambiList <- E.select $ E.from $ \candidate -> do E.groupBy ( candidate E.^. StudyTermCandidateIncidence , candidate E.^. StudyTermCandidateKey , candidate E.^. StudyTermCandidateName ) E.having $ E.countRows E.!=. E.val (1 :: Int64) return $ candidate E.^. StudyTermCandidateIncidence let ambiSet = E.unValue <$> List.nub ambiList -- Most SQL dialects won't allow deletion and queries on the same table at once, hence we delete in two steps. deleteWhere [StudyTermCandidateIncidence <-. ambiSet] return ambiSet -- | remove known StudyTerm from candidates that have the _exact_ name, -- ie. if a candidate contains a known key, we remove it and its associated fullname -- only save if ambiguous candidates haven been removed removeRedundant :: DB [Entity StudyTermCandidate] removeRedundant = do redundants <- E.select $ E.distinct $ E.from $ \(candidate `E.InnerJoin` sterm) -> do E.on $ candidate E.^. StudyTermCandidateKey E.==. sterm E.^. StudyTermsKey E.&&. E.just (candidate E.^. StudyTermCandidateName) E.==. sterm E.^. StudyTermsName return candidate -- Most SQL dialects won't allow deletion and queries on the same table at once, hence we delete in two steps. forM_ redundants $ \Entity{entityVal=StudyTermCandidate{..}} -> deleteWhere $ ( StudyTermCandidateIncidence ==. studyTermCandidateIncidence ) : ([ StudyTermCandidateKey ==. studyTermCandidateKey ] ||. [ StudyTermCandidateName ==. studyTermCandidateName ]) return redundants -- | Search for single candidates and memorize them as StudyTerms. -- Should be called after @removeRedundant@ to increase success chances and reduce cost; otherwise memory heavy! -- Does not delete the used candidates, user @removeRedundant@ for this later on. -- Esqueleto does not provide the INTERESECT operator, thus -- we load the table into Haskell and operate there. Memory usage problem? StudyTermsCandidate may become huge. acceptSingletons :: DB [(STKey,Text)] acceptSingletons = do knownKeys <- fmap unStudyTermsKey <$> selectKeysList [StudyTermsName !=. Nothing] [Asc StudyTermsKey] -- let knownKeysSet = Set.fromAscList knownKeys -- In case of memory problems, change next lines to conduit proper: incidences <- fmap entityVal <$> selectList [StudyTermCandidateKey /<-. knownKeys] [] -- LimitTo might be dangerous here, if we get a partial incidence. Possibly first select N incidences, then retrieving all those only. -- incidences <- E.select $ E.from $ \candidate -> do -- E.where_ $ candidate E.^. StudyTermCandidayeKey `E.notIn` E.valList knownKeys -- return candidate -- Possibly expensive pure computations follows. Break runDB to shorten transaction? let groupedCandidates :: Map STKey (Map UUID (Set Text)) groupedCandidates = foldl' groupFun mempty incidences -- given a key, map each incidence to set of possible names for this key groupFun :: Map STKey (Map TermCandidateIncidence (Set Text)) -> StudyTermCandidate -> Map STKey (Map TermCandidateIncidence (Set Text)) groupFun m StudyTermCandidate{..} = insertWith (Map.unionWith Set.union) studyTermCandidateKey (Map.singleton studyTermCandidateIncidence $ Set.singleton studyTermCandidateName) m -- pointwise intersection per incidence gives possible candidates per key keyCandidates :: Map STKey (Set Text) keyCandidates = Map.map (setIntersections . Map.elems) groupedCandidates -- filter candidates having a unique possibility left fixedKeys :: [(STKey,Text)] fixedKeys = Map.foldlWithKey' combFixed [] keyCandidates combFixed :: [(STKey,Text)] -> STKey -> Set Text -> [(STKey,Text)] combFixed acc k s | Set.size s == 1 -- possibly redundant , [n] <- Set.elems s = (k,n):acc -- empty sets should not occur here , if LDAP is consistent. Maybe raise a warning?! | otherwise = acc -- registerFixed :: (STKey, Text) -> DB (Key StudyTerms) registerFixed :: (STKey, Text) -> DB () registerFixed (key, name) = repsert (StudyTermsKey' key) $ StudyTerms key Nothing (Just name) -- register newly fixed candidates forM_ fixedKeys registerFixed return fixedKeys -- | all existing StudyTerms that are contradiced by current observations conflicts :: DB [Entity StudyTerms] conflicts = E.select $ E.from $ \studyTerms -> do E.where_ $ E.not_ $ E.isNothing $ studyTerms E.^. StudyTermsName E.where_ $ E.exists $ E.from $ \candidateOne -> do E.where_ $ candidateOne E.^. StudyTermCandidateKey E.==. studyTerms E.^. StudyTermsKey E.where_ $ E.notExists . E.from $ \candidateTwo -> do E.where_ $ candidateTwo E.^. StudyTermCandidateIncidence E.==. candidateOne E.^. StudyTermCandidateIncidence E.where_ $ studyTerms E.^. StudyTermsName E.==. E.just (candidateTwo E.^. StudyTermCandidateName) return studyTerms -- | retrieve all incidence keys having containing a certain @StudyTermKey @ getIncidencesFor :: [Key StudyTerms] -> DB [E.Value TermCandidateIncidence] getIncidencesFor stks = E.select $ E.distinct $ E.from $ \candidate -> do E.where_ $ candidate E.^. StudyTermCandidateKey `E.in_` E.valList (unStudyTermsKey <$> stks) return $ candidate E.^. StudyTermCandidateIncidence