From 9b944d70b88c24c1b2700553ba7a607abdd9f031 Mon Sep 17 00:00:00 2001 From: Steffen Jost Date: Mon, 18 Mar 2019 18:57:36 +0100 Subject: [PATCH] StudyTermCandidate inference implemented needs tests --- models/users | 1 + src/Handler/Course.hs | 2 +- src/Handler/Utils/TermCandidates.hs | 202 ++++++++++++++++++++++++++++ src/Utils.hs | 11 ++ 4 files changed, 215 insertions(+), 1 deletion(-) create mode 100644 src/Handler/Utils/TermCandidates.hs diff --git a/models/users b/models/users index 9b9df02ff..adc672252 100644 --- a/models/users +++ b/models/users @@ -54,6 +54,7 @@ StudyTerms -- Studiengang shorthand Text Maybe -- admin determined shorthand name Text Maybe -- description given by LDAP Primary key -- column key is used as actual DB row key + -- newtype Key StudyTerms = StudyTermsKey { unStudyTermsKey :: Int } StudyTermCandidate -- No one at LMU is willing and able to tell us the meaning of the keys for StudyDegrees and StudyTerms. -- Each LDAP login provides an unordered set of keys and an unordered set of plain text description with an unknown 1-1 correspondence. -- This table helps us to infer which key belongs to which plain text by recording possible combinations at login. diff --git a/src/Handler/Course.hs b/src/Handler/Course.hs index 1175bf6e0..fd49acdce 100644 --- a/src/Handler/Course.hs +++ b/src/Handler/Course.hs @@ -803,7 +803,7 @@ getCUsersR tid ssh csh = do , colUserEmail , colUserMatriclenr , colUserDegreeShort - , colUserFieldShort + , colUserField , colUserSemester , sortable (Just "course-registration") (i18nCell MsgRegisteredHeader) (dateCell . view _userTableRegistration) , colUserComment tid ssh csh diff --git a/src/Handler/Utils/TermCandidates.hs b/src/Handler/Utils/TermCandidates.hs new file mode 100644 index 000000000..8997969c8 --- /dev/null +++ b/src/Handler/Utils/TermCandidates.hs @@ -0,0 +1,202 @@ +module Handler.Utils.TermCandidates where + +import Import +-- import Handler.Utils + + +-- Import this module as Candidates + +-- import Utils.Lens + +-- import Data.Time +-- import qualified Data.Text as T +-- import Data.Function ((&)) +-- import Yesod.Form.Bootstrap3 +-- import Colonnade hiding (fromMaybe) +-- import Yesod.Colonnade +-- import qualified Data.UUID.Cryptographic as UUID +-- import Control.Monad.Trans.Writer (mapWriterT) +-- import Database.Persist.Sql (fromSqlKey) +import Data.Set (Set) +import qualified Data.Set as Set +import qualified Data.List as List +import Data.Map (Map) +import qualified Data.Map as Map + + +import qualified Database.Esqueleto as E +-- import Database.Esqueleto.Utils as E + + +type STKey = Int -- Key StudyTerms -- for convenience, assmued identical to field StudyTermCandidateKey + +-- | Just an heuristik to fill in defaults +shortenStudyTerm :: Text -> Text +shortenStudyTerm = concatMap (take 4) . splitCamel + +-- | Attempt to identify new StudyTerms based on observations +-- infer :: MonadHandler m => m ([Entity StudyTerms],[Entity StudyTerms]) +infer :: DB ([Entity StudyTerms],[(STKey, Text)]) +infer = do + void removeAmbiguous -- TODO: show result + inferAcc [] + where + inferAcc prevSet = do + problems <- conflicts + if null problems + then do + void removeRedundant -- TODO: show result + newSet <- acceptSingletons + if null newSet + then -- inference complete + return ([],prevSet) + else + inferAcc (newSet ++ prevSet) + else --abort + return (problems,prevSet) + + +{- +Candidate 1 11 "A" +Candidate 1 11 "B" +Candidate 1 12 "A" +Candidate 1 12 "B" +Candidate 2 12 "B" +Candidate 2 12 "C" +Candidate 2 13 "B" +Candidate 2 13 "C" + +should readily yield 11/A, 12/B 13/C: + +it can infer due to overlab that 12/B must be true, then eliminating B identifies A and C; +this rests on the assumption that the Names are unique, which is NOT TRUE; +as a fix we simply eliminate all observations that have the same name twice, see removeInconsistent + +-} + +-- | remove candidates with ambiguous observations, +-- ie. candidates that have duplicated term names with differing keys +-- which may happen in rare cases +removeAmbiguous :: DB [UUID] +removeAmbiguous = do + ambiList <- E.select $ E.from $ \(candA `E.InnerJoin` candB) -> do + -- Either an innerJoin with itself or an exists-sub-select + E.on $ (candA E.^. StudyTermCandidateIncidence E.==. candB E.^. StudyTermCandidateIncidence) + E.&&. (candA E.^. StudyTermCandidateKey E.!=. candB E.^. StudyTermCandidateKey) + E.&&. (candA E.^. StudyTermCandidateName E.==. candB E.^. StudyTermCandidateName) + E.&&. (candA E.^. StudyTermCandidateId E.!=. candB E.^. StudyTermCandidateId) -- should not be needed, but does not hurt either + return $ candA E.^. StudyTermCandidateIncidence + let ambiSet = E.unValue <$> List.nub ambiList + -- Most SQL dialects won't allow deletion and queries on the same table at once, hence we delete in two steps. + deleteWhere [StudyTermCandidateIncidence <-. ambiSet] + return ambiSet + + +-- | remove known StudyTerm from candidates that have the _exact_ name, +-- ie. if a candidate contains a known key, we remove it and its associated fullname +-- only save if ambiguous candidates haven been removed +removeRedundant :: DB [Entity StudyTermCandidate] +removeRedundant = do + redundants <- E.select $ E.distinct $ E.from $ \(candidate `E.InnerJoin` sterm) -> do + E.on $ candidate E.^. StudyTermCandidateKey E.==. sterm E.^. StudyTermsKey + E.&&. E.just (candidate E.^. StudyTermCandidateName) E.==. sterm E.^. StudyTermsName + return candidate + -- Most SQL dialects won't allow deletion and queries on the same table at once, hence we delete in two steps. + forM_ redundants $ \Entity{entityVal=StudyTermCandidate{..}} -> + deleteWhere $ ( StudyTermCandidateIncidence ==. studyTermCandidateIncidence ) + : ([ StudyTermCandidateKey ==. studyTermCandidateKey ] + ||. [ StudyTermCandidateName ==. studyTermCandidateName ]) + return redundants + + +-- | Search for single candidates and memorize them as StudyTerms. +-- Should be called after @removeRedundant@ to increase success chances and reduce cost; otherwise memory heavy! +-- Does not delete the used candidates, user @removeRedundant@ for this later on. +-- Esqueleto does not provide the INTERESECT operator, thus +-- we load the table into Haskell and operate there. Memory usage problem? StudyTermsCandidate may become huge. +acceptSingletons :: DB [(STKey,Text)] +acceptSingletons = do + knownKeys <- fmap unStudyTermsKey <$> selectKeysList [] [Asc StudyTermsKey] + -- let knownKeysSet = Set.fromAscList knownKeys + -- In case of memory problems, change next lines to conduit proper: + incidences <- fmap entityVal <$> selectList [StudyTermCandidateKey /<-. knownKeys] [] -- LimitTo might be dangerous here, if we get a partial incidence. Possibly first select N incidences, then retrieving all those only. + -- incidences <- E.select $ E.from $ \candidate -> do + -- E.where_ $ candidate E.^. StudyTermCandidayeKey `E.notIn` E.valList knownKeys + -- return candidate + + -- Possibly expensive pure computations follows. Break runDB to shorten transaction? + let groupedCandidates :: Map STKey (Map UUID (Set Text)) + groupedCandidates = foldl' groupFun mempty incidences + + -- given a key, map each incidence to set of possible names for this key + groupFun :: Map STKey (Map UUID (Set Text)) -> StudyTermCandidate -> Map STKey (Map UUID (Set Text)) + groupFun m StudyTermCandidate{..} = + insertWith (Map.unionWith Set.union) + studyTermCandidateKey + (Map.singleton studyTermCandidateIncidence $ Set.singleton studyTermCandidateName) + m + + -- pointwise intersection per incidence gives possible candidates per key + keyCandidates :: Map STKey (Set Text) + keyCandidates = Map.map (setIntersections . Map.elems) groupedCandidates + + -- filter candidates having a unique possibility left + fixedKeys :: [(STKey,Text)] + fixedKeys = Map.foldlWithKey' combFixed [] keyCandidates + + combFixed :: [(STKey,Text)] -> STKey -> Set Text -> [(STKey,Text)] + combFixed acc k s | Set.size s == 1 -- possibly redundant + , [n] <- Set.elems s = (k,n):acc + -- empty sets should not occur here , if LDAP is consistent. Maybe raise a warning?! + | otherwise = acc + + -- registerFixed :: (STKey, Text) -> DB (Key StudyTerms) + registerFixed :: (STKey, Text) -> DB () + registerFixed (key, name) = + -- insertKey (StudyTermsKey key) $ StudyTerms key (Just $ shortenStudyTerm name) (Just name) -- name clash! + void . insert $ StudyTerms key (Just $ shortenStudyTerm name) (Just name) + + + -- register newly fixed candidates + forM_ fixedKeys registerFixed + return fixedKeys + + + -- SOME EARLIER ATTEMPTS FOLLOW: + -- + -- unknownKeys <- E.select $ E.distinct $ E.from $ \candidate -> do + -- E.where_ $ E.notExists $ E.from $ \sterm -> + -- E.where_ $ candidate E.^. StudyTermCandidateKey E.==. sterm E.^. StudyTermKey + -- return $ candidate E.^. StudyTermCandidateKey + -- forM unknownKeys $ \(E.Value key) -> do + -- incidences <- E.select $ E.from $ \candidate -> do + -- E.where_ $ + -- + -- -- DON'T KNOW HOW TO DO IN SQL :( BUT WE NEED THE ENTIRE TABLE ANYHOW + -- candidates <- entityVal <$> selectList [] [] -- load entire candidate table + -- -- create map from UUID to set of candidates for efficiency + -- let collectCandidates m stc@StudyTermCandidate{studyTermCandidateIncidence=inci} + -- = insertWith Set.union inci stc + -- incidences = foldl collectCandidates Map.empty candidates + -- + -- collectKeys m + -- keySets = foldl collectKeys Map.empty candidates + -- + -- -- StudyTermCandidateKey -> Set StudyTermCandidateName + + + + +-- | all existing StudyTerms that are contradiced by current observations +conflicts :: DB [Entity StudyTerms] +conflicts = E.select $ E.from $ \studyTerms -> do + E.where_ $ E.not_ $ E.isNothing $ studyTerms E.^. StudyTermsName + E.where_ $ E.exists $ E.from $ \candidateOne -> do + E.where_ $ candidateOne E.^. StudyTermCandidateKey E.==. studyTerms E.^. StudyTermsKey + E.where_ $ E.notExists . E.from $ \candidateTwo -> do + E.where_ $ candidateTwo E.^. StudyTermCandidateIncidence E.==. candidateOne E.^. StudyTermCandidateIncidence + E.where_ $ studyTerms E.^. StudyTermsName E.==. E.just (candidateTwo E.^. StudyTermCandidateName) + return studyTerms + + + diff --git a/src/Utils.hs b/src/Utils.hs index cd735a6c0..73debb0e8 100644 --- a/src/Utils.hs +++ b/src/Utils.hs @@ -321,6 +321,17 @@ mergeAttrs = mergeAttrs' `on` sort +---------- +-- Sets -- +---------- + +-- | Intersection of multiple sets. Returns empty set for empty input list +setIntersections :: Ord a => [Set a] -> Set a +setIntersections [] = Set.empty +setIntersections (h:t) = foldl' Set.intersection h t + + + ---------- -- Maps -- ----------