StudyTermCandidate inference implemented needs tests

2019-03-18 18:57:36 +01:00 · 2019-03-18 18:57:36 +01:00 · 9b944d70b8
commit 9b944d70b8
parent 2d1f74b4a4
4 changed files with 215 additions and 1 deletions
--- a/models/users
+++ b/models/users
@ -54,6 +54,7 @@ StudyTerms    -- Studiengang
    shorthand     Text Maybe     -- admin determined shorthand
    name          Text Maybe     -- description given by LDAP
    Primary key                  -- column key is used as actual DB row key
+                                 -- newtype Key StudyTerms = StudyTermsKey { unStudyTermsKey :: Int }
 StudyTermCandidate    -- No one at LMU is willing and able to tell us the meaning of the keys for StudyDegrees and StudyTerms.
    -- Each LDAP login provides an unordered set of keys and an unordered set of plain text description with an unknown 1-1 correspondence.
    -- This table helps us to infer which key belongs to which plain text by recording possible combinations at login.
--- a/src/Handler/Course.hs
+++ b/src/Handler/Course.hs
@ -803,7 +803,7 @@ getCUsersR tid ssh csh = do
        , colUserEmail
        , colUserMatriclenr
        , colUserDegreeShort
-        , colUserFieldShort
+        , colUserField
        , colUserSemester
        , sortable (Just "course-registration") (i18nCell MsgRegisteredHeader) (dateCell . view _userTableRegistration)
        , colUserComment tid ssh csh
--- a/src/Handler/Utils/TermCandidates.hs
+++ b/src/Handler/Utils/TermCandidates.hs
@ -0,0 +1,202 @@
+module Handler.Utils.TermCandidates where
+
+import Import
+-- import Handler.Utils
+
+
+-- Import this module as Candidates
+
+-- import Utils.Lens
+
+-- import Data.Time
+-- import qualified Data.Text as T
+-- import Data.Function ((&))
+-- import Yesod.Form.Bootstrap3
+-- import Colonnade hiding (fromMaybe)
+-- import Yesod.Colonnade
+-- import qualified Data.UUID.Cryptographic as UUID
+-- import Control.Monad.Trans.Writer (mapWriterT)
+-- import Database.Persist.Sql (fromSqlKey)
+import Data.Set (Set)
+import qualified Data.Set as Set
+import qualified Data.List as List
+import Data.Map (Map)
+import qualified Data.Map as Map
+
+
+import qualified Database.Esqueleto as E
+-- import Database.Esqueleto.Utils as E
+
+
+type STKey = Int -- Key StudyTerms -- for convenience, assmued identical to field StudyTermCandidateKey
+
+-- | Just an heuristik to fill in defaults
+shortenStudyTerm :: Text -> Text
+shortenStudyTerm = concatMap (take 4) . splitCamel
+
+-- | Attempt to identify new StudyTerms based on observations
+-- infer :: MonadHandler m => m ([Entity StudyTerms],[Entity StudyTerms])
+infer :: DB ([Entity StudyTerms],[(STKey, Text)])
+infer = do
+    void removeAmbiguous -- TODO: show result
+    inferAcc []
+  where
+    inferAcc prevSet = do
+      problems <- conflicts
+      if null problems
+        then do
+          void removeRedundant -- TODO: show result
+          newSet <- acceptSingletons
+          if null newSet
+            then -- inference complete
+              return ([],prevSet)
+            else
+              inferAcc (newSet ++ prevSet)
+        else --abort
+          return (problems,prevSet)
+
+
+{-
+Candidate 1 11 "A"
+Candidate 1 11 "B"
+Candidate 1 12 "A"
+Candidate 1 12 "B"
+Candidate 2 12 "B"
+Candidate 2 12 "C"
+Candidate 2 13 "B"
+Candidate 2 13 "C"
+
+should readily yield 11/A, 12/B 13/C:
+
+it can infer due to overlab that 12/B must be true, then eliminating B identifies A and C;
+this rests on the assumption that the Names are unique, which is NOT TRUE;
+as a fix we simply eliminate all observations that have the same name twice, see removeInconsistent
+
+-}
+
+-- | remove candidates with ambiguous observations,
+--   ie. candidates that have duplicated term names with differing keys
+--   which may happen in rare cases
+removeAmbiguous :: DB [UUID]
+removeAmbiguous = do
+  ambiList <- E.select $ E.from $ \(candA `E.InnerJoin` candB) -> do
+    -- Either an innerJoin with itself or an exists-sub-select
+    E.on $    (candA E.^. StudyTermCandidateIncidence E.==. candB E.^. StudyTermCandidateIncidence)
+        E.&&. (candA E.^. StudyTermCandidateKey       E.!=. candB E.^. StudyTermCandidateKey)
+        E.&&. (candA E.^. StudyTermCandidateName      E.==. candB E.^. StudyTermCandidateName)
+        E.&&. (candA E.^. StudyTermCandidateId        E.!=. candB E.^. StudyTermCandidateId) -- should not be needed, but does not hurt either
+    return $ candA E.^. StudyTermCandidateIncidence
+  let ambiSet = E.unValue <$> List.nub ambiList
+  -- Most SQL dialects won't allow deletion and queries on the same table at once, hence we delete in two steps.
+  deleteWhere [StudyTermCandidateIncidence <-. ambiSet]
+  return ambiSet
+
+
+-- | remove known StudyTerm from candidates that have the _exact_ name,
+--   ie. if a candidate contains a known key, we remove it and its associated fullname
+--   only save if ambiguous candidates haven been removed
+removeRedundant :: DB [Entity StudyTermCandidate]
+removeRedundant = do
+  redundants <- E.select $ E.distinct $ E.from $ \(candidate `E.InnerJoin` sterm) -> do
+    E.on $  candidate E.^. StudyTermCandidateKey  E.==. sterm E.^. StudyTermsKey
+      E.&&. E.just (candidate E.^. StudyTermCandidateName) E.==. sterm E.^. StudyTermsName
+    return candidate
+  -- Most SQL dialects won't allow deletion and queries on the same table at once, hence we delete in two steps.
+  forM_ redundants $ \Entity{entityVal=StudyTermCandidate{..}} ->
+    deleteWhere $   ( StudyTermCandidateIncidence ==. studyTermCandidateIncidence )
+                :  ([ StudyTermCandidateKey       ==. studyTermCandidateKey       ]
+                ||. [ StudyTermCandidateName      ==. studyTermCandidateName      ])
+  return redundants
+
+
+-- | Search for single candidates and memorize them as StudyTerms.
+--   Should be called after @removeRedundant@ to increase success chances and reduce cost; otherwise memory heavy!
+--   Does not delete the used candidates, user @removeRedundant@ for this later on.
+--   Esqueleto does not provide the INTERESECT operator, thus
+--   we load the table into Haskell and operate there. Memory usage problem? StudyTermsCandidate may become huge.
+acceptSingletons :: DB [(STKey,Text)]
+acceptSingletons = do
+  knownKeys <- fmap unStudyTermsKey <$> selectKeysList [] [Asc StudyTermsKey]
+  -- let knownKeysSet = Set.fromAscList knownKeys
+  -- In case of memory problems, change next lines to conduit proper:
+  incidences <- fmap entityVal <$> selectList [StudyTermCandidateKey /<-. knownKeys] [] -- LimitTo might be dangerous here, if we get a partial incidence. Possibly first select N incidences, then retrieving all those only.
+  -- incidences <- E.select $ E.from $ \candidate -> do
+  --   E.where_ $ candidate E.^. StudyTermCandidayeKey `E.notIn` E.valList knownKeys
+  --   return candidate
+
+  -- Possibly expensive pure computations follows. Break runDB to shorten transaction?
+  let groupedCandidates :: Map STKey (Map UUID (Set Text))
+      groupedCandidates = foldl' groupFun mempty incidences
+
+      -- given a key, map each incidence to set of possible names for this key
+      groupFun :: Map STKey (Map UUID (Set Text)) -> StudyTermCandidate -> Map STKey (Map UUID (Set Text))
+      groupFun m StudyTermCandidate{..} =
+        insertWith (Map.unionWith Set.union)
+                   studyTermCandidateKey
+                   (Map.singleton studyTermCandidateIncidence $ Set.singleton studyTermCandidateName)
+                   m
+
+      -- pointwise intersection per incidence gives possible candidates per key
+      keyCandidates :: Map STKey (Set Text)
+      keyCandidates = Map.map (setIntersections . Map.elems) groupedCandidates
+
+      -- filter candidates having a unique possibility left
+      fixedKeys :: [(STKey,Text)]
+      fixedKeys = Map.foldlWithKey' combFixed [] keyCandidates
+
+      combFixed :: [(STKey,Text)] -> STKey -> Set Text ->  [(STKey,Text)]
+      combFixed acc k s | Set.size s == 1 -- possibly redundant
+                        , [n] <- Set.elems s = (k,n):acc
+                        -- empty sets should not occur here , if LDAP is consistent. Maybe raise a warning?!
+                        | otherwise = acc
+
+      -- registerFixed :: (STKey, Text) -> DB (Key StudyTerms)
+      registerFixed :: (STKey, Text) -> DB ()
+      registerFixed (key, name) =
+        -- insertKey (StudyTermsKey key) $ StudyTerms key (Just $ shortenStudyTerm name) (Just name) -- name clash!
+        void . insert $ StudyTerms key (Just $ shortenStudyTerm name) (Just name)
+
+
+  -- register newly fixed candidates
+  forM_ fixedKeys registerFixed
+  return fixedKeys
+
+
+  -- SOME EARLIER ATTEMPTS FOLLOW:
+  --
+  -- unknownKeys <- E.select $ E.distinct $ E.from $ \candidate -> do
+  --    E.where_ $ E.notExists $ E.from $ \sterm ->
+  --      E.where_ $ candidate E.^. StudyTermCandidateKey E.==. sterm E.^. StudyTermKey
+  --    return $ candidate E.^. StudyTermCandidateKey
+  -- forM unknownKeys $ \(E.Value key) -> do
+  --   incidences <- E.select $ E.from $ \candidate -> do
+  --     E.where_ $
+  --
+  -- -- DON'T KNOW HOW TO DO IN SQL :( BUT WE NEED THE ENTIRE TABLE ANYHOW
+  -- candidates <- entityVal <$> selectList [] [] -- load entire candidate table
+  -- -- create map from UUID to set of candidates for efficiency
+  -- let collectCandidates m stc@StudyTermCandidate{studyTermCandidateIncidence=inci}
+  --       = insertWith Set.union inci stc
+  --     incidences = foldl collectCandidates Map.empty candidates
+  --
+  --     collectKeys m
+  --     keySets = foldl collectKeys Map.empty candidates
+  --
+  -- -- StudyTermCandidateKey -> Set StudyTermCandidateName
+
+
+
+
+-- | all existing StudyTerms that are contradiced by current observations
+conflicts :: DB [Entity StudyTerms]
+conflicts = E.select $ E.from $ \studyTerms -> do
+  E.where_ $ E.not_ $ E.isNothing $ studyTerms E.^. StudyTermsName
+  E.where_ $ E.exists $ E.from $ \candidateOne -> do
+    E.where_ $ candidateOne E.^. StudyTermCandidateKey E.==. studyTerms E.^. StudyTermsKey
+    E.where_ $ E.notExists . E.from $ \candidateTwo ->  do
+        E.where_ $ candidateTwo E.^. StudyTermCandidateIncidence E.==. candidateOne E.^. StudyTermCandidateIncidence
+        E.where_ $ studyTerms E.^. StudyTermsName E.==. E.just (candidateTwo E.^. StudyTermCandidateName)
+  return studyTerms
+
+
+
--- a/src/Utils.hs
+++ b/src/Utils.hs
@ -321,6 +321,17 @@ mergeAttrs = mergeAttrs' `on` sort



+----------
+-- Sets --
+----------
+
+-- | Intersection of multiple sets. Returns empty set for empty input list
+setIntersections :: Ord a => [Set a] -> Set a
+setIntersections []    = Set.empty
+setIntersections (h:t) = foldl' Set.intersection h t
+
+
+
 ----------
 -- Maps --
 ----------