StudyTermCandidate inference implemented needs tests
This commit is contained in:
parent
2d1f74b4a4
commit
9b944d70b8
@ -54,6 +54,7 @@ StudyTerms -- Studiengang
|
||||
shorthand Text Maybe -- admin determined shorthand
|
||||
name Text Maybe -- description given by LDAP
|
||||
Primary key -- column key is used as actual DB row key
|
||||
-- newtype Key StudyTerms = StudyTermsKey { unStudyTermsKey :: Int }
|
||||
StudyTermCandidate -- No one at LMU is willing and able to tell us the meaning of the keys for StudyDegrees and StudyTerms.
|
||||
-- Each LDAP login provides an unordered set of keys and an unordered set of plain text description with an unknown 1-1 correspondence.
|
||||
-- This table helps us to infer which key belongs to which plain text by recording possible combinations at login.
|
||||
|
||||
@ -803,7 +803,7 @@ getCUsersR tid ssh csh = do
|
||||
, colUserEmail
|
||||
, colUserMatriclenr
|
||||
, colUserDegreeShort
|
||||
, colUserFieldShort
|
||||
, colUserField
|
||||
, colUserSemester
|
||||
, sortable (Just "course-registration") (i18nCell MsgRegisteredHeader) (dateCell . view _userTableRegistration)
|
||||
, colUserComment tid ssh csh
|
||||
|
||||
202
src/Handler/Utils/TermCandidates.hs
Normal file
202
src/Handler/Utils/TermCandidates.hs
Normal file
@ -0,0 +1,202 @@
|
||||
module Handler.Utils.TermCandidates where
|
||||
|
||||
import Import
|
||||
-- import Handler.Utils
|
||||
|
||||
|
||||
-- Import this module as Candidates
|
||||
|
||||
-- import Utils.Lens
|
||||
|
||||
-- import Data.Time
|
||||
-- import qualified Data.Text as T
|
||||
-- import Data.Function ((&))
|
||||
-- import Yesod.Form.Bootstrap3
|
||||
-- import Colonnade hiding (fromMaybe)
|
||||
-- import Yesod.Colonnade
|
||||
-- import qualified Data.UUID.Cryptographic as UUID
|
||||
-- import Control.Monad.Trans.Writer (mapWriterT)
|
||||
-- import Database.Persist.Sql (fromSqlKey)
|
||||
import Data.Set (Set)
|
||||
import qualified Data.Set as Set
|
||||
import qualified Data.List as List
|
||||
import Data.Map (Map)
|
||||
import qualified Data.Map as Map
|
||||
|
||||
|
||||
import qualified Database.Esqueleto as E
|
||||
-- import Database.Esqueleto.Utils as E
|
||||
|
||||
|
||||
type STKey = Int -- Key StudyTerms -- for convenience, assmued identical to field StudyTermCandidateKey
|
||||
|
||||
-- | Just an heuristik to fill in defaults
|
||||
shortenStudyTerm :: Text -> Text
|
||||
shortenStudyTerm = concatMap (take 4) . splitCamel
|
||||
|
||||
-- | Attempt to identify new StudyTerms based on observations
|
||||
-- infer :: MonadHandler m => m ([Entity StudyTerms],[Entity StudyTerms])
|
||||
infer :: DB ([Entity StudyTerms],[(STKey, Text)])
|
||||
infer = do
|
||||
void removeAmbiguous -- TODO: show result
|
||||
inferAcc []
|
||||
where
|
||||
inferAcc prevSet = do
|
||||
problems <- conflicts
|
||||
if null problems
|
||||
then do
|
||||
void removeRedundant -- TODO: show result
|
||||
newSet <- acceptSingletons
|
||||
if null newSet
|
||||
then -- inference complete
|
||||
return ([],prevSet)
|
||||
else
|
||||
inferAcc (newSet ++ prevSet)
|
||||
else --abort
|
||||
return (problems,prevSet)
|
||||
|
||||
|
||||
{-
|
||||
Candidate 1 11 "A"
|
||||
Candidate 1 11 "B"
|
||||
Candidate 1 12 "A"
|
||||
Candidate 1 12 "B"
|
||||
Candidate 2 12 "B"
|
||||
Candidate 2 12 "C"
|
||||
Candidate 2 13 "B"
|
||||
Candidate 2 13 "C"
|
||||
|
||||
should readily yield 11/A, 12/B 13/C:
|
||||
|
||||
it can infer due to overlab that 12/B must be true, then eliminating B identifies A and C;
|
||||
this rests on the assumption that the Names are unique, which is NOT TRUE;
|
||||
as a fix we simply eliminate all observations that have the same name twice, see removeInconsistent
|
||||
|
||||
-}
|
||||
|
||||
-- | remove candidates with ambiguous observations,
|
||||
-- ie. candidates that have duplicated term names with differing keys
|
||||
-- which may happen in rare cases
|
||||
removeAmbiguous :: DB [UUID]
|
||||
removeAmbiguous = do
|
||||
ambiList <- E.select $ E.from $ \(candA `E.InnerJoin` candB) -> do
|
||||
-- Either an innerJoin with itself or an exists-sub-select
|
||||
E.on $ (candA E.^. StudyTermCandidateIncidence E.==. candB E.^. StudyTermCandidateIncidence)
|
||||
E.&&. (candA E.^. StudyTermCandidateKey E.!=. candB E.^. StudyTermCandidateKey)
|
||||
E.&&. (candA E.^. StudyTermCandidateName E.==. candB E.^. StudyTermCandidateName)
|
||||
E.&&. (candA E.^. StudyTermCandidateId E.!=. candB E.^. StudyTermCandidateId) -- should not be needed, but does not hurt either
|
||||
return $ candA E.^. StudyTermCandidateIncidence
|
||||
let ambiSet = E.unValue <$> List.nub ambiList
|
||||
-- Most SQL dialects won't allow deletion and queries on the same table at once, hence we delete in two steps.
|
||||
deleteWhere [StudyTermCandidateIncidence <-. ambiSet]
|
||||
return ambiSet
|
||||
|
||||
|
||||
-- | remove known StudyTerm from candidates that have the _exact_ name,
|
||||
-- ie. if a candidate contains a known key, we remove it and its associated fullname
|
||||
-- only save if ambiguous candidates haven been removed
|
||||
removeRedundant :: DB [Entity StudyTermCandidate]
|
||||
removeRedundant = do
|
||||
redundants <- E.select $ E.distinct $ E.from $ \(candidate `E.InnerJoin` sterm) -> do
|
||||
E.on $ candidate E.^. StudyTermCandidateKey E.==. sterm E.^. StudyTermsKey
|
||||
E.&&. E.just (candidate E.^. StudyTermCandidateName) E.==. sterm E.^. StudyTermsName
|
||||
return candidate
|
||||
-- Most SQL dialects won't allow deletion and queries on the same table at once, hence we delete in two steps.
|
||||
forM_ redundants $ \Entity{entityVal=StudyTermCandidate{..}} ->
|
||||
deleteWhere $ ( StudyTermCandidateIncidence ==. studyTermCandidateIncidence )
|
||||
: ([ StudyTermCandidateKey ==. studyTermCandidateKey ]
|
||||
||. [ StudyTermCandidateName ==. studyTermCandidateName ])
|
||||
return redundants
|
||||
|
||||
|
||||
-- | Search for single candidates and memorize them as StudyTerms.
|
||||
-- Should be called after @removeRedundant@ to increase success chances and reduce cost; otherwise memory heavy!
|
||||
-- Does not delete the used candidates, user @removeRedundant@ for this later on.
|
||||
-- Esqueleto does not provide the INTERESECT operator, thus
|
||||
-- we load the table into Haskell and operate there. Memory usage problem? StudyTermsCandidate may become huge.
|
||||
acceptSingletons :: DB [(STKey,Text)]
|
||||
acceptSingletons = do
|
||||
knownKeys <- fmap unStudyTermsKey <$> selectKeysList [] [Asc StudyTermsKey]
|
||||
-- let knownKeysSet = Set.fromAscList knownKeys
|
||||
-- In case of memory problems, change next lines to conduit proper:
|
||||
incidences <- fmap entityVal <$> selectList [StudyTermCandidateKey /<-. knownKeys] [] -- LimitTo might be dangerous here, if we get a partial incidence. Possibly first select N incidences, then retrieving all those only.
|
||||
-- incidences <- E.select $ E.from $ \candidate -> do
|
||||
-- E.where_ $ candidate E.^. StudyTermCandidayeKey `E.notIn` E.valList knownKeys
|
||||
-- return candidate
|
||||
|
||||
-- Possibly expensive pure computations follows. Break runDB to shorten transaction?
|
||||
let groupedCandidates :: Map STKey (Map UUID (Set Text))
|
||||
groupedCandidates = foldl' groupFun mempty incidences
|
||||
|
||||
-- given a key, map each incidence to set of possible names for this key
|
||||
groupFun :: Map STKey (Map UUID (Set Text)) -> StudyTermCandidate -> Map STKey (Map UUID (Set Text))
|
||||
groupFun m StudyTermCandidate{..} =
|
||||
insertWith (Map.unionWith Set.union)
|
||||
studyTermCandidateKey
|
||||
(Map.singleton studyTermCandidateIncidence $ Set.singleton studyTermCandidateName)
|
||||
m
|
||||
|
||||
-- pointwise intersection per incidence gives possible candidates per key
|
||||
keyCandidates :: Map STKey (Set Text)
|
||||
keyCandidates = Map.map (setIntersections . Map.elems) groupedCandidates
|
||||
|
||||
-- filter candidates having a unique possibility left
|
||||
fixedKeys :: [(STKey,Text)]
|
||||
fixedKeys = Map.foldlWithKey' combFixed [] keyCandidates
|
||||
|
||||
combFixed :: [(STKey,Text)] -> STKey -> Set Text -> [(STKey,Text)]
|
||||
combFixed acc k s | Set.size s == 1 -- possibly redundant
|
||||
, [n] <- Set.elems s = (k,n):acc
|
||||
-- empty sets should not occur here , if LDAP is consistent. Maybe raise a warning?!
|
||||
| otherwise = acc
|
||||
|
||||
-- registerFixed :: (STKey, Text) -> DB (Key StudyTerms)
|
||||
registerFixed :: (STKey, Text) -> DB ()
|
||||
registerFixed (key, name) =
|
||||
-- insertKey (StudyTermsKey key) $ StudyTerms key (Just $ shortenStudyTerm name) (Just name) -- name clash!
|
||||
void . insert $ StudyTerms key (Just $ shortenStudyTerm name) (Just name)
|
||||
|
||||
|
||||
-- register newly fixed candidates
|
||||
forM_ fixedKeys registerFixed
|
||||
return fixedKeys
|
||||
|
||||
|
||||
-- SOME EARLIER ATTEMPTS FOLLOW:
|
||||
--
|
||||
-- unknownKeys <- E.select $ E.distinct $ E.from $ \candidate -> do
|
||||
-- E.where_ $ E.notExists $ E.from $ \sterm ->
|
||||
-- E.where_ $ candidate E.^. StudyTermCandidateKey E.==. sterm E.^. StudyTermKey
|
||||
-- return $ candidate E.^. StudyTermCandidateKey
|
||||
-- forM unknownKeys $ \(E.Value key) -> do
|
||||
-- incidences <- E.select $ E.from $ \candidate -> do
|
||||
-- E.where_ $
|
||||
--
|
||||
-- -- DON'T KNOW HOW TO DO IN SQL :( BUT WE NEED THE ENTIRE TABLE ANYHOW
|
||||
-- candidates <- entityVal <$> selectList [] [] -- load entire candidate table
|
||||
-- -- create map from UUID to set of candidates for efficiency
|
||||
-- let collectCandidates m stc@StudyTermCandidate{studyTermCandidateIncidence=inci}
|
||||
-- = insertWith Set.union inci stc
|
||||
-- incidences = foldl collectCandidates Map.empty candidates
|
||||
--
|
||||
-- collectKeys m
|
||||
-- keySets = foldl collectKeys Map.empty candidates
|
||||
--
|
||||
-- -- StudyTermCandidateKey -> Set StudyTermCandidateName
|
||||
|
||||
|
||||
|
||||
|
||||
-- | all existing StudyTerms that are contradiced by current observations
|
||||
conflicts :: DB [Entity StudyTerms]
|
||||
conflicts = E.select $ E.from $ \studyTerms -> do
|
||||
E.where_ $ E.not_ $ E.isNothing $ studyTerms E.^. StudyTermsName
|
||||
E.where_ $ E.exists $ E.from $ \candidateOne -> do
|
||||
E.where_ $ candidateOne E.^. StudyTermCandidateKey E.==. studyTerms E.^. StudyTermsKey
|
||||
E.where_ $ E.notExists . E.from $ \candidateTwo -> do
|
||||
E.where_ $ candidateTwo E.^. StudyTermCandidateIncidence E.==. candidateOne E.^. StudyTermCandidateIncidence
|
||||
E.where_ $ studyTerms E.^. StudyTermsName E.==. E.just (candidateTwo E.^. StudyTermCandidateName)
|
||||
return studyTerms
|
||||
|
||||
|
||||
|
||||
11
src/Utils.hs
11
src/Utils.hs
@ -321,6 +321,17 @@ mergeAttrs = mergeAttrs' `on` sort
|
||||
|
||||
|
||||
|
||||
----------
|
||||
-- Sets --
|
||||
----------
|
||||
|
||||
-- | Intersection of multiple sets. Returns empty set for empty input list
|
||||
setIntersections :: Ord a => [Set a] -> Set a
|
||||
setIntersections [] = Set.empty
|
||||
setIntersections (h:t) = foldl' Set.intersection h t
|
||||
|
||||
|
||||
|
||||
----------
|
||||
-- Maps --
|
||||
----------
|
||||
|
||||
Loading…
Reference in New Issue
Block a user