fradrive/src/Handler/Utils/TermCandidates.hs

module Handler.Utils.TermCandidates where

import Import
-- import Handler.Utils


-- Import this module as Candidates

-- import Utils.Lens

-- import Data.Time
-- import qualified Data.Text as T
-- import Data.Function ((&))
-- import Yesod.Form.Bootstrap3
-- import Colonnade hiding (fromMaybe)
-- import Yesod.Colonnade
-- import qualified Data.UUID.Cryptographic as UUID
-- import Control.Monad.Trans.Writer (mapWriterT)
-- import Database.Persist.Sql (fromSqlKey)
import qualified Data.Set as Set
import qualified Data.List as List
import qualified Data.Map as Map


import qualified Database.Esqueleto as E
import qualified Database.Esqueleto.Internal.Internal as E

{-# ANN module ("HLint: ignore Use newtype instead of data"::String) #-}

type STKey = Int -- for convenience, assmued identical to field StudyTermNameCandidateKey

data FailedCandidateInference = FailedCandidateInference [Entity StudyTerms]
  deriving (Typeable, Show)

instance Exception FailedCandidateInference
  -- Default Instance

-- -- | Just an heuristik to fill in defaults
-- shortenStudyTerm :: Text -> Text
-- shortenStudyTerm = concatMap (take 4) . splitCamel

-- | Attempt to identify new StudyTerms based on observations, returning:
--   * list of ambiguous instances that were discarded outright (identical names for differents keys observed in single incidences)
--   * list of problems, ie. StudyTerms that contradict observed incidences
--   * list of redundants, i.e. redundant observed incidences
--   * list of accepted, i.e. newly accepted key/name pairs
inferNamesHandler :: Handler ([Entity StudyTerms],[TermCandidateIncidence],[Entity StudyTermNameCandidate],[(StudyTermsId,Text)])
inferNamesHandler = runDB $ inferAcc mempty
  where
    inferAcc (accAmbiguous, accRedundants, accAccepted) =
      handle (\(FailedCandidateInference fails) -> (fails, accAmbiguous, accRedundants, accAccepted') <$ E.transactionUndo) $ do
        (infAmbis, infReds, infAccs) <- inferStep
        if
          | null infAccs ->
            return ([],          accAmbiguous, infReds <> accRedundants,            accAccepted')
          | otherwise -> do
            E.transactionSave -- commit transaction if there are no problems
            inferAcc   (infAmbis <> accAmbiguous, infReds <> accRedundants, infAccs <> accAccepted)
      where
        accAccepted' = over (traversed . _1) StudyTermsKey' accAccepted

    inferStep = do
      ambiguous  <- removeAmbiguousNames
      redundants <- removeRedundantNames
      accepted <- acceptSingletonNames
      problems <- nameConflicts
      unless (null problems) $ throwM $ FailedCandidateInference problems
      return (ambiguous, redundants, accepted)

inferParentsHandler :: Handler ([Entity StudySubTermParentCandidate], [Entity StudySubTerms])
inferParentsHandler = runDB $ inferAcc mempty
  where
    inferAcc (infReds', infAccs') = do
      (infReds, infAccs) <- inferStep
      if
        | null infAccs ->
          return (infReds' <> infReds, infAccs')
        | otherwise ->
          inferAcc (infReds' <> infReds, infAccs' <> infAccs)
    inferStep = do
      redundants <- removeRedundantParents
      accepted <- acceptSingletonParents
      return (redundants, accepted)

{-
Candidate 1 11 "A"
Candidate 1 11 "B"
Candidate 1 12 "A"
Candidate 1 12 "B"
Candidate 2 12 "B"
Candidate 2 12 "C"
Candidate 2 13 "B"
Candidate 2 13 "C"

should readily yield 11/A, 12/B 13/C:

it can infer due to overlab that 12/B must be true, then eliminating B identifies A and C;
this rests on the assumption that the Names are unique, which is NOT TRUE;
as a fix we simply eliminate all observations that have the same name twice, see removeInconsistent

-}

-- | remove candidates with ambiguous observations,
--   ie. candidates that have duplicated term names with differing keys
--   which may happen in rare cases
removeAmbiguousNames :: DB [TermCandidateIncidence]
removeAmbiguousNames = do
  ambiList <- E.select $ E.from $ \candidate -> do
    E.groupBy ( candidate E.^. StudyTermNameCandidateIncidence
              , candidate E.^. StudyTermNameCandidateKey
              , candidate E.^. StudyTermNameCandidateName
              )
    E.having $ E.countRows E.!=. E.val (1 :: Int64)
    return $ candidate E.^. StudyTermNameCandidateIncidence
  let ambiSet = E.unValue <$> List.nub ambiList
  -- Most SQL dialects won't allow deletion and queries on the same table at once, hence we delete in two steps.
  deleteWhere [StudyTermNameCandidateIncidence <-. ambiSet]
  return ambiSet


-- | remove known StudyTerm from candidates that have the _exact_ name,
--   ie. if a candidate contains a known key, we remove it and its associated fullname
--   only save if ambiguous candidates haven been removed
removeRedundantNames :: DB [Entity StudyTermNameCandidate]
removeRedundantNames = do
  redundants <- E.select $ E.distinct $ E.from $ \(candidate `E.InnerJoin` sterm) -> do
    E.on $ candidate E.^. StudyTermNameCandidateKey E.==. sterm E.^. StudyTermsKey
     E.&&. (       E.just (candidate E.^. StudyTermNameCandidateName) E.==. sterm E.^. StudyTermsName
             E.||. E.exists (E.from $ \(subTerm `E.InnerJoin` sterm2) -> do
                                E.on $ subTerm E.^. StudySubTermsParent E.==. sterm  E.^. StudyTermsId
                                 E.&&. subTerm E.^. StudySubTermsChild  E.==. sterm2 E.^. StudyTermsId
                                E.where_ $ E.just (candidate E.^. StudyTermNameCandidateName) E.==. sterm2 E.^. StudyTermsName
                            )
           )
    return candidate
  -- Most SQL dialects won't allow deletion and queries on the same table at once, hence we delete in two steps.
  forM_ redundants $ \Entity{entityVal=StudyTermNameCandidate{..}} ->
    deleteWhere $   ( StudyTermNameCandidateIncidence ==. studyTermNameCandidateIncidence )
                :  ([ StudyTermNameCandidateKey       ==. studyTermNameCandidateKey       ]
                ||. [ StudyTermNameCandidateName      ==. studyTermNameCandidateName      ])
  return redundants

removeRedundantParents :: DB [Entity StudySubTermParentCandidate]
removeRedundantParents = do
  redundants <- E.select . E.distinct . E.from $ \(candidate `E.InnerJoin` subTerm) -> do
    E.on $ candidate E.^. StudySubTermParentCandidateKey E.==. E.veryUnsafeCoerceSqlExprValue (subTerm E.^. StudySubTermsChild)
     E.&&. candidate E.^. StudySubTermParentCandidateParent E.==. E.veryUnsafeCoerceSqlExprValue (subTerm E.^. StudySubTermsParent)
    return candidate
  forM_ redundants $ \(Entity _ StudySubTermParentCandidate{..}) ->
    E.delete . E.from $ \candidate ->
      E.where_ $ candidate E.^. StudySubTermParentCandidateIncidence E.==. E.val studySubTermParentCandidateIncidence
           E.&&. (       candidate E.^. StudySubTermParentCandidateParent `E.in_` E.valList [studySubTermParentCandidateParent, studySubTermParentCandidateKey]
                   E.||. candidate E.^. StudySubTermParentCandidateKey `E.in_` E.valList [studySubTermParentCandidateParent, studySubTermParentCandidateKey]
                 )
  return redundants

removeRedundantStandalone :: DB [Entity StudyTermStandaloneCandidate]
removeRedundantStandalone = do
  redundants <- E.select . E.distinct . E.from $ \(candidate `E.InnerJoin` sterm) -> do
    E.on $ candidate E.^. StudyTermStandaloneCandidateKey E.==. sterm E.^. StudyTermsKey
     E.&&. E.not_ (E.isNothing $ sterm E.^. StudyTermsDefaultDegree)
     E.&&. E.not_ (E.isNothing $ sterm E.^. StudyTermsDefaultType)
    return candidate
  deleteWhere [ StudyTermStandaloneCandidateId <-. map entityKey redundants ]
  return redundants


-- | Search for single candidates and memorize them as StudyTerms.
--   Should be called after @removeRedundant@ to increase success chances and reduce cost; otherwise memory heavy!
--   Does not delete the used candidates, user @removeRedundant@ for this later on.
--   Esqueleto does not provide the INTERESECT operator, thus
--   we load the table into Haskell and operate there. Memory usage problem? StudyTermsCandidate may become huge.
acceptSingletonNames :: DB [(STKey,Text)]
acceptSingletonNames = do
  knownKeys <- fmap unStudyTermsKey <$> selectKeysList [StudyTermsName !=. Nothing] [Asc StudyTermsKey]
  -- let knownKeysSet = Set.fromAscList knownKeys
  -- In case of memory problems, change next lines to conduit proper:
  incidences <- fmap entityVal <$> selectList [StudyTermNameCandidateKey /<-. knownKeys] [] -- LimitTo might be dangerous here, if we get a partial incidence. Possibly first select N incidences, then retrieving all those only.
  -- incidences <- E.select $ E.from $ \candidate -> do
  --   E.where_ $ candidate E.^. StudyTermCandidayeKey `E.notIn` E.valList knownKeys
  --   return candidate

  -- Possibly expensive pure computations follows. Break runDB to shorten transaction?
  let groupedCandidates :: Map STKey (Map UUID (Set Text))
      groupedCandidates = foldl' groupFun mempty incidences

      -- given a key, map each incidence to set of possible names for this key
      groupFun :: Map STKey (Map TermCandidateIncidence (Set Text)) -> StudyTermNameCandidate -> Map STKey (Map TermCandidateIncidence (Set Text))
      groupFun m StudyTermNameCandidate{..} =
        insertWith (Map.unionWith Set.union)
                   studyTermNameCandidateKey
                   (Map.singleton studyTermNameCandidateIncidence $ Set.singleton studyTermNameCandidateName)
                   m

      -- pointwise intersection per incidence gives possible candidates per key
      keyCandidates :: Map STKey (Set Text)
      keyCandidates = Map.map (setIntersections . Map.elems) groupedCandidates

      -- filter candidates having a unique possibility left
      fixedKeys :: [(STKey,Text)]
      fixedKeys = fst $ Map.foldlWithKey' combFixed mempty keyCandidates
        where
          combFixed :: ([(STKey,Text)], Set STKey) -> STKey -> Set Text -> ([(STKey,Text)], Set STKey)
          combFixed (acc, bad) k s
            | Set.member k bad
            = (acc, bad)
            | maybe False (`Set.notMember` s) (lookup k acc)
            = (filter (\(k', _) -> k /= k') acc, Set.insert k bad)
            | [n] <- Set.elems s
            = ((k,n) : acc, bad)
            | otherwise = (acc, bad)

      -- registerFixed :: (STKey, Text) -> DB (Key StudyTerms)
      registerFixed :: (STKey, Text) -> DB ()
      registerFixed (key, name) =
        repsert (StudyTermsKey' key) $ StudyTerms key Nothing (Just name) Nothing Nothing

  -- register newly fixed candidates
  forM_ fixedKeys registerFixed
  return fixedKeys

acceptSingletonParents :: DB [Entity StudySubTerms]
acceptSingletonParents = do
  candidates <- map entityVal <$> selectList [] []

  let
    groupedCandidates :: Map STKey (Map UUID (Set STKey))
    groupedCandidates = foldl' groupFun mempty candidates
      where
        groupFun :: Map STKey (Map UUID (Set STKey)) -> StudySubTermParentCandidate -> Map STKey (Map UUID (Set STKey))
        groupFun m StudySubTermParentCandidate{..} =
          Map.insertWith (Map.unionWith Set.union)
            studySubTermParentCandidateKey
            (Map.singleton studySubTermParentCandidateIncidence $ Set.singleton studySubTermParentCandidateParent)
            m

    parentCandidates :: Map STKey (Set STKey)
    parentCandidates = Map.map (setIntersections . Map.elems) groupedCandidates

    fixedParents :: [(STKey, STKey)]
    fixedParents = fst $ Map.foldlWithKey' combFixed mempty parentCandidates
      where
        combFixed :: ([(STKey, STKey)], Set STKey) -> STKey -> Set STKey -> ([(STKey, STKey)], Set STKey)
        combFixed (acc, bad) k s
          | Set.member k bad
          = (acc, bad)
          | maybe False (`Set.notMember` s) (lookup k acc)
          = (filter (\(k', _) -> k /= k') acc, Set.insert k bad)
          | [p] <- Set.elems s
          = ((k, p) : acc, bad)
          | otherwise = (acc, bad)

  inserted <- forM fixedParents $ \(key, parent) -> do
    unlessM (existsKey $ StudyTermsKey' key) $
      insert_ (StudyTerms key Nothing Nothing Nothing Nothing)
    unlessM (existsKey $ StudyTermsKey' parent) $
      insert_ (StudyTerms parent Nothing Nothing Nothing Nothing)
    insertUnique $ StudySubTerms
      { studySubTermsChild  = StudyTermsKey' key
      , studySubTermsParent = StudyTermsKey' parent
      }

  mapM getJustEntity $ catMaybes inserted


-- | all existing StudyTerms that are contradiced by current observations
nameConflicts :: DB [Entity StudyTerms]
nameConflicts = E.select $ E.from $ \studyTerms -> do
  E.where_ $ E.not_ $ E.isNothing $ studyTerms E.^. StudyTermsName
  E.where_ $ E.exists $ E.from $ \candidateOne -> do
    E.where_ $ candidateOne E.^. StudyTermNameCandidateKey E.==. studyTerms E.^. StudyTermsKey
    E.where_ $ E.notExists . E.from $ \candidateTwo ->  do
        E.where_ $ candidateTwo E.^. StudyTermNameCandidateIncidence E.==. candidateOne E.^. StudyTermNameCandidateIncidence
        E.where_ $ studyTerms E.^. StudyTermsName E.==. E.just (candidateTwo E.^. StudyTermNameCandidateName)
             E.||. E.exists ( E.from $ \(pCandidate `E.LeftOuterJoin` termsTwo) -> do
                                E.on $ pCandidate E.^. StudySubTermParentCandidateParent E.==. studyTerms E.^. StudyTermsKey
                                 E.&&. E.just (pCandidate E.^. StudySubTermParentCandidateKey) E.==. termsTwo E.?. StudyTermsKey
                                E.where_ $ E.joinV (termsTwo E.?. StudyTermsName) E.==. E.just (candidateTwo E.^. StudyTermNameCandidateName)
                                     E.||. E.isNothing (E.joinV $ termsTwo E.?. StudyTermsName)
                            )
             E.||. E.exists ( E.from $ \(subTerms `E.InnerJoin` termsTwo) -> do
                                E.on $ subTerms E.^. StudySubTermsParent E.==. studyTerms E.^. StudyTermsId
                                 E.&&. subTerms E.^. StudySubTermsChild  E.==. termsTwo   E.^. StudyTermsId
                                E.where_ $ termsTwo E.^. StudyTermsName E.==. E.just (candidateTwo E.^. StudyTermNameCandidateName)
                                     E.||. E.isNothing (termsTwo E.^. StudyTermsName)
                            )
  return studyTerms


-- | retrieve all incidence keys having containing a certain @StudyTermKey @
getNameIncidencesFor :: [StudyTermsId] -> DB [E.Value TermCandidateIncidence]
getNameIncidencesFor stks = E.select $ E.distinct $ E.from $ \candidate -> do
  E.where_ $ candidate E.^. StudyTermNameCandidateKey `E.in_` E.valList stks'
  return $ candidate E.^. StudyTermNameCandidateIncidence
  where
    stks' = stks <&> unStudyTermsKey