fradrive/src/Handler/Utils/TermCandidates.hs

module Handler.Utils.TermCandidates where

import Import
-- import Handler.Utils


-- Import this module as Candidates

-- import Utils.Lens

-- import Data.Time
-- import qualified Data.Text as T
-- import Data.Function ((&))
-- import Yesod.Form.Bootstrap3
-- import Colonnade hiding (fromMaybe)
-- import Yesod.Colonnade
-- import qualified Data.UUID.Cryptographic as UUID
-- import Control.Monad.Trans.Writer (mapWriterT)
-- import Database.Persist.Sql (fromSqlKey)
import Data.Set (Set)
import qualified Data.Set as Set
import qualified Data.List as List
import Data.Map (Map)
import qualified Data.Map as Map


import qualified Database.Esqueleto as E
-- import Database.Esqueleto.Utils as E


type STKey = Int -- Key StudyTerms -- for convenience, assmued identical to field StudyTermCandidateKey

data FailedCandidateInference = FailedCandidateInference [Entity StudyTerms]
  deriving (Typeable)

instance Show FailedCandidateInference where
  show (FailedCandidateInference _) = "Failed Candidate Inference" -- TODO

instance Exception FailedCandidateInference
  -- Default Instance

-- | Just an heuristik to fill in defaults
shortenStudyTerm :: Text -> Text
shortenStudyTerm = concatMap (take 4) . splitCamel

-- | Attempt to identify new StudyTerms based on observations
inferHandler :: Handler ([UUID],([Entity StudyTerms],[Entity StudyTermCandidate],[(STKey,Text)]))
inferHandler = do
    (ambiguous, problems) <- runDB $ (,) <$> removeAmbiguous <*> conflicts
    if (null problems)
      then do
        infRes <- inferAcc ([],[])
        return (ambiguous, infRes)
      else
        return (ambiguous,(problems,[],[]))

  where
    inferAcc (accRedundants, accAccepted) =
      handle (\(FailedCandidateInference fails) -> return (fails,accRedundants,accAccepted)) $ do
        (infReds,infAccs) <- runDB inferStep
        if null infAccs
          then return ([], infReds ++ accRedundants,            accAccepted)
          else inferAcc   (infReds ++ accRedundants, infAccs ++ accAccepted)

    inferStep = do
      redundants <- removeRedundant
      accepted <- acceptSingletons
      problems <- conflicts
      when (not $ null problems) $ throw $ FailedCandidateInference problems
      return (redundants, accepted)


-- | Attempt to identify new StudyTerms based on observations
-- infer :: MonadHandler m => m ([Entity StudyTerms],[Entity StudyTerms])
infer :: DB ([Entity StudyTerms],[(STKey, Text)])
infer = do
    void removeAmbiguous -- TODO: show result
    inferAcc []
  where
    inferAcc prevSet = do
      problems <- conflicts
      if null problems
        then do
          void removeRedundant -- TODO: show result
          newSet <- acceptSingletons
          if null newSet
            then -- inference complete
              return ([],prevSet)
            else
              inferAcc (newSet ++ prevSet)
        else --abort
          return (problems,prevSet)


{-
Candidate 1 11 "A"
Candidate 1 11 "B"
Candidate 1 12 "A"
Candidate 1 12 "B"
Candidate 2 12 "B"
Candidate 2 12 "C"
Candidate 2 13 "B"
Candidate 2 13 "C"

should readily yield 11/A, 12/B 13/C:

it can infer due to overlab that 12/B must be true, then eliminating B identifies A and C;
this rests on the assumption that the Names are unique, which is NOT TRUE;
as a fix we simply eliminate all observations that have the same name twice, see removeInconsistent

-}

-- | remove candidates with ambiguous observations,
--   ie. candidates that have duplicated term names with differing keys
--   which may happen in rare cases
removeAmbiguous :: DB [UUID]
removeAmbiguous = do
  ambiList <- E.select $ E.from $ \(candA `E.InnerJoin` candB) -> do
    -- Either an innerJoin with itself or an exists-sub-select
    E.on $    (candA E.^. StudyTermCandidateIncidence E.==. candB E.^. StudyTermCandidateIncidence)
        E.&&. (candA E.^. StudyTermCandidateKey       E.!=. candB E.^. StudyTermCandidateKey)
        E.&&. (candA E.^. StudyTermCandidateName      E.==. candB E.^. StudyTermCandidateName)
        E.&&. (candA E.^. StudyTermCandidateId        E.!=. candB E.^. StudyTermCandidateId) -- should not be needed, but does not hurt either
    return $ candA E.^. StudyTermCandidateIncidence
  let ambiSet = E.unValue <$> List.nub ambiList
  -- Most SQL dialects won't allow deletion and queries on the same table at once, hence we delete in two steps.
  deleteWhere [StudyTermCandidateIncidence <-. ambiSet]
  return ambiSet


-- | remove known StudyTerm from candidates that have the _exact_ name,
--   ie. if a candidate contains a known key, we remove it and its associated fullname
--   only save if ambiguous candidates haven been removed
removeRedundant :: DB [Entity StudyTermCandidate]
removeRedundant = do
  redundants <- E.select $ E.distinct $ E.from $ \(candidate `E.InnerJoin` sterm) -> do
    E.on $  candidate E.^. StudyTermCandidateKey  E.==. sterm E.^. StudyTermsKey
      E.&&. E.just (candidate E.^. StudyTermCandidateName) E.==. sterm E.^. StudyTermsName
    return candidate
  -- Most SQL dialects won't allow deletion and queries on the same table at once, hence we delete in two steps.
  forM_ redundants $ \Entity{entityVal=StudyTermCandidate{..}} ->
    deleteWhere $   ( StudyTermCandidateIncidence ==. studyTermCandidateIncidence )
                :  ([ StudyTermCandidateKey       ==. studyTermCandidateKey       ]
                ||. [ StudyTermCandidateName      ==. studyTermCandidateName      ])
  return redundants


-- | Search for single candidates and memorize them as StudyTerms.
--   Should be called after @removeRedundant@ to increase success chances and reduce cost; otherwise memory heavy!
--   Does not delete the used candidates, user @removeRedundant@ for this later on.
--   Esqueleto does not provide the INTERESECT operator, thus
--   we load the table into Haskell and operate there. Memory usage problem? StudyTermsCandidate may become huge.
acceptSingletons :: DB [(STKey,Text)]
acceptSingletons = do
  knownKeys <- fmap unStudyTermsKey <$> selectKeysList [] [Asc StudyTermsKey]
  -- let knownKeysSet = Set.fromAscList knownKeys
  -- In case of memory problems, change next lines to conduit proper:
  incidences <- fmap entityVal <$> selectList [StudyTermCandidateKey /<-. knownKeys] [] -- LimitTo might be dangerous here, if we get a partial incidence. Possibly first select N incidences, then retrieving all those only.
  -- incidences <- E.select $ E.from $ \candidate -> do
  --   E.where_ $ candidate E.^. StudyTermCandidayeKey `E.notIn` E.valList knownKeys
  --   return candidate

  -- Possibly expensive pure computations follows. Break runDB to shorten transaction?
  let groupedCandidates :: Map STKey (Map UUID (Set Text))
      groupedCandidates = foldl' groupFun mempty incidences

      -- given a key, map each incidence to set of possible names for this key
      groupFun :: Map STKey (Map UUID (Set Text)) -> StudyTermCandidate -> Map STKey (Map UUID (Set Text))
      groupFun m StudyTermCandidate{..} =
        insertWith (Map.unionWith Set.union)
                   studyTermCandidateKey
                   (Map.singleton studyTermCandidateIncidence $ Set.singleton studyTermCandidateName)
                   m

      -- pointwise intersection per incidence gives possible candidates per key
      keyCandidates :: Map STKey (Set Text)
      keyCandidates = Map.map (setIntersections . Map.elems) groupedCandidates

      -- filter candidates having a unique possibility left
      fixedKeys :: [(STKey,Text)]
      fixedKeys = Map.foldlWithKey' combFixed [] keyCandidates

      combFixed :: [(STKey,Text)] -> STKey -> Set Text ->  [(STKey,Text)]
      combFixed acc k s | Set.size s == 1 -- possibly redundant
                        , [n] <- Set.elems s = (k,n):acc
                        -- empty sets should not occur here , if LDAP is consistent. Maybe raise a warning?!
                        | otherwise = acc

      -- registerFixed :: (STKey, Text) -> DB (Key StudyTerms)
      registerFixed :: (STKey, Text) -> DB ()
      registerFixed (key, name) =
        -- insertKey (StudyTermsKey key) $ StudyTerms key (Just $ shortenStudyTerm name) (Just name) -- name clash!
        void . insert $ StudyTerms key (Just $ shortenStudyTerm name) (Just name)


  -- register newly fixed candidates
  forM_ fixedKeys registerFixed
  return fixedKeys


  -- SOME EARLIER ATTEMPTS FOLLOW:
  --
  -- unknownKeys <- E.select $ E.distinct $ E.from $ \candidate -> do
  --    E.where_ $ E.notExists $ E.from $ \sterm ->
  --      E.where_ $ candidate E.^. StudyTermCandidateKey E.==. sterm E.^. StudyTermKey
  --    return $ candidate E.^. StudyTermCandidateKey
  -- forM unknownKeys $ \(E.Value key) -> do
  --   incidences <- E.select $ E.from $ \candidate -> do
  --     E.where_ $
  --
  -- -- DON'T KNOW HOW TO DO IN SQL :( BUT WE NEED THE ENTIRE TABLE ANYHOW
  -- candidates <- entityVal <$> selectList [] [] -- load entire candidate table
  -- -- create map from UUID to set of candidates for efficiency
  -- let collectCandidates m stc@StudyTermCandidate{studyTermCandidateIncidence=inci}
  --       = insertWith Set.union inci stc
  --     incidences = foldl collectCandidates Map.empty candidates
  --
  --     collectKeys m
  --     keySets = foldl collectKeys Map.empty candidates
  --
  -- -- StudyTermCandidateKey -> Set StudyTermCandidateName


-- | all existing StudyTerms that are contradiced by current observations
conflicts :: DB [Entity StudyTerms]
conflicts = E.select $ E.from $ \studyTerms -> do
  E.where_ $ E.not_ $ E.isNothing $ studyTerms E.^. StudyTermsName
  E.where_ $ E.exists $ E.from $ \candidateOne -> do
    E.where_ $ candidateOne E.^. StudyTermCandidateKey E.==. studyTerms E.^. StudyTermsKey
    E.where_ $ E.notExists . E.from $ \candidateTwo ->  do
        E.where_ $ candidateTwo E.^. StudyTermCandidateIncidence E.==. candidateOne E.^. StudyTermCandidateIncidence
        E.where_ $ studyTerms E.^. StudyTermsName E.==. E.just (candidateTwo E.^. StudyTermCandidateName)
  return studyTerms