fradrive/src/Handler/Utils/Csv.hs
2022-10-12 09:35:16 +02:00

414 lines
16 KiB
Haskell

-- SPDX-FileCopyrightText: 2022 Gregor Kleen <gregor.kleen@ifi.lmu.de>,Steffen Jost <jost@tcs.ifi.lmu.de>
--
-- SPDX-License-Identifier: AGPL-3.0-or-later
{-# OPTIONS_GHC -fno-warn-orphans #-}
module Handler.Utils.Csv
( decodeCsv, decodeCsvPositional, decodeCsvWith
, encodeCsv, encodeCsvWith, encodeCsvRendered, encodeCsvRenderedWith
, csvRenderedToTypedContent, csvRenderedToTypedContentWith
, expectedCsvFormat, expectedCsvContentType
, encodeDefaultOrderedCsv
, respondCsv, respondCsvDB
, respondDefaultOrderedCsv, respondDefaultOrderedCsvDB
, fileSourceCsv, fileSourceCsvPositional
, partIsAttachmentCsv, setContentDispositionCsv
, csvOptionsForFormat
, CsvParseError(..)
, ToNamedRecord(..), FromNamedRecord(..)
, DefaultOrdered(..)
, ToField(..), FromField(..)
, recodeCsv
) where
import Import hiding (Header, mapM_)
import Data.Csv
import Data.Csv.Conduit
import Handler.Utils.Form (uploadContents)
import Handler.Utils.ContentDisposition (setContentDisposition')
import Control.Monad (mapM_)
-- import qualified Data.Csv.Util as Csv
import qualified Data.Csv.Parser as Csv
import qualified Data.Conduit.List as C (mapMaybe)
import qualified Data.Conduit.Combinators as C
import qualified Data.Map as Map
import qualified Data.Vector as Vector
import qualified Data.HashMap.Strict as HashMap
import qualified Data.ByteString.Lazy as LBS
import qualified Data.Attoparsec.ByteString.Lazy as A
import Data.Time.Format (iso8601DateFormat)
import qualified Data.Char as Char
import Control.Monad.Error.Class (MonadError(..))
import Data.Time.Clock.POSIX (getPOSIXTime)
import qualified Data.Time.Format as Time
-- import qualified Codec.Archive.Zip as Zip
_haltingCsvParseError :: Prism' CsvParseError CsvStreamHaltParseError
_haltingCsvParseError = prism' (\(HaltingCsvParseError bs t) -> CsvParseError bs t) $ \case
CsvParseError bs t -> Just $ HaltingCsvParseError bs t
_other -> Nothing
_csvStreamRecordParseError :: Prism' CsvParseError CsvStreamRecordParseError
_csvStreamRecordParseError = prism' (\(CsvStreamRecordParseError t) -> IncrementalError t) $ \case
IncrementalError t -> Just $ CsvStreamRecordParseError t
_other -> Nothing
throwIncrementalErrors :: MonadError CsvParseError m => ConduitT (Either CsvStreamRecordParseError a) a m ()
throwIncrementalErrors = C.mapM $ either (throwError . review _csvStreamRecordParseError) return
newtype MaybeEmptyRecord csv = MaybeEmptyRecord { unMaybeEmptyRecord :: Maybe csv }
instance FromNamedRecord csv => FromNamedRecord (MaybeEmptyRecord csv) where
parseNamedRecord r
| all null r = pure $ MaybeEmptyRecord Nothing
| otherwise = MaybeEmptyRecord . Just <$> parseNamedRecord r
instance FromRecord csv => FromRecord (MaybeEmptyRecord csv) where
parseRecord r
| all null r = pure $ MaybeEmptyRecord Nothing
| otherwise = MaybeEmptyRecord . Just <$> parseRecord r
decodeCsv :: (MonadHandler m, HandlerSite m ~ UniWorX, MonadThrow m, FromNamedRecord csv) => ConduitT ByteString csv m ()
decodeCsv = decodeCsv' $ \opts -> fromNamedCsvStreamError opts (review _haltingCsvParseError) .| throwIncrementalErrors
decodeCsvPositional :: (MonadHandler m, HandlerSite m ~ UniWorX, MonadThrow m, FromRecord csv) => HasHeader -> ConduitT ByteString csv m ()
decodeCsvPositional hdr = decodeCsv' $ \opts -> fromCsvStreamError opts hdr (review _haltingCsvParseError) .| throwIncrementalErrors
decodeCsvWith :: (MonadHandler m, HandlerSite m ~ UniWorX, MonadThrow m, FromNamedRecord csv, FromRecord csv) => CsvOptions -> ConduitT ByteString csv m ()
decodeCsvWith opts
| csvIncludeHeader fmtOpts
= decodeCsv' $ \_ -> fromNamedCsvStreamError decOpts (review _haltingCsvParseError) .| throwIncrementalErrors
| otherwise
= decodeCsv' $ \_ -> fromCsvStreamError decOpts NoHeader (review _haltingCsvParseError) .| throwIncrementalErrors
where
fmtOpts = csvFormat opts
decOpts = DecodeOptions { decDelimiter = fromIntegral $ Char.ord $ csvDelimiter fmtOpts }
decodeCsv' :: forall csv m.
( MonadHandler m, HandlerSite m ~ UniWorX
, MonadThrow m
)
=> (forall m'. Monad m' => DecodeOptions -> ConduitT ByteString (MaybeEmptyRecord csv) (ExceptT CsvParseError m') ())
-> ConduitT ByteString csv m ()
decodeCsv' fromCsv' = do
encOpts <- maybe def (userCsvOptions . entityVal) <$> lift maybeAuth
recodeCsv encOpts False decodeCsv'' .| C.mapMaybe unMaybeEmptyRecord
where
decodeCsv'' = transPipe throwExceptT $ do
testBuffer <- accumTestBuffer LBS.empty
mapM_ leftover $ LBS.toChunks testBuffer
let decodeOptions = defaultDecodeOptions
& guessDelimiter testBuffer
$logInfoS "decodeCsv" [st|Guessed Csv.DecodeOptions from buffer of size #{tshow (LBS.length testBuffer)}/#{tshow testBufferSize}: #{tshow decodeOptions}|]
fromCsv' decodeOptions
testBufferSize :: Num a => a
testBufferSize = 4096
accumTestBuffer acc
| LBS.length acc >= testBufferSize = return acc
| otherwise = do
frag <- await
case frag of
Nothing -> return acc
Just frag' -> accumTestBuffer (acc <> LBS.fromStrict frag')
guessDelimiter testBuffer
| Just firstDQuote <- doubleQuote `LBS.elemIndex` testBuffer
= if
| firstDQuote /= 0
-> \x -> x { Csv.decDelimiter = testBuffer `LBS.index` pred firstDQuote }
| A.Done unused _ <- A.parse quotedField testBuffer
-> case A.parse endOfLine unused of
A.Fail{}
| Just (nextChar, _) <- LBS.uncons unused
-> \x -> x { Csv.decDelimiter = nextChar }
_other -> guessDelimiter $ LBS.take firstDQuote testBuffer <> unused
| otherwise
-> id -- Parsing of something, which should be a quoted field, failed; bail now
| A.Done _ ls <- A.parse (A.many1 $ A.manyTill A.anyWord8 endOfLine) testBuffer
, (h:hs) <- filter (not . Map.null) $ map (fmap getSum . Map.unionsWith mappend . map (flip Map.singleton $ Sum 1) . filter (not . isAlphaNum')) ls
, Just equals <- fromNullable $ Map.filterWithKey (\c n -> all ((== Just n) . Map.lookup c) hs) h
, let maxH = maximum equals
, [d] <- filter ((== Just maxH) . flip Map.lookup (toNullable equals)) . Map.keys $ toNullable equals
= \x -> x { Csv.decDelimiter = d }
| otherwise
= id
isAlphaNum' = Char.isAlphaNum . Char.chr . fromIntegral
quotedField :: A.Parser () -- We don't care about the return value
quotedField = void . Csv.field $ Csv.decDelimiter defaultDecodeOptions -- We can use comma as a separator, because we know that the field we're trying to parse is quoted and so does not rely on the delimiter
endOfLine :: A.Parser ()
endOfLine = asum
[ void $ A.word8 newline
, mapM_ A.word8 [cr, newline]
, void $ A.word8 cr
]
doubleQuote, newline, cr :: Word8
doubleQuote = 34
newline = 10
cr = 13
encodeCsv :: ( ToNamedRecord csv
, MonadHandler m
, HandlerSite m ~ UniWorX
, RenderMessage UniWorX msg
)
=> msg -- ^ Sheet name for .xlsx
-> Header
-> ConduitT csv ByteString m CsvFormat
-- ^ Encode a stream of records
--
-- Currently not streaming
encodeCsv sheetName hdr = do
encOpts <- maybe def (userCsvOptions . entityVal) <$> lift maybeAuth
encodeCsvWith encOpts sheetName hdr
encodeCsvWith :: ( ToNamedRecord csv
, MonadHandler m
, HandlerSite m ~ UniWorX
, RenderMessage UniWorX msg
)
=> CsvOptions
-> msg -- ^ Sheet name for .xlsx
-> Header
-> ConduitT csv ByteString m CsvFormat
-- ^ Encode a stream of records
--
-- Currently not streaming
encodeCsvWith encOpts sheetName hdr = transPipe liftHandler $ case encOpts ^. _csvFormat of
CsvFormatOptions{}
| Just csvOpts <- encOpts ^? _csvFormat . _CsvEncodeOptions, has (_csvFormat . _CsvFormat . _FormatCsv) encOpts -> do
(C.sourceLazy . encodeByNameWith csvOpts hdr =<< C.foldMap pure) .| recode'
return FormatCsv
| otherwise -> error "encOpts is CsvFormatOptions but cannot be converted via _CsvEncodeOptions or hasn't _FormatCsv"
CsvXlsxFormatOptions{}
| has (_csvFormat . _CsvFormat . _FormatXlsx) encOpts -> do
rendered <- toCsvRendered hdr <$> C.foldMap (pure @Seq)
sheetName' <- ($ sheetName) <$> getMessageRender
pNow <- liftIO getPOSIXTime
C.sourceLazy (fromXlsx pNow $ csvRenderedToXlsx sheetName' rendered) .| recode'
return FormatXlsx
| otherwise -> error "encOpts hasn't _FormatXlsx"
where recode' = recodeCsv encOpts True $ C.map id
encodeCsvRendered :: ( MonadHandler m
, HandlerSite m ~ UniWorX
, RenderMessage UniWorX msg
)
=> msg -- ^ Sheet name for .xlsx
-> CsvRendered
-> m (CsvFormat, LBS.ByteString)
encodeCsvRendered sheetName CsvRendered{..} = runConduit $ yieldMany csvRenderedData .| (encodeCsv sheetName csvRenderedHeader `fuseBoth` C.sinkLazy)
encodeCsvRenderedWith :: ( MonadHandler m
, HandlerSite m ~ UniWorX
, RenderMessage UniWorX msg
)
=> CsvOptions
-> msg -- ^ Sheet name for .xlsx
-> CsvRendered
-> m (CsvFormat, LBS.ByteString)
encodeCsvRenderedWith encOpts sheetName CsvRendered{..} = runConduit $ yieldMany csvRenderedData .| (encodeCsvWith encOpts sheetName csvRenderedHeader `fuseBoth` C.sinkLazy)
csvRenderedToTypedContent :: ( MonadHandler m
, HandlerSite m ~ UniWorX
, RenderMessage UniWorX msg
)
=> msg -- ^ Sheet name for .xlsx
-> CsvRendered
-> m TypedContent
csvRenderedToTypedContent sheetName csvRendered = do
encOpts <- maybe def (userCsvOptions . entityVal) <$> maybeAuth
csvRenderedToTypedContentWith encOpts sheetName csvRendered
csvRenderedToTypedContentWith :: ( MonadHandler m
, HandlerSite m ~ UniWorX
, RenderMessage UniWorX msg
)
=> CsvOptions
-> msg -- ^ Sheet name for .xlsx
-> CsvRendered
-> m TypedContent
csvRenderedToTypedContentWith encOpts sheetName csvRendered = do
(csvFormat, resp) <- encodeCsvRenderedWith encOpts sheetName csvRendered
let cType = case csvFormat of
FormatCsv -> typeCsv'
FormatXlsx -> typeXlsx
return . TypedContent cType $ toContent resp
timestampCsv :: ( MonadHandler m
, HandlerSite m ~ UniWorX
)
=> m (FilePath -> FilePath)
timestampCsv = do
csvOpts <- fmap (maybe def $ userCsvOptions . entityVal) maybeAuth
timestampCsv' csvOpts . review _Wrapped =<< languages
timestampCsv' :: MonadIO m
=> CsvOptions -> Languages -> m (FilePath -> FilePath)
timestampCsv' csvOpts (Languages langs) = liftIO $ if
| csvOpts ^. _csvTimestamp -> do
ts <- getCurrentTime <&> Time.formatTime (getTimeLocale' langs) (iso8601DateFormat $ Just "%H%M")
return $ (<>) (ts <> "-")
| otherwise -> return id
expectedCsvFormat :: ( MonadHandler m
, HandlerSite m ~ UniWorX
)
=> m CsvFormat
expectedCsvFormat = view (_csvFormat . _CsvFormat) . maybe def (userCsvOptions . entityVal) <$> maybeAuth
expectedCsvContentType :: ( MonadHandler m
, HandlerSite m ~ UniWorX
)
=> m ContentType
expectedCsvContentType = expectedCsvFormat <&> \case
FormatCsv -> typeCsv'
FormatXlsx -> typeXlsx
partIsAttachmentCsv :: (RenderMessage UniWorX msg, MonadMail m, HandlerSite m ~ UniWorX)
=> msg
-> StateT Part m ()
partIsAttachmentCsv fName' = do
csvOpts <- lift askMailCsvOptions
langs <- lift askMailLanguages
fName <- ($ fName') <$> lift getMailMessageRender
ts <- timestampCsv' csvOpts langs
let ext = case csvOpts ^. _csvFormat . _CsvFormat of
FormatCsv -> extensionCsv
FormatXlsx -> extensionXlsx
partIsAttachment . ts $ unpack fName `addExtension` unpack ext
setContentDispositionCsv :: (RenderMessage UniWorX msg, MonadHandler m, HandlerSite m ~ UniWorX)
=> msg
-> m ()
setContentDispositionCsv fName' = do
fName <- unpack . ($ fName') <$> getMessageRender
ts <- timestampCsv
fmt <- expectedCsvFormat
let ext = case fmt of
FormatCsv -> extensionCsv
FormatXlsx -> extensionXlsx
setContentDisposition' . Just $ ensureExtension (unpack ext) (ts fName)
encodeDefaultOrderedCsv :: forall csv m msg.
( ToNamedRecord csv
, DefaultOrdered csv
, MonadHandler m
, HandlerSite m ~ UniWorX
, RenderMessage UniWorX msg
)
=> msg -- ^ Sheet name for .xlsx
-> ConduitT csv ByteString m CsvFormat
encodeDefaultOrderedCsv sheetName = encodeCsv sheetName $ headerOrder (error "headerOrder" :: csv)
respondCsv :: ( ToNamedRecord csv
, RenderMessage UniWorX msg
)
=> msg -- ^ Sheet name for .xlsx
-> Header
-> ConduitT () csv Handler ()
-> Handler TypedContent
respondCsv sheetName hdr src = do
cType <- expectedCsvContentType
respondSource cType $ src .| void (encodeCsv sheetName hdr) .| awaitForever sendChunk
respondDefaultOrderedCsv :: forall csv msg.
( ToNamedRecord csv
, DefaultOrdered csv
, RenderMessage UniWorX msg
)
=> msg -- ^ Sheet name for .xlsx
-> ConduitT () csv Handler ()
-> Handler TypedContent
respondDefaultOrderedCsv sheetName = respondCsv sheetName $ headerOrder (error "headerOrder" :: csv)
respondCsvDB :: ( ToNamedRecord csv
, RenderMessage UniWorX msg
)
=> msg -- ^ Sheet name for .xlsx
-> Header
-> ConduitT () csv DB ()
-> Handler TypedContent
respondCsvDB sheetName hdr src = do
cType <- expectedCsvContentType
respondSourceDB cType $ src .| void (encodeCsv sheetName hdr) .| awaitForever sendChunk
respondDefaultOrderedCsvDB :: forall csv msg.
( ToNamedRecord csv
, DefaultOrdered csv
, RenderMessage UniWorX msg
)
=> msg -- ^ Sheet name for .xlsx
-> ConduitT () csv DB ()
-> Handler TypedContent
respondDefaultOrderedCsvDB sheetName = respondCsvDB sheetName $ headerOrder (error "headerOrder" :: csv)
fileSourceCsv :: ( FromNamedRecord csv
, MonadThrow m
, MonadHandler m
, HandlerSite m ~ UniWorX
)
=> ConduitT FileReference csv m ()
fileSourceCsv = uploadContents .| decodeCsv
fileSourceCsvPositional :: ( MonadHandler m
, HandlerSite m ~ UniWorX
, MonadThrow m
, FromRecord csv
)
=> HasHeader
-> ConduitT FileReference csv m ()
fileSourceCsvPositional hdr = uploadContents .| decodeCsvPositional hdr
instance ToWidget UniWorX CsvRendered where
toWidget CsvRendered{..} = liftWidget $(widgetFile "widgets/csvRendered")
where
csvData = [ [ decodeUtf8 <$> HashMap.lookup columnKey row
| columnKey <- Vector.toList csvRenderedHeader
]
| row <- csvRenderedData
]
headers = decodeUtf8 <$> Vector.toList csvRenderedHeader
csvOptionsForFormat :: ( MonadHandler m, HandlerSite m ~ UniWorX )
=> CsvFormat
-> m CsvOptions
csvOptionsForFormat fmt = do
csvOpts <- fmap (maybe def $ userCsvOptions . entityVal) maybeAuth
return $ if
| fmt == csvOpts ^. _csvFormat . _CsvFormat
-> csvOpts
| otherwise
-> csvOpts & _csvFormat .~ (csvPreset . _CsvFormatPreset # fmt)