feat(csv-import): automagically determine csv delimiters

This commit is contained in:
Gregor Kleen 2019-08-05 15:40:23 +02:00
parent 94a120808d
commit 3555322f2a
2 changed files with 70 additions and 3 deletions

View File

@ -75,6 +75,7 @@ dependencies:
- blaze-html - blaze-html
- conduit-resumablesink >=0.2 - conduit-resumablesink >=0.2
- parsec - parsec
- attoparsec
- uuid - uuid
- exceptions - exceptions
- stm - stm

View File

@ -19,6 +19,9 @@ import Import hiding (Header)
import Data.Csv import Data.Csv
import Data.Csv.Conduit import Data.Csv.Conduit
-- import qualified Data.Csv.Util as Csv
import qualified Data.Csv.Parser as Csv
import qualified Data.Conduit.List as C import qualified Data.Conduit.List as C
import qualified Data.Conduit.Combinators as C (sourceLazy) import qualified Data.Conduit.Combinators as C (sourceLazy)
@ -26,6 +29,11 @@ import qualified Data.Map as Map
import qualified Data.Vector as Vector import qualified Data.Vector as Vector
import qualified Data.HashMap.Strict as HashMap import qualified Data.HashMap.Strict as HashMap
import qualified Data.ByteString as BS
import qualified Data.ByteString.Lazy as LBS
import qualified Data.Attoparsec.ByteString.Lazy as A
deriving instance Typeable CsvParseError deriving instance Typeable CsvParseError
instance Exception CsvParseError instance Exception CsvParseError
@ -33,14 +41,71 @@ instance Exception CsvParseError
typeCsv, typeCsv' :: ContentType typeCsv, typeCsv' :: ContentType
typeCsv = "text/csv" typeCsv = "text/csv"
typeCsv' = "text/csv; charset=UTF-8; header=present" typeCsv' = BS.intercalate "; " [typeCsv, "charset=UTF-8", "header=present"]
extensionCsv :: Extension extensionCsv :: Extension
extensionCsv = fromMaybe "csv" $ listToMaybe [ ext | (ext, mime) <- Map.toList mimeMap, mime == typeCsv ] extensionCsv = fromMaybe "csv" $ listToMaybe [ ext | (ext, mime) <- Map.toList mimeMap, mime == typeCsv ]
decodeCsv :: (MonadThrow m, FromNamedRecord csv) => Conduit ByteString m csv decodeCsv :: (MonadThrow m, FromNamedRecord csv, MonadLogger m) => Conduit ByteString m csv
decodeCsv = transPipe throwExceptT $ fromNamedCsv defaultDecodeOptions decodeCsv = transPipe throwExceptT $ do
testBuffer <- accumTestBuffer LBS.empty
mapM_ leftover $ LBS.toChunks testBuffer
let decodeOptions = guessDecodeOptions testBuffer
$logInfoS "decodeCsv" [st|Guessed Csv.DecodeOptions from buffer of size #{tshow (LBS.length testBuffer)}/#{tshow testBufferSize}: #{tshow decodeOptions}|]
fromNamedCsv decodeOptions
where
testBufferSize = 4096
accumTestBuffer acc
| LBS.length acc >= testBufferSize = return acc
| otherwise = do
frag <- await
case frag of
Nothing -> return acc
Just frag' -> accumTestBuffer (acc <> LBS.fromStrict frag')
guessDecodeOptions testBuffer
| Just firstDQuote <- doubleQuote `LBS.elemIndex` testBuffer
= if
| firstDQuote /= 0
-> Csv.DecodeOptions $ testBuffer `LBS.index` pred firstDQuote
| A.Done unused _ <- A.parse quotedField testBuffer
-> case A.parse endOfLine unused of
A.Fail _ _ _
| Just (nextChar, _) <- LBS.uncons unused
-> defaultDecodeOptions { Csv.decDelimiter = nextChar }
_other -> guessDecodeOptions $ LBS.take firstDQuote testBuffer <> unused
| otherwise
-> defaultDecodeOptions -- Parsing of something, which should be a quoted field, failed; bail now
| A.Done _ ls <- A.parse (A.many1 $ A.manyTill A.anyWord8 endOfLine) testBuffer
, (h:hs) <- filter (not . Map.null) $ map (fmap getSum . Map.unionsWith mappend . map (flip Map.singleton $ Sum 1)) ls
, Just equals <- fromNullable $ Map.filterWithKey (\c n -> all ((== Just n) . Map.lookup c) hs) h
, let maxH = maximum equals
, [d] <- filter ((== Just maxH) . flip Map.lookup (toNullable equals)) . Map.keys $ toNullable equals
= defaultDecodeOptions { Csv.decDelimiter = d }
| otherwise
= defaultDecodeOptions
quotedField :: A.Parser () -- We don't care about the return value
quotedField = void . Csv.field $ Csv.decDelimiter defaultDecodeOptions -- We can use comma as a separator, because we know that the field we're trying to parse is quoted and so does not rely on the delimiter
endOfLine :: A.Parser ()
endOfLine = asum
[ void $ A.word8 newline
, mapM_ (void . A.word8) [cr, newline]
, void $ A.word8 cr
]
doubleQuote, newline, cr :: Word8
doubleQuote = 34
newline = 10
cr = 13
encodeCsv :: ( ToNamedRecord csv encodeCsv :: ( ToNamedRecord csv
, DefaultOrdered csv , DefaultOrdered csv
@ -70,6 +135,7 @@ respondCsvDB src = respondSourceDB typeCsv' $ src .| encodeCsv .| awaitForever s
fileSourceCsv :: ( FromNamedRecord csv fileSourceCsv :: ( FromNamedRecord csv
, MonadResource m , MonadResource m
, MonadLogger m
) )
=> FileInfo => FileInfo
-> Source m csv -> Source m csv