feat(csv-import): automagically determine csv delimiters

2019-08-05 15:40:23 +02:00 · 2019-08-05 15:40:23 +02:00 · 3555322f2a
commit 3555322f2a
parent 94a120808d
2 changed files with 70 additions and 3 deletions
--- a/package.yaml
+++ b/package.yaml
@ -75,6 +75,7 @@ dependencies:
  - blaze-html
  - conduit-resumablesink >=0.2
  - parsec
+  - attoparsec
  - uuid
  - exceptions
  - stm
--- a/src/Handler/Utils/Csv.hs
+++ b/src/Handler/Utils/Csv.hs
@ -19,6 +19,9 @@ import Import hiding (Header)
 import Data.Csv
 import Data.Csv.Conduit

+-- import qualified Data.Csv.Util as Csv
+import qualified Data.Csv.Parser as Csv
+
 import qualified Data.Conduit.List as C
 import qualified Data.Conduit.Combinators as C (sourceLazy)

@ -26,6 +29,11 @@ import qualified Data.Map as Map
 import qualified Data.Vector as Vector
 import qualified Data.HashMap.Strict as HashMap

+import qualified Data.ByteString as BS
+import qualified Data.ByteString.Lazy as LBS
+
+import qualified Data.Attoparsec.ByteString.Lazy as A
+

 deriving instance Typeable CsvParseError
 instance Exception CsvParseError
@ -33,14 +41,71 @@ instance Exception CsvParseError

 typeCsv, typeCsv' :: ContentType
 typeCsv = "text/csv"
-typeCsv' = "text/csv; charset=UTF-8; header=present"
+typeCsv' = BS.intercalate "; " [typeCsv, "charset=UTF-8", "header=present"]

 extensionCsv :: Extension
 extensionCsv = fromMaybe "csv" $ listToMaybe [ ext | (ext, mime) <- Map.toList mimeMap, mime == typeCsv ]


-decodeCsv :: (MonadThrow m, FromNamedRecord csv) => Conduit ByteString m csv
-decodeCsv = transPipe throwExceptT $ fromNamedCsv defaultDecodeOptions
+decodeCsv :: (MonadThrow m, FromNamedRecord csv, MonadLogger m) => Conduit ByteString m csv
+decodeCsv = transPipe throwExceptT $ do
+  testBuffer <- accumTestBuffer LBS.empty
+  mapM_ leftover $ LBS.toChunks testBuffer
+
+  let decodeOptions = guessDecodeOptions testBuffer
+  $logInfoS "decodeCsv" [st|Guessed Csv.DecodeOptions from buffer of size #{tshow (LBS.length testBuffer)}/#{tshow testBufferSize}: #{tshow decodeOptions}|]
+  
+  fromNamedCsv decodeOptions
+  where
+    testBufferSize = 4096
+    accumTestBuffer acc
+      | LBS.length acc >= testBufferSize = return acc
+      | otherwise = do
+        frag <- await
+        case frag of
+          Nothing -> return acc
+          Just frag' -> accumTestBuffer (acc <> LBS.fromStrict frag')
+
+    guessDecodeOptions testBuffer
+      | Just firstDQuote <- doubleQuote `LBS.elemIndex` testBuffer
+      = if
+          | firstDQuote /= 0
+            -> Csv.DecodeOptions $ testBuffer `LBS.index` pred firstDQuote
+          | A.Done unused _ <- A.parse quotedField testBuffer
+            -> case A.parse endOfLine unused of
+              A.Fail _ _ _
+                | Just (nextChar, _) <- LBS.uncons unused
+                  -> defaultDecodeOptions { Csv.decDelimiter = nextChar }
+              _other -> guessDecodeOptions $ LBS.take firstDQuote testBuffer <> unused
+          | otherwise
+            -> defaultDecodeOptions -- Parsing of something, which should be a quoted field, failed; bail now
+      | A.Done _ ls <- A.parse (A.many1 $ A.manyTill A.anyWord8 endOfLine) testBuffer
+      , (h:hs) <- filter (not . Map.null) $ map (fmap getSum . Map.unionsWith mappend . map (flip Map.singleton $ Sum 1)) ls
+      , Just equals <- fromNullable $ Map.filterWithKey (\c n -> all ((== Just n) . Map.lookup c) hs) h
+      , let maxH = maximum equals
+      , [d] <- filter ((== Just maxH) . flip Map.lookup (toNullable equals)) . Map.keys $ toNullable equals
+      = defaultDecodeOptions { Csv.decDelimiter = d }
+      | otherwise
+      = defaultDecodeOptions
+
+
+    quotedField :: A.Parser () -- We don't care about the return value
+    quotedField = void . Csv.field $ Csv.decDelimiter defaultDecodeOptions -- We can use comma as a separator, because we know that the field we're trying to parse is quoted and so does not rely on the delimiter
+
+
+    endOfLine :: A.Parser ()
+    endOfLine = asum
+      [ void $ A.word8 newline
+      , mapM_ (void . A.word8) [cr, newline]
+      , void $ A.word8 cr
+      ]
+
+    doubleQuote, newline, cr :: Word8
+    doubleQuote = 34
+    newline = 10
+    cr = 13
+
+      

 encodeCsv :: ( ToNamedRecord csv
             , DefaultOrdered csv
@ -70,6 +135,7 @@ respondCsvDB src = respondSourceDB typeCsv' $ src .| encodeCsv .| awaitForever s

 fileSourceCsv :: ( FromNamedRecord csv
                 , MonadResource m
+                 , MonadLogger m
                 )
              => FileInfo
              -> Source m csv