copied some of cassavas parsing stuff. they omptimize pretty well.

2016-06-26 13:48:53 -04:00 · 2016-06-26 13:48:53 -04:00 · cb11a8bd9a
commit cb11a8bd9a
parent db725eba69
4 changed files with 234 additions and 1 deletions
--- a/siphon/siphon.cabal
+++ b/siphon/siphon.cabal
@ -20,6 +20,7 @@ library
    Siphon
    Siphon.Types
    Siphon.Encoding
+    Siphon.Internal
  build-depends:
      base >= 4.7 && < 5
    , colonnade
@ -28,6 +29,7 @@ library
    , contravariant
    , vector
    , pipes
+    , attoparsec
  default-language:    Haskell2010

 source-repository head
--- a/siphon/src/Siphon/Decoding.hs
+++ b/siphon/src/Siphon/Decoding.hs
@ -1,6 +1,7 @@
 module Siphon.Decoding where

 import Siphon.Types
+import qualified Data.Attoparsec.Types as Atto

 -- unrow :: c1 -> (Vector c2,c1)
 -- 
@ -8,3 +9,33 @@ import Siphon.Types
 --     -> Decoding (Indexed f) c a
 --     -> Vector c
 --     -> Either DecodingErrors a
+
+pipe :: SiphonDecoding c1 c2 
+     -> Atto.Parser c1 (WithEnd c2) 
+     -> Pipe c1 (Vector c2) m String
+pipe (SiphonDecoding parse isNull) p = do
+  c1 <- awaitSkip isNull
+  case parse p c1 of
+
+awaitSkip :: (a -> Bool)
+          -> Consumer' a m a
+awaitSkip f = go where
+  go = do
+    a <- await
+    if f a then go else return a 
+
+nextSkipEmpty
+  :: (Monad m, Eq a, Monoid a)
+  => Producer a m r
+  -> m (Either r (a, Producer a m r))
+nextSkipEmpty = go where
+    go p0 = do
+      x <- next p0
+      case x of
+         Left  _        -> return x
+         Right (a,p1)
+          | a == mempty -> go p1
+          | otherwise   -> return x
+{-# INLINABLE nextSkipEmpty #-}
+
+
--- a/siphon/src/Siphon/Internal.hs
+++ b/siphon/src/Siphon/Internal.hs
@ -0,0 +1,188 @@
+{-# LANGUAGE BangPatterns #-}
+
+-- | A CSV parser. The parser defined here is RFC 4180 compliant, with
+-- the following extensions:
+--
+--  * Empty lines are ignored.
+--
+--  * Non-escaped fields may contain any characters except
+--    double-quotes, commas, carriage returns, and newlines.
+--
+--  * Escaped fields may contain any characters (but double-quotes
+--    need to be escaped).
+--
+-- The functions in this module can be used to implement e.g. a
+-- resumable parser that is fed input incrementally.
+module Siphon.Internal where
+
+import Siphon.Types
+
+import Data.ByteString.Builder (toLazyByteString,byteString)
+import qualified Data.ByteString.Char8 as BC8
+import Control.Applicative (optional)
+import Data.Attoparsec.ByteString.Char8 (char, endOfInput, string)
+import qualified Data.Attoparsec.ByteString as A
+import qualified Data.Attoparsec.Lazy as AL
+import qualified Data.Attoparsec.Zepto as Z
+import qualified Data.ByteString as S
+import qualified Data.ByteString.Unsafe as S
+import qualified Data.Vector as V
+import qualified Data.ByteString as B
+import qualified Data.ByteString.Lazy as LByteString
+import Data.Word (Word8)
+import Data.Vector (Vector)
+import Data.ByteString (ByteString)
+
+import Control.Applicative
+import Data.Monoid
+
+-- parse :: Monad m
+--       => SiphonDecoding c1 c2
+--       -> Atto.Parser a b -- ^ Attoparsec parser
+--       -> Pipes.Parser a m (Maybe (Either ParsingError b)) -- ^ Pipes parser
+-- parse parser = S.StateT $ \p0 -> do
+--     x <- nextSkipEmpty p0
+--     case x of
+--       Left r       -> return (Nothing, return r)
+--       Right (a,p1) -> step (yield a >>) (_parse parser a) p1
+--   where
+--     step diffP res p0 = case res of
+--       Fail _ c m -> return (Just (Left (ParsingError c m)), diffP p0)
+--       Done a b   -> return (Just (Right b), yield a >> p0)
+--       Partial k  -> do
+--         x <- nextSkipEmpty p0
+--         case x of
+--           Left e -> step diffP (k mempty) (return e)
+--           Right (a,p1) -> step (diffP . (yield a >>)) (k a) p1
+
+-- | Specialized version of 'sepBy1'' which is faster due to not
+-- accepting an arbitrary separator.
+sepByDelim1' :: AL.Parser a
+             -> Word8  -- ^ Field delimiter
+             -> AL.Parser [a]
+sepByDelim1' p !delim = liftM2' (:) p loop
+  where
+    loop = do
+        mb <- A.peekWord8
+        case mb of
+            Just b | b == delim -> liftM2' (:) (A.anyWord8 *> p) loop
+            _                   -> pure []
+{-# INLINE sepByDelim1' #-}
+
+-- | Specialized version of 'sepBy1'' which is faster due to not
+-- accepting an arbitrary separator.
+sepByEndOfLine1' :: AL.Parser a
+                 -> AL.Parser [a]
+sepByEndOfLine1' p = liftM2' (:) p loop
+  where
+    loop = do
+        mb <- A.peekWord8
+        case mb of
+            Just b | b == cr ->
+                liftM2' (:) (A.anyWord8 *> A.word8 newline *> p) loop
+                   | b == newline ->
+                liftM2' (:) (A.anyWord8 *> p) loop
+            _ -> pure []
+{-# INLINE sepByEndOfLine1' #-}
+
+-- | Parse a record, not including the terminating line separator. The
+-- terminating line separate is not included as the last record in a
+-- CSV file is allowed to not have a terminating line separator. You
+-- most likely want to use the 'endOfLine' parser in combination with
+-- this parser.
+row :: Word8  -- ^ Field delimiter
+    -> AL.Parser (Vector ByteString)
+row !delim = V.fromList <$!> field delim `sepByDelim1'` delim <* endOfLine
+{-# INLINE row #-}
+
+removeBlankLines :: [Vector ByteString] -> [Vector ByteString]
+removeBlankLines = filter (not . blankLine)
+
+-- | Parse a field. The field may be in either the escaped or
+-- non-escaped format. The return value is unescaped.
+field :: Word8 -> AL.Parser ByteString
+field !delim = do
+    mb <- A.peekWord8
+    -- We purposely don't use <|> as we want to commit to the first
+    -- choice if we see a double quote.
+    case mb of
+        Just b | b == doubleQuote -> escapedField
+        _                         -> unescapedField delim
+{-# INLINE field #-}
+
+escapedField :: AL.Parser S.ByteString
+escapedField = do
+    _ <- dquote
+    -- The scan state is 'True' if the previous character was a double
+    -- quote.  We need to drop a trailing double quote left by scan.
+    s <- S.init <$> (A.scan False $ \s c -> if c == doubleQuote
+                                            then Just (not s)
+                                            else if s then Nothing
+                                                 else Just False)
+    if doubleQuote `S.elem` s
+        then case Z.parse unescape s of
+            Right r  -> return r
+            Left err -> fail err
+        else return s
+
+unescapedField :: Word8 -> AL.Parser S.ByteString
+unescapedField !delim = A.takeWhile (\ c -> c /= doubleQuote &&
+                                            c /= newline &&
+                                            c /= delim &&
+                                            c /= cr)
+
+dquote :: AL.Parser Char
+dquote = char '"'
+
+-- | This could be improved. We could avoid the builder and just
+-- write to a buffer directly.
+unescape :: Z.Parser S.ByteString
+unescape = (LByteString.toStrict . toLazyByteString) <$!> go mempty where
+  go acc = do
+    h <- Z.takeWhile (/= doubleQuote)
+    let rest = do
+          start <- Z.take 2
+          if (S.unsafeHead start == doubleQuote &&
+              S.unsafeIndex start 1 == doubleQuote)
+              then go (acc `mappend` byteString h `mappend` byteString (BC8.singleton '"'))
+              else fail "invalid CSV escape sequence"
+    done <- Z.atEnd
+    if done
+      then return (acc `mappend` byteString h)
+      else rest
+
+-- | A strict version of 'Data.Functor.<$>' for monads.
+(<$!>) :: Monad m => (a -> b) -> m a -> m b
+f <$!> m = do
+    a <- m
+    return $! f a
+{-# INLINE (<$!>) #-}
+
+infixl 4 <$!>
+
+-- | Is this an empty record (i.e. a blank line)?
+blankLine :: V.Vector B.ByteString -> Bool
+blankLine v = V.length v == 1 && (B.null (V.head v))
+
+-- | A version of 'liftM2' that is strict in the result of its first
+-- action.
+liftM2' :: (Monad m) => (a -> b -> c) -> m a -> m b -> m c
+liftM2' f a b = do
+    !x <- a
+    y <- b
+    return (f x y)
+{-# INLINE liftM2' #-}
+
+
+-- | Match either a single newline character @\'\\n\'@, or a carriage
+-- return followed by a newline character @\"\\r\\n\"@, or a single
+-- carriage return @\'\\r\'@.
+endOfLine :: A.Parser ()
+endOfLine = (A.word8 newline *> return ()) <|> (string (BC8.pack "\r\n") *> return ()) <|> (A.word8 cr *> return ())
+{-# INLINE endOfLine #-}
+
+doubleQuote, newline, cr :: Word8
+doubleQuote = 34
+newline = 10
+cr = 13
+
--- a/siphon/src/Siphon/Types.hs
+++ b/siphon/src/Siphon/Types.hs
@ -1,6 +1,7 @@
 module Siphon.Types where

 import Data.Vector (Vector)
+import qualified Data.Attoparsec.Types as Atto

 newtype Escaped c = Escaped { getEscaped :: c }

@ -12,6 +13,17 @@ data Siphon c1 c2 = Siphon
  , siphonIntercalate :: !(Vector (Escaped c2) -> c2)
  }

-- data Clarify = Clarify
+data SiphonDecoding c1 c2 = SiphonDecoding
+  { siphonDecodingParse :: Atto.Parser c1 c2 -> c1 -> Atto.IResult c1 c2
+  , siphonDecodingNull  :: c1 -> Bool
+  }
+
+data WithEnd c = WithEnd
+  { withEndEnded :: Bool
+  , withEndContent :: c
+  }
+
+-- data SiphonDecodingError
 --   { clarify 
 --   }
+