copied some of cassavas parsing stuff. they omptimize pretty well.

This commit is contained in:
Andrew Martin 2016-06-26 13:48:53 -04:00
parent db725eba69
commit cb11a8bd9a
4 changed files with 234 additions and 1 deletions

View File

@ -20,6 +20,7 @@ library
Siphon
Siphon.Types
Siphon.Encoding
Siphon.Internal
build-depends:
base >= 4.7 && < 5
, colonnade
@ -28,6 +29,7 @@ library
, contravariant
, vector
, pipes
, attoparsec
default-language: Haskell2010
source-repository head

View File

@ -1,6 +1,7 @@
module Siphon.Decoding where
import Siphon.Types
import qualified Data.Attoparsec.Types as Atto
-- unrow :: c1 -> (Vector c2,c1)
--
@ -8,3 +9,33 @@ import Siphon.Types
-- -> Decoding (Indexed f) c a
-- -> Vector c
-- -> Either DecodingErrors a
pipe :: SiphonDecoding c1 c2
-> Atto.Parser c1 (WithEnd c2)
-> Pipe c1 (Vector c2) m String
pipe (SiphonDecoding parse isNull) p = do
c1 <- awaitSkip isNull
case parse p c1 of
awaitSkip :: (a -> Bool)
-> Consumer' a m a
awaitSkip f = go where
go = do
a <- await
if f a then go else return a
nextSkipEmpty
:: (Monad m, Eq a, Monoid a)
=> Producer a m r
-> m (Either r (a, Producer a m r))
nextSkipEmpty = go where
go p0 = do
x <- next p0
case x of
Left _ -> return x
Right (a,p1)
| a == mempty -> go p1
| otherwise -> return x
{-# INLINABLE nextSkipEmpty #-}

View File

@ -0,0 +1,188 @@
{-# LANGUAGE BangPatterns #-}
-- | A CSV parser. The parser defined here is RFC 4180 compliant, with
-- the following extensions:
--
-- * Empty lines are ignored.
--
-- * Non-escaped fields may contain any characters except
-- double-quotes, commas, carriage returns, and newlines.
--
-- * Escaped fields may contain any characters (but double-quotes
-- need to be escaped).
--
-- The functions in this module can be used to implement e.g. a
-- resumable parser that is fed input incrementally.
module Siphon.Internal where
import Siphon.Types
import Data.ByteString.Builder (toLazyByteString,byteString)
import qualified Data.ByteString.Char8 as BC8
import Control.Applicative (optional)
import Data.Attoparsec.ByteString.Char8 (char, endOfInput, string)
import qualified Data.Attoparsec.ByteString as A
import qualified Data.Attoparsec.Lazy as AL
import qualified Data.Attoparsec.Zepto as Z
import qualified Data.ByteString as S
import qualified Data.ByteString.Unsafe as S
import qualified Data.Vector as V
import qualified Data.ByteString as B
import qualified Data.ByteString.Lazy as LByteString
import Data.Word (Word8)
import Data.Vector (Vector)
import Data.ByteString (ByteString)
import Control.Applicative
import Data.Monoid
-- parse :: Monad m
-- => SiphonDecoding c1 c2
-- -> Atto.Parser a b -- ^ Attoparsec parser
-- -> Pipes.Parser a m (Maybe (Either ParsingError b)) -- ^ Pipes parser
-- parse parser = S.StateT $ \p0 -> do
-- x <- nextSkipEmpty p0
-- case x of
-- Left r -> return (Nothing, return r)
-- Right (a,p1) -> step (yield a >>) (_parse parser a) p1
-- where
-- step diffP res p0 = case res of
-- Fail _ c m -> return (Just (Left (ParsingError c m)), diffP p0)
-- Done a b -> return (Just (Right b), yield a >> p0)
-- Partial k -> do
-- x <- nextSkipEmpty p0
-- case x of
-- Left e -> step diffP (k mempty) (return e)
-- Right (a,p1) -> step (diffP . (yield a >>)) (k a) p1
-- | Specialized version of 'sepBy1'' which is faster due to not
-- accepting an arbitrary separator.
sepByDelim1' :: AL.Parser a
-> Word8 -- ^ Field delimiter
-> AL.Parser [a]
sepByDelim1' p !delim = liftM2' (:) p loop
where
loop = do
mb <- A.peekWord8
case mb of
Just b | b == delim -> liftM2' (:) (A.anyWord8 *> p) loop
_ -> pure []
{-# INLINE sepByDelim1' #-}
-- | Specialized version of 'sepBy1'' which is faster due to not
-- accepting an arbitrary separator.
sepByEndOfLine1' :: AL.Parser a
-> AL.Parser [a]
sepByEndOfLine1' p = liftM2' (:) p loop
where
loop = do
mb <- A.peekWord8
case mb of
Just b | b == cr ->
liftM2' (:) (A.anyWord8 *> A.word8 newline *> p) loop
| b == newline ->
liftM2' (:) (A.anyWord8 *> p) loop
_ -> pure []
{-# INLINE sepByEndOfLine1' #-}
-- | Parse a record, not including the terminating line separator. The
-- terminating line separate is not included as the last record in a
-- CSV file is allowed to not have a terminating line separator. You
-- most likely want to use the 'endOfLine' parser in combination with
-- this parser.
row :: Word8 -- ^ Field delimiter
-> AL.Parser (Vector ByteString)
row !delim = V.fromList <$!> field delim `sepByDelim1'` delim <* endOfLine
{-# INLINE row #-}
removeBlankLines :: [Vector ByteString] -> [Vector ByteString]
removeBlankLines = filter (not . blankLine)
-- | Parse a field. The field may be in either the escaped or
-- non-escaped format. The return value is unescaped.
field :: Word8 -> AL.Parser ByteString
field !delim = do
mb <- A.peekWord8
-- We purposely don't use <|> as we want to commit to the first
-- choice if we see a double quote.
case mb of
Just b | b == doubleQuote -> escapedField
_ -> unescapedField delim
{-# INLINE field #-}
escapedField :: AL.Parser S.ByteString
escapedField = do
_ <- dquote
-- The scan state is 'True' if the previous character was a double
-- quote. We need to drop a trailing double quote left by scan.
s <- S.init <$> (A.scan False $ \s c -> if c == doubleQuote
then Just (not s)
else if s then Nothing
else Just False)
if doubleQuote `S.elem` s
then case Z.parse unescape s of
Right r -> return r
Left err -> fail err
else return s
unescapedField :: Word8 -> AL.Parser S.ByteString
unescapedField !delim = A.takeWhile (\ c -> c /= doubleQuote &&
c /= newline &&
c /= delim &&
c /= cr)
dquote :: AL.Parser Char
dquote = char '"'
-- | This could be improved. We could avoid the builder and just
-- write to a buffer directly.
unescape :: Z.Parser S.ByteString
unescape = (LByteString.toStrict . toLazyByteString) <$!> go mempty where
go acc = do
h <- Z.takeWhile (/= doubleQuote)
let rest = do
start <- Z.take 2
if (S.unsafeHead start == doubleQuote &&
S.unsafeIndex start 1 == doubleQuote)
then go (acc `mappend` byteString h `mappend` byteString (BC8.singleton '"'))
else fail "invalid CSV escape sequence"
done <- Z.atEnd
if done
then return (acc `mappend` byteString h)
else rest
-- | A strict version of 'Data.Functor.<$>' for monads.
(<$!>) :: Monad m => (a -> b) -> m a -> m b
f <$!> m = do
a <- m
return $! f a
{-# INLINE (<$!>) #-}
infixl 4 <$!>
-- | Is this an empty record (i.e. a blank line)?
blankLine :: V.Vector B.ByteString -> Bool
blankLine v = V.length v == 1 && (B.null (V.head v))
-- | A version of 'liftM2' that is strict in the result of its first
-- action.
liftM2' :: (Monad m) => (a -> b -> c) -> m a -> m b -> m c
liftM2' f a b = do
!x <- a
y <- b
return (f x y)
{-# INLINE liftM2' #-}
-- | Match either a single newline character @\'\\n\'@, or a carriage
-- return followed by a newline character @\"\\r\\n\"@, or a single
-- carriage return @\'\\r\'@.
endOfLine :: A.Parser ()
endOfLine = (A.word8 newline *> return ()) <|> (string (BC8.pack "\r\n") *> return ()) <|> (A.word8 cr *> return ())
{-# INLINE endOfLine #-}
doubleQuote, newline, cr :: Word8
doubleQuote = 34
newline = 10
cr = 13

View File

@ -1,6 +1,7 @@
module Siphon.Types where
import Data.Vector (Vector)
import qualified Data.Attoparsec.Types as Atto
newtype Escaped c = Escaped { getEscaped :: c }
@ -12,6 +13,17 @@ data Siphon c1 c2 = Siphon
, siphonIntercalate :: !(Vector (Escaped c2) -> c2)
}
-- data Clarify = Clarify
data SiphonDecoding c1 c2 = SiphonDecoding
{ siphonDecodingParse :: Atto.Parser c1 c2 -> c1 -> Atto.IResult c1 c2
, siphonDecodingNull :: c1 -> Bool
}
data WithEnd c = WithEnd
{ withEndEnded :: Bool
, withEndContent :: c
}
-- data SiphonDecodingError
-- { clarify
-- }