Minor reorganization in prep for zipping

This commit is contained in:
Dylan Simon 2017-05-11 21:38:16 -04:00
parent 2c87f62e2b
commit 49f413a492
5 changed files with 73 additions and 49 deletions

View File

@ -0,0 +1,25 @@
module Codec.Archive.Zip.Conduit.Internal
( zipError
, sizeCRC
) where
import Control.Monad.Catch (MonadThrow, throwM)
import qualified Data.ByteString as BS
import qualified Data.Conduit as C
import Data.Digest.CRC32 (crc32Update)
import Data.Word (Word32)
import Codec.Archive.Zip.Conduit.Types
zipError :: MonadThrow m => String -> m a
zipError = throwM . ZipError
passthroughFold :: Monad m => (a -> b -> a) -> a -> C.ConduitM b b m a
passthroughFold f z = C.await >>= maybe
(return z)
(\x -> do
C.yield x
passthroughFold f (f z x))
sizeCRC :: (Monad m, Integral n) => C.ConduitM BS.ByteString BS.ByteString m (n, Word32)
sizeCRC = passthroughFold (\(l, c) b -> (l + fromIntegral (BS.length b), crc32Update c b)) (0, 0)

View File

@ -0,0 +1,31 @@
module Codec.Archive.Zip.Conduit.Types where
import Control.Exception (Exception(..))
import Data.ByteString (ByteString)
import Data.String (IsString(..))
import Data.Time.LocalTime (LocalTime)
import Data.Typeable (Typeable)
import Data.Word (Word64)
-- |Errors thrown during zip file processing
newtype ZipError = ZipError String
deriving (Show, Typeable)
instance IsString ZipError where
fromString = ZipError
instance Exception ZipError where
displayException (ZipError e) = "ZipError: " ++ e
-- |(The beginning of) a single entry in a zip stream, which may be any file or directory.
-- As per zip file conventions, directory names should end with a slash and have no data, but this library does not ensure that.
data ZipEntry = ZipEntry
{ zipEntryName :: ByteString -- ^File name, usually utf-8 encoded, with a trailing slash for directories
, zipEntryTime :: LocalTime -- ^Modification time
, zipEntrySize :: Maybe Word64 -- ^Size of file data (if known)
}
-- |Summary information at the end of a zip stream.
data ZipInfo = ZipInfo
{ zipComment :: ByteString
}

View File

@ -9,10 +9,9 @@ module Codec.Archive.Zip.Conduit.UnZip
) where ) where
import Control.Applicative ((<|>), empty) import Control.Applicative ((<|>), empty)
import Control.Exception (Exception(..))
import Control.Monad (when, unless, guard) import Control.Monad (when, unless, guard)
import Control.Monad.Base (MonadBase) import Control.Monad.Base (MonadBase)
import Control.Monad.Catch (MonadThrow, throwM) import Control.Monad.Catch (MonadThrow)
import Control.Monad.Primitive (PrimMonad) import Control.Monad.Primitive (PrimMonad)
import qualified Data.Binary.Get as G import qualified Data.Binary.Get as G
import Data.Bits ((.&.), complement, testBit, shiftL, shiftR) import Data.Bits ((.&.), complement, testBit, shiftL, shiftR)
@ -22,30 +21,11 @@ import qualified Data.Conduit as C
import qualified Data.Conduit.List as CL import qualified Data.Conduit.List as CL
import Data.Conduit.Serialization.Binary (sinkGet) import Data.Conduit.Serialization.Binary (sinkGet)
import Data.Conduit.Zlib (WindowBits(..), decompress) import Data.Conduit.Zlib (WindowBits(..), decompress)
import Data.Digest.CRC32 (crc32Update)
import Data.Time (LocalTime(..), fromGregorian, TimeOfDay(..)) import Data.Time (LocalTime(..), fromGregorian, TimeOfDay(..))
import Data.Typeable (Typeable)
import Data.Word (Word32, Word64) import Data.Word (Word32, Word64)
-- |(The beginning of) a single entry in a zip stream, which may be any file or directory. import Codec.Archive.Zip.Conduit.Types
-- As per zip file conventions, directory names should end with a slash and have no data, but this library does not ensure that. import Codec.Archive.Zip.Conduit.Internal
data ZipEntry = ZipEntry
{ zipEntryName :: BS.ByteString -- ^File name, usually utf-8 encoded, with a trailing slash for directories
, zipEntryTime :: LocalTime -- ^Modification time
, zipEntrySize :: !Word64 -- ^Size of file data
}
-- |Summary information at the end of a zip stream.
data ZipInfo = ZipInfo
{ zipComment :: BS.ByteString
}
-- |Errors thrown during zip file processing
newtype ZipError = ZipError String
deriving (Show, Typeable)
instance Exception ZipError where
displayException (ZipError e) = "ZipError: " ++ e
data Header m data Header m
= FileHeader = FileHeader
@ -54,7 +34,6 @@ data Header m
, fileCRC :: !Word32 , fileCRC :: !Word32
, fileCSize :: !Word64 , fileCSize :: !Word64
, fileZip64 :: !Bool , fileZip64 :: !Bool
, fileStream :: !Bool
} }
| EndOfCentralDirectory | EndOfCentralDirectory
{ endInfo :: ZipInfo { endInfo :: ZipInfo
@ -75,9 +54,6 @@ data ExtField = ExtField
} }
-} -}
zipError :: MonadThrow m => String -> m a
zipError = throwM . ZipError
pass :: (MonadThrow m, Integral n) => n -> C.Conduit BS.ByteString m BS.ByteString pass :: (MonadThrow m, Integral n) => n -> C.Conduit BS.ByteString m BS.ByteString
pass 0 = return () pass 0 = return ()
pass n = C.await >>= maybe pass n = C.await >>= maybe
@ -94,16 +70,6 @@ pass n = C.await >>= maybe
pass n') pass n')
where ni = toInteger n where ni = toInteger n
passthroughFold :: Monad m => (a -> b -> a) -> a -> C.ConduitM b b m a
passthroughFold f z = C.await >>= maybe
(return z)
(\x -> do
C.yield x
passthroughFold f (f z x))
sizeCRC :: (Monad m, Integral n) => C.ConduitM BS.ByteString BS.ByteString m (n, Word32)
sizeCRC = passthroughFold (\(l, c) b -> (l + fromIntegral (BS.length b), crc32Update c b)) (0, 0)
foldGet :: (a -> G.Get a) -> a -> G.Get a foldGet :: (a -> G.Get a) -> a -> G.Get a
foldGet g z = do foldGet g z = do
e <- G.isEmpty e <- G.isEmpty
@ -126,8 +92,8 @@ unZip = next where
FileHeader{..} -> do FileHeader{..} -> do
C.yield $ Left fileEntry C.yield $ Left fileEntry
r <- C.mapOutput Right $ r <- C.mapOutput Right $
if fileStream case zipEntrySize fileEntry of
then do -- unknown size Nothing -> do -- unknown size
((csize, _), (size, crc)) <- C.fuseBoth sizeCRC ((csize, _), (size, crc)) <- C.fuseBoth sizeCRC
$ fileDecompress $ fileDecompress
C..| sizeCRC C..| sizeCRC
@ -136,17 +102,17 @@ unZip = next where
{ fileCSize = csize { fileCSize = csize
, fileCRC = crc , fileCRC = crc
, fileEntry = fileEntry , fileEntry = fileEntry
{ zipEntrySize = size { zipEntrySize = Just size
} }
} }
else do -- known size Just usize -> do -- known size
(size, crc) <- pass fileCSize (size, crc) <- pass fileCSize
C..| (fileDecompress >> CL.sinkNull) C..| (fileDecompress >> CL.sinkNull)
C..| sizeCRC C..| sizeCRC
-- traceM $ "size=" ++ show size ++ "," ++ show (zipEntrySize fileEntry) ++ " crc=" ++ show crc ++ "," ++ show fileCRC -- traceM $ "size=" ++ show size ++ "," ++ show (zipEntrySize fileEntry) ++ " crc=" ++ show crc ++ "," ++ show fileCRC
-- optional data description (possibly ambiguous!) -- optional data description (possibly ambiguous!)
sinkGet $ (guard =<< dataDesc h) <|> return () sinkGet $ (guard =<< dataDesc h) <|> return ()
return (size == zipEntrySize fileEntry && crc == fileCRC) return (size == usize && crc == fileCRC)
unless r $ zipError $ BSC.unpack (zipEntryName fileEntry) ++ ": data integrity check failed" unless r $ zipError $ BSC.unpack (zipEntryName fileEntry) ++ ": data integrity check failed"
next next
EndOfCentralDirectory{..} -> do EndOfCentralDirectory{..} -> do
@ -168,7 +134,7 @@ unZip = next where
csiz <- getSize csiz <- getSize
usiz <- getSize usiz <- getSize
-- traceM $ "crc=" ++ show crc ++ "," ++ show fileCRC ++ " csiz=" ++ show csiz ++ "," ++ show fileCSize ++ " usiz=" ++ show usiz ++ "," ++ show (zipEntrySize fileEntry) -- traceM $ "crc=" ++ show crc ++ "," ++ show fileCRC ++ " csiz=" ++ show csiz ++ "," ++ show fileCSize ++ " usiz=" ++ show usiz ++ "," ++ show (zipEntrySize fileEntry)
return $ crc == fileCRC && csiz == fileCSize && usiz == zipEntrySize fileEntry return $ crc == fileCRC && csiz == fileCSize && (usiz ==) `all` zipEntrySize fileEntry
dataDescBody _ = empty dataDescBody _ = empty
central = G.getWord32le >>= centralBody central = G.getWord32le >>= centralBody
centralBody 0x02014b50 = centralHeader >> central centralBody 0x02014b50 = centralHeader >> central
@ -242,13 +208,12 @@ unZip = next where
{ fileEntry = ZipEntry { fileEntry = ZipEntry
{ zipEntryName = name { zipEntryName = name
, zipEntryTime = mtime , zipEntryTime = mtime
, zipEntrySize = extZip64USize , zipEntrySize = if testBit gpf 3 then Nothing else Just extZip64USize
} }
, fileDecompress = dcomp , fileDecompress = dcomp
, fileCSize = extZip64CSize , fileCSize = extZip64CSize
, fileCRC = crc , fileCRC = crc
, fileZip64 = extZip64 , fileZip64 = extZip64
, fileStream = testBit gpf 3
} }
centralHeader = do centralHeader = do
-- ignore everything -- ignore everything

View File

@ -20,10 +20,10 @@ extract = C.awaitForever start where
liftIO $ BSC.putStrLn zipEntryName liftIO $ BSC.putStrLn zipEntryName
liftIO $ createDirectoryIfMissing True (takeDirectory name) liftIO $ createDirectoryIfMissing True (takeDirectory name)
if BSC.last zipEntryName == '/' if BSC.last zipEntryName == '/'
then when (zipEntrySize /= 0) $ fail $ name ++ ": non-empty directory" then when ((0 /=) `any` zipEntrySize) $ fail $ name ++ ": non-empty directory"
else do -- C.bracketP else do -- C.bracketP
h <- liftIO $ openFile name WriteMode h <- liftIO $ openFile name WriteMode
liftIO $ hSetFileSize h $ toInteger zipEntrySize mapM_ (liftIO . hSetFileSize h . toInteger) zipEntrySize
write C..| CB.sinkHandle h write C..| CB.sinkHandle h
liftIO $ hClose h liftIO $ hClose h
liftIO $ setModificationTime name $ localTimeToUTC utc zipEntryTime -- FIXME: timezone liftIO $ setModificationTime name $ localTimeToUTC utc zipEntryTime -- FIXME: timezone

View File

@ -1,7 +1,7 @@
name: zip-stream name: zip-stream
version: 0 version: 0
synopsis: ZIP file stream processing using conduits synopsis: ZIP archive streaming using conduits
description: Process (extract and create) zip files as streams, accessing individual files without having to write a zip file to disk, unlike zip-conduit. Unfortunately, processing zip files in this way introduces some limitations on what ZIP features can be supported, but the goal is to support most cases. description: Process (extract and create) zip files as streams (e.g., over the network), accessing individual files without having to write a zip file to disk (unlike zip-conduit).
license: BSD3 license: BSD3
license-file: LICENSE license-file: LICENSE
author: Dylan Simon author: Dylan Simon
@ -17,7 +17,10 @@ source-repository head
library library
exposed-modules: exposed-modules:
Codec.Archive.Zip.Conduit.Types
Codec.Archive.Zip.Conduit.UnZip Codec.Archive.Zip.Conduit.UnZip
other-modules:
Codec.Archive.Zip.Conduit.Internal
default-language: Haskell2010 default-language: Haskell2010
ghc-options: -Wall ghc-options: -Wall
build-depends: build-depends: