Start zipStream implementation

Only local headers and data so far; no central directory yet
This commit is contained in:
Dylan Simon 2017-05-12 17:17:35 -04:00
parent 49f413a492
commit c460089d1a
5 changed files with 175 additions and 26 deletions

View File

@ -1,19 +1,26 @@
module Codec.Archive.Zip.Conduit.Internal module Codec.Archive.Zip.Conduit.Internal
( zipError ( zipError
, idConduit
, sizeCRC , sizeCRC
, zip64Size
, deflateWindowBits
) where ) where
import Codec.Compression.Zlib.Raw (WindowBits(..))
import Control.Monad.Catch (MonadThrow, throwM) import Control.Monad.Catch (MonadThrow, throwM)
import qualified Data.ByteString as BS import qualified Data.ByteString as BS
import qualified Data.Conduit as C import qualified Data.Conduit as C
import Data.Digest.CRC32 (crc32Update) import Data.Digest.CRC32 (crc32Update)
import Data.Word (Word32) import Data.Word (Word32, Word64)
import Codec.Archive.Zip.Conduit.Types import Codec.Archive.Zip.Conduit.Types
zipError :: MonadThrow m => String -> m a zipError :: MonadThrow m => String -> m a
zipError = throwM . ZipError zipError = throwM . ZipError
idConduit :: Monad m => C.Conduit a m a
idConduit = C.awaitForever C.yield
passthroughFold :: Monad m => (a -> b -> a) -> a -> C.ConduitM b b m a passthroughFold :: Monad m => (a -> b -> a) -> a -> C.ConduitM b b m a
passthroughFold f z = C.await >>= maybe passthroughFold f z = C.await >>= maybe
(return z) (return z)
@ -21,5 +28,11 @@ passthroughFold f z = C.await >>= maybe
C.yield x C.yield x
passthroughFold f (f z x)) passthroughFold f (f z x))
sizeCRC :: (Monad m, Integral n) => C.ConduitM BS.ByteString BS.ByteString m (n, Word32) sizeCRC :: Monad m => C.ConduitM BS.ByteString BS.ByteString m (Word64, Word32)
sizeCRC = passthroughFold (\(l, c) b -> (l + fromIntegral (BS.length b), crc32Update c b)) (0, 0) sizeCRC = passthroughFold (\(l, c) b -> (l + fromIntegral (BS.length b), crc32Update c b)) (0, 0)
zip64Size :: Integral n => n
zip64Size = 0xffffffff
deflateWindowBits :: WindowBits
deflateWindowBits = WindowBits (-15)

View File

@ -5,7 +5,7 @@ module Codec.Archive.Zip.Conduit.UnZip
( ZipEntry(..) ( ZipEntry(..)
, ZipInfo(..) , ZipInfo(..)
, ZipError , ZipError
, unZip , unZipStream
) where ) where
import Control.Applicative ((<|>), empty) import Control.Applicative ((<|>), empty)
@ -20,9 +20,9 @@ import qualified Data.ByteString.Char8 as BSC
import qualified Data.Conduit as C import qualified Data.Conduit as C
import qualified Data.Conduit.List as CL import qualified Data.Conduit.List as CL
import Data.Conduit.Serialization.Binary (sinkGet) import Data.Conduit.Serialization.Binary (sinkGet)
import Data.Conduit.Zlib (WindowBits(..), decompress) import Data.Conduit.Zlib (decompress)
import Data.Time (LocalTime(..), fromGregorian, TimeOfDay(..)) import Data.Time (LocalTime(..), TimeOfDay(..), fromGregorian)
import Data.Word (Word32, Word64) import Data.Word (Word16, Word32, Word64)
import Codec.Archive.Zip.Conduit.Types import Codec.Archive.Zip.Conduit.Types
import Codec.Archive.Zip.Conduit.Internal import Codec.Archive.Zip.Conduit.Internal
@ -75,6 +75,17 @@ foldGet g z = do
e <- G.isEmpty e <- G.isEmpty
if e then return z else g z >>= foldGet g if e then return z else g z >>= foldGet g
fromDOSTime :: Word16 -> Word16 -> LocalTime
fromDOSTime time date = LocalTime
(fromGregorian
(fromIntegral $ date `shiftR` 9 + 1980)
(fromIntegral $ date `shiftR` 5 .&. 0x0f)
(fromIntegral $ date .&. 0x1f))
(TimeOfDay
(fromIntegral $ time `shiftR` 11)
(fromIntegral $ time `shiftR` 5 .&. 0x3f)
(fromIntegral $ time `shiftL` 1 .&. 0x3f))
-- |Stream a zip file, producing a sequence of entry headers and data blocks. -- |Stream a zip file, producing a sequence of entry headers and data blocks.
-- For example, this might produce: @Left (ZipEntry "directory\/" ...), Left (ZipEntry "directory\/file.txt" ...), Right "hello w", Right "orld!\\n", Left ...@ -- For example, this might produce: @Left (ZipEntry "directory\/" ...), Left (ZipEntry "directory\/file.txt" ...), Right "hello w", Right "orld!\\n", Left ...@
-- The final result is summary information taken from the end of the zip file. -- The final result is summary information taken from the end of the zip file.
@ -84,8 +95,8 @@ foldGet g z = do
-- It does not (ironically) support uncompressed zip files that have been created as streams, where file sizes are not known beforehand. -- It does not (ironically) support uncompressed zip files that have been created as streams, where file sizes are not known beforehand.
-- Since it does not use the offset information at the end of the file, it assumes all entries are packed sequentially, which is usually the case. -- Since it does not use the offset information at the end of the file, it assumes all entries are packed sequentially, which is usually the case.
-- Any errors are thrown in the underlying monad. -- Any errors are thrown in the underlying monad.
unZip :: (MonadBase b m, PrimMonad b, MonadThrow m) => C.ConduitM BS.ByteString (Either ZipEntry BS.ByteString) m ZipInfo unZipStream :: (MonadBase b m, PrimMonad b, MonadThrow m) => C.ConduitM BS.ByteString (Either ZipEntry BS.ByteString) m ZipInfo
unZip = next where unZipStream = next where
next = do next = do
h <- sinkGet header h <- sinkGet header
case h of case h of
@ -150,20 +161,10 @@ unZip = next where
comp <- G.getWord16le comp <- G.getWord16le
dcomp <- case comp of dcomp <- case comp of
0 | testBit gpf 3 -> fail "Unsupported uncompressed streaming file data" 0 | testBit gpf 3 -> fail "Unsupported uncompressed streaming file data"
| otherwise -> return $ C.awaitForever C.yield -- idConduit | otherwise -> return idConduit
8 -> return $ decompress (WindowBits (-15)) 8 -> return $ decompress deflateWindowBits
_ -> fail $ "Unsupported compression method: " ++ show comp _ -> fail $ "Unsupported compression method: " ++ show comp
time <- G.getWord16le time <- fromDOSTime <$> G.getWord16le <*> G.getWord16le
date <- G.getWord16le
let mtime = LocalTime
(fromGregorian
(fromIntegral $ date `shiftR` 9 + 1980)
(fromIntegral $ date `shiftR` 5 .&. 0x0f)
(fromIntegral $ date .&. 0x1f))
(TimeOfDay
(fromIntegral $ time `shiftR` 11)
(fromIntegral $ time `shiftR` 5 .&. 0x3f)
(fromIntegral $ time `shiftL` 1 .&. 0x3f))
crc <- G.getWord32le crc <- G.getWord32le
csiz <- G.getWord32le csiz <- G.getWord32le
usiz <- G.getWord32le usiz <- G.getWord32le
@ -176,8 +177,8 @@ unZip = next where
G.isolate z $ case t of G.isolate z $ case t of
0x0001 -> do 0x0001 -> do
-- the zip specs claim "the Local header MUST include BOTH" but "only if the corresponding field is set to 0xFFFFFFFF" -- the zip specs claim "the Local header MUST include BOTH" but "only if the corresponding field is set to 0xFFFFFFFF"
usiz' <- if usiz == maxBound then G.getWord64le else return $ extZip64USize ext usiz' <- if usiz == zip64Size then G.getWord64le else return $ extZip64USize ext
csiz' <- if csiz == maxBound then G.getWord64le else return $ extZip64CSize ext csiz' <- if csiz == zip64Size then G.getWord64le else return $ extZip64CSize ext
return ext return ext
{ extZip64 = True { extZip64 = True
, extZip64USize = usiz' , extZip64USize = usiz'
@ -207,7 +208,7 @@ unZip = next where
return FileHeader return FileHeader
{ fileEntry = ZipEntry { fileEntry = ZipEntry
{ zipEntryName = name { zipEntryName = name
, zipEntryTime = mtime , zipEntryTime = time
, zipEntrySize = if testBit gpf 3 then Nothing else Just extZip64USize , zipEntrySize = if testBit gpf 3 then Nothing else Just extZip64USize
} }
, fileDecompress = dcomp , fileDecompress = dcomp

View File

@ -0,0 +1,132 @@
{-# LANGUAGE RecordWildCards #-}
{-# LANGUAGE ViewPatterns #-}
module Codec.Archive.Zip.Conduit.Zip
( ZipOptions(..)
, defaultZipOptions
, ZipEntry(..)
, ZipData(..)
, zipFileData
, zipStream
) where
import qualified Codec.Compression.Zlib.Raw as Z
import Control.Arrow ((&&&), (+++), left)
import Control.Monad (when)
import Control.Monad.Base (MonadBase)
import Control.Monad.Catch (MonadThrow)
import Control.Monad.Primitive (PrimMonad)
import Control.Monad.Trans.Resource (MonadResource)
import qualified Data.Binary.Put as P
import Data.Bits (bit, shiftL, shiftR, (.|.))
import qualified Data.ByteString as BS
import qualified Data.ByteString.Char8 as BSC
import qualified Data.ByteString.Lazy as BSL
import qualified Data.Conduit as C
import qualified Data.Conduit.Binary as CB
import Data.Conduit.Serialization.Binary (sourcePut)
import Data.Conduit.Zlib (compress)
import Data.Digest.CRC32 (crc32)
import Data.Either (isLeft)
import Data.Maybe (fromMaybe)
import Data.Time (LocalTime(..), TimeOfDay(..), toGregorian)
import Data.Word (Word16, Word64)
import Codec.Archive.Zip.Conduit.Types
import Codec.Archive.Zip.Conduit.Internal
data ZipOptions = ZipOptions
{ zipOpt64 :: Bool -- ^Allow zip file sizes over 4GB (reduces compatibility, but is otherwise safe for any file sizes)
, zipOptCompressLevel :: Int -- ^Compress (0 = store only, 9 = best) zipped files (improves compatibility, since some unzip programs don't supported stored, streamed files, including the one in this package)
, zipOptInfo :: ZipInfo -- ^Other parameters to store in the zip file
}
defaultZipOptions :: ZipOptions
defaultZipOptions = ZipOptions
{ zipOpt64 = False
, zipOptCompressLevel = -1
, zipOptInfo = ZipInfo
{ zipComment = BS.empty
}
}
data ZipData m
= ZipDataByteString BSL.ByteString
| ZipDataSource (C.Source m BS.ByteString)
instance Monad m => Monoid (ZipData m) where
mempty = ZipDataByteString BSL.empty
mappend (ZipDataByteString a) (ZipDataByteString b) = ZipDataByteString $ mappend a b
mappend a b = ZipDataSource $ mappend (zipDataSource a) (zipDataSource b)
zipFileData :: MonadResource m => FilePath -> ZipData m
zipFileData = ZipDataSource . CB.sourceFile
zipDataSource :: Monad m => ZipData m -> C.Source m BS.ByteString
zipDataSource (ZipDataByteString b) = CB.sourceLbs b
zipDataSource (ZipDataSource s) = s
zipData :: Monad m => ZipData m -> Either (C.Source m BS.ByteString) BSL.ByteString
zipData (ZipDataByteString b) = Right b
zipData (ZipDataSource s) = Left s
dataSize :: Either a BSL.ByteString -> Maybe Word64
dataSize (Left _) = Nothing
dataSize (Right b) = Just $ fromIntegral $ BSL.length b
toDOSTime :: LocalTime -> (Word16, Word16)
toDOSTime (LocalTime (toGregorian -> (year, month, day)) (TimeOfDay hour mins secs)) =
( fromIntegral hour `shiftL` 11 .|. fromIntegral mins `shiftL` 5 .|. truncate secs `shiftR` 1
, fromIntegral (year - 1980) `shiftL` 9 .|. fromIntegral month `shiftL` 5 .|. fromIntegral day
)
zipStream :: (MonadBase b m, PrimMonad b, MonadThrow m) => ZipOptions -> C.ConduitM (ZipEntry, ZipData m) BS.ByteString m Word64
zipStream ZipOptions{..} = do
C.awaitForever $ C.toProducer . entry
return 0 -- TODO: size
where
entry (ZipEntry{..}, zipData -> dat) = do
let usiz = dataSize dat
sdat = left (C..| sizeCRC) dat
comp = zipOptCompressLevel /= 0 && all (0 /=) usiz
(cdat, csiz)
| comp =
( ((`C.fuseBoth` (compress zipOptCompressLevel deflateWindowBits C..| (fst <$> sizeCRC)))
+++ Z.compress) sdat -- level for Z.compress?
, dataSize cdat)
| otherwise = (left (fmap (id &&& fst)) sdat, usiz)
z64 = maybe zipOpt64 (zip64Size <) (max <$> usiz <*> csiz)
namelen = BS.length zipEntryName
when (namelen > fromIntegral (maxBound :: Word16)) $ zipError $ BSC.unpack zipEntryName ++ ": entry name too long"
sourcePut $ do
P.putWord32le 0x04034b50
P.putWord16le $ if z64 then 45 else 20
P.putWord16le $ if isLeft dat then bit 3 else 0
P.putWord16le $ if comp then 8 else 0
let (time, date) = toDOSTime zipEntryTime
P.putWord16le $ time
P.putWord16le $ date
P.putWord32le $ either (const 0) crc32 cdat
P.putWord32le $ if z64 then zip64Size else maybe 0 fromIntegral csiz
P.putWord32le $ if z64 then zip64Size else maybe 0 fromIntegral usiz
P.putWord16le $ fromIntegral namelen
P.putWord16le $ if z64 then 20 else 0
P.putByteString zipEntryName
when z64 $ do
P.putWord16le 0x0001
P.putWord16le 16
P.putWord64le $ fromMaybe 0 usiz
P.putWord64le $ fromMaybe 0 csiz
either
(\cd -> do
((usz, crc), csz) <- cd -- write compressed data
when (not z64 && (usz > zip64Size || csz > zip64Size)) $ zipError $ BSC.unpack zipEntryName ++ ": file too large and zipOpt64 disabled"
sourcePut $ do
P.putWord32le 0x08074b50
P.putWord32le crc
let putsz
| z64 = P.putWord64le
| otherwise = P.putWord32le . fromIntegral
putsz csz
putsz usz)
CB.sourceLbs
cdat

View File

@ -43,5 +43,5 @@ main = do
exitFailure exitFailure
ZipInfo{..} <- C.runConduit ZipInfo{..} <- C.runConduit
$ CB.sourceHandle stdin $ CB.sourceHandle stdin
C..| C.fuseUpstream unZip extract C..| C.fuseUpstream unZipStream extract
BSC.putStrLn zipComment BSC.putStrLn zipComment

View File

@ -19,6 +19,7 @@ library
exposed-modules: exposed-modules:
Codec.Archive.Zip.Conduit.Types Codec.Archive.Zip.Conduit.Types
Codec.Archive.Zip.Conduit.UnZip Codec.Archive.Zip.Conduit.UnZip
Codec.Archive.Zip.Conduit.Zip
other-modules: other-modules:
Codec.Archive.Zip.Conduit.Internal Codec.Archive.Zip.Conduit.Internal
default-language: Haskell2010 default-language: Haskell2010
@ -33,8 +34,10 @@ library
digest, digest,
exceptions, exceptions,
primitive, primitive,
resourcet,
time, time,
transformers-base transformers-base,
zlib
executable unzip-stream executable unzip-stream
main-is: unzip.hs main-is: unzip.hs