From c460089d1ae727aef1161376af6f7d375125aaf7 Mon Sep 17 00:00:00 2001 From: Dylan Simon Date: Fri, 12 May 2017 17:17:35 -0400 Subject: [PATCH] Start zipStream implementation Only local headers and data so far; no central directory yet --- Codec/Archive/Zip/Conduit/Internal.hs | 17 +++- Codec/Archive/Zip/Conduit/UnZip.hs | 45 ++++----- Codec/Archive/Zip/Conduit/Zip.hs | 132 ++++++++++++++++++++++++++ cmd/unzip.hs | 2 +- zip-stream.cabal | 5 +- 5 files changed, 175 insertions(+), 26 deletions(-) create mode 100644 Codec/Archive/Zip/Conduit/Zip.hs diff --git a/Codec/Archive/Zip/Conduit/Internal.hs b/Codec/Archive/Zip/Conduit/Internal.hs index 97bf220..1e27be3 100644 --- a/Codec/Archive/Zip/Conduit/Internal.hs +++ b/Codec/Archive/Zip/Conduit/Internal.hs @@ -1,19 +1,26 @@ module Codec.Archive.Zip.Conduit.Internal ( zipError + , idConduit , sizeCRC + , zip64Size + , deflateWindowBits ) where +import Codec.Compression.Zlib.Raw (WindowBits(..)) import Control.Monad.Catch (MonadThrow, throwM) import qualified Data.ByteString as BS import qualified Data.Conduit as C import Data.Digest.CRC32 (crc32Update) -import Data.Word (Word32) +import Data.Word (Word32, Word64) import Codec.Archive.Zip.Conduit.Types zipError :: MonadThrow m => String -> m a zipError = throwM . ZipError +idConduit :: Monad m => C.Conduit a m a +idConduit = C.awaitForever C.yield + passthroughFold :: Monad m => (a -> b -> a) -> a -> C.ConduitM b b m a passthroughFold f z = C.await >>= maybe (return z) @@ -21,5 +28,11 @@ passthroughFold f z = C.await >>= maybe C.yield x passthroughFold f (f z x)) -sizeCRC :: (Monad m, Integral n) => C.ConduitM BS.ByteString BS.ByteString m (n, Word32) +sizeCRC :: Monad m => C.ConduitM BS.ByteString BS.ByteString m (Word64, Word32) sizeCRC = passthroughFold (\(l, c) b -> (l + fromIntegral (BS.length b), crc32Update c b)) (0, 0) + +zip64Size :: Integral n => n +zip64Size = 0xffffffff + +deflateWindowBits :: WindowBits +deflateWindowBits = WindowBits (-15) diff --git a/Codec/Archive/Zip/Conduit/UnZip.hs b/Codec/Archive/Zip/Conduit/UnZip.hs index 1c973d2..27a74ba 100644 --- a/Codec/Archive/Zip/Conduit/UnZip.hs +++ b/Codec/Archive/Zip/Conduit/UnZip.hs @@ -5,7 +5,7 @@ module Codec.Archive.Zip.Conduit.UnZip ( ZipEntry(..) , ZipInfo(..) , ZipError - , unZip + , unZipStream ) where import Control.Applicative ((<|>), empty) @@ -20,9 +20,9 @@ import qualified Data.ByteString.Char8 as BSC import qualified Data.Conduit as C import qualified Data.Conduit.List as CL import Data.Conduit.Serialization.Binary (sinkGet) -import Data.Conduit.Zlib (WindowBits(..), decompress) -import Data.Time (LocalTime(..), fromGregorian, TimeOfDay(..)) -import Data.Word (Word32, Word64) +import Data.Conduit.Zlib (decompress) +import Data.Time (LocalTime(..), TimeOfDay(..), fromGregorian) +import Data.Word (Word16, Word32, Word64) import Codec.Archive.Zip.Conduit.Types import Codec.Archive.Zip.Conduit.Internal @@ -75,6 +75,17 @@ foldGet g z = do e <- G.isEmpty if e then return z else g z >>= foldGet g +fromDOSTime :: Word16 -> Word16 -> LocalTime +fromDOSTime time date = LocalTime + (fromGregorian + (fromIntegral $ date `shiftR` 9 + 1980) + (fromIntegral $ date `shiftR` 5 .&. 0x0f) + (fromIntegral $ date .&. 0x1f)) + (TimeOfDay + (fromIntegral $ time `shiftR` 11) + (fromIntegral $ time `shiftR` 5 .&. 0x3f) + (fromIntegral $ time `shiftL` 1 .&. 0x3f)) + -- |Stream a zip file, producing a sequence of entry headers and data blocks. -- For example, this might produce: @Left (ZipEntry "directory\/" ...), Left (ZipEntry "directory\/file.txt" ...), Right "hello w", Right "orld!\\n", Left ...@ -- The final result is summary information taken from the end of the zip file. @@ -84,8 +95,8 @@ foldGet g z = do -- It does not (ironically) support uncompressed zip files that have been created as streams, where file sizes are not known beforehand. -- Since it does not use the offset information at the end of the file, it assumes all entries are packed sequentially, which is usually the case. -- Any errors are thrown in the underlying monad. -unZip :: (MonadBase b m, PrimMonad b, MonadThrow m) => C.ConduitM BS.ByteString (Either ZipEntry BS.ByteString) m ZipInfo -unZip = next where +unZipStream :: (MonadBase b m, PrimMonad b, MonadThrow m) => C.ConduitM BS.ByteString (Either ZipEntry BS.ByteString) m ZipInfo +unZipStream = next where next = do h <- sinkGet header case h of @@ -150,20 +161,10 @@ unZip = next where comp <- G.getWord16le dcomp <- case comp of 0 | testBit gpf 3 -> fail "Unsupported uncompressed streaming file data" - | otherwise -> return $ C.awaitForever C.yield -- idConduit - 8 -> return $ decompress (WindowBits (-15)) + | otherwise -> return idConduit + 8 -> return $ decompress deflateWindowBits _ -> fail $ "Unsupported compression method: " ++ show comp - time <- G.getWord16le - date <- G.getWord16le - let mtime = LocalTime - (fromGregorian - (fromIntegral $ date `shiftR` 9 + 1980) - (fromIntegral $ date `shiftR` 5 .&. 0x0f) - (fromIntegral $ date .&. 0x1f)) - (TimeOfDay - (fromIntegral $ time `shiftR` 11) - (fromIntegral $ time `shiftR` 5 .&. 0x3f) - (fromIntegral $ time `shiftL` 1 .&. 0x3f)) + time <- fromDOSTime <$> G.getWord16le <*> G.getWord16le crc <- G.getWord32le csiz <- G.getWord32le usiz <- G.getWord32le @@ -176,8 +177,8 @@ unZip = next where G.isolate z $ case t of 0x0001 -> do -- the zip specs claim "the Local header MUST include BOTH" but "only if the corresponding field is set to 0xFFFFFFFF" - usiz' <- if usiz == maxBound then G.getWord64le else return $ extZip64USize ext - csiz' <- if csiz == maxBound then G.getWord64le else return $ extZip64CSize ext + usiz' <- if usiz == zip64Size then G.getWord64le else return $ extZip64USize ext + csiz' <- if csiz == zip64Size then G.getWord64le else return $ extZip64CSize ext return ext { extZip64 = True , extZip64USize = usiz' @@ -207,7 +208,7 @@ unZip = next where return FileHeader { fileEntry = ZipEntry { zipEntryName = name - , zipEntryTime = mtime + , zipEntryTime = time , zipEntrySize = if testBit gpf 3 then Nothing else Just extZip64USize } , fileDecompress = dcomp diff --git a/Codec/Archive/Zip/Conduit/Zip.hs b/Codec/Archive/Zip/Conduit/Zip.hs new file mode 100644 index 0000000..99a6f5d --- /dev/null +++ b/Codec/Archive/Zip/Conduit/Zip.hs @@ -0,0 +1,132 @@ +{-# LANGUAGE RecordWildCards #-} +{-# LANGUAGE ViewPatterns #-} +module Codec.Archive.Zip.Conduit.Zip + ( ZipOptions(..) + , defaultZipOptions + , ZipEntry(..) + , ZipData(..) + , zipFileData + , zipStream + ) where + +import qualified Codec.Compression.Zlib.Raw as Z +import Control.Arrow ((&&&), (+++), left) +import Control.Monad (when) +import Control.Monad.Base (MonadBase) +import Control.Monad.Catch (MonadThrow) +import Control.Monad.Primitive (PrimMonad) +import Control.Monad.Trans.Resource (MonadResource) +import qualified Data.Binary.Put as P +import Data.Bits (bit, shiftL, shiftR, (.|.)) +import qualified Data.ByteString as BS +import qualified Data.ByteString.Char8 as BSC +import qualified Data.ByteString.Lazy as BSL +import qualified Data.Conduit as C +import qualified Data.Conduit.Binary as CB +import Data.Conduit.Serialization.Binary (sourcePut) +import Data.Conduit.Zlib (compress) +import Data.Digest.CRC32 (crc32) +import Data.Either (isLeft) +import Data.Maybe (fromMaybe) +import Data.Time (LocalTime(..), TimeOfDay(..), toGregorian) +import Data.Word (Word16, Word64) + +import Codec.Archive.Zip.Conduit.Types +import Codec.Archive.Zip.Conduit.Internal + +data ZipOptions = ZipOptions + { zipOpt64 :: Bool -- ^Allow zip file sizes over 4GB (reduces compatibility, but is otherwise safe for any file sizes) + , zipOptCompressLevel :: Int -- ^Compress (0 = store only, 9 = best) zipped files (improves compatibility, since some unzip programs don't supported stored, streamed files, including the one in this package) + , zipOptInfo :: ZipInfo -- ^Other parameters to store in the zip file + } + +defaultZipOptions :: ZipOptions +defaultZipOptions = ZipOptions + { zipOpt64 = False + , zipOptCompressLevel = -1 + , zipOptInfo = ZipInfo + { zipComment = BS.empty + } + } + +data ZipData m + = ZipDataByteString BSL.ByteString + | ZipDataSource (C.Source m BS.ByteString) + +instance Monad m => Monoid (ZipData m) where + mempty = ZipDataByteString BSL.empty + mappend (ZipDataByteString a) (ZipDataByteString b) = ZipDataByteString $ mappend a b + mappend a b = ZipDataSource $ mappend (zipDataSource a) (zipDataSource b) + +zipFileData :: MonadResource m => FilePath -> ZipData m +zipFileData = ZipDataSource . CB.sourceFile + +zipDataSource :: Monad m => ZipData m -> C.Source m BS.ByteString +zipDataSource (ZipDataByteString b) = CB.sourceLbs b +zipDataSource (ZipDataSource s) = s + +zipData :: Monad m => ZipData m -> Either (C.Source m BS.ByteString) BSL.ByteString +zipData (ZipDataByteString b) = Right b +zipData (ZipDataSource s) = Left s + +dataSize :: Either a BSL.ByteString -> Maybe Word64 +dataSize (Left _) = Nothing +dataSize (Right b) = Just $ fromIntegral $ BSL.length b + +toDOSTime :: LocalTime -> (Word16, Word16) +toDOSTime (LocalTime (toGregorian -> (year, month, day)) (TimeOfDay hour mins secs)) = + ( fromIntegral hour `shiftL` 11 .|. fromIntegral mins `shiftL` 5 .|. truncate secs `shiftR` 1 + , fromIntegral (year - 1980) `shiftL` 9 .|. fromIntegral month `shiftL` 5 .|. fromIntegral day + ) + +zipStream :: (MonadBase b m, PrimMonad b, MonadThrow m) => ZipOptions -> C.ConduitM (ZipEntry, ZipData m) BS.ByteString m Word64 +zipStream ZipOptions{..} = do + C.awaitForever $ C.toProducer . entry + return 0 -- TODO: size + where + entry (ZipEntry{..}, zipData -> dat) = do + let usiz = dataSize dat + sdat = left (C..| sizeCRC) dat + comp = zipOptCompressLevel /= 0 && all (0 /=) usiz + (cdat, csiz) + | comp = + ( ((`C.fuseBoth` (compress zipOptCompressLevel deflateWindowBits C..| (fst <$> sizeCRC))) + +++ Z.compress) sdat -- level for Z.compress? + , dataSize cdat) + | otherwise = (left (fmap (id &&& fst)) sdat, usiz) + z64 = maybe zipOpt64 (zip64Size <) (max <$> usiz <*> csiz) + namelen = BS.length zipEntryName + when (namelen > fromIntegral (maxBound :: Word16)) $ zipError $ BSC.unpack zipEntryName ++ ": entry name too long" + sourcePut $ do + P.putWord32le 0x04034b50 + P.putWord16le $ if z64 then 45 else 20 + P.putWord16le $ if isLeft dat then bit 3 else 0 + P.putWord16le $ if comp then 8 else 0 + let (time, date) = toDOSTime zipEntryTime + P.putWord16le $ time + P.putWord16le $ date + P.putWord32le $ either (const 0) crc32 cdat + P.putWord32le $ if z64 then zip64Size else maybe 0 fromIntegral csiz + P.putWord32le $ if z64 then zip64Size else maybe 0 fromIntegral usiz + P.putWord16le $ fromIntegral namelen + P.putWord16le $ if z64 then 20 else 0 + P.putByteString zipEntryName + when z64 $ do + P.putWord16le 0x0001 + P.putWord16le 16 + P.putWord64le $ fromMaybe 0 usiz + P.putWord64le $ fromMaybe 0 csiz + either + (\cd -> do + ((usz, crc), csz) <- cd -- write compressed data + when (not z64 && (usz > zip64Size || csz > zip64Size)) $ zipError $ BSC.unpack zipEntryName ++ ": file too large and zipOpt64 disabled" + sourcePut $ do + P.putWord32le 0x08074b50 + P.putWord32le crc + let putsz + | z64 = P.putWord64le + | otherwise = P.putWord32le . fromIntegral + putsz csz + putsz usz) + CB.sourceLbs + cdat diff --git a/cmd/unzip.hs b/cmd/unzip.hs index 045886b..1daa5fc 100644 --- a/cmd/unzip.hs +++ b/cmd/unzip.hs @@ -43,5 +43,5 @@ main = do exitFailure ZipInfo{..} <- C.runConduit $ CB.sourceHandle stdin - C..| C.fuseUpstream unZip extract + C..| C.fuseUpstream unZipStream extract BSC.putStrLn zipComment diff --git a/zip-stream.cabal b/zip-stream.cabal index ad49ef7..545ed73 100644 --- a/zip-stream.cabal +++ b/zip-stream.cabal @@ -19,6 +19,7 @@ library exposed-modules: Codec.Archive.Zip.Conduit.Types Codec.Archive.Zip.Conduit.UnZip + Codec.Archive.Zip.Conduit.Zip other-modules: Codec.Archive.Zip.Conduit.Internal default-language: Haskell2010 @@ -33,8 +34,10 @@ library digest, exceptions, primitive, + resourcet, time, - transformers-base + transformers-base, + zlib executable unzip-stream main-is: unzip.hs