diff --git a/Codec/Archive/Zip/Conduit/UnZip.hs b/Codec/Archive/Zip/Conduit/UnZip.hs index 1d63b5b..5c03607 100644 --- a/Codec/Archive/Zip/Conduit/UnZip.hs +++ b/Codec/Archive/Zip/Conduit/UnZip.hs @@ -6,9 +6,10 @@ module Codec.Archive.Zip.Conduit.UnZip , unZip ) where -import Control.Monad (when, unless) +import Control.Applicative ((<|>), empty) +import Control.Monad (when, unless, guard) import qualified Data.Binary.Get as G -import Data.Bits ((.&.), complement, shiftL, shiftR) +import Data.Bits ((.&.), complement, testBit, shiftL, shiftR) import qualified Data.ByteString as BS import qualified Data.Conduit as C import qualified Data.Conduit.List as CL @@ -16,12 +17,12 @@ import Data.Conduit.Serialization.Binary (sinkGet) import Data.Conduit.Zlib (WindowBits(..), decompress) import Data.Digest.CRC32 (crc32Update) import Data.Time (UTCTime(..), fromGregorian, timeOfDayToTime, TimeOfDay(..)) -import Data.Word (Word, Word32) +import Data.Word (Word32, Word64) data ZipEntry = ZipEntry { zipEntryName :: BS.ByteString , zipEntryTime :: UTCTime - , zipEntrySize :: Word + , zipEntrySize :: !Word64 } data ZipInfo = ZipInfo @@ -33,12 +34,44 @@ data Header m { fileDecompress :: C.Conduit BS.ByteString m BS.ByteString , fileEntry :: !ZipEntry , fileCRC :: !Word32 - , fileCSize :: !Word32 + , fileCSize :: !Word64 + , fileZip64 :: !Bool } | EndOfCentralDirectory { endInfo :: ZipInfo } +data ExtField = ExtField + { extZip64 :: Bool + , extZip64USize + , extZip64CSize :: Word64 + } + +{- ExtUnix + { extUnixATime + , extUnixMTime :: UTCTime + , extUnixUID + , extUnixGID :: Word16 + , extUnixData :: BS.ByteString + } +-} + +pass :: (Monad m, Integral n) => n -> C.Conduit BS.ByteString m BS.ByteString +pass 0 = return () +pass n = C.await >>= maybe + (fail $ "EOF in file data, expecting " ++ show ni ++ " more bytes") + (\b -> + let n' = ni - toInteger (BS.length b) in + if n' < 0 + then do + let (b', r) = BS.splitAt (fromIntegral n) b + C.yield b' + C.leftover r + else do + C.yield b + pass n') + where ni = toInteger n + crc32 :: Monad m => C.Consumer BS.ByteString m Word32 crc32 = CL.fold crc32Update 0 @@ -52,9 +85,10 @@ unZip = next where case h of FileHeader{..} -> do C.yield $ Left fileEntry - C.mapOutput Right $ pass (fromIntegral fileCSize) + C.mapOutput Right $ pass fileCSize C..| (fileDecompress >> CL.sinkNull) C..| checkCRC fileCRC + sinkGet $ dataDesc h next EndOfCentralDirectory{..} -> do return endInfo @@ -62,20 +96,36 @@ unZip = next where sig <- G.getWord32le case sig of 0x04034b50 -> fileHeader - 0x08074b50 -> -- data descriptor - G.skip 12 >> header - _ -> centralDirectory sig - centralDirectory 0x02014b50 = centralHeader >> G.getWord32le >>= centralDirectory - centralDirectory 0x06054b50 = EndOfCentralDirectory <$> endDirectory - centralDirectory sig = fail $ "Unknown header signature: " ++ show sig + _ -> centralBody sig + dataDesc h = -- this takes a bit of flexibility to account for the various cases + (do -- with signature + sig <- G.getWord32le + guard (sig == 0x06054b50) + dataDescBody h) + <|> dataDescBody h -- without signature + <|> return () -- none + dataDescBody FileHeader{..} = do + crc <- G.getWord32le + let getSize = if fileZip64 then G.getWord64le else fromIntegral <$> G.getWord32le + csiz <- getSize + usiz <- getSize + guard $ crc == fileCRC && csiz == fileCSize && usiz == zipEntrySize fileEntry + dataDescBody _ = empty + central = G.getWord32le >>= centralBody + centralBody 0x02014b50 = centralHeader >> central + centralBody 0x06064b50 = zip64EndDirectory >> central + centralBody 0x07064b50 = G.skip 16 >> central + centralBody 0x06054b50 = EndOfCentralDirectory <$> endDirectory + centralBody sig = fail $ "Unknown header signature: " ++ show sig fileHeader = do ver <- G.getWord16le - when (ver > 20) $ fail $ "Unsupported version: " ++ show ver + when (ver > 45) $ fail $ "Unsupported version: " ++ show ver gpf <- G.getWord16le - when (gpf .&. complement 6 /= 0) $ fail $ "Unsupported flags: " ++ show gpf + when (gpf .&. complement 0o06 /= 0) $ fail $ "Unsupported flags: " ++ show gpf comp <- G.getWord16le dcomp <- case comp of - 0 -> return $ C.awaitForever C.yield + 0 | testBit gpf 3 -> fail "Unsupported uncompressed streaming file data" + | otherwise -> return $ C.awaitForever C.yield -- idConduit 8 -> return $ decompress (WindowBits (-15)) _ -> fail $ "Unsupported compression method: " ++ show comp time <- G.getWord16le @@ -93,40 +143,69 @@ unZip = next where crc <- G.getWord32le csiz <- G.getWord32le usiz <- G.getWord32le - nlen <- G.getWord16le - elen <- G.getWord16le - name <- G.getByteString $ fromIntegral nlen - G.skip $ fromIntegral elen + nlen <- fromIntegral <$> G.getWord16le + elen <- fromIntegral <$> G.getWord16le + name <- G.getByteString nlen + let getExt ext = do + t <- G.getWord16le + z <- fromIntegral <$> G.getWord16le + ext' <- G.isolate z $ case t of + 0x0001 -> do + -- the zip specs claim "the Local header MUST include BOTH" but "only if the corresponding field is set to 0xFFFFFFFF" + usiz' <- if usiz == maxBound then G.getWord64le else return $ extZip64USize ext + csiz' <- if csiz == maxBound then G.getWord64le else return $ extZip64CSize ext + return ext + { extZip64 = True + , extZip64USize = usiz' + , extZip64CSize = csiz' + } + {- + 0x000d -> do + atim <- G.getWord32le + mtim <- G.getWord32le + uid <- G.getWord16le + gid <- G.getWord16le + dat <- G.getByteString $ z - 12 + return ExtUnix + { extUnixATime = posixSecondsToUTCTime atim + , extUnixMTime = posixSecondsToUTCTime mtim + , extUnixUID = uid + , extUnixGID = gid + , extUnixData = dat + } + -} + _ -> ext <$ G.skip z + getExt ext' + ExtField{..} <- G.isolate elen $ getExt ExtField + { extZip64 = False + , extZip64USize = fromIntegral usiz + , extZip64CSize = fromIntegral csiz + } return FileHeader { fileEntry = ZipEntry { zipEntryName = name , zipEntryTime = mtime - , zipEntrySize = fromIntegral usiz + , zipEntrySize = extZip64USize } , fileDecompress = dcomp - , fileCSize = csiz + , fileCSize = extZip64CSize , fileCRC = crc + , fileZip64 = extZip64 } centralHeader = do -- ignore everything G.skip 24 - nlen <- G.getWord16le - elen <- G.getWord16le - clen <- G.getWord16le - G.skip $ 12 + fromIntegral nlen + fromIntegral elen + fromIntegral clen + nlen <- fromIntegral <$> G.getWord16le + elen <- fromIntegral <$> G.getWord16le + clen <- fromIntegral <$> G.getWord16le + G.skip $ 12 + nlen + elen + clen + zip64EndDirectory = do + len <- G.getWord64le + G.skip $ fromIntegral len -- would not expect to overflow... endDirectory = do G.skip 16 - clen <- G.getWord16le - comm <- G.getByteString $ fromIntegral clen + clen <- fromIntegral <$> G.getWord16le + comm <- G.getByteString clen return ZipInfo { zipComment = comm } - pass 0 = return () - pass n = C.await >>= maybe - (fail $ "EOF in file data, expecting " ++ show n ++ " more bytes") - (\b -> do - let (b', r) = BS.splitAt n b - C.yield b' - if BS.null r - then pass $ n - BS.length b' - else C.leftover r) diff --git a/zip-stream.cabal b/zip-stream.cabal index be24e22..573569d 100644 --- a/zip-stream.cabal +++ b/zip-stream.cabal @@ -22,7 +22,7 @@ library ghc-options: -Wall build-depends: base >= 4.7 && < 5, - binary, + binary >= 0.7.2, binary-conduit, bytestring, conduit,