Add zip64 support to unzip

Untested, but hey, the last untested stuff worked perfectly
This commit is contained in:
Dylan Simon 2017-05-11 15:20:54 -04:00
parent 081c3b786a
commit 66d0ad3720
2 changed files with 116 additions and 37 deletions

View File

@ -6,9 +6,10 @@ module Codec.Archive.Zip.Conduit.UnZip
, unZip , unZip
) where ) where
import Control.Monad (when, unless) import Control.Applicative ((<|>), empty)
import Control.Monad (when, unless, guard)
import qualified Data.Binary.Get as G import qualified Data.Binary.Get as G
import Data.Bits ((.&.), complement, shiftL, shiftR) import Data.Bits ((.&.), complement, testBit, shiftL, shiftR)
import qualified Data.ByteString as BS import qualified Data.ByteString as BS
import qualified Data.Conduit as C import qualified Data.Conduit as C
import qualified Data.Conduit.List as CL import qualified Data.Conduit.List as CL
@ -16,12 +17,12 @@ import Data.Conduit.Serialization.Binary (sinkGet)
import Data.Conduit.Zlib (WindowBits(..), decompress) import Data.Conduit.Zlib (WindowBits(..), decompress)
import Data.Digest.CRC32 (crc32Update) import Data.Digest.CRC32 (crc32Update)
import Data.Time (UTCTime(..), fromGregorian, timeOfDayToTime, TimeOfDay(..)) import Data.Time (UTCTime(..), fromGregorian, timeOfDayToTime, TimeOfDay(..))
import Data.Word (Word, Word32) import Data.Word (Word32, Word64)
data ZipEntry = ZipEntry data ZipEntry = ZipEntry
{ zipEntryName :: BS.ByteString { zipEntryName :: BS.ByteString
, zipEntryTime :: UTCTime , zipEntryTime :: UTCTime
, zipEntrySize :: Word , zipEntrySize :: !Word64
} }
data ZipInfo = ZipInfo data ZipInfo = ZipInfo
@ -33,12 +34,44 @@ data Header m
{ fileDecompress :: C.Conduit BS.ByteString m BS.ByteString { fileDecompress :: C.Conduit BS.ByteString m BS.ByteString
, fileEntry :: !ZipEntry , fileEntry :: !ZipEntry
, fileCRC :: !Word32 , fileCRC :: !Word32
, fileCSize :: !Word32 , fileCSize :: !Word64
, fileZip64 :: !Bool
} }
| EndOfCentralDirectory | EndOfCentralDirectory
{ endInfo :: ZipInfo { endInfo :: ZipInfo
} }
data ExtField = ExtField
{ extZip64 :: Bool
, extZip64USize
, extZip64CSize :: Word64
}
{- ExtUnix
{ extUnixATime
, extUnixMTime :: UTCTime
, extUnixUID
, extUnixGID :: Word16
, extUnixData :: BS.ByteString
}
-}
pass :: (Monad m, Integral n) => n -> C.Conduit BS.ByteString m BS.ByteString
pass 0 = return ()
pass n = C.await >>= maybe
(fail $ "EOF in file data, expecting " ++ show ni ++ " more bytes")
(\b ->
let n' = ni - toInteger (BS.length b) in
if n' < 0
then do
let (b', r) = BS.splitAt (fromIntegral n) b
C.yield b'
C.leftover r
else do
C.yield b
pass n')
where ni = toInteger n
crc32 :: Monad m => C.Consumer BS.ByteString m Word32 crc32 :: Monad m => C.Consumer BS.ByteString m Word32
crc32 = CL.fold crc32Update 0 crc32 = CL.fold crc32Update 0
@ -52,9 +85,10 @@ unZip = next where
case h of case h of
FileHeader{..} -> do FileHeader{..} -> do
C.yield $ Left fileEntry C.yield $ Left fileEntry
C.mapOutput Right $ pass (fromIntegral fileCSize) C.mapOutput Right $ pass fileCSize
C..| (fileDecompress >> CL.sinkNull) C..| (fileDecompress >> CL.sinkNull)
C..| checkCRC fileCRC C..| checkCRC fileCRC
sinkGet $ dataDesc h
next next
EndOfCentralDirectory{..} -> do EndOfCentralDirectory{..} -> do
return endInfo return endInfo
@ -62,20 +96,36 @@ unZip = next where
sig <- G.getWord32le sig <- G.getWord32le
case sig of case sig of
0x04034b50 -> fileHeader 0x04034b50 -> fileHeader
0x08074b50 -> -- data descriptor _ -> centralBody sig
G.skip 12 >> header dataDesc h = -- this takes a bit of flexibility to account for the various cases
_ -> centralDirectory sig (do -- with signature
centralDirectory 0x02014b50 = centralHeader >> G.getWord32le >>= centralDirectory sig <- G.getWord32le
centralDirectory 0x06054b50 = EndOfCentralDirectory <$> endDirectory guard (sig == 0x06054b50)
centralDirectory sig = fail $ "Unknown header signature: " ++ show sig dataDescBody h)
<|> dataDescBody h -- without signature
<|> return () -- none
dataDescBody FileHeader{..} = do
crc <- G.getWord32le
let getSize = if fileZip64 then G.getWord64le else fromIntegral <$> G.getWord32le
csiz <- getSize
usiz <- getSize
guard $ crc == fileCRC && csiz == fileCSize && usiz == zipEntrySize fileEntry
dataDescBody _ = empty
central = G.getWord32le >>= centralBody
centralBody 0x02014b50 = centralHeader >> central
centralBody 0x06064b50 = zip64EndDirectory >> central
centralBody 0x07064b50 = G.skip 16 >> central
centralBody 0x06054b50 = EndOfCentralDirectory <$> endDirectory
centralBody sig = fail $ "Unknown header signature: " ++ show sig
fileHeader = do fileHeader = do
ver <- G.getWord16le ver <- G.getWord16le
when (ver > 20) $ fail $ "Unsupported version: " ++ show ver when (ver > 45) $ fail $ "Unsupported version: " ++ show ver
gpf <- G.getWord16le gpf <- G.getWord16le
when (gpf .&. complement 6 /= 0) $ fail $ "Unsupported flags: " ++ show gpf when (gpf .&. complement 0o06 /= 0) $ fail $ "Unsupported flags: " ++ show gpf
comp <- G.getWord16le comp <- G.getWord16le
dcomp <- case comp of dcomp <- case comp of
0 -> return $ C.awaitForever C.yield 0 | testBit gpf 3 -> fail "Unsupported uncompressed streaming file data"
| otherwise -> return $ C.awaitForever C.yield -- idConduit
8 -> return $ decompress (WindowBits (-15)) 8 -> return $ decompress (WindowBits (-15))
_ -> fail $ "Unsupported compression method: " ++ show comp _ -> fail $ "Unsupported compression method: " ++ show comp
time <- G.getWord16le time <- G.getWord16le
@ -93,40 +143,69 @@ unZip = next where
crc <- G.getWord32le crc <- G.getWord32le
csiz <- G.getWord32le csiz <- G.getWord32le
usiz <- G.getWord32le usiz <- G.getWord32le
nlen <- G.getWord16le nlen <- fromIntegral <$> G.getWord16le
elen <- G.getWord16le elen <- fromIntegral <$> G.getWord16le
name <- G.getByteString $ fromIntegral nlen name <- G.getByteString nlen
G.skip $ fromIntegral elen let getExt ext = do
t <- G.getWord16le
z <- fromIntegral <$> G.getWord16le
ext' <- G.isolate z $ case t of
0x0001 -> do
-- the zip specs claim "the Local header MUST include BOTH" but "only if the corresponding field is set to 0xFFFFFFFF"
usiz' <- if usiz == maxBound then G.getWord64le else return $ extZip64USize ext
csiz' <- if csiz == maxBound then G.getWord64le else return $ extZip64CSize ext
return ext
{ extZip64 = True
, extZip64USize = usiz'
, extZip64CSize = csiz'
}
{-
0x000d -> do
atim <- G.getWord32le
mtim <- G.getWord32le
uid <- G.getWord16le
gid <- G.getWord16le
dat <- G.getByteString $ z - 12
return ExtUnix
{ extUnixATime = posixSecondsToUTCTime atim
, extUnixMTime = posixSecondsToUTCTime mtim
, extUnixUID = uid
, extUnixGID = gid
, extUnixData = dat
}
-}
_ -> ext <$ G.skip z
getExt ext'
ExtField{..} <- G.isolate elen $ getExt ExtField
{ extZip64 = False
, extZip64USize = fromIntegral usiz
, extZip64CSize = fromIntegral csiz
}
return FileHeader return FileHeader
{ fileEntry = ZipEntry { fileEntry = ZipEntry
{ zipEntryName = name { zipEntryName = name
, zipEntryTime = mtime , zipEntryTime = mtime
, zipEntrySize = fromIntegral usiz , zipEntrySize = extZip64USize
} }
, fileDecompress = dcomp , fileDecompress = dcomp
, fileCSize = csiz , fileCSize = extZip64CSize
, fileCRC = crc , fileCRC = crc
, fileZip64 = extZip64
} }
centralHeader = do centralHeader = do
-- ignore everything -- ignore everything
G.skip 24 G.skip 24
nlen <- G.getWord16le nlen <- fromIntegral <$> G.getWord16le
elen <- G.getWord16le elen <- fromIntegral <$> G.getWord16le
clen <- G.getWord16le clen <- fromIntegral <$> G.getWord16le
G.skip $ 12 + fromIntegral nlen + fromIntegral elen + fromIntegral clen G.skip $ 12 + nlen + elen + clen
zip64EndDirectory = do
len <- G.getWord64le
G.skip $ fromIntegral len -- would not expect to overflow...
endDirectory = do endDirectory = do
G.skip 16 G.skip 16
clen <- G.getWord16le clen <- fromIntegral <$> G.getWord16le
comm <- G.getByteString $ fromIntegral clen comm <- G.getByteString clen
return ZipInfo return ZipInfo
{ zipComment = comm { zipComment = comm
} }
pass 0 = return ()
pass n = C.await >>= maybe
(fail $ "EOF in file data, expecting " ++ show n ++ " more bytes")
(\b -> do
let (b', r) = BS.splitAt n b
C.yield b'
if BS.null r
then pass $ n - BS.length b'
else C.leftover r)

View File

@ -22,7 +22,7 @@ library
ghc-options: -Wall ghc-options: -Wall
build-depends: build-depends:
base >= 4.7 && < 5, base >= 4.7 && < 5,
binary, binary >= 0.7.2,
binary-conduit, binary-conduit,
bytestring, bytestring,
conduit, conduit,