zip-stream/Codec/Archive/Zip/Conduit/UnZip.hs
Dylan Simon e006ecd336 Streaming unzip conduit, initial version
Untested, still many limitations on zip format
2017-05-09 20:47:49 -04:00

133 lines
4.1 KiB
Haskell

{-# LANGUAGE RecordWildCards #-}
{-# LANGUAGE RankNTypes #-}
module Codec.Archive.Zip.Conduit.UnZip
( ZipEntry(..)
, ZipInfo(..)
, unZip
) where
import Control.Monad (when, unless)
import qualified Data.Binary.Get as G
import Data.Bits ((.&.), complement, shiftL, shiftR)
import qualified Data.ByteString as BS
import qualified Data.Conduit as C
import qualified Data.Conduit.List as CL
import Data.Conduit.Serialization.Binary (sinkGet)
import Data.Conduit.Zlib (WindowBits(..), decompress)
import Data.Digest.CRC32 (crc32Update)
import Data.Time (UTCTime(..), fromGregorian, timeOfDayToTime, TimeOfDay(..))
import Data.Word (Word, Word32)
data ZipEntry = ZipEntry
{ zipEntryName :: BS.ByteString
, zipEntryTime :: UTCTime
, zipEntrySize :: Word
}
data ZipInfo = ZipInfo
{ zipComment :: BS.ByteString
}
data Header m
= FileHeader
{ fileDecompress :: C.Conduit BS.ByteString m BS.ByteString
, fileEntry :: !ZipEntry
, fileCRC :: !Word32
, fileCSize :: !Word32
}
| EndOfCentralDirectory
{ endInfo :: ZipInfo
}
crc32 :: Monad m => C.Consumer BS.ByteString m Word32
crc32 = CL.fold crc32Update 0
checkCRC :: Monad m => Word32 -> C.Conduit BS.ByteString m BS.ByteString
checkCRC t = C.passthroughSink crc32 $ \r -> unless (r == t) $ fail "CRC32 mismatch"
unZip :: C.ConduitM BS.ByteString (Either ZipEntry BS.ByteString) IO ZipInfo
unZip = next where
next = do
h <- sinkGet header
case h of
FileHeader{..} -> do
C.yield $ Left fileEntry
C.mapOutput Right $ pass (fromIntegral fileCSize)
C..| fileDecompress
C..| checkCRC fileCRC
next
EndOfCentralDirectory{..} -> do
return endInfo
header = do
sig <- G.getWord32le
case sig of
0x04034b50 -> fileHeader
0x08074b50 -> -- data descriptor
G.skip 12 >> header
_ -> centralDirectory sig
centralDirectory 0x02014b50 = centralHeader >> G.getWord32le >>= centralDirectory
centralDirectory 0x06054b50 = EndOfCentralDirectory <$> endDirectory
centralDirectory sig = fail $ "Unknown header signature: " ++ show sig
fileHeader = do
ver <- G.getWord16le
when (ver > 20) $ fail $ "Unsupported version: " ++ show ver
gpf <- G.getWord16le
when (gpf .&. complement 6 /= 0) $ fail $ "Unsupported flags: " ++ show gpf
comp <- G.getWord16le
dcomp <- case comp of
0 -> return $ C.awaitForever C.yield
8 -> return $ decompress (WindowBits (-15))
_ -> fail $ "Unsupported compression method: " ++ show comp
time <- G.getWord16le
date <- G.getWord16le
let mtime = UTCTime (fromGregorian
(fromIntegral $ date `shiftR` 9 + 1980)
(fromIntegral $ date `shiftR` 5 .&. 0x0f)
(fromIntegral $ date .&. 0x1f)
)
(timeOfDayToTime $ TimeOfDay
(fromIntegral $ time `shiftR` 11)
(fromIntegral $ time `shiftR` 5 .&. 0x3f)
(fromIntegral $ time `shiftL` 1 .&. 0x3f)
)
crc <- G.getWord32le
csiz <- G.getWord32le
usiz <- G.getWord32le
nlen <- G.getWord16le
elen <- G.getWord16le
name <- G.getByteString $ fromIntegral nlen
G.skip $ fromIntegral elen
return FileHeader
{ fileEntry = ZipEntry
{ zipEntryName = name
, zipEntryTime = mtime
, zipEntrySize = fromIntegral usiz
}
, fileDecompress = dcomp
, fileCSize = csiz
, fileCRC = crc
}
centralHeader = do
-- ignore everything
G.skip 24
nlen <- G.getWord16le
elen <- G.getWord16le
clen <- G.getWord16le
G.skip $ 12 + fromIntegral nlen + fromIntegral elen + fromIntegral clen
endDirectory = do
G.skip 16
clen <- G.getWord16le
comm <- G.getByteString $ fromIntegral clen
return ZipInfo
{ zipComment = comm
}
pass 0 = return ()
pass n = C.await >>= maybe
(fail $ "EOF in file data, expecting " ++ show n ++ " more bytes")
(\b -> do
let (b', r) = BS.splitAt n b
C.yield b'
if BS.null r
then pass $ n - BS.length b'
else C.leftover r)