Rewrite to support more sources and changing the encoding dynamically

Now it's possible to change the character encoding while de-/encoding.
Also, it's possible to use any data structure as a source or target of the de-/encoding process.

darcs-hash:20090221203100-a4fee-6da31f2e37c30a3f5cd5f10af71984209488bb0b
This commit is contained in:
Henning Guenther 2009-02-21 12:31:00 -08:00
parent 8b16078e5e
commit eeee054f1e
44 changed files with 1414 additions and 1769 deletions

28
Control/Throws.hs Normal file
View File

@ -0,0 +1,28 @@
{-# LANGUAGE FlexibleInstances,MultiParamTypeClasses #-}
module Control.Throws where
import Control.Exception.Extensible
import Control.Monad.Identity
import Control.Monad.Reader
import Control.Monad.State
class Throws e m where
throwException :: e -> m a
instance Exception e => Throws e Identity where
throwException = throw
{-instance MonadError e m => Throws e m where
throwException = throwError-}
instance Throws e (Either e) where
throwException = Left
instance Exception e => Throws e IO where
throwException = throw
instance Throws e m => Throws e (StateT s m) where
throwException x = StateT (\s -> throwException x)
instance Throws e m => Throws e (ReaderT s m) where
throwException x = ReaderT (\s -> throwException x)

View File

@ -1,24 +1,45 @@
{-# LANGUAGE ExistentialQuantification,CPP #-}
{-# LANGUAGE FlexibleContexts,ExistentialQuantification #-}
module Data.Encoding
(Encoding(..)
,EncodingException(..)
,DecodingException(..)
,recode
,recodeLazy
,DynEncoding()
#ifndef USE_HPC
,encodingFromString
,encodingFromStringMaybe
#endif
)
where
(module Data.Encoding.Exception
,module Data.Encoding.ByteSource
,module Data.Encoding.ByteSink
,Encoding(..)
,DynEncoding
,recode
,encodeString
,encodeStringExplicit
,decodeString
,decodeStringExplicit
,encodeLazyByteString
,encodeLazyByteStringExplicit
,decodeLazyByteString
,decodeLazyByteStringExplicit
,encodeStrictByteString
,encodeStrictByteStringExplicit
,decodeStrictByteString
,decodeStrictByteStringExplicit
,encodingFromString
)
where
import Data.ByteString (ByteString)
import qualified Data.ByteString.Lazy as Lazy (ByteString)
import Data.Typeable
import Data.Encoding.Base
import Data.Encoding.ByteSource
import Data.Encoding.ByteSink
import Data.Encoding.Exception
import Data.Sequence
import Data.Foldable(toList)
import Data.Char
import Control.Monad.State
import Control.Monad.Identity
import Control.Monad.Error.Class
import Data.Binary.Put
import Data.Binary.Get
import qualified Data.ByteString as BS
import qualified Data.ByteString.Lazy as LBS
#ifndef USE_HPC
import Data.Encoding.ASCII
import Data.Encoding.UTF8
import Data.Encoding.UTF16
@ -52,36 +73,61 @@ import Data.Encoding.KOI8U
import Data.Encoding.GB18030
import Data.Char
import Text.Regex
#endif
-- | An untyped encoding. Used in 'System.IO.Encoding.getSystemEncoding'.
data DynEncoding = forall t. (Encoding t,Show t,Typeable t,Eq t)
=> DynEncoding t
data DynEncoding = forall enc. Encoding enc => DynEncoding enc
instance Encoding DynEncoding where
encode (DynEncoding enc) = encode enc
encodeLazy (DynEncoding enc) = encodeLazy enc
encodable (DynEncoding enc) = encodable enc
decode (DynEncoding enc) = decode enc
decodeLazy (DynEncoding enc) = decodeLazy enc
decodable (DynEncoding enc) = decodable enc
decodeChar (DynEncoding e) = decodeChar e
encodeChar (DynEncoding e) = encodeChar e
decode (DynEncoding e) = decode e
encode (DynEncoding e) = encode e
instance Show DynEncoding where
show (DynEncoding enc) = "DynEncoding "++show enc
recode :: (Encoding enc1,Encoding enc2,ByteSource m,ByteSink m) => enc1 -> enc2 -> m ()
recode e1 e2 = untilM_ sourceEmpty (decodeChar e1 >>= encodeChar e2)
instance Eq DynEncoding where
(DynEncoding enc1) == (DynEncoding enc2) = case cast enc2 of
Nothing -> False
Just renc2 -> enc1 == renc2
encodeString :: Encoding enc => enc -> String -> String
encodeString e str = toList $ viewl $ execState (encode e str) empty
-- | This decodes a string from one encoding and encodes it into another.
recode :: (Encoding from,Encoding to) => from -> to -> ByteString -> ByteString
recode enc_f enc_t bs = encode enc_t (decode enc_f bs)
encodeStringExplicit :: Encoding enc => enc -> String -> Either EncodingException String
encodeStringExplicit e str = execStateT (encode e str) empty >>= return.toList.viewl
recodeLazy :: (Encoding from,Encoding to) => from -> to -> Lazy.ByteString -> Lazy.ByteString
recodeLazy enc_f enc_t bs = encodeLazy enc_t (decodeLazy enc_f bs)
decodeString :: Encoding enc => enc -> String -> String
decodeString e str = evalState (decode e) str
decodeStringExplicit :: Encoding enc => enc -> String -> Either DecodingException String
decodeStringExplicit e str = evalStateT (decode e) str
encodeLazyByteString :: Encoding enc => enc -> String -> LBS.ByteString
encodeLazyByteString e str = runPut $ encode e str
encodeLazyByteStringExplicit :: Encoding enc => enc -> String -> Either EncodingException LBS.ByteString
encodeLazyByteStringExplicit e str = let PutME g = encode e str
in case g of
Left err -> Left err
Right (p,()) -> Right $ runPut p
decodeLazyByteString :: Encoding enc => enc -> LBS.ByteString -> String
decodeLazyByteString e str = runGet (decode e) str
decodeLazyByteStringExplicit :: Encoding enc => enc -> LBS.ByteString -> Either DecodingException String
decodeLazyByteStringExplicit e str = evalStateT (decode e) str
encodeStrictByteString :: Encoding enc => enc -> String -> BS.ByteString
encodeStrictByteString e str = snd $ createStrict $ encode e str
encodeStrictByteStringExplicit :: Encoding enc => enc -> String -> Either EncodingException BS.ByteString
encodeStrictByteStringExplicit e str = let StrictSinkE g = encode e str
(r,bstr) = createStrict g
in case r of
Left err -> Left err
Right _ -> Right bstr
decodeStrictByteString :: Encoding enc => enc -> BS.ByteString -> String
decodeStrictByteString e str = evalState (decode e) str
decodeStrictByteStringExplicit :: Encoding enc => enc -> BS.ByteString -> Either DecodingException String
decodeStrictByteStringExplicit e str = evalStateT (decode e) str
#ifndef USE_HPC
-- | Like 'encodingFromString' but returns 'Nothing' instead of throwing an error
encodingFromStringMaybe :: String -> Maybe DynEncoding
encodingFromStringMaybe codeName = case (normalizeEncoding codeName) of
@ -266,12 +312,9 @@ encodingFromStringMaybe codeName = case (normalizeEncoding codeName) of
normalizeEncoding s = map toLower $ subRegex sep s "_"
sep = mkRegex "[^0-9A-Za-z]+"
-- | Takes the name of an encoding and creates a dynamic encoding from it.
encodingFromString :: String -> DynEncoding
encodingFromString str = maybe
(error $ "Data.Encoding.encodingFromString: Unknown encoding: "++show str)
id
(encodingFromStringMaybe str)
#endif
(encodingFromStringMaybe str)

View File

@ -1,31 +1,17 @@
{-# LANGUAGE DeriveDataTypeable #-}
-- | ASCII (American Standard Code for Information Interchange) is the
-- \"normal\" computer encoding using the byte values 0-127 to represent
-- characters. Refer to <http://en.wikipedia.org/wiki/ASCII> for
-- more information.
module Data.Encoding.ASCII
(ASCII(..)) where
module Data.Encoding.ASCII where
import Control.Exception (throwDyn)
import Data.ByteString (pack)
import qualified Data.ByteString.Lazy as Lazy (pack)
import Data.ByteString.Char8 (unpack)
import Data.Char (ord)
import qualified Data.ByteString.Lazy as Lazy
import Data.Char
import Data.Encoding.Base
import Data.Word
import Data.Encoding.ByteSource
import Data.Encoding.ByteSink
import Data.Typeable
data ASCII = ASCII deriving (Show,Eq,Typeable)
charToASCII :: Char -> Word8
charToASCII ch = if ch < '\128'
then fromIntegral $ ord ch
else throwDyn (HasNoRepresentation ch)
instance Encoding ASCII where
encode _ str = pack (map charToASCII str)
encodeLazy _ str = Lazy.pack (map charToASCII str)
encodable _ ch = ch < '\128'
decode _ = unpack
decodable _ = const True
decodeChar _ = do
w <- fetchWord8
return $ chr $ fromIntegral w
encodeChar _ c = do
pushWord8 $ fromIntegral $ ord c

View File

@ -1,157 +1,45 @@
{-# LANGUAGE CPP,TemplateHaskell,DeriveDataTypeable #-}
module Data.Encoding.Base where
module Data.Encoding.Base
(Encoding(..)
,EncodeState(..)
,encodeMultibyte
,encodeMultibyteLazy
,decodeMultibyte
,decodeMultibyteLazy
,encodeSinglebyte
,encodeSinglebyteLazy
,decodeSinglebyte
,EncodingException(..)
,DecodingException(..)
,decodingArray
,encodingMap)
where
import Data.Encoding.Exception
import Data.Encoding.ByteSource
import Data.Encoding.ByteSink
import Data.Array(array)
import Data.ByteString (ByteString,unfoldrN,unfoldr,length,index,unpack)
import qualified Data.ByteString.Lazy as LBS
import Data.Encoding.Helper.Template
#if __GLASGOW_HASKELL__>=608
import Data.ByteString.Unsafe(unsafeIndex)
#else
import Data.ByteString.Base(unsafeIndex)
#endif
import Data.Map (Map,fromList,lookup)
import Data.Char(chr)
import Data.Maybe(mapMaybe)
import Data.Typeable
import Control.Throws
import Data.Array as Array
import Data.Map as Map hiding ((!))
import Data.Word
import Prelude hiding (lookup,length)
import qualified Prelude
import Control.Exception
import Data.Dynamic(toDyn)
import Data.Char
import Language.Haskell.TH
{- | Represents an encoding, supporting various methods of de- and encoding.
Minimal complete definition: encode, decode
-}
class Encoding enc where
-- | Encode a 'String' into a strict 'ByteString'. Throws the
-- 'HasNoRepresentation'-Exception if it encounters an unrepresentable
-- character.
encode :: enc -> String -> ByteString
-- | Encode a 'String' into a lazy 'Data.ByteString.Lazy.ByteString'.
encodeLazy :: enc -> String -> LBS.ByteString
encodeLazy e str = LBS.fromChunks [encode e str]
-- | Whether or not the given 'Char' is representable in this encoding. Default: 'True'.
encodable :: enc -> Char -> Bool
encodable _ _ = True
-- | Decode a strict 'ByteString' into a 'String'. If the string is not
-- decodable, a 'DecodingException' is thrown.
decode :: enc -> ByteString -> String
decodeLazy :: enc -> LBS.ByteString -> String
decodeLazy e str = concatMap (decode e) (LBS.toChunks str)
-- | Whether or no a given 'ByteString' is decodable. Default: 'True'.
decodable :: enc -> ByteString -> Bool
decodable _ _ = True
decodeChar :: ByteSource m => enc -> m Char
encodeChar :: ByteSink m => enc -> Char -> m ()
decode :: ByteSource m => enc -> m String
decode e = untilM sourceEmpty (decodeChar e)
encode :: ByteSink m => enc -> String -> m ()
encode e = mapM_ (encodeChar e)
encodeMultibyte :: (Char -> (Word8,EncodeState)) -> String -> ByteString
encodeMultibyte f str = unfoldr (\st -> case st of
(Done,[]) -> Nothing
(Done,x:xs) -> let (w,st) = f x in Just (w,(st,xs))
(Put1 w1,xs) -> Just (w1,(Done,xs))
(Put2 w1 w2,xs) -> Just (w1,(Put1 w2,xs))
(Put3 w1 w2 w3,xs) -> Just (w1,(Put2 w2 w3,xs))) (Done,str)
untilM :: Monad m => m Bool -> m a -> m [a]
untilM check act = do
end <- check
if end
then return []
else (do
x <- act
xs <- untilM check act
return (x:xs)
)
encodeMultibyteLazy :: (Char -> (Word8,EncodeState)) -> String -> LBS.ByteString
encodeMultibyteLazy f str = LBS.unfoldr (\ ~(st,rest) -> case st of
Done -> case rest of
[] -> Nothing
x:xs -> let ~(w,st) = f x in Just (w,(st,xs))
Put1 w1 -> Just (w1,(Done,rest))
Put2 w1 w2 -> Just (w1,(Put1 w2,rest))
Put3 w1 w2 w3 -> Just (w1,(Put2 w2 w3,rest))) (Done,str)
untilM_ :: Monad m => m Bool -> m a -> m ()
untilM_ check act = untilM check act >> return ()
decodeMultibyte :: ([Word8] -> (Char,[Word8])) -> ByteString -> String
decodeMultibyte f str = decode (unpack str)
where
decode lst = let (c,nlst) = f lst in if null lst then [] else c:(decode nlst)
decodeMultibyteLazy :: ([Word8] -> (Char,[Word8])) -> LBS.ByteString -> String
decodeMultibyteLazy f str = decode (LBS.unpack str)
where
decode lst = let (c,nlst) = f lst in if null lst then [] else c:(decode nlst)
encodeSinglebyte :: (Char -> Word8) -> String -> ByteString
encodeSinglebyte f str = fst $ unfoldrN (Prelude.length str) (\st -> case st of
[] -> Nothing
(x:xs) -> Just (f x,xs)) str
encodeSinglebyteLazy :: (Char -> Word8) -> String -> LBS.ByteString
encodeSinglebyteLazy f str = LBS.unfoldr (\st -> case st of
[] -> Nothing
(x:xs) -> Just (f x,xs)) str
decodeSinglebyte :: (Word8 -> Char) -> ByteString -> String
decodeSinglebyte f str = map f (unpack str)
data EncodeState
= Done
| Put1 !Word8
| Put2 !Word8 !Word8
| Put3 !Word8 !Word8 !Word8
deriving Show
-- | This exception type is thrown whenever something went wrong during the
-- encoding-process.
data EncodingException
= HasNoRepresentation Char -- ^ Thrown if a specific character
-- is not representable in an encoding.
deriving (Eq,Show,Typeable)
-- | This exception type is thrown whenever something went wrong during the
-- decoding-process.
data DecodingException
= IllegalCharacter Word8 -- ^ The sequence contained an illegal
-- byte that couldn't be decoded.
| UnexpectedEnd -- ^ more bytes were needed to allow a
-- successfull decoding.
| OutOfRange -- ^ the decoded value was out of the unicode range
| IllegalRepresentation [Word8] -- ^ The character sequence encodes a
-- character, but is illegal.
deriving (Eq,Show,Typeable)
decodingArray :: FilePath -> Q Exp
decodingArray file = do
trans <- runIO (readTranslation file)
createCharArray trans 0 255
encodingMap :: FilePath -> Q Exp
#ifndef __HADDOCK__
encodingMap file = do
trans <- runIO (readTranslation file)
return $ AppE
(VarE 'fromList)
(ListE [ TupE [LitE $ CharL to,LitE $ IntegerL from]
| (from,to) <- trans])
#endif
readTranslation :: FilePath -> IO [(Integer,Char)]
readTranslation file = do
cont <- readFile file
return $ mapMaybe (\ln -> case ln of
[] -> Nothing
('#':xs) -> Nothing
_ -> case words ln of
(src:"#UNDEFINED":_) -> Just (read src,'\xFFFD') -- XXX: Find a better way to handle this
(src:trg:_) -> Just (read src,chr $ read trg)
_ -> Nothing
) (lines cont)
encodeWithMap :: ByteSink m => Map Char Word8 -> Char -> m ()
encodeWithMap mp c = case Map.lookup c mp of
Nothing -> throwException $ HasNoRepresentation c
Just v -> pushWord8 v
decodeWithArray :: ByteSource m => Array Word8 (Maybe Char) -> m Char
decodeWithArray arr = do
w <- fetchWord8
case arr!w of
Nothing -> throwException $ IllegalCharacter w
Just c -> return c

View File

@ -6,11 +6,15 @@ module Data.Encoding.BootString
,punycode) where
import Data.Encoding.Base
import Data.ByteString.Char8 (pack,unpack)
import Data.List (unfoldr,partition)
import Data.Encoding.Exception
import Data.Encoding.ByteSource
import Data.Encoding.ByteSink
import Control.Throws
import Data.Word
import Data.List (unfoldr,partition,find)
import Data.Char (ord,chr)
import Data.Typeable
import Control.Exception (throwDyn)
import Control.Monad (when)
data BootString = BootString
{base :: Int
@ -34,27 +38,27 @@ punycode = BootString
,init_n = 0x80
}
punyValue :: Char -> Int
punyValue :: ByteSource m => Word8 -> m Int
punyValue c
| n < 0x30 = norep
| n <= 0x39 = n-0x30+26
| n <= 0x39 = return $ n-0x30+26
| n < 0x41 = norep
| n <= 0x5A = n-0x41
| n <= 0x5A = return $ n-0x41
| n < 0x61 = norep
| n <= 0x7A = n-0x61
| n <= 0x7A = return $ n-0x61
| otherwise = norep
where
n = ord c
norep = throwDyn (HasNoRepresentation c)
n = fromIntegral c
norep = throwException (IllegalCharacter c)
punyChar :: Int -> Char
punyChar :: ByteSink m => Int -> m Word8
punyChar c
| c < 0 = norep
| c < 26 = chr $ 0x61+c
| c < 36 = chr $ 0x30+c-26
| c < 26 = return $ fromIntegral $ 0x61+c
| c < 36 = return $ fromIntegral $ 0x30+c-26
| otherwise = norep
where
norep = throwDyn OutOfRange
norep = throwException (HasNoRepresentation (chr c))
getT :: BootString -> Int -> Int -> Int
getT bs k bias
@ -73,40 +77,46 @@ adapt bs delta numpoints firsttime = let
$ iterate (\(d,k) -> (d `div` (base bs - tmin bs),k+(base bs))) (delta2,0)
in rk + (((base bs - tmin bs +1) * rd) `div` (rd + skew bs))
decodeValue :: BootString -> Int -> Int -> Int -> Int -> [Int] -> (Int,[Int])
decodeValue :: ByteSource m => BootString -> Int -> Int -> Int -> Int -> [Int] -> m (Int,[Int])
decodeValue bs bias i k w (x:xs)
| x >= base bs = throwDyn OutOfRange
| x > (maxBound - i) `div` w = throwDyn OutOfRange
| x < t = (ni,xs)
| w > maxBound `div` (base bs - t) = throwDyn OutOfRange
| x >= base bs = throwException OutOfRange
| x > (maxBound - i) `div` w = throwException OutOfRange
| x < t = return (ni,xs)
| w > maxBound `div` (base bs - t) = throwException OutOfRange
| null xs = throwException OutOfRange
| otherwise = decodeValue bs bias ni (k+base bs) (w*(base bs - t)) xs
where
ni = i + x*w
t = getT bs k bias
decodeValues :: BootString -> Int -> [Int] -> [(Char,Int)]
decodeValues :: ByteSource m => BootString -> Int -> [Int] -> m [(Char,Int)]
decodeValues bs len xs = decodeValues' bs (init_n bs) 0 (init_bias bs) len xs
decodeValues' :: BootString -> Int -> Int -> Int -> Int -> [Int] -> [(Char,Int)]
decodeValues' bs n i bias len [] = []
decodeValues' bs n i bias len xs
| dn > maxBound - n = throwDyn OutOfRange
| otherwise = (chr $ nn,nni):decodeValues' bs nn (nni+1)
(adapt bs (ni-i) (len+1) (i==0)) (len+1) rst
where
(ni,rst) = decodeValue bs bias i (base bs) 1 xs
(dn,nni) = ni `divMod` (len+1)
nn = n + dn
insertDeltas :: [(Char,Int)] -> String -> String
decodeValues' :: ByteSource m => BootString -> Int -> Int -> Int -> Int -> [Int] -> m [(Char,Int)]
decodeValues' bs n i bias len [] = return []
decodeValues' bs n i bias len xs = do
(ni,rst) <- decodeValue bs bias i (base bs) 1 xs
let (dn,nni) = ni `divMod` (len+1)
let nn = n+dn
if dn > maxBound - n
then throwException OutOfRange
else (do
rest <- decodeValues' bs nn (nni+1) (adapt bs (ni-i) (len+1) (i==0)) (len+1) rst
return $ (chr $ nn,nni):rest
)
insertDeltas :: [(a,Int)] -> [a] -> [a]
insertDeltas [] str = str
insertDeltas ((c,p):xs) str = let
(l,r) = splitAt p str
in insertDeltas xs (l++[c]++r)
punyDecode :: String -> String -> String
punyDecode base ext = insertDeltas (decodeValues punycode (length base) (map punyValue ext)) base
punyDecode :: ByteSource m => [Word8] -> [Word8] -> m String
punyDecode base ext = do
pvals <- mapM punyValue ext
vals <- decodeValues punycode (length base) pvals
return $ insertDeltas vals (map (chr.fromIntegral) base)
encodeValue :: BootString -> Int -> Int -> Int -> Int -> [Int]
encodeValue bs bias delta n c = unfoldr (\(q,k,out) -> let
t = getT bs k bias
@ -136,15 +146,39 @@ encodeValues bs b l h bias delta n cps
m = minimum (filter (>=n) cps)
(ndelta,nh,nbias,outp) = encodeValues' bs b h bias (delta + (m - n)*(h + 1)) m cps
breakLast :: (a -> Bool) -> [a] -> Maybe ([a],[a])
breakLast p xs = do
(bf,af,ind) <- breakLast' 0 Nothing p xs
return (bf,af)
where
breakLast' n r p [] = do
v <- r
return ([],[],v)
breakLast' n r p (x:xs) = let res = if p x
then breakLast' (n+1) (Just n) p xs
else breakLast' (n+1) r p xs
in do
(bf,af,v) <- res
return $ if n<v then (x:bf,af,v) else (bf,x:af,v)
instance Encoding BootString where
encode bs str = let
(base,nbase) = partition (\c -> ord c < init_n bs) str
b = length base
res = map punyChar $
encodeValues bs b (length str) b (init_bias bs) 0 (init_n bs) (map ord str)
in pack $ if null base
then res
else base++"-"++res
decode bs str = case break (=='-') (unpack str) of
(base,'-':nbase) -> punyDecode base nbase
(nbase,"") -> punyDecode "" nbase
encodeChar _ c = error "Data.Encoding.BootString.encodeChar: Please use 'encode' for encoding BootStrings"
decodeChar _ = error "Data.Encoding.BootString.decodeChar: Please use 'decode' for decoding BootStrings"
encode bs str = let (base,nbase) = partition (\c -> ord c < init_n bs) str
b = length base
in do
res <- mapM punyChar $ encodeValues bs b (length str) b (init_bias bs) 0 (init_n bs) (map ord str)
when (not $ null base) $ do
mapM_ (pushWord8.fromIntegral.ord) base
pushWord8 (fromIntegral $ ord '-')
mapM_ pushWord8 res
decode bs = do
wrds <- untilM sourceEmpty fetchWord8
let m = fromIntegral $ ord '-'
case breakLast (==m) wrds of
Just ([],_) -> throwException (IllegalCharacter m)
Just (base,_:nbase) -> case find (\w -> fromIntegral w > init_n bs) base of
Nothing -> punyDecode base nbase
Just ww -> throwException (IllegalCharacter ww)
Nothing -> punyDecode [] wrds

195
Data/Encoding/ByteSink.hs Normal file
View File

@ -0,0 +1,195 @@
{-# LANGUAGE FlexibleInstances,FlexibleContexts,MultiParamTypeClasses #-}
module Data.Encoding.ByteSink where
import Data.Encoding.Exception
import Data.Binary.Put
import Data.Bits
import Data.Char
import Data.Sequence
import Data.Word
import Data.Foldable (toList)
import Control.Throws
import Control.Exception.Extensible
import Control.Monad.State
import Control.Monad.Identity
import Control.Monad.Reader
import Foreign.Ptr (Ptr,plusPtr,minusPtr)
import Foreign.Marshal.Alloc (mallocBytes,reallocBytes,free)
import Foreign.Storable (poke)
import System.IO
import System.IO.Unsafe (unsafePerformIO)
import qualified Data.ByteString as BS
import Data.ByteString.Unsafe (unsafePackCStringFinalizer)
class (Monad m,Throws EncodingException m) => ByteSink m where
pushWord8 :: Word8 -> m ()
pushWord16be :: Word16 -> m ()
pushWord16be w = do
pushWord8 (fromIntegral $ w `shiftR` 8)
pushWord8 (fromIntegral $ w)
pushWord16le :: Word16 -> m ()
pushWord16le w = do
pushWord8 (fromIntegral $ w)
pushWord8 (fromIntegral $ w `shiftR` 8)
pushWord32be :: Word32 -> m ()
pushWord32be w = do
pushWord8 (fromIntegral $ w `shiftR` 24)
pushWord8 (fromIntegral $ w `shiftR` 16)
pushWord8 (fromIntegral $ w `shiftR` 8)
pushWord8 (fromIntegral $ w)
pushWord32le :: Word32 -> m ()
pushWord32le w = do
pushWord8 (fromIntegral $ w)
pushWord8 (fromIntegral $ w `shiftR` 8)
pushWord8 (fromIntegral $ w `shiftR` 16)
pushWord8 (fromIntegral $ w `shiftR` 24)
pushWord64be :: Word64 -> m ()
pushWord64be w = do
pushWord8 (fromIntegral $ w `shiftR` 56)
pushWord8 (fromIntegral $ w `shiftR` 48)
pushWord8 (fromIntegral $ w `shiftR` 40)
pushWord8 (fromIntegral $ w `shiftR` 32)
pushWord8 (fromIntegral $ w `shiftR` 24)
pushWord8 (fromIntegral $ w `shiftR` 16)
pushWord8 (fromIntegral $ w `shiftR` 8)
pushWord8 (fromIntegral $ w)
pushWord64le :: Word64 -> m ()
pushWord64le w = do
pushWord8 (fromIntegral $ w)
pushWord8 (fromIntegral $ w `shiftR` 8)
pushWord8 (fromIntegral $ w `shiftR` 16)
pushWord8 (fromIntegral $ w `shiftR` 24)
pushWord8 (fromIntegral $ w `shiftR` 32)
pushWord8 (fromIntegral $ w `shiftR` 40)
pushWord8 (fromIntegral $ w `shiftR` 48)
pushWord8 (fromIntegral $ w `shiftR` 56)
instance Throws EncodingException PutM where
throwException = throw
instance ByteSink PutM where
pushWord8 = putWord8
pushWord16be = putWord16be
pushWord16le = putWord16le
pushWord32be = putWord32be
pushWord32le = putWord32le
pushWord64be = putWord64be
pushWord64le = putWord64le
newtype PutME a = PutME (Either EncodingException (PutM (),a))
instance Monad PutME where
return x = PutME $ Right (return (),x)
(PutME x) >>= g = PutME $ do
(m,r) <- x
let (PutME ng) = g r
case ng of
Left err -> Left err
Right (m',nr) -> Right (m>>m',nr)
instance Throws EncodingException PutME where
throwException = PutME . Left
instance ByteSink PutME where
pushWord8 w = PutME $ Right (putWord8 w,())
pushWord16be w = PutME $ Right (putWord16be w,())
pushWord16le w = PutME $ Right (putWord16le w,())
pushWord32be w = PutME $ Right (putWord32be w,())
pushWord32le w = PutME $ Right (putWord32le w,())
pushWord64be w = PutME $ Right (putWord64be w,())
pushWord64le w = PutME $ Right (putWord64le w,())
instance Monad (Either EncodingException) where
return x = Right x
Left err >>= g = Left err
Right x >>= g = g x
instance Throws EncodingException (State (Seq Char)) where
throwException = throw
instance ByteSink (State (Seq Char)) where
pushWord8 x = modify (|> (chr $ fromIntegral x))
instance ByteSink (StateT (Seq Char) (Either EncodingException)) where
pushWord8 x = modify (|> (chr $ fromIntegral x))
newtype StrictSink a = StrictS (Ptr Word8 -> Int -> Int -> IO (a,Ptr Word8,Int,Int))
instance Monad StrictSink where
return x = StrictS $ \cstr pos max -> return (x,cstr,pos,max)
(StrictS f) >>= g = StrictS (\cstr pos max -> do
(res,ncstr,npos,nmax) <- f cstr pos max
let StrictS g' = g res
g' ncstr npos nmax
)
instance Throws EncodingException StrictSink where
throwException = throw
instance ByteSink StrictSink where
pushWord8 x = StrictS (\cstr pos max -> do
(ncstr,nmax) <- if pos < max
then return (cstr,max)
else (do
let nmax = max + 32
nptr <- reallocBytes cstr nmax
return (nptr,nmax)
)
poke (ncstr `plusPtr` pos) x
return ((),ncstr,pos+1,nmax)
)
newtype StrictSinkE a = StrictSinkE (StrictSink (Either EncodingException a))
instance Monad StrictSinkE where
return = StrictSinkE . return . Right
(StrictSinkE s) >>= g = StrictSinkE $ do
res <- s
case res of
Left err -> return $ Left err
Right res' -> let StrictSinkE g' = g res'
in g'
instance Throws EncodingException StrictSinkE where
throwException = StrictSinkE . return . Left
instance ByteSink StrictSinkE where
pushWord8 x = StrictSinkE $ pushWord8 x >>= return . Right
createStrictWithLen :: StrictSink a -> Int -> (a,BS.ByteString)
createStrictWithLen (StrictS f) max = unsafePerformIO $ do
ptr <- mallocBytes max
(r,nptr,len,_) <- f ptr 0 max
str <- unsafePackCStringFinalizer nptr len (free nptr)
return (r,str)
createStrict :: StrictSink a -> (a,BS.ByteString)
createStrict sink = createStrictWithLen sink 32
newtype StrictSinkExplicit a = StrictSinkExplicit (StrictSink (Either EncodingException a))
instance Monad StrictSinkExplicit where
return = (StrictSinkExplicit).return.Right
(StrictSinkExplicit sink) >>= f
= StrictSinkExplicit (do
res <- sink
case res of
Left err -> return $ Left err
Right x -> let StrictSinkExplicit sink2 = f x
in sink2)
instance Throws EncodingException StrictSinkExplicit where
throwException = StrictSinkExplicit . return . Left
instance ByteSink StrictSinkExplicit where
pushWord8 x = StrictSinkExplicit $ do
pushWord8 x
return $ Right ()
instance ByteSink (ReaderT Handle IO) where
pushWord8 x = do
h <- ask
liftIO $ do
hPutChar h (chr $ fromIntegral x)

161
Data/Encoding/ByteSource.hs Normal file
View File

@ -0,0 +1,161 @@
{-# LANGUAGE FlexibleInstances,FlexibleContexts,MultiParamTypeClasses #-}
module Data.Encoding.ByteSource where
import Data.Encoding.Exception
import Data.Bits
import Data.Binary.Get
import Data.Char
import Data.Word
import Control.Monad.State
import Control.Monad.Identity
import Control.Monad.Reader
import Control.Exception.Extensible
import Control.Throws
import qualified Data.ByteString as BS
import qualified Data.ByteString.Lazy as LBS
import System.IO
class (Monad m,Throws DecodingException m) => ByteSource m where
sourceEmpty :: m Bool
fetchWord8 :: m Word8
fetchWord16be :: m Word16
fetchWord16be = do
w1 <- fetchWord8
w2 <- fetchWord8
return $ ((fromIntegral w1) `shiftL` 8)
.|. (fromIntegral w2)
fetchWord16le :: m Word16
fetchWord16le = do
w1 <- fetchWord8
w2 <- fetchWord8
return $ ((fromIntegral w2) `shiftL` 8)
.|. (fromIntegral w1)
fetchWord32be :: m Word32
fetchWord32be = do
w1 <- fetchWord8
w2 <- fetchWord8
w3 <- fetchWord8
w4 <- fetchWord8
return $ ((fromIntegral w1) `shiftL` 24)
.|. ((fromIntegral w2) `shiftL` 16)
.|. ((fromIntegral w3) `shiftL` 8)
.|. (fromIntegral w4)
fetchWord32le :: m Word32
fetchWord32le = do
w1 <- fetchWord8
w2 <- fetchWord8
w3 <- fetchWord8
w4 <- fetchWord8
return $ ((fromIntegral w4) `shiftL` 24)
.|. ((fromIntegral w3) `shiftL` 16)
.|. ((fromIntegral w2) `shiftL` 8)
.|. (fromIntegral w1)
fetchWord64be :: m Word64
fetchWord64be = do
w1 <- fetchWord8
w2 <- fetchWord8
w3 <- fetchWord8
w4 <- fetchWord8
w5 <- fetchWord8
w6 <- fetchWord8
w7 <- fetchWord8
w8 <- fetchWord8
return $ ((fromIntegral w1) `shiftL` 56)
.|. ((fromIntegral w2) `shiftL` 48)
.|. ((fromIntegral w3) `shiftL` 40)
.|. ((fromIntegral w4) `shiftL` 32)
.|. ((fromIntegral w5) `shiftL` 24)
.|. ((fromIntegral w6) `shiftL` 16)
.|. ((fromIntegral w7) `shiftL` 8)
.|. (fromIntegral w8)
fetchWord64le :: m Word64
fetchWord64le = do
w1 <- fetchWord8
w2 <- fetchWord8
w3 <- fetchWord8
w4 <- fetchWord8
w5 <- fetchWord8
w6 <- fetchWord8
w7 <- fetchWord8
w8 <- fetchWord8
return $ ((fromIntegral w8) `shiftL` 56)
.|. ((fromIntegral w7) `shiftL` 48)
.|. ((fromIntegral w6) `shiftL` 40)
.|. ((fromIntegral w5) `shiftL` 32)
.|. ((fromIntegral w4) `shiftL` 24)
.|. ((fromIntegral w3) `shiftL` 16)
.|. ((fromIntegral w2) `shiftL` 8)
.|. (fromIntegral w1)
instance Throws DecodingException Get where
throwException = throw
instance ByteSource Get where
sourceEmpty = isEmpty
fetchWord8 = getWord8
fetchWord16be = getWord16be
fetchWord16le = getWord16le
fetchWord32be = getWord32be
fetchWord32le = getWord32le
fetchWord64be = getWord64be
fetchWord64le = getWord64le
instance Throws DecodingException (State [Char]) where
throwException = throw
instance ByteSource (State [Char]) where
sourceEmpty = gets null
fetchWord8 = do
chs <- get
case chs of
[] -> throw UnexpectedEnd
c:cs -> do
put cs
return (fromIntegral $ ord c)
instance Monad (Either DecodingException) where
return = Right
(Left err) >>= g = Left err
(Right x) >>= g = g x
instance ByteSource (StateT [Char] (Either DecodingException)) where
sourceEmpty = gets null
fetchWord8 = do
chs <- get
case chs of
[] -> throwException UnexpectedEnd --handleDecodingError UnexpectedEnd
c:cs -> do
put cs
return (fromIntegral $ ord c)
instance Throws DecodingException (State BS.ByteString) where
throwException = throw
instance ByteSource (State BS.ByteString) where
sourceEmpty = gets BS.null
fetchWord8 = State (\str -> case BS.uncons str of
Nothing -> throw UnexpectedEnd
Just (c,cs) -> (c,cs))
instance ByteSource (StateT BS.ByteString (Either DecodingException)) where
sourceEmpty = gets BS.null
fetchWord8 = StateT (\str -> case BS.uncons str of
Nothing -> Left UnexpectedEnd
Just ns -> Right ns)
instance ByteSource (StateT LBS.ByteString (Either DecodingException)) where
sourceEmpty = gets LBS.null
fetchWord8 = StateT (\str -> case LBS.uncons str of
Nothing -> Left UnexpectedEnd
Just ns -> Right ns)
instance ByteSource (ReaderT Handle IO) where
sourceEmpty = do
h <- ask
liftIO (hIsEOF h)
fetchWord8 = do
h <- ask
liftIO $ do
ch <- hGetChar h
return (fromIntegral $ ord ch)

View File

@ -1,35 +1,6 @@
{-# LANGUAGE CPP,TemplateHaskell,DeriveDataTypeable #-}
{- | This module implements Windows Codepage number 1250 which encodes languages that use latin script.
See <http://en.wikipedia.org/wiki/CP1250> for more information.
-}
module Data.Encoding.CP1250
(CP1250(..)) where
{-# LANGUAGE TemplateHaskell #-}
module Data.Encoding.CP1250 where
import Data.Array ((!),Array)
import Data.Word (Word8)
import Data.ByteString (all)
import Data.Map (Map,lookup,member)
import Data.Encoding.Base
import Prelude hiding (lookup,all)
import Control.Exception (throwDyn)
import Data.Typeable
import Data.Encoding.Helper.Template (makeISOInstance)
data CP1250 = CP1250 deriving (Eq,Show,Typeable)
instance Encoding CP1250 where
encode _ = encodeSinglebyte (\c -> case lookup c encodeMap of
Just v -> v
Nothing -> throwDyn (HasNoRepresentation c))
encodable _ c = member c encodeMap
decode _ = decodeSinglebyte (decodeArr!)
decodable _ = all (\w -> decodeArr!w /= '\xFFFD')
decodeArr :: Array Word8 Char
#ifndef __HADDOCK__
decodeArr = $(decodingArray "CP1250.TXT")
#endif
encodeMap :: Map Char Word8
#ifndef __HADDOCK__
encodeMap = $(encodingMap "CP1250.TXT")
#endif
$( makeISOInstance "CP1250" "CP1250.TXT" )

View File

@ -1,35 +1,6 @@
{-# LANGUAGE CPP,TemplateHaskell,DeriveDataTypeable #-}
{- | This module implements Windows Codepage number 1251 which encodes languages that use the cyrillic alphabet.
See <http://en.wikipedia.org/wiki/CP1251> for more information.
-}
module Data.Encoding.CP1251
(CP1251(..)) where
{-# LANGUAGE TemplateHaskell #-}
module Data.Encoding.CP1251 where
import Data.Array ((!),Array)
import Data.Word (Word8)
import Data.ByteString (all)
import Data.Map (Map,lookup,member)
import Data.Encoding.Base
import Prelude hiding (lookup,all)
import Control.Exception (throwDyn)
import Data.Typeable
import Data.Encoding.Helper.Template (makeISOInstance)
data CP1251 = CP1251 deriving (Eq,Show,Typeable)
instance Encoding CP1251 where
encode _ = encodeSinglebyte (\c -> case lookup c encodeMap of
Just v -> v
Nothing -> throwDyn (HasNoRepresentation c))
encodable _ c = member c encodeMap
decode _ = decodeSinglebyte (decodeArr!)
decodable _ = all (\w -> decodeArr!w /= '\xFFFD')
decodeArr :: Array Word8 Char
#ifndef __HADDOCK__
decodeArr = $(decodingArray "CP1251.TXT")
#endif
encodeMap :: Map Char Word8
#ifndef __HADDOCK__
encodeMap = $(encodingMap "CP1251.TXT")
#endif
$( makeISOInstance "CP1251" "CP1251.TXT" )

View File

@ -1,35 +1,6 @@
{-# LANGUAGE CPP,TemplateHaskell,DeriveDataTypeable #-}
{- | This module implements Windows Codepage number 1252 which is a superset of ISO 8859-1.
See <http://en.wikipedia.org/wiki/CP1252> for more information.
-}
module Data.Encoding.CP1252
(CP1252(..)) where
{-# LANGUAGE TemplateHaskell #-}
module Data.Encoding.CP1252 where
import Data.Array ((!),Array)
import Data.Word (Word8)
import Data.ByteString (all)
import Data.Map (Map,lookup,member)
import Data.Encoding.Base
import Prelude hiding (lookup,all)
import Control.Exception (throwDyn)
import Data.Typeable
import Data.Encoding.Helper.Template (makeISOInstance)
data CP1252 = CP1252 deriving (Eq,Show,Typeable)
instance Encoding CP1252 where
encode _ = encodeSinglebyte (\c -> case lookup c encodeMap of
Just v -> v
Nothing -> throwDyn (HasNoRepresentation c))
encodable _ c = member c encodeMap
decode _ = decodeSinglebyte (decodeArr!)
decodable _ = all (\w -> decodeArr!w /= '\xFFFD')
decodeArr :: Array Word8 Char
#ifndef __HADDOCK__
decodeArr = $(decodingArray "CP1252.TXT")
#endif
encodeMap :: Map Char Word8
#ifndef __HADDOCK__
encodeMap = $(encodingMap "CP1252.TXT")
#endif
$( makeISOInstance "CP1252" "CP1252.TXT" )

View File

@ -1,35 +1,6 @@
{-# LANGUAGE CPP,TemplateHaskell,DeriveDataTypeable #-}
{- | This module implements Windows Codepage number 1253 which encodes modern greek.
See <http://en.wikipedia.org/wiki/CP1253> for more information.
-}
module Data.Encoding.CP1253
(CP1253(..)) where
{-# LANGUAGE TemplateHaskell #-}
module Data.Encoding.CP1253 where
import Data.Array ((!),Array)
import Data.Word (Word8)
import Data.ByteString (all)
import Data.Map (Map,lookup,member)
import Data.Encoding.Base
import Prelude hiding (lookup,all)
import Control.Exception (throwDyn)
import Data.Typeable
import Data.Encoding.Helper.Template (makeISOInstance)
data CP1253 = CP1253 deriving (Eq,Show,Typeable)
instance Encoding CP1253 where
encode _ = encodeSinglebyte (\c -> case lookup c encodeMap of
Just v -> v
Nothing -> throwDyn (HasNoRepresentation c))
encodable _ c = member c encodeMap
decode _ = decodeSinglebyte (decodeArr!)
decodable _ = all (\w -> decodeArr!w /= '\xFFFD')
decodeArr :: Array Word8 Char
#ifndef __HADDOCK__
decodeArr = $(decodingArray "CP1253.TXT")
#endif
encodeMap :: Map Char Word8
#ifndef __HADDOCK__
encodeMap = $(encodingMap "CP1253.TXT")
#endif
$( makeISOInstance "CP1253" "CP1253.TXT" )

View File

@ -1,35 +1,6 @@
{-# LANGUAGE CPP,TemplateHaskell,DeriveDataTypeable #-}
{- | This module implements Windows Codepage number 1254 which encodes the turkish language.
See <http://en.wikipedia.org/wiki/CP1254> for more information.
-}
module Data.Encoding.CP1254
(CP1254(..)) where
{-# LANGUAGE TemplateHaskell #-}
module Data.Encoding.CP1254 where
import Data.Array ((!),Array)
import Data.Word (Word8)
import Data.ByteString (all)
import Data.Map (Map,lookup,member)
import Data.Encoding.Base
import Prelude hiding (lookup,all)
import Control.Exception (throwDyn)
import Data.Typeable
import Data.Encoding.Helper.Template (makeISOInstance)
data CP1254 = CP1254 deriving (Eq,Show,Typeable)
instance Encoding CP1254 where
encode _ = encodeSinglebyte (\c -> case lookup c encodeMap of
Just v -> v
Nothing -> throwDyn (HasNoRepresentation c))
encodable _ c = member c encodeMap
decode _ = decodeSinglebyte (decodeArr!)
decodable _ = all (\w -> decodeArr!w /= '\xFFFD')
decodeArr :: Array Word8 Char
#ifndef __HADDOCK__
decodeArr = $(decodingArray "CP1254.TXT")
#endif
encodeMap :: Map Char Word8
#ifndef __HADDOCK__
encodeMap = $(encodingMap "CP1254.TXT")
#endif
$( makeISOInstance "CP1254" "CP1254.TXT" )

View File

@ -1,35 +1,6 @@
{-# LANGUAGE CPP,TemplateHaskell,DeriveDataTypeable #-}
{- | This module implements Windows Codepage number 1255 which encodes the hebrew language.
See <http://en.wikipedia.org/wiki/CP1255> for more information.
-}
module Data.Encoding.CP1255
(CP1255(..)) where
{-# LANGUAGE TemplateHaskell #-}
module Data.Encoding.CP1255 where
import Data.Array ((!),Array)
import Data.Word (Word8)
import Data.ByteString (all)
import Data.Map (Map,lookup,member)
import Data.Encoding.Base
import Prelude hiding (lookup,all)
import Control.Exception (throwDyn)
import Data.Typeable
import Data.Encoding.Helper.Template (makeISOInstance)
data CP1255 = CP1255 deriving (Eq,Show,Typeable)
instance Encoding CP1255 where
encode _ = encodeSinglebyte (\c -> case lookup c encodeMap of
Just v -> v
Nothing -> throwDyn (HasNoRepresentation c))
encodable _ c = member c encodeMap
decode _ = decodeSinglebyte (decodeArr!)
decodable _ = all (\w -> decodeArr!w /= '\xFFFD')
decodeArr :: Array Word8 Char
#ifndef __HADDOCK__
decodeArr = $(decodingArray "CP1255.TXT")
#endif
encodeMap :: Map Char Word8
#ifndef __HADDOCK__
encodeMap = $(encodingMap "CP1255.TXT")
#endif
$( makeISOInstance "CP1255" "CP1255.TXT" )

View File

@ -1,35 +1,6 @@
{-# LANGUAGE CPP,TemplateHaskell,DeriveDataTypeable #-}
{- | This module implements Windows Codepage number 1256 which encodes languages which use the arabic script.
See <http://en.wikipedia.org/wiki/CP1256> for more information.
-}
module Data.Encoding.CP1256
(CP1256(..)) where
{-# LANGUAGE TemplateHaskell #-}
module Data.Encoding.CP1256 where
import Data.Array ((!),Array)
import Data.Word (Word8)
import Data.ByteString (all)
import Data.Map (Map,lookup,member)
import Data.Encoding.Base
import Prelude hiding (lookup,all)
import Control.Exception (throwDyn)
import Data.Typeable
import Data.Encoding.Helper.Template (makeISOInstance)
data CP1256 = CP1256 deriving (Eq,Show,Typeable)
instance Encoding CP1256 where
encode _ = encodeSinglebyte (\c -> case lookup c encodeMap of
Just v -> v
Nothing -> throwDyn (HasNoRepresentation c))
encodable _ c = member c encodeMap
decode _ = decodeSinglebyte (decodeArr!)
decodable _ = all (\w -> decodeArr!w /= '\xFFFD')
decodeArr :: Array Word8 Char
#ifndef __HADDOCK__
decodeArr = $(decodingArray "CP1256.TXT")
#endif
encodeMap :: Map Char Word8
#ifndef __HADDOCK__
encodeMap = $(encodingMap "CP1256.TXT")
#endif
$( makeISOInstance "CP1256" "CP1256.TXT" )

View File

@ -1,35 +1,6 @@
{-# LANGUAGE CPP,TemplateHaskell,DeriveDataTypeable #-}
{- | This module implements Windows Codepage number 1257 which encodes the estonian, latvian and lithuanian language.
See <http://en.wikipedia.org/wiki/CP1257> for more information.
-}
module Data.Encoding.CP1257
(CP1257(..)) where
{-# LANGUAGE TemplateHaskell #-}
module Data.Encoding.CP1257 where
import Data.Array ((!),Array)
import Data.Word (Word8)
import Data.ByteString (all)
import Data.Map (Map,lookup,member)
import Data.Encoding.Base
import Prelude hiding (lookup,all)
import Control.Exception (throwDyn)
import Data.Typeable
import Data.Encoding.Helper.Template (makeISOInstance)
data CP1257 = CP1257 deriving (Eq,Show,Typeable)
instance Encoding CP1257 where
encode _ = encodeSinglebyte (\c -> case lookup c encodeMap of
Just v -> v
Nothing -> throwDyn (HasNoRepresentation c))
encodable _ c = member c encodeMap
decode _ = decodeSinglebyte (decodeArr!)
decodable _ = all (\w -> decodeArr!w /= '\xFFFD')
decodeArr :: Array Word8 Char
#ifndef __HADDOCK__
decodeArr = $(decodingArray "CP1257.TXT")
#endif
encodeMap :: Map Char Word8
#ifndef __HADDOCK__
encodeMap = $(encodingMap "CP1257.TXT")
#endif
$( makeISOInstance "CP1257" "CP1257.TXT" )

View File

@ -1,35 +1,6 @@
{-# LANGUAGE CPP,TemplateHaskell,DeriveDataTypeable #-}
{- | This module implements Windows Codepage number 1258 which encodes the vietnamese language.
See <http://en.wikipedia.org/wiki/CP1258> for more information.
-}
module Data.Encoding.CP1258
(CP1258(..)) where
{-# LANGUAGE TemplateHaskell #-}
module Data.Encoding.CP1258 where
import Data.Array ((!),Array)
import Data.Word (Word8)
import Data.ByteString (all)
import Data.Map (Map,lookup,member)
import Data.Encoding.Base
import Prelude hiding (lookup,all)
import Control.Exception (throwDyn)
import Data.Typeable
import Data.Encoding.Helper.Template (makeISOInstance)
data CP1258 = CP1258 deriving (Eq,Show,Typeable)
instance Encoding CP1258 where
encode _ = encodeSinglebyte (\c -> case lookup c encodeMap of
Just v -> v
Nothing -> throwDyn (HasNoRepresentation c))
encodable _ c = member c encodeMap
decode _ = decodeSinglebyte (decodeArr!)
decodable _ = all (\w -> decodeArr!w /= '\xFFFD')
decodeArr :: Array Word8 Char
#ifndef __HADDOCK__
decodeArr = $(decodingArray "CP1258.TXT")
#endif
encodeMap :: Map Char Word8
#ifndef __HADDOCK__
encodeMap = $(encodingMap "CP1258.TXT")
#endif
$( makeISOInstance "CP1258" "CP1258.TXT" )

View File

@ -0,0 +1,30 @@
{-# LANGUAGE DeriveDataTypeable #-}
module Data.Encoding.Exception where
import Control.Exception.Extensible
import Data.Word
import Data.Typeable
import Control.Monad.Identity
-- | This exception type is thrown whenever something went wrong during the
-- encoding-process.
data EncodingException
= HasNoRepresentation Char -- ^ Thrown if a specific character
-- is not representable in an encoding.
deriving (Eq,Show,Typeable)
instance Exception EncodingException
-- | This exception type is thrown whenever something went wrong during the
-- decoding-process.
data DecodingException
= IllegalCharacter Word8 -- ^ The sequence contained an illegal
-- byte that couldn't be decoded.
| UnexpectedEnd -- ^ more bytes were needed to allow a
-- successfull decoding.
| OutOfRange -- ^ the decoded value was out of the unicode range
| IllegalRepresentation [Word8] -- ^ The character sequence encodes a
-- character, but is illegal.
deriving (Eq,Show,Typeable)
instance Exception DecodingException

View File

@ -7,11 +7,14 @@ module Data.Encoding.GB18030
(GB18030(..))
where
import Control.Exception
import Control.Throws
import Data.Char (chr,ord)
import Data.Word
import Data.Bits
import Data.Encoding.Base
import Data.Encoding.ByteSource
import Data.Encoding.ByteSink
import Data.Encoding.Exception
import Data.ByteString (ByteString)
import qualified Data.ByteString as BS
import Data.Typeable
@ -27,111 +30,103 @@ import Data.Encoding.GB18030Data
data GB18030 = GB18030 deriving (Eq,Show,Typeable)
instance Encoding GB18030 where
encode _ = encodeMultibyte encodeGB
encodeLazy _ = encodeMultibyteLazy encodeGB
decode _ = decodeMultibyte decodeGB
decodeLazy _ = decodeMultibyteLazy decodeGB
encodable _ ch = ch <= '\x10FFFF'
decodable _ = checkValidity
data DecodingState
= Valid
| Invalid
| Second
| Third
| Fourth
deriving Eq
checkValidity :: ByteString -> Bool
checkValidity bs = BS.foldl' (\st w -> case st of
Invalid -> Invalid
Valid | w<=0x80 -> Valid
| w<=0xFE -> Second
| otherwise -> Invalid
Second | w< 0x30 -> Invalid
| w<=0x39 -> Third
| w<=0x7E -> Valid
| w==0x7F -> Invalid
| w<=0xFE -> Valid
| otherwise -> Invalid
Third | w< 0x81 -> Invalid
| w<=0xFE -> Fourth
| otherwise -> Invalid
Fourth | w< 0x30 -> Invalid
| w<=0x39 -> Valid
| otherwise -> Invalid
) Valid bs == Valid
{- How this works: The nested if-structures form an binary tree over the
- encoding range.
-}
encodeGB :: Char -> (Word8,EncodeState)
encodeGB ch = if ch<='\x4946' -- 1
then (if ch<='\x4055' -- 2
then (if ch<='\x2E80' -- 3
then (if ch<='\x200F' -- 4
then (if ch<'\x0452'
then arr 0x0000 arr1
else range range1)
else (if ch<'\x2643'
then arr 0x2010 arr2
decodeChar _ = do
w1 <- fetchWord8
case () of
_
| w1 <= 0x80 -> return (chr $ fromIntegral w1) -- it's ascii
| w1 <= 0xFE -> do
w2 <- fetchWord8
case () of
_
| w2 < 0x30 -> throwException (IllegalCharacter w2)
| w2 <= 0x39 -> do
w3 <- fetchWord8
case () of
_
| w3 < 0x81 -> throwException (IllegalCharacter w3)
| w3 <= 0xFE -> do
w4 <- fetchWord8
case () of
_
| w4 < 0x30 -> throwException (IllegalCharacter w4)
| w4 <= 0x39 -> decodeGBFour $ linear w1 w2 w3 w4
| otherwise -> throwException (IllegalCharacter w4)
| otherwise -> throwException (IllegalCharacter w3)
| w2 <= 0x7E -> return $ decodeGBTwo $ linear2 w1 w2
| w2 == 0x7F -> throwException (IllegalCharacter w2)
| w2 <= 0xFE -> return $ decodeGBTwo $ linear2 w1 w2
| otherwise -> throwException (IllegalCharacter w2)
| otherwise -> throwException (IllegalCharacter w1)
{- How this works: The nested if-structures form an binary tree over the
- encoding range.
-}
encodeChar _ ch = if ch<='\x4946' -- 1
then (if ch<='\x4055' -- 2
then (if ch<='\x2E80' -- 3
then (if ch<='\x200F' -- 4
then (if ch<'\x0452'
then arr 0x0000 arr1
else range range1)
else (if ch<'\x2643'
then arr 0x2010 arr2
else range range2))
else (if ch<='\x3917' -- 4
then (if ch<'\x361B'
then arr 0x2E81 arr3
else range range3)
else (if ch<'\x3CE1'
then arr 0x3918 arr4
else range range4)))
then (if ch<'\x361B'
then arr 0x2E81 arr3
else range range3)
else (if ch<'\x3CE1'
then arr 0x3918 arr4
else range range4)))
else (if ch<='\x464B' -- 3
then (if ch<='\x4336' -- 4
then (if ch<'\x4160'
then arr 0x4056 arr5
else range range5)
else (if ch<'\x44D7'
then arr 0x4337 arr6
else range range6))
then (if ch<='\x4336' -- 4
then (if ch<'\x4160'
then arr 0x4056 arr5
else range range5)
else (if ch<'\x44D7'
then arr 0x4337 arr6
else range range6))
else (if ch<'\x478E'
then arr 0x464C arr7
else range range7)))
then arr 0x464C arr7
else range range7)))
else (if ch<='\xF92B' -- 2
then (if ch<='\xD7FF' -- 3
then (if ch<='\x4C76' -- 4
then (if ch<'\x49B8'
then arr 0x4947 arr8
else range range8)
else (if ch<'\x9FA6'
then arr 0x4C77 arr9
else range range9))
then (if ch<='\xD7FF' -- 3
then (if ch<='\x4C76' -- 4
then (if ch<'\x49B8'
then arr 0x4947 arr8
else range range8)
else (if ch<'\x9FA6'
then arr 0x4C77 arr9
else range range9))
else (if ch<'\xE865'
then arr 0xD800 arr10
else range range10))
then arr 0xD800 arr10
else range range10))
else (if ch<='\xFFFF' -- 3
then (if ch<='\xFE2F' -- 4
then (if ch<'\xFA2A'
then arr 0xF92C arr11
else range range11)
else (if ch<'\xFFE6'
then arr 0xFE30 arr12
else range range12))
then (if ch<='\xFE2F' -- 4
then (if ch<'\xFA2A'
then arr 0xF92C arr11
else range range11)
else (if ch<'\xFFE6'
then arr 0xFE30 arr12
else range range12))
else (if ch<='\x10FFFF' -- 4
then range range13
else throwDyn (HasNoRepresentation ch))))
where
range r = let
(w1,w2,w3,w4) = delinear (ord ch + r)
in (w1,Put3 w2 w3 w4)
arr off a = let
ind = (ord ch - off)*5
w1 = unsafeIndex a (ind+1)
w2 = unsafeIndex a (ind+2)
w3 = unsafeIndex a (ind+3)
w4 = unsafeIndex a (ind+4)
in (w1,case unsafeIndex a ind of
1 -> Done
2 -> Put1 w2
3 -> Put2 w2 w3
4 -> Put3 w2 w3 w4)
then range range13
else throwException (HasNoRepresentation ch))))
where
range r = let (w1,w2,w3,w4) = delinear (ord ch + r)
in pushWord8 w1 >> pushWord8 w2 >> pushWord8 w3 >> pushWord8 w4
arr off a = let ind = (ord ch - off)*5
w1 = unsafeIndex a (ind+1)
w2 = unsafeIndex a (ind+2)
w3 = unsafeIndex a (ind+3)
w4 = unsafeIndex a (ind+4)
in do
pushWord8 w1
case unsafeIndex a ind of
1 -> return ()
2 -> pushWord8 w2
3 -> pushWord8 w2 >> pushWord8 w3
4 -> pushWord8 w2 >> pushWord8 w3 >> pushWord8 w4
linear :: Word8 -> Word8 -> Word8 -> Word8 -> Int
linear w1 w2 w3 w4
@ -157,32 +152,6 @@ delinear n = let
,fromIntegral w3+0x81
,fromIntegral w4+0x30)
decodeGB :: [Word8] -> (Char,[Word8])
decodeGB (w1:rst)
| w1 <=0x80 = (chr $ fromIntegral w1,rst) -- it's ascii
| w1 <=0xFE = case rst of
w2:rst2
| w2 < 0x30 -> throwDyn (IllegalCharacter w2)
| w2 <=0x39 -> case rst2 of
w3:rst3
| w3 < 0x81 -> throwDyn (IllegalCharacter w3)
| w3 <=0xFE -> case rst3 of
w4:rst4
| w4 < 0x30 -> throwDyn (IllegalCharacter w4)
| w4 <=0x39 -> let
v = linear w1 w2 w3 w4
in (decodeGBFour v,rst4)
| otherwise -> throwDyn (IllegalCharacter w4)
[] -> throwDyn UnexpectedEnd
| otherwise -> throwDyn (IllegalCharacter w3)
[] -> throwDyn UnexpectedEnd
| w2 <=0x7E -> (decodeGBTwo (linear2 w1 w2),rst2)
| w2 ==0x7F -> throwDyn (IllegalCharacter w2)
| w2 <=0xFE -> (decodeGBTwo (linear2 w1 w2),rst2)
| otherwise -> throwDyn (IllegalCharacter w2)
[] -> throwDyn UnexpectedEnd
| otherwise = throwDyn (IllegalCharacter w1)
decodeGBTwo :: Int -> Char
decodeGBTwo n = let
rn = n*2
@ -190,7 +159,7 @@ decodeGBTwo n = let
w2 = unsafeIndex rrarr (rn+1)
in chr $ ((fromIntegral w1) `shiftL` 8) .|. (fromIntegral w2)
decodeGBFour :: Int -> Char
decodeGBFour :: ByteSource m => Int -> m Char
decodeGBFour v = if v<=17858 -- 1
then (if v<=15582 -- 2
then (if v<=11328 -- 3
@ -241,15 +210,15 @@ decodeGBFour v = if v<=17858 -- 1
else range range12))
else (if v<=1237575 && v>=189000
then range range13
else throwDyn OutOfRange)))
else throwException OutOfRange)))
where
arr off a = let
v' = (v-off)*2
w1 = unsafeIndex a v'
w2 = unsafeIndex a (v'+1)
in chr $ ((fromIntegral w1) `shiftL` 8)
in return $ chr $ ((fromIntegral w1) `shiftL` 8)
.|. (fromIntegral w2)
range r = chr (v-r)
range r = return $ chr (v-r)
range1,range2,range3,range4,range5,range6,range7,range8,range9,range10,range11,range12,range13 :: Int
range1 = -286

View File

@ -1,4 +1,4 @@
{-# LANGUAGE CPP, MagicHash #-}
{-# LANGUAGE CPP,MagicHash #-}
module Data.Encoding.GB18030Data where
import Data.ByteString(ByteString)

View File

@ -1,34 +1,75 @@
{-# LANGUAGE CPP,TemplateHaskell #-}
{- This module is used to create arrays from lists in template haskell -}
{-# LANGUAGE TemplateHaskell #-}
module Data.Encoding.Helper.Template where
import Data.Encoding.Base
import Data.Char
import Data.Word
import Data.Array.IArray (Array,array)
import Data.Maybe (mapMaybe)
import Data.Map as Map (fromList,lookup)
import Data.Array
import Language.Haskell.TH
createCharArray :: [(Integer,Char)] -> Integer -> Integer -> Q Exp
#ifndef __HADDOCK__
createCharArray lst = createArray (map (\(x,y) -> (x,LitE $ CharL y)) lst)
#endif
makeISOInstance :: String -> FilePath -> Q [Dec]
makeISOInstance name file = do
let rname = mkName name
trans <- runIO (readTranslation file)
mp <- encodingMap (validTranslations trans)
arr <- decodingArray (fillTranslations trans)
return [ DataD [] rname [] [NormalC rname []] [''Show]
, InstanceD [] (AppT (ConT ''Encoding) (ConT rname))
[FunD 'encodeChar
[Clause [WildP] (NormalB $ AppE (VarE 'encodeWithMap) (VarE $ mkName "mp"))
[ValD (VarP $ mkName "mp") (NormalB mp) []]
]
,FunD 'decodeChar
[Clause [WildP] (NormalB $ AppE (VarE 'decodeWithArray) (VarE $ mkName "arr"))
[ValD (VarP $ mkName "arr") (NormalB arr) []]
]
]
]
createCharArray :: [(Integer,Maybe Char)] -> Integer -> Integer -> Q Exp
createCharArray lst = createArray (map (\(x,y) -> (x,case y of
Nothing -> ConE 'Nothing
Just c -> AppE (ConE 'Just) (LitE $ CharL c))
) lst)
createArray :: [(Integer,Exp)] -> Integer -> Integer -> Q Exp
#ifndef __HADDOCK__
createArray lst from to = return $ AppE
(AppE
(VarE 'array)
(TupE [LitE $ IntegerL from,LitE $ IntegerL to]))
(ListE [ TupE [LitE $ IntegerL x,y]
| (x,y) <- lst ])
#endif
xmlArray :: [(Char,[Word8])] -> Integer -> Integer -> Q Exp
#ifndef __HADDOCK__
xmlArray lst l u = do
let trans = map (\(ch,bin) ->
(toInteger $ ord ch
,TupE [LitE $ IntegerL (toInteger $ length bin),TupE $ map (\b -> LitE $ IntegerL (fromIntegral b)) bin ++ replicate (4-length bin) (LitE $ IntegerL 0)]
)) (filter (\(c,_) -> ord c <= fromInteger u && ord c >= fromInteger l) lst)
createArray trans l u
#endif
decodingArray :: [(Integer,Maybe Char)] -> Q Exp
decodingArray trans = createCharArray trans 0 255
encodingMap :: [(Integer,Char)] -> Q Exp
encodingMap trans = return $ AppE
(VarE 'fromList)
(ListE [ TupE [LitE $ CharL to,LitE $ IntegerL from]
| (from,to) <- trans])
readTranslation :: FilePath -> IO [(Integer,Maybe Char)]
readTranslation file = do
cont <- readFile file
return $ mapMaybe (\ln -> case ln of
[] -> Nothing
('#':xs) -> Nothing
_ -> case words ln of
(src:"#UNDEFINED":_) -> Just (read src,Nothing) -- XXX: Find a better way to handle this
(src:trg:_) -> Just (read src,Just $ chr $ read trg)
_ -> Nothing
) (lines cont)
fillTranslations :: [(Integer,Maybe Char)] -> [(Integer,Maybe Char)]
fillTranslations = fillTranslations' (-1)
where
fillTranslations' n ((n',c):cs) = (map (\i -> (i,Nothing)) [n+1..n'-1])++((n',c):fillTranslations' n' cs)
fillTranslations' n [] = map (\i -> (i,Nothing)) [n+1..255]
validTranslations :: [(Integer,Maybe Char)] -> [(Integer,Char)]
validTranslations = mapMaybe (\(n,mc) -> case mc of
Nothing -> Nothing
Just c -> Just (n,c))

View File

@ -1,26 +1,18 @@
{-# LANGUAGE CPP,TemplateHaskell,DeriveDataTypeable #-}
{- | Implements ISO\/IEC 8859-1 alias latin-1 encoding. See
<http://en.wikipedia.org/wiki/ISO/IEC_8859-1> for further information.
-}
module Data.Encoding.ISO88591
(ISO88591(..)
) where
module Data.Encoding.ISO88591 where
import Control.Throws
import Data.Encoding.Base
import Data.Char(ord,chr)
import Data.Word
import Control.Exception
import Data.Typeable
import Data.Encoding.Exception
import Data.Encoding.ByteSource
import Data.Encoding.ByteSink
import Data.Char (ord,chr)
data ISO88591 = ISO88591 deriving (Eq,Show,Typeable)
enc :: Char -> Word8
enc c = if ord c < 256
then fromIntegral $ ord c
else throwDyn (HasNoRepresentation c)
data ISO88591 = ISO88591 deriving (Show)
instance Encoding ISO88591 where
encode _ = encodeSinglebyte enc
encodeLazy _ = encodeSinglebyteLazy enc
encodable _ c = ord c < 256
decode _ = decodeSinglebyte (chr.fromIntegral)
encodeChar _ c
| c > '\255' = throwException (HasNoRepresentation c)
| otherwise = pushWord8 (fromIntegral $ ord c)
decodeChar _ = do
w <- fetchWord8
return (chr $ fromIntegral w)

View File

@ -1,34 +1,6 @@
{-# LANGUAGE CPP,TemplateHaskell,DeriveDataTypeable #-}
module Data.Encoding.ISO885910
(ISO885910(..)) where
{-# LANGUAGE TemplateHaskell #-}
module Data.Encoding.ISO885910 where
import Data.Array ((!),Array)
import Data.Word (Word8)
import Data.Map (Map,lookup,member)
import Data.Encoding.Base
import Prelude hiding (lookup)
import Control.Exception (throwDyn)
import Data.Typeable
import Data.Encoding.Helper.Template (makeISOInstance)
data ISO885910 = ISO885910 deriving (Eq,Show,Typeable)
enc :: Char -> Word8
enc c = case lookup c encodeMap of
Just v -> v
Nothing -> throwDyn (HasNoRepresentation c)
instance Encoding ISO885910 where
encode _ = encodeSinglebyte enc
encodeLazy _ = encodeSinglebyteLazy enc
encodable _ c = member c encodeMap
decode _ = decodeSinglebyte (decodeArr!)
decodeArr :: Array Word8 Char
#ifndef __HADDOCK__
decodeArr = $(decodingArray "8859-10.TXT")
#endif
encodeMap :: Map Char Word8
#ifndef __HADDOCK__
encodeMap = $(encodingMap "8859-10.TXT")
#endif
$( makeISOInstance "ISO885910" "8859-10.TXT" )

View File

@ -1,34 +1,6 @@
{-# LANGUAGE CPP,TemplateHaskell,DeriveDataTypeable #-}
module Data.Encoding.ISO885911
(ISO885911(..)) where
{-# LANGUAGE TemplateHaskell #-}
module Data.Encoding.ISO885911 where
import Data.Array ((!),Array)
import Data.Word (Word8)
import Data.Map (Map,lookup,member)
import Data.Encoding.Base
import Prelude hiding (lookup)
import Control.Exception (throwDyn)
import Data.Typeable
import Data.Encoding.Helper.Template (makeISOInstance)
data ISO885911 = ISO885911 deriving (Eq,Show,Typeable)
enc :: Char -> Word8
enc c = case lookup c encodeMap of
Just v -> v
Nothing -> throwDyn (HasNoRepresentation c)
instance Encoding ISO885911 where
encode _ = encodeSinglebyte enc
encodeLazy _ = encodeSinglebyteLazy enc
encodable _ c = member c encodeMap
decode _ = decodeSinglebyte (decodeArr!)
decodeArr :: Array Word8 Char
#ifndef __HADDOCK__
decodeArr = $(decodingArray "8859-11.TXT")
#endif
encodeMap :: Map Char Word8
#ifndef __HADDOCK__
encodeMap = $(encodingMap "8859-11.TXT")
#endif
$( makeISOInstance "ISO885911" "8859-11.TXT" )

View File

@ -1,34 +1,6 @@
{-# LANGUAGE CPP,TemplateHaskell,DeriveDataTypeable #-}
module Data.Encoding.ISO885913
(ISO885913(..)) where
{-# LANGUAGE TemplateHaskell #-}
module Data.Encoding.ISO885913 where
import Data.Array ((!),Array)
import Data.Word (Word8)
import Data.Map (Map,lookup,member)
import Data.Encoding.Base
import Prelude hiding (lookup)
import Control.Exception (throwDyn)
import Data.Typeable
import Data.Encoding.Helper.Template (makeISOInstance)
data ISO885913 = ISO885913 deriving (Eq,Show,Typeable)
enc :: Char -> Word8
enc c = case lookup c encodeMap of
Just v -> v
Nothing -> throwDyn (HasNoRepresentation c)
instance Encoding ISO885913 where
encode _ = encodeSinglebyte enc
encodeLazy _ = encodeSinglebyteLazy enc
encodable _ c = member c encodeMap
decode _ = decodeSinglebyte (decodeArr!)
decodeArr :: Array Word8 Char
#ifndef __HADDOCK__
decodeArr = $(decodingArray "8859-13.TXT")
#endif
encodeMap :: Map Char Word8
#ifndef __HADDOCK__
encodeMap = $(encodingMap "8859-13.TXT")
#endif
$( makeISOInstance "ISO885913" "8859-13.TXT" )

View File

@ -1,34 +1,6 @@
{-# LANGUAGE CPP,TemplateHaskell,DeriveDataTypeable #-}
module Data.Encoding.ISO885914
(ISO885914(..)) where
{-# LANGUAGE TemplateHaskell #-}
module Data.Encoding.ISO885914 where
import Data.Array ((!),Array)
import Data.Word (Word8)
import Data.Map (Map,lookup,member)
import Data.Encoding.Base
import Prelude hiding (lookup)
import Control.Exception (throwDyn)
import Data.Typeable
import Data.Encoding.Helper.Template (makeISOInstance)
data ISO885914 = ISO885914 deriving (Eq,Show,Typeable)
enc :: Char -> Word8
enc c = case lookup c encodeMap of
Just v -> v
Nothing -> throwDyn (HasNoRepresentation c)
instance Encoding ISO885914 where
encode _ = encodeSinglebyte enc
encodeLazy _ = encodeSinglebyteLazy enc
encodable _ c = member c encodeMap
decode _ = decodeSinglebyte (decodeArr!)
decodeArr :: Array Word8 Char
#ifndef __HADDOCK__
decodeArr = $(decodingArray "8859-14.TXT")
#endif
encodeMap :: Map Char Word8
#ifndef __HADDOCK__
encodeMap = $(encodingMap "8859-14.TXT")
#endif
$( makeISOInstance "ISO885914" "8859-14.TXT" )

View File

@ -1,34 +1,6 @@
{-# LANGUAGE CPP,TemplateHaskell,DeriveDataTypeable #-}
module Data.Encoding.ISO885915
(ISO885915(..)) where
{-# LANGUAGE TemplateHaskell #-}
module Data.Encoding.ISO885915 where
import Data.Array ((!),Array)
import Data.Word (Word8)
import Data.Map (Map,lookup,member)
import Data.Encoding.Base
import Prelude hiding (lookup)
import Control.Exception (throwDyn)
import Data.Typeable
import Data.Encoding.Helper.Template (makeISOInstance)
data ISO885915 = ISO885915 deriving (Eq,Show,Typeable)
enc :: Char -> Word8
enc c = case lookup c encodeMap of
Just v -> v
Nothing -> throwDyn (HasNoRepresentation c)
instance Encoding ISO885915 where
encode _ = encodeSinglebyte enc
encodeLazy _ = encodeSinglebyteLazy enc
encodable _ c = member c encodeMap
decode _ = decodeSinglebyte (decodeArr!)
decodeArr :: Array Word8 Char
#ifndef __HADDOCK__
decodeArr = $(decodingArray "8859-15.TXT")
#endif
encodeMap :: Map Char Word8
#ifndef __HADDOCK__
encodeMap = $(encodingMap "8859-15.TXT")
#endif
$( makeISOInstance "ISO885915" "8859-15.TXT" )

View File

@ -1,34 +1,6 @@
{-# LANGUAGE CPP,TemplateHaskell,DeriveDataTypeable #-}
module Data.Encoding.ISO885916
(ISO885916(..)) where
{-# LANGUAGE TemplateHaskell #-}
module Data.Encoding.ISO885916 where
import Data.Array ((!),Array)
import Data.Word (Word8)
import Data.Map (Map,lookup,member)
import Data.Encoding.Base
import Prelude hiding (lookup)
import Control.Exception (throwDyn)
import Data.Typeable
import Data.Encoding.Helper.Template (makeISOInstance)
data ISO885916 = ISO885916 deriving (Eq,Show,Typeable)
enc :: Char -> Word8
enc c = case lookup c encodeMap of
Just v -> v
Nothing -> throwDyn (HasNoRepresentation c)
instance Encoding ISO885916 where
encode _ = encodeSinglebyte enc
encodeLazy _ = encodeSinglebyteLazy enc
encodable _ c = member c encodeMap
decode _ = decodeSinglebyte (decodeArr!)
decodeArr :: Array Word8 Char
#ifndef __HADDOCK__
decodeArr = $(decodingArray "8859-16.TXT")
#endif
encodeMap :: Map Char Word8
#ifndef __HADDOCK__
encodeMap = $(encodingMap "8859-16.TXT")
#endif
$( makeISOInstance "ISO885916" "8859-16.TXT" )

View File

@ -1,38 +1,6 @@
{-# LANGUAGE CPP,TemplateHaskell,DeriveDataTypeable #-}
{- | Implements ISO\/IEC 8859-2 alias latin-2 encoding. See
<http://en.wikipedia.org/wiki/ISO/IEC_8859-2> for further informations.
-}
module Data.Encoding.ISO88592
(ISO88592(..)) where
{-# LANGUAGE TemplateHaskell #-}
module Data.Encoding.ISO88592 where
import Data.Array
import Data.Map hiding ((!))
import Data.Word
import Data.Encoding.Base
import Data.ByteString hiding (length,map)
import Prelude hiding (lookup,all)
import Control.Exception
import Data.Typeable
import Data.Encoding.Helper.Template (makeISOInstance)
data ISO88592 = ISO88592 deriving (Eq,Show,Typeable)
enc :: Char -> Word8
enc c = case lookup c encodeMap of
Just v -> v
Nothing -> throwDyn (HasNoRepresentation c)
instance Encoding ISO88592 where
encode _ = encodeSinglebyte enc
encodeLazy _ = encodeSinglebyteLazy enc
encodable _ c = member c encodeMap
decode _ = decodeSinglebyte (\w -> decodeArr!w)
decodeArr :: Array Word8 Char
#ifndef __HADDOCK__
decodeArr = $(decodingArray "8859-2.TXT")
#endif
encodeMap :: Map Char Word8
#ifndef __HADDOCK__
encodeMap = $(encodingMap "8859-2.TXT")
#endif
$( makeISOInstance "ISO88592" "8859-2.TXT" )

View File

@ -1,36 +1,6 @@
{-# LANGUAGE CPP,TemplateHaskell,DeriveDataTypeable #-}
{- | Implements ISO 8859-3 encoding, alias latin-3, alias south european
-}
module Data.Encoding.ISO88593
(ISO88593(..)) where
{-# LANGUAGE TemplateHaskell #-}
module Data.Encoding.ISO88593 where
import Data.Array ((!),Array)
import Data.Word (Word8)
import Data.Map (Map,lookup,member)
import Data.Encoding.Base
import Prelude hiding (lookup)
import Control.Exception (throwDyn)
import Data.Typeable
import Data.Encoding.Helper.Template (makeISOInstance)
data ISO88593 = ISO88593 deriving (Eq,Show,Typeable)
enc :: Char -> Word8
enc c = case lookup c encodeMap of
Just v -> v
Nothing -> throwDyn (HasNoRepresentation c)
instance Encoding ISO88593 where
encode _ = encodeSinglebyte enc
encodeLazy _ = encodeSinglebyteLazy enc
encodable _ c = member c encodeMap
decode _ = decodeSinglebyte (decodeArr!)
decodeArr :: Array Word8 Char
#ifndef __HADDOCK__
decodeArr = $(decodingArray "8859-3.TXT")
#endif
encodeMap :: Map Char Word8
#ifndef __HADDOCK__
encodeMap = $(encodingMap "8859-3.TXT")
#endif
$( makeISOInstance "ISO88593" "8859-3.TXT" )

View File

@ -1,34 +1,6 @@
{-# LANGUAGE CPP,TemplateHaskell,DeriveDataTypeable #-}
module Data.Encoding.ISO88594
(ISO88594(..)) where
{-# LANGUAGE TemplateHaskell #-}
module Data.Encoding.ISO88594 where
import Data.Array ((!),Array)
import Data.Word (Word8)
import Data.Map (Map,lookup,member)
import Data.Encoding.Base
import Prelude hiding (lookup)
import Control.Exception (throwDyn)
import Data.Typeable
import Data.Encoding.Helper.Template (makeISOInstance)
data ISO88594 = ISO88594 deriving (Eq,Show,Typeable)
enc :: Char -> Word8
enc c = case lookup c encodeMap of
Just v -> v
Nothing -> throwDyn (HasNoRepresentation c)
instance Encoding ISO88594 where
encode _ = encodeSinglebyte enc
encodeLazy _ = encodeSinglebyteLazy enc
encodable _ c = member c encodeMap
decode _ = decodeSinglebyte (decodeArr!)
decodeArr :: Array Word8 Char
#ifndef __HADDOCK__
decodeArr = $(decodingArray "8859-4.TXT")
#endif
encodeMap :: Map Char Word8
#ifndef __HADDOCK__
encodeMap = $(encodingMap "8859-4.TXT")
#endif
$( makeISOInstance "ISO88594" "8859-4.TXT" )

View File

@ -1,30 +1,6 @@
{-# LANGUAGE CPP,TemplateHaskell,DeriveDataTypeable #-}
module Data.Encoding.ISO88595
(ISO88595(..)) where
{-# LANGUAGE TemplateHaskell #-}
module Data.Encoding.ISO88595 where
import Data.Array ((!),Array)
import Data.Word (Word8)
import Data.Map (Map,lookup,member)
import Data.Encoding.Base
import Prelude hiding (lookup)
import Control.Exception (throwDyn)
import Data.Typeable
import Data.Encoding.Helper.Template (makeISOInstance)
data ISO88595 = ISO88595 deriving (Eq,Show,Typeable)
instance Encoding ISO88595 where
encode _ = encodeSinglebyte (\c -> case lookup c encodeMap of
Just v -> v
Nothing -> throwDyn (HasNoRepresentation c))
encodable _ c = member c encodeMap
decode _ = decodeSinglebyte (decodeArr!)
decodeArr :: Array Word8 Char
#ifndef __HADDOCK__
decodeArr = $(decodingArray "8859-5.TXT")
#endif
encodeMap :: Map Char Word8
#ifndef __HADDOCK__
encodeMap = $(encodingMap "8859-5.TXT")
#endif
$( makeISOInstance "ISO88595" "8859-5.TXT" )

View File

@ -1,30 +1,6 @@
{-# LANGUAGE CPP,TemplateHaskell,DeriveDataTypeable #-}
module Data.Encoding.ISO88596
(ISO88596(..)) where
{-# LANGUAGE TemplateHaskell #-}
module Data.Encoding.ISO88596 where
import Data.Array ((!),Array)
import Data.Word (Word8)
import Data.Map (Map,lookup,member)
import Data.Encoding.Base
import Prelude hiding (lookup)
import Control.Exception (throwDyn)
import Data.Typeable
import Data.Encoding.Helper.Template (makeISOInstance)
data ISO88596 = ISO88596 deriving (Eq,Show,Typeable)
instance Encoding ISO88596 where
encode _ = encodeSinglebyte (\c -> case lookup c encodeMap of
Just v -> v
Nothing -> throwDyn (HasNoRepresentation c))
encodable _ c = member c encodeMap
decode _ = decodeSinglebyte (decodeArr!)
decodeArr :: Array Word8 Char
#ifndef __HADDOCK__
decodeArr = $(decodingArray "8859-6.TXT")
#endif
encodeMap :: Map Char Word8
#ifndef __HADDOCK__
encodeMap = $(encodingMap "8859-6.TXT")
#endif
$( makeISOInstance "ISO88596" "8859-6.TXT" )

View File

@ -1,30 +1,6 @@
{-# LANGUAGE CPP,TemplateHaskell,DeriveDataTypeable #-}
module Data.Encoding.ISO88597
(ISO88597(..)) where
{-# LANGUAGE TemplateHaskell #-}
module Data.Encoding.ISO88597 where
import Data.Array ((!),Array)
import Data.Word (Word8)
import Data.Map (Map,lookup,member)
import Data.Encoding.Base
import Prelude hiding (lookup)
import Control.Exception (throwDyn)
import Data.Typeable
import Data.Encoding.Helper.Template (makeISOInstance)
data ISO88597 = ISO88597 deriving (Eq,Show,Typeable)
instance Encoding ISO88597 where
encode _ = encodeSinglebyte (\c -> case lookup c encodeMap of
Just v -> v
Nothing -> throwDyn (HasNoRepresentation c))
encodable _ c = member c encodeMap
decode _ = decodeSinglebyte (decodeArr!)
decodeArr :: Array Word8 Char
#ifndef __HADDOCK__
decodeArr = $(decodingArray "8859-7.TXT")
#endif
encodeMap :: Map Char Word8
#ifndef __HADDOCK__
encodeMap = $(encodingMap "8859-7.TXT")
#endif
$( makeISOInstance "ISO88597" "8859-7.TXT" )

View File

@ -1,30 +1,6 @@
{-# LANGUAGE CPP,TemplateHaskell,DeriveDataTypeable #-}
module Data.Encoding.ISO88598
(ISO88598(..)) where
{-# LANGUAGE TemplateHaskell #-}
module Data.Encoding.ISO88598 where
import Data.Array ((!),Array)
import Data.Word (Word8)
import Data.Map (Map,lookup,member)
import Data.Encoding.Base
import Prelude hiding (lookup)
import Control.Exception (throwDyn)
import Data.Typeable
import Data.Encoding.Helper.Template (makeISOInstance)
data ISO88598 = ISO88598 deriving (Eq,Show,Typeable)
instance Encoding ISO88598 where
encode _ = encodeSinglebyte (\c -> case lookup c encodeMap of
Just v -> v
Nothing -> throwDyn (HasNoRepresentation c))
encodable _ c = member c encodeMap
decode _ = decodeSinglebyte (decodeArr!)
decodeArr :: Array Word8 Char
#ifndef __HADDOCK__
decodeArr = $(decodingArray "8859-8.TXT")
#endif
encodeMap :: Map Char Word8
#ifndef __HADDOCK__
encodeMap = $(encodingMap "8859-8.TXT")
#endif
$( makeISOInstance "ISO88598" "8859-8.TXT" )

View File

@ -1,30 +1,6 @@
{-# LANGUAGE CPP,TemplateHaskell,DeriveDataTypeable #-}
module Data.Encoding.ISO88599
(ISO88599(..)) where
{-# LANGUAGE TemplateHaskell #-}
module Data.Encoding.ISO88599 where
import Data.Array ((!),Array)
import Data.Word (Word8)
import Data.Map (Map,lookup,member)
import Data.Encoding.Base
import Prelude hiding (lookup)
import Control.Exception (throwDyn)
import Data.Typeable
import Data.Encoding.Helper.Template (makeISOInstance)
data ISO88599 = ISO88599 deriving (Eq,Show,Typeable)
instance Encoding ISO88599 where
encode _ = encodeSinglebyte (\c -> case lookup c encodeMap of
Just v -> v
Nothing -> throwDyn (HasNoRepresentation c))
encodable _ c = member c encodeMap
decode _ = decodeSinglebyte (decodeArr!)
decodeArr :: Array Word8 Char
#ifndef __HADDOCK__
decodeArr = $(decodingArray "8859-9.TXT")
#endif
encodeMap :: Map Char Word8
#ifndef __HADDOCK__
encodeMap = $(encodingMap "8859-9.TXT")
#endif
$( makeISOInstance "ISO88599" "8859-9.TXT" )

View File

@ -5,16 +5,18 @@
module Data.Encoding.KOI8R
(KOI8R(..)) where
import Control.Exception (throwDyn)
import Control.Throws
import Data.Array.Unboxed
import Data.Char (ord,chr)
import qualified Data.ByteString.Lazy as Lazy
import Data.Map hiding (map,(!))
import Data.Word
import Prelude hiding (lookup)
import Data.Typeable
import Data.Encoding.Base
import Data.Encoding.ByteSource
import Data.Encoding.ByteSink
import Data.Encoding.Exception
data KOI8R = KOI8R deriving (Eq,Show,Typeable)
@ -44,22 +46,14 @@ koi8rList =
,'\x042c','\x042b','\x0417','\x0428','\x042d','\x0429','\x0427','\x042a'
]
koi8rDecode :: Word8 -> Char
koi8rDecode ch
| ch < 128 = chr $ fromIntegral ch
| otherwise = koi8rArr!ch
koi8rEncode :: Char -> Word8
koi8rEncode ch
| ch < '\128' = fromIntegral $ ord ch
| otherwise = case lookup ch koi8rMap of
Just w -> w
Nothing -> throwDyn (HasNoRepresentation ch)
instance Encoding KOI8R where
encode _ = encodeSinglebyte koi8rEncode
encodeLazy _ = encodeSinglebyteLazy koi8rEncode
encodable _ c = (c < '\128') || (member c koi8rMap)
decode _ = decodeSinglebyte koi8rDecode
decodeLazy _ str = concatMap (decodeSinglebyte koi8rDecode) (Lazy.toChunks str)
decodable _ = const True
decodeChar _ = do
w <- fetchWord8
if w < 128
then return $ chr $ fromIntegral w
else return $ koi8rArr!w
encodeChar _ ch
| ch < '\128' = pushWord8 $ fromIntegral $ ord ch
| otherwise = case lookup ch koi8rMap of
Just w -> pushWord8 w
Nothing -> throwException (HasNoRepresentation ch)

View File

@ -5,13 +5,16 @@
module Data.Encoding.KOI8U
(KOI8U(..)) where
import Control.Exception (throwDyn)
import Data.Encoding.Base
import Data.Encoding.ByteSource
import Data.Encoding.ByteSink
import Data.Encoding.Exception
import Control.Throws
import Data.Word
import Data.Array.Unboxed
import Data.Encoding.Base
import Data.Char (chr,ord)
import Data.Map (Map,fromList,lookup,member)
import qualified Data.ByteString.Lazy as Lazy
import Prelude hiding (lookup)
import Data.Typeable
@ -43,22 +46,14 @@ koi8uList =
,'\x042c','\x042b','\x0417','\x0428','\x042d','\x0429','\x0427','\x042a'
]
koi8uDecode :: Word8 -> Char
koi8uDecode ch
| ch < 128 = chr $ fromIntegral ch
| otherwise = koi8uArr!ch
koi8uEncode :: Char -> Word8
koi8uEncode ch
| ch < '\128' = fromIntegral $ ord ch
| otherwise = case lookup ch koi8uMap of
Just w -> w
Nothing -> throwDyn (HasNoRepresentation ch)
instance Encoding KOI8U where
encode _ = encodeSinglebyte koi8uEncode
encodeLazy _ = encodeSinglebyteLazy koi8uEncode
encodable _ c = (c < '\128') || (member c koi8uMap)
decode _ = decodeSinglebyte koi8uDecode
decodeLazy _ str = concatMap (decodeSinglebyte koi8uDecode) (Lazy.toChunks str)
decodable _ = const True
decodeChar _ = do
w <- fetchWord8
if w < 128
then return $ chr $ fromIntegral w
else return $ koi8uArr!w
encodeChar _ ch
| ch < '\128' = pushWord8 $ fromIntegral $ ord ch
| otherwise = case lookup ch koi8uMap of
Just w -> pushWord8 w
Nothing -> throwException (HasNoRepresentation ch)

View File

@ -3,150 +3,79 @@
See <http://en.wikipedia.org/wiki/UTF-16> for more information.
-}
module Data.Encoding.UTF16
(UTF16(..)
) where
(UTF16(..)
) where
import Data.Encoding.Base
import Data.Char(ord,chr)
import Data.Encoding.ByteSink
import Data.Encoding.ByteSource
import Data.Encoding.Exception
import Control.Throws
import Data.Bits
import Data.Int
import Data.Word
import Data.ByteString as BS
import qualified Data.ByteString.Lazy as LBS
import Prelude hiding (length)
import Control.Exception
import Data.Dynamic (toDyn)
import Data.Char
import Data.Typeable
import Data.Word
data UTF16
= UTF16 -- ^ Decodes big and little endian, encodes big endian.
| UTF16BE -- ^ Big endian decoding and encoding, fails if the string isn\'t actually big endian.
| UTF16LE -- ^ Little endian decoding and encoding.
deriving (Eq,Show,Typeable)
= UTF16 -- ^ Decodes big and little endian, encodes big endian.
| UTF16BE -- ^ Big endian decoding and encoding, fails if the string isn\'t actually big endian.
| UTF16LE -- ^ Little endian decoding and encoding.
deriving (Eq,Show,Typeable)
utf16enc :: Bool -> (EncodeState,String) -> Maybe (Word8,(EncodeState,String))
utf16enc _ (Done,[]) = Nothing
utf16enc True (Done,x:xs)
| n<=0x0000FFFF = Just
(fromIntegral $ n `shiftR` 8
,(Put1 (fromIntegral $ n),xs))
| n<=0x0010FFFF = Just
(fromIntegral $ 0xD8 .|. (n' `shiftR` 18)
,(Put3 (fromIntegral $ (n' `shiftR` 10))
(fromIntegral $
0xDC .|. ((n' `shiftR` 8) .&. 0x03))
(fromIntegral n'),xs))
| otherwise = throwDyn $ HasNoRepresentation x
where
n = ord x
n' = n - 0x10000
utf16enc False (Done,x:xs)
| n<=0x0000FFFF = Just
(fromIntegral $ n
,(Put1 (fromIntegral $ n `shiftR` 8),xs))
| n<=0x0010FFFF = Just
(fromIntegral n'
,(Put3 (fromIntegral $
0xDC .|. ((n' `shiftR` 8) .&. 0x03))
(fromIntegral $ (n' `shiftR` 10))
(fromIntegral $ 0xD8 .|. (n' `shiftR` 18)),xs))
| otherwise = throwDyn $ HasNoRepresentation x
where
n = ord x
n' = n - 0x10000
utf16enc _ (Put3 w1 w2 w3,xs) = Just (w1,(Put2 w2 w3,xs))
utf16enc _ (Put2 w1 w2,xs) = Just (w1,(Put1 w2,xs))
utf16enc _ (Put1 w1,xs) = Just (w1,(Done,xs))
readBOM :: ByteSource m => m (Either Char UTF16)
readBOM = do
ch <- decodeChar UTF16
case ch of
'\xFEFF' -> return (Right UTF16BE)
'\xFFFE' -> return (Right UTF16LE)
_ -> return (Left ch)
decodeUTF16 :: ByteSource m => (m Word16) -> m Char
decodeUTF16 fetch = do
w1 <- fetch
if w1 < 0xD800 || w1 > 0xDFFF
then return (chr $ fromIntegral w1)
else (if w1 > 0xDBFF
then throwException (IllegalCharacter (fromIntegral (w1 `shiftR` 8)))
else (do
w2 <- fetch
if w2 < 0xDC00 || w2 > 0xDFFF
then throwException (IllegalCharacter (fromIntegral (w2 `shiftR` 8)))
else let v = ((fromIntegral (w1 .&. 0x3FF)) `shiftL` 10)
.|. (fromIntegral (w2 .&. 0x3FF))
in return $ chr (v+0x10000)
)
)
{-# SPECIALIZE utf16dec :: Bool -> Word8 -> Word8 -> Word8 -> Word8 -> (Char,Int) #-}
{-# SPECIALIZE utf16dec :: Bool -> Word8 -> Word8 -> Word8 -> Word8 -> (Char,Int64) #-}
utf16dec :: Num a => Bool -> Word8 -> Word8 -> Word8 -> Word8 -> (Char,a)
utf16dec be s1 s2 s3 s4
| w1< 0xD8 || w1> 0xDF
= (chr $ ((fromIntegral w1) `shiftL` 8) .|. (fromIntegral w2),2)
| w1> 0xDB = throwDyn $ IllegalCharacter w1
| w3< 0xDC || w3>0xDF = throwDyn $ IllegalCharacter w3
| otherwise = (chr $ (((fromIntegral w1 .&. 0x03) `shiftL` 18)
.|. ((fromIntegral w2) `shiftL` 10)
.|. ((fromIntegral w3 .&. 0x03) `shiftL` 8)
.|. (fromIntegral w4)) + 0x10000,4)
where
(w1,w2,w3,w4) = if be then (s1,s2,s3,s4) else (s2,s1,s4,s3)
encodeUTF16 :: ByteSink m => (Word16 -> m ()) -> Char -> m ()
encodeUTF16 push ch
| val<=0xDFFF && val>=0xD800 = throwException (HasNoRepresentation ch)
| val<=0x0000FFFF = push $ fromIntegral val
| val<=0x0010FFFF = let v = val - 0x10000
w1 = (fromIntegral (v `shiftR` 10)) .|. 0xD800
w2 = ((fromIntegral v) .&. 0x3FF) .|. 0xDC00
in push w1 >> push w2
| otherwise = throwException (HasNoRepresentation ch)
where
val = ord ch
instance Encoding UTF16 where
encode enc str = unfoldr (utf16enc (enc/=UTF16LE)) (case enc of
UTF16 -> Put2 0xFE 0xFF
_ -> Done,str)
encodeLazy enc str = LBS.unfoldr (utf16enc (enc/=UTF16LE)) (case enc of
UTF16 -> Put2 0xFE 0xFF
_ -> Done,str)
encodable _ c = ord c <= 0x0010FFFF
decode bo str = case findByteOrder str of
Nothing -> decode' (bo/=UTF16LE) 0
Just big -> decode' big 2
where
l = BS.length str
decode' be i = if i>=l
then []
else c:decode' be (i+took)
where
(c,took) = mapException (\ex -> case ex of
ErrorCall _ -> DynException (toDyn UnexpectedEnd)
_ -> ex) (utf16dec be s1 s2 s3 s4)
s1 = index str i
s2 = index str (i+1)
s3 = index str (i+2)
s4 = index str (i+3)
decodeLazy bo str = case findByteOrderLazy str of
Nothing -> decode' (bo/=UTF16LE) 0
Just big -> decode' big 2
where
l = LBS.length str
decode' be i = if i>=l
then []
else c:decode' be (i+took)
where
(c,took) = mapException (\ex -> case ex of
ErrorCall _ -> DynException (toDyn UnexpectedEnd)
_ -> ex) (utf16dec be s1 s2 s3 s4)
s1 = LBS.index str i
s2 = LBS.index str (i+1)
s3 = LBS.index str (i+2)
s4 = LBS.index str (i+3)
decodable bo str = case findByteOrder str of
Nothing -> check' (bo/=UTF16LE) (length str) 0
Just big -> check' big (length str) 2
where
check' be m i
| m == i = True
| m == i+1 = False
| w1< 0xD8 || w1> 0xDF = check' be m (i+2)
| w1> 0xDB = False
| m <= i+3 = False
| w3< 0xDC || w3>0xDF = False
| otherwise = check' be m (i+4)
where
(w1,w3) = if be then (s1,s3) else (s2,s4)
s1 = index str i
s2 = index str (i+1)
s3 = index str (i+2)
s4 = index str (i+3)
encodeChar UTF16LE = encodeUTF16 pushWord16le
encodeChar _ = encodeUTF16 pushWord16be
decodeChar UTF16LE = decodeUTF16 fetchWord16le
decodeChar _ = decodeUTF16 fetchWord16be
findByteOrder :: ByteString -> Maybe Bool
findByteOrder str
| length str < 2 = Nothing
| w1 == 0xFE && w2 == 0xFF = Just True
| w1 == 0xFF && w2 == 0xFE = Just False
| otherwise = Nothing
where
w1 = index str 0
w2 = index str 1
encode UTF16 str = do
encodeChar UTF16 '\xFEFF'
mapM_ (encodeChar UTF16) str
encode enc str = mapM_ (encodeChar enc) str
findByteOrderLazy :: LBS.ByteString -> Maybe Bool
findByteOrderLazy str = case LBS.unpack (LBS.take 2 str) of
[w1,w2]
| w1 == 0xFE && w2 == 0xFF -> Just True
| w1 == 0xFF && w2 == 0xFE -> Just False
| otherwise -> Nothing
_ -> Nothing
decode UTF16 = do
res <- readBOM
case res of
Left c -> do
cs <- untilM sourceEmpty (decodeChar UTF16BE)
return (c:cs)
Right bom -> decode bom
decode enc = untilM sourceEmpty (decodeChar enc)

View File

@ -3,17 +3,17 @@
See <http://en.wikipedia.org/wiki/UTF-32> for more information.
-}
module Data.Encoding.UTF32
(UTF32(..))
where
(UTF32(..))
where
import Data.Bits
import Data.Char (ord,chr)
import Data.Encoding.Base
import Data.Word
import Control.Exception (throwDyn)
import Data.Encoding.ByteSink
import Data.Encoding.ByteSource
import Data.Encoding.Exception
import Data.Char
import Data.Typeable
import qualified Data.ByteString as BS
import qualified Data.ByteString.Lazy as LBS
data UTF32
= UTF32 -- ^ Detects big or little endian through the use of the BOM (Byte Order Mask) character. Defaults to big endian if not present.
@ -21,70 +21,26 @@ data UTF32
| UTF32LE -- ^ Encodes and decodes using the little endian encoding.
deriving (Eq,Show,Typeable)
bom :: Char
bom = '\xFEFF'
instance Encoding UTF32 where
encode UTF32 str = encodeMultibyte encodeUTF32be (bom:str)
encode UTF32LE str = encodeMultibyte encodeUTF32le str
encode UTF32BE str = encodeMultibyte encodeUTF32be str
encodeLazy UTF32 str = encodeMultibyteLazy encodeUTF32be (bom:str)
encodeLazy UTF32LE str = encodeMultibyteLazy encodeUTF32le str
encodeLazy UTF32BE str = encodeMultibyteLazy encodeUTF32be str
encodable _ c = ord c < 0x0010FFFF
decode UTF32 str = let
(start,rest) = BS.splitAt 4 str
in case BS.unpack start of
[0x00,0x00,0xFE,0xFF] -> decode UTF32BE rest
[0xFE,0xFF,0x00,0x00] -> decode UTF32LE rest
_ -> decode UTF32BE str
decode UTF32LE str = decodeMultibyte decodeUTF32le str
decode UTF32BE str = decodeMultibyte decodeUTF32be str
decodeLazy UTF32 str = let
(start,rest) = LBS.splitAt 4 str
in case LBS.unpack start of
[0x00,0x00,0xFE,0xFF] -> decodeLazy UTF32BE rest
[0xFE,0xFF,0x00,0x00] -> decodeLazy UTF32LE rest
_ -> decodeLazy UTF32BE str
decodeLazy UTF32LE str = decodeMultibyteLazy decodeUTF32le str
decodeLazy UTF32BE str = decodeMultibyteLazy decodeUTF32be str
encodeChar UTF32LE ch = pushWord32le (fromIntegral $ ord ch)
encodeChar _ ch = pushWord32be (fromIntegral $ ord ch)
decodeChar UTF32LE = do
wrd <- fetchWord32le
return $ chr $ fromIntegral wrd
decodeChar _ = do
wrd <- fetchWord32be
return $ chr $ fromIntegral wrd
encode UTF32 str = do
encodeChar UTF32 '\xFEFF'
mapM_ (encodeChar UTF32) str
encode enc str = mapM_ (encodeChar enc) str
encodeUTF32be :: Char -> (Word8,EncodeState)
encodeUTF32be ch = let
w = ord ch
w1 = fromIntegral $ w `shiftR` 24
w2 = fromIntegral $ w `shiftR` 16
w3 = fromIntegral $ w `shiftR` 8
w4 = fromIntegral $ w
in (w1,Put3 w2 w3 w4)
encodeUTF32le :: Char -> (Word8,EncodeState)
encodeUTF32le ch = let
w = ord ch
w1 = fromIntegral $ w `shiftR` 24
w2 = fromIntegral $ w `shiftR` 16
w3 = fromIntegral $ w `shiftR` 8
w4 = fromIntegral $ w
in (w4,Put3 w3 w2 w1)
decodeUTF32be :: [Word8] -> (Char,[Word8])
decodeUTF32be (w1:w2:w3:w4:rest) = let
v = (fromIntegral w1 `shiftL` 24) .|.
(fromIntegral w2 `shiftL` 16) .|.
(fromIntegral w3 `shiftL` 8) .|.
(fromIntegral w4)
in if v < 0x0010FFFF
then (chr v,rest)
else throwDyn (IllegalRepresentation [w1,w2,w3,w4])
decodeUTF32be _ = throwDyn UnexpectedEnd
decodeUTF32le :: [Word8] -> (Char,[Word8])
decodeUTF32le (w1:w2:w3:w4:rest) = let
v = (fromIntegral w4 `shiftL` 24) .|.
(fromIntegral w3 `shiftL` 16) .|.
(fromIntegral w2 `shiftL` 8) .|.
(fromIntegral w1)
in if v < 0x0010FFFF
then (chr v,rest)
else throwDyn (IllegalRepresentation [w1,w2,w3,w4])
decodeUTF32le _ = throwDyn UnexpectedEnd
decode UTF32 = do
ch <- fetchWord32be
case ch of
0x0000FEFF -> untilM sourceEmpty (decodeChar UTF32BE)
0xFFFE0000 -> untilM sourceEmpty (decodeChar UTF32LE)
_ -> do
rest <- untilM sourceEmpty (decodeChar UTF32)
return ((chr $ fromIntegral ch):rest)
decode enc = untilM sourceEmpty (decodeChar enc)

View File

@ -2,156 +2,124 @@
{- | This module implements UTF-8 encoding and decoding as in RFC 3629.
See <http://en.wikipedia.org/wiki/UTF-8> for more information.
-}
module Data.Encoding.UTF8
(UTF8(..)) where
module Data.Encoding.UTF8 where
import Control.Throws
import Data.Char
import Data.Bits
import Data.Char (ord,chr)
import Data.Encoding.Base
import Data.ByteString
import Data.Word
import Prelude hiding (length)
import Control.Exception
import Data.Encoding.ByteSource
import Data.Encoding.ByteSink
import Data.Encoding.Exception
import Data.Typeable
data UTF8
= UTF8 -- ^ Very forgiving decoding mechanism, accepts everything that it can make any sense of.
| UTF8Strict -- ^ More strict decoding, doesn\'t accept sequences that have a too long representation and checks bits that aren\'t used in the decoding.
deriving (Eq,Show,Typeable)
encodeUTF8 :: Char -> (Word8,EncodeState)
encodeUTF8 x
| n<=0x0000007F = (v,Done)
| n<=0x000007FF = (fromIntegral $ 0xC0 .|. (n `shiftR` 6)
,Put1 (0x80 .|. (v .&. 0x3F)))
| n<=0x0000FFFF = (fromIntegral $ 0xE0 .|. (n `shiftR` 12)
,Put2 (fromIntegral $
0x80 .|. ((n `shiftR` 6) .&. 0x3F))
(fromIntegral $
0x80 .|. (n .&. 0x3F)))
| n<=0x0010FFFF = (fromIntegral $ 0xF0 .|. (n `shiftR` 18)
,Put3 (fromIntegral $
0x80 .|. ((n `shiftR` 12) .&. 0x3F))
(fromIntegral $
0x80 .|. ((n `shiftR` 6) .&. 0x3F))
(fromIntegral $
0x80 .|. (n .&. 0x3F)))
| otherwise = throwDyn (HasNoRepresentation x)
where
n = ord x
v = fromIntegral $ ord x
decodeUTF8 :: [Word8] -> (Char,[Word8])
decodeUTF8 ~(w1:rest1)
| w1<=0x7F = (chr $ fromIntegral w1,rest1)
| w1<=0xBF = throwDyn (IllegalCharacter w1)
| w1<=0xDF = case rest1 of
(w2:rest2) -> (chr $ ((fromIntegral $ w1 .&. 0x1F) `shiftL` 6)
.|. (fromIntegral (w2 .&. 0x3F)),rest2)
_ -> throwDyn UnexpectedEnd
| w1<=0xEF = case rest1 of
(w2:w3:rest3) -> (chr $ ((fromIntegral $ w1 .&. 0x0F) `shiftL` 12)
.|. ((fromIntegral $ w2 .&. 0x3F) `shiftL` 6)
.|. (fromIntegral $ w3 .&. 0x3F),rest3)
_ -> throwDyn UnexpectedEnd
| w1<=0xF7 = case rest1 of
(w2:w3:w4:rest4) -> (chr $ ((fromIntegral $ w1 .&. 0x07) `shiftL` 18)
.|. ((fromIntegral $ w2 .&. 0x3F) `shiftL` 12)
.|. ((fromIntegral $ w3 .&. 0x3F) `shiftL` 6)
.|. (fromIntegral $ w4 .&. 0x3F),rest4)
_ -> throwDyn UnexpectedEnd
| otherwise = throwDyn (IllegalCharacter w1)
decodeUTF8Strict :: [Word8] -> (Char,[Word8])
decodeUTF8Strict ~(w1:rest1)
| w1<=0x7F = (chr $ fromIntegral w1,rest1)
| w1<=0xBF = throwDyn (IllegalCharacter w1)
| w1<=0xDF = case rest1 of
(w2:rest2)
| invalidExtend w2 -> throwDyn (IllegalCharacter w2)
| otherwise -> let
v1 = w1 .&. 0x1F
in if v1 <= 1
then throwDyn (IllegalRepresentation [w1,w2])
else (chr $ ((fromIntegral v1) `shiftL` 6)
.|. (fromIntegral (w2 .&. 0x3F)),rest2)
_ -> throwDyn UnexpectedEnd
| w1<=0xEF = case rest1 of
(w2:w3:rest3)
| invalidExtend w2 -> throwDyn (IllegalCharacter w2)
| invalidExtend w3 -> throwDyn (IllegalCharacter w3)
| otherwise -> let
v1 = w1 .&. 0x0F
v2 = w2 .&. 0x3F
in if v1 == 0 && v2 < 0x20
then throwDyn (IllegalRepresentation [w1,w2,w3])
else (chr $ ((fromIntegral v1) `shiftL` 12)
.|. ((fromIntegral v2) `shiftL` 6)
.|. (fromIntegral $ w3 .&. 0x3F),rest3)
_ -> throwDyn UnexpectedEnd
| w1<=0xF7 = case rest1 of
(w2:w3:w4:rest4)
| invalidExtend w2 -> throwDyn (IllegalCharacter w2)
| invalidExtend w3 -> throwDyn (IllegalCharacter w3)
| invalidExtend w4 -> throwDyn (IllegalCharacter w4)
| otherwise -> let
v1 = w1 .&. 0x07
v2 = w2 .&. 0x3F
in if v1 == 0 && v2 < 0x10
then throwDyn (IllegalRepresentation [w1,w2,w3,w4])
else (chr $ ((fromIntegral $ w1 .&. 0x07) `shiftL` 18)
.|. ((fromIntegral $ w2 .&. 0x3F) `shiftL` 12)
.|. ((fromIntegral $ w3 .&. 0x3F) `shiftL` 6)
.|. (fromIntegral $ w4 .&. 0x3F),rest4)
_ -> throwDyn UnexpectedEnd
| otherwise = throwDyn (IllegalCharacter w1)
where
invalidExtend wrd = wrd .&. 0xC0 /= 0x80
data UTF8AnalyzeState
= Skip !Int
| CheckAndSkip !Word8 !Int
| Ok
| Failed
deriving Eq
data UTF8 = UTF8 -- ^ Very forgiving decoding mechanism, accepts everything that it can make any sense of.
| UTF8Strict -- ^ More strict decoding, doesn\'t accept sequences that have a too long representation and checks bits that aren\'t used in the decoding
deriving (Eq,Show,Typeable)
instance Encoding UTF8 where
encode _ = encodeMultibyte encodeUTF8
encodeLazy _ = encodeMultibyteLazy encodeUTF8
encodable _ c = ord c <= 0x0010FFFF
decode UTF8 = decodeMultibyte decodeUTF8
decode UTF8Strict = decodeMultibyte decodeUTF8Strict
decodeLazy UTF8 = decodeMultibyteLazy decodeUTF8
decodeLazy UTF8Strict = decodeMultibyteLazy decodeUTF8Strict
decodable UTF8 str = (foldl' (\st w -> case st of
Ok | w<=0x7F -> Ok
| w<=0xBF -> Failed
| w<=0xDF -> Skip 0
| w<=0xEF -> Skip 1
| w<=0xF7 -> Skip 2
| otherwise -> Failed
Failed -> Failed
Skip n -> if w .&. 0xC0 == 0x80
then (if n == 0 then Ok else Skip (n-1))
else Failed) Ok str) == Ok
decodable UTF8Strict str = (foldl' (\st w -> case st of
Ok | w<=0x7F -> Ok
| w<=0xBF -> Failed
| w<=0xDF -> if w .&. 0x1F <= 1
then Failed
else Skip 0
| w<=0xEF -> if w .&. 0x0F == 0
then CheckAndSkip 0x20 1
else Skip 1
| w<=0xF7 -> if w .&. 0x07 == 0
then CheckAndSkip 0x10 2
else Skip 2
| otherwise -> Failed
Failed -> Failed
Skip n -> if w .&. 0xC0 == 0x80
then (if n == 0 then Ok else Skip (n-1))
else Failed
CheckAndSkip chk n -> if w .&. 0xC0 == 0x80 && w .&. 0x3F >= chk
then (if n == 0 then Ok else Skip (n-1))
else Failed
) Ok str) == Ok
encodeChar _ c
| n <= 0x0000007F = p8 n
| n <= 0x000007FF = do
p8 $ 0xC0 .|. (n `shiftR` 6)
p8 $ 0x80 .|. (n .&. 0x3F)
| n <= 0x0000FFFF = do
p8 $ 0xE0 .|. (n `shiftR` 12)
p8 $ 0x80 .|. ((n `shiftR` 6) .&. 0x3F)
p8 $ 0x80 .|. (n .&. 0x3F)
| n <= 0x0010FFFF = do
p8 $ 0xF0 .|. (n `shiftR` 18)
p8 $ 0x80 .|. ((n `shiftR` 12) .&. 0x3F)
p8 $ 0x80 .|. ((n `shiftR` 6) .&. 0x3F)
p8 $ 0x80 .|. (n .&. 0x3F)
| otherwise = throwException (HasNoRepresentation c)
where
n = ord c
p8 = pushWord8.fromIntegral
decodeChar UTF8 = do
w1 <- fetchWord8
case () of
_
| w1 <= 0x7F -> return $ chr $ fromIntegral w1
| w1 <= 0xBF -> throwException (IllegalCharacter w1)
| w1 <= 0xDF -> do
w2 <- fetchWord8
return $ chr $
((fromIntegral $ w1 .&. 0x1F) `shiftL` 6)
.|. (fromIntegral $ w2 .&. 0x3F)
| w1 <= 0xEF -> do
w2 <- fetchWord8
w3 <- fetchWord8
let v1 = w1 .&. 0x0F
v2 = w2 .&. 0x3F
v3 = w3 .&. 0x3F
return $ chr $
((fromIntegral v1) `shiftL` 12)
.|. ((fromIntegral v2) `shiftL` 6)
.|. (fromIntegral v3)
| w1 <= 0xF7 -> do
w2 <- fetchWord8
w3 <- fetchWord8
w4 <- fetchWord8
let v1 = w1 .&. 0x07
v2 = w2 .&. 0x3F
v3 = w3 .&. 0x3F
v4 = w4 .&. 0x3F
return $ chr $
((fromIntegral v1) `shiftL` 18)
.|. ((fromIntegral v2) `shiftL` 12)
.|. ((fromIntegral v3) `shiftL` 6)
.|. (fromIntegral v4)
| otherwise -> throwException (IllegalCharacter w1)
decodeChar UTF8Strict = do
w1 <- fetchWord8
case () of
_
| w1 <= 0x7F -> return $ chr $ fromIntegral w1
| w1 <= 0xBF -> throwException (IllegalCharacter w1)
| w1 <= 0xDF -> do
w2 <- fetchExtend8
let v1 = w1 .&. 0x1F
if v1 <= 1
then throwException (IllegalRepresentation [w1,w2])
else return $ chr $
((fromIntegral v1) `shiftL` 6)
.|. (fromIntegral $ w2 .&. 0x3F)
| w1 <= 0xEF -> do
w2 <- fetchExtend8
w3 <- fetchExtend8
let v1 = w1 .&. 0x0F
v2 = w2 .&. 0x3F
v3 = w3 .&. 0x3F
if v1 == 0 && v2 < 0x20
then throwException (IllegalRepresentation [w1,w2,w3])
else return $ chr $
((fromIntegral v1) `shiftL` 12)
.|. ((fromIntegral v2) `shiftL` 6)
.|. (fromIntegral v3)
| w1 <= 0xF7 -> do
w2 <- fetchExtend8
w3 <- fetchExtend8
w4 <- fetchExtend8
let v1 = w1 .&. 0x07
v2 = w2 .&. 0x3F
v3 = w3 .&. 0x3F
v4 = w4 .&. 0x3F
if v1 == 0 && v2 < 0x10
then throwException (IllegalRepresentation [w1,w2,w3,w4])
else return $ chr $
((fromIntegral v1) `shiftL` 18)
.|. ((fromIntegral v2) `shiftL` 12)
.|. ((fromIntegral v3) `shiftL` 6)
.|. (fromIntegral v4)
| otherwise -> throwException (IllegalCharacter w1)
where
invalidExtend wrd = wrd .&. 0xC0 /= 0x80
fetchExtend8 = do
w <- fetchWord8
if invalidExtend w
then throwException (IllegalCharacter w)
else return w

View File

@ -1,25 +1,80 @@
{-# LANGUAGE ImplicitParams,ForeignFunctionInterface #-}
{- | This module provides a replacement for the normal (unicode unaware) IO functions of haskell.
By using implicit parameters, it can be used almost as a drop-in replacement.
-}
module System.IO.Encoding
(getSystemEncoding
,hPutStr
,hGetContents) where
(getSystemEncoding
,getContents
,hPutStr
,hPutStrLn
,hGetContents
,readFile
,writeFile
,appendFile
,hGetChar
,hGetLine
,hPutChar
,interact
,print) where
import Foreign.C.String
import Data.Encoding
import System.IO hiding (hPutStr,hGetContents)
import qualified Data.ByteString.Lazy as BS
import System.IO (Handle,stdout,stdin)
import Prelude hiding (print,getContents,readFile,writeFile,appendFile,interact)
import qualified Data.ByteString.Lazy as LBS
import qualified Data.ByteString as BS
import Control.Monad.Reader (runReaderT)
-- | Like the normal 'System.IO.hGetContents', but decodes the input using an
-- encoding.
hGetContents :: Encoding e => e -> Handle -> IO String
hGetContents enc h = do
str <- BS.hGetContents h
return $ decodeLazy enc str
hGetContents :: (Encoding e,?enc :: e) => Handle -> IO String
hGetContents h = do
str <- LBS.hGetContents h
return $ decodeLazyByteString ?enc str
getContents :: (Encoding e,?enc :: e) => IO String
getContents = do
str <- LBS.getContents
return $ decodeLazyByteString ?enc str
-- | Like the normal 'System.IO.hPutStr', but encodes the output using an
-- encoding.
hPutStr :: Encoding e => e -> Handle -> String -> IO ()
hPutStr enc h str = BS.hPut h (encodeLazy enc str)
hPutStr :: (Encoding e,?enc :: e) => Handle -> String -> IO ()
hPutStr h str = LBS.hPut h (encodeLazyByteString ?enc str)
hPutStrLn :: (Encoding e,?enc :: e) => Handle -> String -> IO ()
hPutStrLn h str = do
LBS.hPut h (encodeLazyByteString ?enc str)
LBS.hPut h (encodeLazyByteString ?enc "\n")
print :: (Encoding e,Show a,?enc :: e) => a -> IO ()
print x = hPutStrLn stdout (show x)
readFile :: (Encoding e,?enc :: e) => FilePath -> IO String
readFile fn = LBS.readFile fn >>= return.(decodeLazyByteString ?enc)
writeFile :: (Encoding e,?enc :: e) => FilePath -> String -> IO ()
writeFile fn str = LBS.writeFile fn $ encodeLazyByteString ?enc str
appendFile :: (Encoding e,?enc :: e) => FilePath -> String -> IO ()
appendFile fn str = LBS.appendFile fn $ encodeLazyByteString ?enc str
hGetChar :: (Encoding e,?enc :: e) => Handle -> IO Char
hGetChar h = runReaderT (decodeChar ?enc) h
hGetLine :: (Encoding e,?enc :: e) => Handle -> IO String
hGetLine h = do
line <- BS.hGetLine h
return $ decodeStrictByteString ?enc line
hPutChar :: (Encoding e,?enc :: e) => Handle -> Char -> IO ()
hPutChar h c = runReaderT (encodeChar ?enc c) h
interact :: (Encoding e,?enc :: e) => (String -> String) -> IO ()
interact f = do
line <- hGetLine stdin
hPutStrLn stdout (f line)
foreign import ccall "system_encoding.h get_system_encoding"
get_system_encoding :: IO CString
@ -29,4 +84,4 @@ getSystemEncoding :: IO DynEncoding
getSystemEncoding = do
enc <- get_system_encoding
str <- peekCString enc
return $ encodingFromString str
return $ encodingFromString str

View File

@ -4,9 +4,10 @@ module Test.Tester where
import Data.Encoding
import Test.HUnit
import Data.Word
import Data.Char
import qualified Data.ByteString as BS
import qualified Data.ByteString.Lazy as LBS
import Control.Exception (catchDyn,evaluate)
import Test.QuickCheck hiding (Testable)
data EncodingTest
= forall enc. (Encoding enc,Show enc) =>
@ -17,52 +18,60 @@ data EncodingTest
EncodingError enc String EncodingException
instance Testable EncodingTest where
test (EncodingTest enc src trg) = TestList
[TestLabel (show enc ++ " encodable")
(TestCase $ (all (encodable enc) src) @=? True)
,TestLabel (show enc ++ " encoding (strict)")
(TestCase $ bstr @=? (encode enc src))
,TestLabel (show enc ++ " encoding (lazy)")
(TestCase $ lbstr @=? (encodeLazy enc src))
,TestLabel (show enc ++ " decodable")
(TestCase $ (decodable enc bstr) @=? True)
,TestLabel (show enc ++ " decoding (strict)")
(TestCase $ src @=? (decode enc bstr))
,TestLabel (show enc ++ " decoding (lazy)")
(TestCase $ src @=? (decodeLazy enc lbstr))
]
where
bstr = BS.pack trg
lbstr = LBS.pack trg
test (DecodingError enc trg what) = TestList
[TestLabel (show what++" not decodable in "++show enc) $
TestCase $ assert $ not $ decodable enc (BS.pack trg)
,TestLabel (show enc ++ " decoding error (strict)") $ TestCase $
catchDyn (do
mapM_ evaluate (decode enc (BS.pack trg))
assertFailure "No exception thrown"
)
(\exc -> exc @=? what)
,TestLabel (show enc ++ " decoding error (lazy)") $ TestCase $
catchDyn (do
mapM_ evaluate (decodeLazy enc (LBS.pack trg))
assertFailure "No exception thrown"
)
(\exc -> exc @=? what)
]
test (EncodingError enc src what) = TestList
[TestLabel (show src ++ " not encodable in " ++ show enc) $
TestCase $ assert $ not $ all (encodable enc) src
,TestLabel (show enc ++ " encoding error (strict)") $ TestCase $
catchDyn (do
evaluate (encode enc src)
assertFailure "No exception thrown"
)
(\exc -> exc @=? what)
,TestLabel (show enc ++ " encoding error (lazy)") $ TestCase $
catchDyn (do
evaluate (encodeLazy enc src)
assertFailure "No exception thrown"
)
(\exc -> exc @=? what)
]
test (EncodingTest enc src trg)
= TestList
[TestLabel (show enc ++ " encoding")
(TestCase $ encodeStrictByteStringExplicit enc src
@?= Right (BS.pack trg))
,TestLabel (show enc ++ " decoding")
(TestCase $ decodeStrictByteStringExplicit enc (BS.pack trg)
@=? Right src)
]
test (DecodingError enc src ex)
= TestLabel (show enc ++ " decoding error")
(TestCase $ decodeStrictByteStringExplicit enc (BS.pack src) @=? Left ex)
charGen :: Gen Char
charGen = let
ascii = choose (0x00,0x7F) >>= return.chr
oneByte = choose (0x80,0xFF) >>= return.chr
twoByte = choose (0x0100,0xFFFF) >>= return.chr
threeByte = choose (0x010000,0x10FFFF) >>= return.chr
in frequency [(40,ascii),(30,oneByte),(20,twoByte),(10,threeByte)]
instance Arbitrary Char where
arbitrary = charGen
coarbitrary x = id
instance Arbitrary Word8 where
arbitrary = choose (0x00,0xFF::Int) >>= return.fromIntegral
coarbitrary x = id
quickCheckEncoding :: Encoding enc => enc -> IO ()
quickCheckEncoding e = do
quickCheck (encodingIdentity e)
quickCheck (decodingIdentity e)
encodingIdentity :: Encoding enc => enc -> String -> Property
encodingIdentity e str
= trivial (null str)
$ case encoded of
Left err -> trivial True True
Right res -> case decodeStrictByteStringExplicit e res of
Left err -> property False
Right res' -> property (str==res')
where
encoded = encodeStrictByteStringExplicit e str
decodingIdentity :: Encoding enc => enc -> [Word8] -> Property
decodingIdentity e wrd
= trivial (null wrd)
$ case decoded of
Left err -> trivial True True
Right res -> case encodeStrictByteStringExplicit e res of
Left err -> property False
Right res' -> property (bstr==res')
where
bstr = BS.pack wrd
decoded = decodeStrictByteStringExplicit e bstr

View File

@ -1,119 +1,158 @@
module Test.Tests where
import Test.Tester
import Data.Encoding
import Data.Encoding.ASCII
import Data.Encoding.UTF8
import Data.Encoding.UTF16
import Data.Encoding.UTF32
import Data.Encoding.ISO88592
import Data.Encoding.ISO88593
import Data.Encoding.ISO88594
import Data.Encoding.ISO88595
import Data.Encoding.ISO88596
import Data.Encoding.ISO88597
import Data.Encoding.ISO88598
import Data.Encoding.ISO88599
import Data.Encoding.ISO885910
import Data.Encoding.ISO885911
import Data.Encoding.ISO885913
import Data.Encoding.ISO885914
import Data.Encoding.BootString
import Test.Tester
import Test.HUnit
import Test.QuickCheck hiding (test)
import Data.Char (ord)
asciiTests :: Test
asciiTests = TestList $ map test $
[EncodingTest ASCII
"Hello, world!"
[0x48,0x65,0x6C,0x6C,0x6F,0x2C,0x20,0x77,0x6F,0x72,0x6C,0x64,0x21]
,EncodingError ASCII
"\x0041\x2262\x0391\x002E"
(HasNoRepresentation '\x2262')
]
identityTests :: IO ()
identityTests = do
putStrLn "for UTF8..."
quickCheckEncoding UTF8Strict
putStrLn "for UTF16..."
quickCheckEncoding UTF16BE
quickCheckEncoding UTF16LE
putStrLn "for UTF32..."
quickCheck $ encodingIdentity UTF32
putStrLn "for ISO88592..."
quickCheckEncoding ISO88592
putStrLn "for ISO88593..."
quickCheckEncoding ISO88593
putStrLn "for ISO88594..."
quickCheckEncoding ISO88594
putStrLn "for ISO88595..."
quickCheckEncoding ISO88595
putStrLn "for ISO88596..."
quickCheckEncoding ISO88596
putStrLn "for ISO88597..."
quickCheckEncoding ISO88597
putStrLn "for ISO88598..."
quickCheckEncoding ISO88598
putStrLn "for ISO88599..."
quickCheckEncoding ISO88599
putStrLn "for ISO885910..."
quickCheckEncoding ISO885910
putStrLn "for ISO885911..."
quickCheckEncoding ISO885911
putStrLn "for ISO885913..."
quickCheckEncoding ISO885913
putStrLn "for ISO885914..."
quickCheckEncoding ISO885914
putStrLn "for Punycode..."
quickCheck $ encodingIdentity punycode
utf8Tests :: Test
utf8Tests = TestList $ map test $
-- Simple encoding tests
concat [[EncodingTest enc "\x0041\x2262\x0391\x002E"
[0x41,0xE2,0x89,0xA2,0xCE,0x91,0x2E]
,EncodingTest enc "\xD55C\xAD6D\xC5B4"
[0xED,0x95,0x9C,0xEA,0xB5,0xAD,0xEC,0x96,0xB4]
,EncodingTest enc "\x65E5\x672C\x8A9E"
[0xE6,0x97,0xA5,0xE6,0x9C,0xAC,0xE8,0xAA,0x9E]
,EncodingTest enc "\x233B4"
[0xF0,0xA3,0x8E,0xB4]
,EncodingTest enc ""
[]
-- First possible sequence of a certain length
,EncodingTest enc "\x0000"
[0x00]
,EncodingTest enc "\x0080"
[0xC2,0x80]
,EncodingTest enc "\x0800"
[0xE0,0xA0,0x80]
,EncodingTest enc "\x10000"
[0xF0,0x90,0x80,0x80]
-- Last possible sequence of a certain length
,EncodingTest enc "\x007F"
[0x7F]
,EncodingTest enc "\x07FF"
[0xDF,0xBF]
,EncodingTest enc "\xFFFF"
[0xEF,0xBF,0xBF]
-- Other boundaries
,EncodingTest enc "\xD7FF"
[0xED,0x9F,0xBF]
,EncodingTest enc "\xE000"
[0xEE,0x80,0x80]
,EncodingTest enc "\xFFFD"
[0xEF,0xBF,0xBD]
-- Illegal starting characters
,DecodingError enc
[0x65,0x55,0x85]
(IllegalCharacter 0x85)
-- Unexpected end
,DecodingError enc
[0x41,0xE2,0x89,0xA2,0xCE]
UnexpectedEnd
,DecodingError enc
[0x41,0xE2,0x89]
UnexpectedEnd
,DecodingError enc
[0x41,0xE2]
UnexpectedEnd]
| enc <- [UTF8,UTF8Strict]
]++
[DecodingError UTF8 [0xFE] (IllegalCharacter 0xFE)
,DecodingError UTF8 [0xFF] (IllegalCharacter 0xFF)
-- Overlong representations of '/'
,DecodingError UTF8Strict [0xC0,0xAF]
(IllegalRepresentation [0xC0,0xAF])
,DecodingError UTF8Strict [0xE0,0x80,0xAF]
(IllegalRepresentation [0xE0,0x80,0xAF])
,DecodingError UTF8Strict [0xF0,0x80,0x80,0xAF]
(IllegalRepresentation [0xF0,0x80,0x80,0xAF])
-- Maximum overlong sequences
,DecodingError UTF8Strict [0xC1,0xBF]
(IllegalRepresentation [0xC1,0xBF])
,DecodingError UTF8Strict [0xE0,0x9F,0xBF]
(IllegalRepresentation [0xE0,0x9F,0xBF])
,DecodingError UTF8Strict [0xF0,0x8F,0xBF,0xBF]
(IllegalRepresentation [0xF0,0x8F,0xBF,0xBF])
-- Overlong represenations of '\NUL'
,DecodingError UTF8Strict [0xC0,0x80]
(IllegalRepresentation [0xC0,0x80])
,DecodingError UTF8Strict [0xE0,0x80,0x80]
(IllegalRepresentation [0xE0,0x80,0x80])
,DecodingError UTF8Strict [0xF0,0x80,0x80,0x80]
(IllegalRepresentation [0xF0,0x80,0x80,0x80])
-- Invalid extends
-- 2 of 2
,DecodingError UTF8Strict [0xCC,0x1C,0xE0]
(IllegalCharacter 0x1C)
-- 2 of 3
,DecodingError UTF8Strict [0xE3,0x6C,0xB3]
(IllegalCharacter 0x6C)
-- 3 of 3
,DecodingError UTF8Strict [0xE3,0xB4,0x6D]
(IllegalCharacter 0x6D)
-- 2 of 4
,DecodingError UTF8Strict [0xF2,0x6C,0xB3,0xB3]
(IllegalCharacter 0x6C)
-- 3 of 4
,DecodingError UTF8Strict [0xF2,0xB3,0x6C,0xB3]
(IllegalCharacter 0x6C)
-- 4 of 4
,DecodingError UTF8Strict [0xF2,0xB3,0xB3,0x6C]
(IllegalCharacter 0x6C)
]
utf8Tests = TestList $ map test $ concat
[[EncodingTest enc "\x0041\x2262\x0391\x002E"
[0x41,0xE2,0x89,0xA2,0xCE,0x91,0x2E]
,EncodingTest enc "\xD55C\xAD6D\xC5B4"
[0xED,0x95,0x9C,0xEA,0xB5,0xAD,0xEC,0x96,0xB4]
,EncodingTest enc "\x65E5\x672C\x8A9E"
[0xE6,0x97,0xA5,0xE6,0x9C,0xAC,0xE8,0xAA,0x9E]
,EncodingTest enc "\x233B4"
[0xF0,0xA3,0x8E,0xB4]
,EncodingTest enc ""
[]
-- First possible sequence of a certain length
,EncodingTest enc "\x0000"
[0x00]
,EncodingTest enc "\x0080"
[0xC2,0x80]
,EncodingTest enc "\x0800"
[0xE0,0xA0,0x80]
,EncodingTest enc "\x10000"
[0xF0,0x90,0x80,0x80]
-- Last possible sequence of a certain length
,EncodingTest enc "\x007F"
[0x7F]
,EncodingTest enc "\x07FF"
[0xDF,0xBF]
,EncodingTest enc "\xFFFF"
[0xEF,0xBF,0xBF]
-- Other boundaries
,EncodingTest enc "\xD7FF"
[0xED,0x9F,0xBF]
,EncodingTest enc "\xE000"
[0xEE,0x80,0x80]
,EncodingTest enc "\xFFFD"
[0xEF,0xBF,0xBD]
-- Illegal starting characters
,DecodingError enc
[0x65,0x55,0x85]
(IllegalCharacter 0x85)
-- Unexpected end
,DecodingError enc
[0x41,0xE2,0x89,0xA2,0xCE]
UnexpectedEnd
,DecodingError enc
[0x41,0xE2,0x89]
UnexpectedEnd
,DecodingError enc
[0x41,0xE2]
UnexpectedEnd
]
| enc <- [UTF8,UTF8Strict]
]++
[DecodingError UTF8 [0xFE] (IllegalCharacter 0xFE)
,DecodingError UTF8 [0xFF] (IllegalCharacter 0xFF)
-- Overlong representations of '/'
,DecodingError UTF8Strict [0xC0,0xAF]
(IllegalRepresentation [0xC0,0xAF])
,DecodingError UTF8Strict [0xE0,0x80,0xAF]
(IllegalRepresentation [0xE0,0x80,0xAF])
,DecodingError UTF8Strict [0xF0,0x80,0x80,0xAF]
(IllegalRepresentation [0xF0,0x80,0x80,0xAF])
-- Maximum overlong sequences
,DecodingError UTF8Strict [0xC1,0xBF]
(IllegalRepresentation [0xC1,0xBF])
,DecodingError UTF8Strict [0xE0,0x9F,0xBF]
(IllegalRepresentation [0xE0,0x9F,0xBF])
,DecodingError UTF8Strict [0xF0,0x8F,0xBF,0xBF]
(IllegalRepresentation [0xF0,0x8F,0xBF,0xBF])
-- Overlong represenations of '\NUL'
,DecodingError UTF8Strict [0xC0,0x80]
(IllegalRepresentation [0xC0,0x80])
,DecodingError UTF8Strict [0xE0,0x80,0x80]
(IllegalRepresentation [0xE0,0x80,0x80])
,DecodingError UTF8Strict [0xF0,0x80,0x80,0x80]
(IllegalRepresentation [0xF0,0x80,0x80,0x80])
-- Invalid extends
-- 2 of 2
,DecodingError UTF8Strict [0xCC,0x1C,0xE0]
(IllegalCharacter 0x1C)
-- 2 of 3
,DecodingError UTF8Strict [0xE3,0x6C,0xB3]
(IllegalCharacter 0x6C)
-- 3 of 3
,DecodingError UTF8Strict [0xE3,0xB4,0x6D]
(IllegalCharacter 0x6D)
-- 2 of 4
,DecodingError UTF8Strict [0xF2,0x6C,0xB3,0xB3]
(IllegalCharacter 0x6C)
-- 3 of 4
,DecodingError UTF8Strict [0xF2,0xB3,0x6C,0xB3]
(IllegalCharacter 0x6C)
-- 4 of 4
,DecodingError UTF8Strict [0xF2,0xB3,0xB3,0x6C]
(IllegalCharacter 0x6C)
]
utf16Tests :: Test
utf16Tests = TestList $ map test $
@ -129,9 +168,9 @@ utf16Tests = TestList $ map test $
[0x02,0xE8]
,DecodingError UTF16LE [0x65,0xDC]
(IllegalCharacter 0xDC)
,DecodingError UTF16BE [0xDC]
,DecodingError UTF16BE [0xDC,0x33]
(IllegalCharacter 0xDC)
,DecodingError UTF16BE [0xD9,0x78,0xDA]
,DecodingError UTF16BE [0xD9,0x78,0xDA,0x33]
(IllegalCharacter 0xDA)
,DecodingError UTF16BE [0xD9,0x78,0xDA,0x66]
(IllegalCharacter 0xDA)
@ -183,5 +222,9 @@ punycodeTests = TestList $ map test $
-- Vietnamese
,punyTest "TạisaohọkhôngthểchỉnóitiếngViệt"
"TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"
{-,punyTest "foo\xC2\xAD\xCD\x8F\xE1\xA0\x86\xE1\xA0\x8B\
\bar\xE2\x80\x8B\xE2\x81\xA0\
\baz\xEF\xB8\x80\xEF\xB8\x88\xEF\xB8\x8F\xEF\xBB\xBF"
"foobarbaz"-}
]
where punyTest str outp = EncodingTest punycode str (map (fromIntegral.ord) outp)
where punyTest str outp = EncodingTest punycode str (map (fromIntegral.ord) outp)

View File

@ -1,63 +1,37 @@
Name: encoding
Version: 0.4.1
Version: 0.5.0
Author: Henning Günther
Maintainer: h.guenther@tu-bs.de
License: BSD3
Synopsis: A library for various character encodings
Stability: alpha
Description:
Haskell has excellect handling of unicode, the Char type covers all unicode chars. Unfortunatly, there's no possibility to read or write something to the outer world in an encoding other than ascii due to the lack of support for encodings. This library should help with that.
Category: Codec
Homepage: http://code.haskell.org/encoding/
Cabal-Version: >=1.2
Build-Type: Simple
Extra-Source-Files:
8859-2.TXT
8859-3.TXT
8859-4.TXT
8859-5.TXT
8859-6.TXT
8859-7.TXT
8859-8.TXT
8859-9.TXT
8859-10.TXT
8859-11.TXT
8859-13.TXT
8859-14.TXT
8859-15.TXT
8859-16.TXT
CP1250.TXT
CP1251.TXT
CP1252.TXT
CP1253.TXT
CP1254.TXT
CP1255.TXT
CP1256.TXT
CP1257.TXT
CP1258.TXT
gb-18030-2000.xml
system_encoding.h
create_gb18030_data.sh
NEWS
Flag splitBase
description: Choose the new smaller, split-up base package.
Library
if flag(splitBase)
Build-Depends: bytestring, base >= 3, template-haskell, containers, array, regex-compat
Build-Depends: bytestring, base >= 3, binary, mtl, containers, extensible-exceptions, array, template-haskell, regex-compat
else
Build-Depends: base < 3, template-haskell, regex-compat
Extensions: TemplateHaskell,CPP,ExistentialQuantification,ForeignFunctionInterface
C-Sources: system_encoding.c
Include-Dirs: .
Install-Includes: system_encoding.h
Build-Depends: base < 3, binary, extensible-exceptions, template-haskell
Exposed-Modules:
Data.Encoding
Data.Encoding.ByteSource
Data.Encoding.ByteSink
Data.Encoding.Exception
Control.Throws
Data.Encoding.ASCII
Data.Encoding.UTF8
Data.Encoding.UTF16
Data.Encoding.UTF32
Data.Encoding.KOI8R
Data.Encoding.KOI8U
Data.Encoding.ISO88591
Data.Encoding.ISO88592
Data.Encoding.ISO88593
@ -73,7 +47,6 @@ Library
Data.Encoding.ISO885914
Data.Encoding.ISO885915
Data.Encoding.ISO885916
Data.Encoding.BootString
Data.Encoding.CP1250
Data.Encoding.CP1251
Data.Encoding.CP1252
@ -83,12 +56,7 @@ Library
Data.Encoding.CP1256
Data.Encoding.CP1257
Data.Encoding.CP1258
Data.Encoding.KOI8R
Data.Encoding.KOI8U
Data.Encoding.GB18030
System.IO.Encoding
Other-Modules:
Data.Encoding.Base
Data.Encoding.GB18030Data
Data.Encoding.Helper.Template
Data.Encoding.BootString
System.IO.Encoding