Rewrite to support more sources and changing the encoding dynamically

Now it's possible to change the character encoding while de-/encoding.
Also, it's possible to use any data structure as a source or target of the de-/encoding process.

darcs-hash:20090221203100-a4fee-6da31f2e37c30a3f5cd5f10af71984209488bb0b
This commit is contained in:
Henning Guenther 2009-02-21 12:31:00 -08:00
parent 8b16078e5e
commit dee13764d9
44 changed files with 1414 additions and 1769 deletions

28
Control/Throws.hs Normal file
View File

@ -0,0 +1,28 @@
{-# LANGUAGE FlexibleInstances,MultiParamTypeClasses #-}
module Control.Throws where
import Control.Exception.Extensible
import Control.Monad.Identity
import Control.Monad.Reader
import Control.Monad.State
class Throws e m where
throwException :: e -> m a
instance Exception e => Throws e Identity where
throwException = throw
{-instance MonadError e m => Throws e m where
throwException = throwError-}
instance Throws e (Either e) where
throwException = Left
instance Exception e => Throws e IO where
throwException = throw
instance Throws e m => Throws e (StateT s m) where
throwException x = StateT (\s -> throwException x)
instance Throws e m => Throws e (ReaderT s m) where
throwException x = ReaderT (\s -> throwException x)

View File

@ -1,24 +1,45 @@
{-# LANGUAGE ExistentialQuantification,CPP #-} {-# LANGUAGE FlexibleContexts,ExistentialQuantification #-}
module Data.Encoding module Data.Encoding
(Encoding(..) (module Data.Encoding.Exception
,EncodingException(..) ,module Data.Encoding.ByteSource
,DecodingException(..) ,module Data.Encoding.ByteSink
,recode ,Encoding(..)
,recodeLazy ,DynEncoding
,DynEncoding() ,recode
#ifndef USE_HPC ,encodeString
,encodingFromString ,encodeStringExplicit
,encodingFromStringMaybe ,decodeString
#endif ,decodeStringExplicit
) ,encodeLazyByteString
where ,encodeLazyByteStringExplicit
,decodeLazyByteString
,decodeLazyByteStringExplicit
,encodeStrictByteString
,encodeStrictByteStringExplicit
,decodeStrictByteString
,decodeStrictByteStringExplicit
,encodingFromString
)
where
import Data.ByteString (ByteString)
import qualified Data.ByteString.Lazy as Lazy (ByteString)
import Data.Typeable
import Data.Encoding.Base import Data.Encoding.Base
import Data.Encoding.ByteSource
import Data.Encoding.ByteSink
import Data.Encoding.Exception
import Data.Sequence
import Data.Foldable(toList)
import Data.Char
import Control.Monad.State
import Control.Monad.Identity
import Control.Monad.Error.Class
import Data.Binary.Put
import Data.Binary.Get
import qualified Data.ByteString as BS
import qualified Data.ByteString.Lazy as LBS
#ifndef USE_HPC
import Data.Encoding.ASCII import Data.Encoding.ASCII
import Data.Encoding.UTF8 import Data.Encoding.UTF8
import Data.Encoding.UTF16 import Data.Encoding.UTF16
@ -52,36 +73,61 @@ import Data.Encoding.KOI8U
import Data.Encoding.GB18030 import Data.Encoding.GB18030
import Data.Char import Data.Char
import Text.Regex import Text.Regex
#endif
-- | An untyped encoding. Used in 'System.IO.Encoding.getSystemEncoding'. data DynEncoding = forall enc. Encoding enc => DynEncoding enc
data DynEncoding = forall t. (Encoding t,Show t,Typeable t,Eq t)
=> DynEncoding t
instance Encoding DynEncoding where instance Encoding DynEncoding where
encode (DynEncoding enc) = encode enc decodeChar (DynEncoding e) = decodeChar e
encodeLazy (DynEncoding enc) = encodeLazy enc encodeChar (DynEncoding e) = encodeChar e
encodable (DynEncoding enc) = encodable enc decode (DynEncoding e) = decode e
decode (DynEncoding enc) = decode enc encode (DynEncoding e) = encode e
decodeLazy (DynEncoding enc) = decodeLazy enc
decodable (DynEncoding enc) = decodable enc
instance Show DynEncoding where recode :: (Encoding enc1,Encoding enc2,ByteSource m,ByteSink m) => enc1 -> enc2 -> m ()
show (DynEncoding enc) = "DynEncoding "++show enc recode e1 e2 = untilM_ sourceEmpty (decodeChar e1 >>= encodeChar e2)
instance Eq DynEncoding where encodeString :: Encoding enc => enc -> String -> String
(DynEncoding enc1) == (DynEncoding enc2) = case cast enc2 of encodeString e str = toList $ viewl $ execState (encode e str) empty
Nothing -> False
Just renc2 -> enc1 == renc2
-- | This decodes a string from one encoding and encodes it into another. encodeStringExplicit :: Encoding enc => enc -> String -> Either EncodingException String
recode :: (Encoding from,Encoding to) => from -> to -> ByteString -> ByteString encodeStringExplicit e str = execStateT (encode e str) empty >>= return.toList.viewl
recode enc_f enc_t bs = encode enc_t (decode enc_f bs)
recodeLazy :: (Encoding from,Encoding to) => from -> to -> Lazy.ByteString -> Lazy.ByteString decodeString :: Encoding enc => enc -> String -> String
recodeLazy enc_f enc_t bs = encodeLazy enc_t (decodeLazy enc_f bs) decodeString e str = evalState (decode e) str
decodeStringExplicit :: Encoding enc => enc -> String -> Either DecodingException String
decodeStringExplicit e str = evalStateT (decode e) str
encodeLazyByteString :: Encoding enc => enc -> String -> LBS.ByteString
encodeLazyByteString e str = runPut $ encode e str
encodeLazyByteStringExplicit :: Encoding enc => enc -> String -> Either EncodingException LBS.ByteString
encodeLazyByteStringExplicit e str = let PutME g = encode e str
in case g of
Left err -> Left err
Right (p,()) -> Right $ runPut p
decodeLazyByteString :: Encoding enc => enc -> LBS.ByteString -> String
decodeLazyByteString e str = runGet (decode e) str
decodeLazyByteStringExplicit :: Encoding enc => enc -> LBS.ByteString -> Either DecodingException String
decodeLazyByteStringExplicit e str = evalStateT (decode e) str
encodeStrictByteString :: Encoding enc => enc -> String -> BS.ByteString
encodeStrictByteString e str = snd $ createStrict $ encode e str
encodeStrictByteStringExplicit :: Encoding enc => enc -> String -> Either EncodingException BS.ByteString
encodeStrictByteStringExplicit e str = let StrictSinkE g = encode e str
(r,bstr) = createStrict g
in case r of
Left err -> Left err
Right _ -> Right bstr
decodeStrictByteString :: Encoding enc => enc -> BS.ByteString -> String
decodeStrictByteString e str = evalState (decode e) str
decodeStrictByteStringExplicit :: Encoding enc => enc -> BS.ByteString -> Either DecodingException String
decodeStrictByteStringExplicit e str = evalStateT (decode e) str
#ifndef USE_HPC
-- | Like 'encodingFromString' but returns 'Nothing' instead of throwing an error -- | Like 'encodingFromString' but returns 'Nothing' instead of throwing an error
encodingFromStringMaybe :: String -> Maybe DynEncoding encodingFromStringMaybe :: String -> Maybe DynEncoding
encodingFromStringMaybe codeName = case (normalizeEncoding codeName) of encodingFromStringMaybe codeName = case (normalizeEncoding codeName) of
@ -266,12 +312,9 @@ encodingFromStringMaybe codeName = case (normalizeEncoding codeName) of
normalizeEncoding s = map toLower $ subRegex sep s "_" normalizeEncoding s = map toLower $ subRegex sep s "_"
sep = mkRegex "[^0-9A-Za-z]+" sep = mkRegex "[^0-9A-Za-z]+"
-- | Takes the name of an encoding and creates a dynamic encoding from it. -- | Takes the name of an encoding and creates a dynamic encoding from it.
encodingFromString :: String -> DynEncoding encodingFromString :: String -> DynEncoding
encodingFromString str = maybe encodingFromString str = maybe
(error $ "Data.Encoding.encodingFromString: Unknown encoding: "++show str) (error $ "Data.Encoding.encodingFromString: Unknown encoding: "++show str)
id id
(encodingFromStringMaybe str) (encodingFromStringMaybe str)
#endif

View File

@ -1,31 +1,17 @@
{-# LANGUAGE DeriveDataTypeable #-} {-# LANGUAGE DeriveDataTypeable #-}
-- | ASCII (American Standard Code for Information Interchange) is the module Data.Encoding.ASCII where
-- \"normal\" computer encoding using the byte values 0-127 to represent
-- characters. Refer to <http://en.wikipedia.org/wiki/ASCII> for
-- more information.
module Data.Encoding.ASCII
(ASCII(..)) where
import Control.Exception (throwDyn) import Data.Char
import Data.ByteString (pack)
import qualified Data.ByteString.Lazy as Lazy (pack)
import Data.ByteString.Char8 (unpack)
import Data.Char (ord)
import qualified Data.ByteString.Lazy as Lazy
import Data.Encoding.Base import Data.Encoding.Base
import Data.Word import Data.Encoding.ByteSource
import Data.Encoding.ByteSink
import Data.Typeable import Data.Typeable
data ASCII = ASCII deriving (Show,Eq,Typeable) data ASCII = ASCII deriving (Show,Eq,Typeable)
charToASCII :: Char -> Word8
charToASCII ch = if ch < '\128'
then fromIntegral $ ord ch
else throwDyn (HasNoRepresentation ch)
instance Encoding ASCII where instance Encoding ASCII where
encode _ str = pack (map charToASCII str) decodeChar _ = do
encodeLazy _ str = Lazy.pack (map charToASCII str) w <- fetchWord8
encodable _ ch = ch < '\128' return $ chr $ fromIntegral w
decode _ = unpack encodeChar _ c = do
decodable _ = const True pushWord8 $ fromIntegral $ ord c

View File

@ -1,157 +1,45 @@
{-# LANGUAGE CPP,TemplateHaskell,DeriveDataTypeable #-} module Data.Encoding.Base where
module Data.Encoding.Base import Data.Encoding.Exception
(Encoding(..) import Data.Encoding.ByteSource
,EncodeState(..) import Data.Encoding.ByteSink
,encodeMultibyte
,encodeMultibyteLazy
,decodeMultibyte
,decodeMultibyteLazy
,encodeSinglebyte
,encodeSinglebyteLazy
,decodeSinglebyte
,EncodingException(..)
,DecodingException(..)
,decodingArray
,encodingMap)
where
import Data.Array(array) import Control.Throws
import Data.ByteString (ByteString,unfoldrN,unfoldr,length,index,unpack) import Data.Array as Array
import qualified Data.ByteString.Lazy as LBS import Data.Map as Map hiding ((!))
import Data.Encoding.Helper.Template
#if __GLASGOW_HASKELL__>=608
import Data.ByteString.Unsafe(unsafeIndex)
#else
import Data.ByteString.Base(unsafeIndex)
#endif
import Data.Map (Map,fromList,lookup)
import Data.Char(chr)
import Data.Maybe(mapMaybe)
import Data.Typeable
import Data.Word import Data.Word
import Prelude hiding (lookup,length) import Data.Char
import qualified Prelude
import Control.Exception
import Data.Dynamic(toDyn)
import Language.Haskell.TH
{- | Represents an encoding, supporting various methods of de- and encoding.
Minimal complete definition: encode, decode
-}
class Encoding enc where class Encoding enc where
-- | Encode a 'String' into a strict 'ByteString'. Throws the decodeChar :: ByteSource m => enc -> m Char
-- 'HasNoRepresentation'-Exception if it encounters an unrepresentable encodeChar :: ByteSink m => enc -> Char -> m ()
-- character. decode :: ByteSource m => enc -> m String
encode :: enc -> String -> ByteString decode e = untilM sourceEmpty (decodeChar e)
-- | Encode a 'String' into a lazy 'Data.ByteString.Lazy.ByteString'. encode :: ByteSink m => enc -> String -> m ()
encodeLazy :: enc -> String -> LBS.ByteString encode e = mapM_ (encodeChar e)
encodeLazy e str = LBS.fromChunks [encode e str]
-- | Whether or not the given 'Char' is representable in this encoding. Default: 'True'.
encodable :: enc -> Char -> Bool
encodable _ _ = True
-- | Decode a strict 'ByteString' into a 'String'. If the string is not
-- decodable, a 'DecodingException' is thrown.
decode :: enc -> ByteString -> String
decodeLazy :: enc -> LBS.ByteString -> String
decodeLazy e str = concatMap (decode e) (LBS.toChunks str)
-- | Whether or no a given 'ByteString' is decodable. Default: 'True'.
decodable :: enc -> ByteString -> Bool
decodable _ _ = True
encodeMultibyte :: (Char -> (Word8,EncodeState)) -> String -> ByteString untilM :: Monad m => m Bool -> m a -> m [a]
encodeMultibyte f str = unfoldr (\st -> case st of untilM check act = do
(Done,[]) -> Nothing end <- check
(Done,x:xs) -> let (w,st) = f x in Just (w,(st,xs)) if end
(Put1 w1,xs) -> Just (w1,(Done,xs)) then return []
(Put2 w1 w2,xs) -> Just (w1,(Put1 w2,xs)) else (do
(Put3 w1 w2 w3,xs) -> Just (w1,(Put2 w2 w3,xs))) (Done,str) x <- act
xs <- untilM check act
return (x:xs)
)
encodeMultibyteLazy :: (Char -> (Word8,EncodeState)) -> String -> LBS.ByteString untilM_ :: Monad m => m Bool -> m a -> m ()
encodeMultibyteLazy f str = LBS.unfoldr (\ ~(st,rest) -> case st of untilM_ check act = untilM check act >> return ()
Done -> case rest of
[] -> Nothing
x:xs -> let ~(w,st) = f x in Just (w,(st,xs))
Put1 w1 -> Just (w1,(Done,rest))
Put2 w1 w2 -> Just (w1,(Put1 w2,rest))
Put3 w1 w2 w3 -> Just (w1,(Put2 w2 w3,rest))) (Done,str)
decodeMultibyte :: ([Word8] -> (Char,[Word8])) -> ByteString -> String encodeWithMap :: ByteSink m => Map Char Word8 -> Char -> m ()
decodeMultibyte f str = decode (unpack str) encodeWithMap mp c = case Map.lookup c mp of
where Nothing -> throwException $ HasNoRepresentation c
decode lst = let (c,nlst) = f lst in if null lst then [] else c:(decode nlst) Just v -> pushWord8 v
decodeMultibyteLazy :: ([Word8] -> (Char,[Word8])) -> LBS.ByteString -> String
decodeMultibyteLazy f str = decode (LBS.unpack str)
where
decode lst = let (c,nlst) = f lst in if null lst then [] else c:(decode nlst)
encodeSinglebyte :: (Char -> Word8) -> String -> ByteString
encodeSinglebyte f str = fst $ unfoldrN (Prelude.length str) (\st -> case st of
[] -> Nothing
(x:xs) -> Just (f x,xs)) str
encodeSinglebyteLazy :: (Char -> Word8) -> String -> LBS.ByteString
encodeSinglebyteLazy f str = LBS.unfoldr (\st -> case st of
[] -> Nothing
(x:xs) -> Just (f x,xs)) str
decodeSinglebyte :: (Word8 -> Char) -> ByteString -> String
decodeSinglebyte f str = map f (unpack str)
data EncodeState
= Done
| Put1 !Word8
| Put2 !Word8 !Word8
| Put3 !Word8 !Word8 !Word8
deriving Show
-- | This exception type is thrown whenever something went wrong during the
-- encoding-process.
data EncodingException
= HasNoRepresentation Char -- ^ Thrown if a specific character
-- is not representable in an encoding.
deriving (Eq,Show,Typeable)
-- | This exception type is thrown whenever something went wrong during the
-- decoding-process.
data DecodingException
= IllegalCharacter Word8 -- ^ The sequence contained an illegal
-- byte that couldn't be decoded.
| UnexpectedEnd -- ^ more bytes were needed to allow a
-- successfull decoding.
| OutOfRange -- ^ the decoded value was out of the unicode range
| IllegalRepresentation [Word8] -- ^ The character sequence encodes a
-- character, but is illegal.
deriving (Eq,Show,Typeable)
decodingArray :: FilePath -> Q Exp
decodingArray file = do
trans <- runIO (readTranslation file)
createCharArray trans 0 255
encodingMap :: FilePath -> Q Exp
#ifndef __HADDOCK__
encodingMap file = do
trans <- runIO (readTranslation file)
return $ AppE
(VarE 'fromList)
(ListE [ TupE [LitE $ CharL to,LitE $ IntegerL from]
| (from,to) <- trans])
#endif
readTranslation :: FilePath -> IO [(Integer,Char)]
readTranslation file = do
cont <- readFile file
return $ mapMaybe (\ln -> case ln of
[] -> Nothing
('#':xs) -> Nothing
_ -> case words ln of
(src:"#UNDEFINED":_) -> Just (read src,'\xFFFD') -- XXX: Find a better way to handle this
(src:trg:_) -> Just (read src,chr $ read trg)
_ -> Nothing
) (lines cont)
decodeWithArray :: ByteSource m => Array Word8 (Maybe Char) -> m Char
decodeWithArray arr = do
w <- fetchWord8
case arr!w of
Nothing -> throwException $ IllegalCharacter w
Just c -> return c

View File

@ -6,11 +6,15 @@ module Data.Encoding.BootString
,punycode) where ,punycode) where
import Data.Encoding.Base import Data.Encoding.Base
import Data.ByteString.Char8 (pack,unpack) import Data.Encoding.Exception
import Data.List (unfoldr,partition) import Data.Encoding.ByteSource
import Data.Encoding.ByteSink
import Control.Throws
import Data.Word
import Data.List (unfoldr,partition,find)
import Data.Char (ord,chr) import Data.Char (ord,chr)
import Data.Typeable import Data.Typeable
import Control.Exception (throwDyn) import Control.Monad (when)
data BootString = BootString data BootString = BootString
{base :: Int {base :: Int
@ -34,27 +38,27 @@ punycode = BootString
,init_n = 0x80 ,init_n = 0x80
} }
punyValue :: Char -> Int punyValue :: ByteSource m => Word8 -> m Int
punyValue c punyValue c
| n < 0x30 = norep | n < 0x30 = norep
| n <= 0x39 = n-0x30+26 | n <= 0x39 = return $ n-0x30+26
| n < 0x41 = norep | n < 0x41 = norep
| n <= 0x5A = n-0x41 | n <= 0x5A = return $ n-0x41
| n < 0x61 = norep | n < 0x61 = norep
| n <= 0x7A = n-0x61 | n <= 0x7A = return $ n-0x61
| otherwise = norep | otherwise = norep
where where
n = ord c n = fromIntegral c
norep = throwDyn (HasNoRepresentation c) norep = throwException (IllegalCharacter c)
punyChar :: Int -> Char punyChar :: ByteSink m => Int -> m Word8
punyChar c punyChar c
| c < 0 = norep | c < 0 = norep
| c < 26 = chr $ 0x61+c | c < 26 = return $ fromIntegral $ 0x61+c
| c < 36 = chr $ 0x30+c-26 | c < 36 = return $ fromIntegral $ 0x30+c-26
| otherwise = norep | otherwise = norep
where where
norep = throwDyn OutOfRange norep = throwException (HasNoRepresentation (chr c))
getT :: BootString -> Int -> Int -> Int getT :: BootString -> Int -> Int -> Int
getT bs k bias getT bs k bias
@ -73,40 +77,46 @@ adapt bs delta numpoints firsttime = let
$ iterate (\(d,k) -> (d `div` (base bs - tmin bs),k+(base bs))) (delta2,0) $ iterate (\(d,k) -> (d `div` (base bs - tmin bs),k+(base bs))) (delta2,0)
in rk + (((base bs - tmin bs +1) * rd) `div` (rd + skew bs)) in rk + (((base bs - tmin bs +1) * rd) `div` (rd + skew bs))
decodeValue :: BootString -> Int -> Int -> Int -> Int -> [Int] -> (Int,[Int]) decodeValue :: ByteSource m => BootString -> Int -> Int -> Int -> Int -> [Int] -> m (Int,[Int])
decodeValue bs bias i k w (x:xs) decodeValue bs bias i k w (x:xs)
| x >= base bs = throwDyn OutOfRange | x >= base bs = throwException OutOfRange
| x > (maxBound - i) `div` w = throwDyn OutOfRange | x > (maxBound - i) `div` w = throwException OutOfRange
| x < t = (ni,xs) | x < t = return (ni,xs)
| w > maxBound `div` (base bs - t) = throwDyn OutOfRange | w > maxBound `div` (base bs - t) = throwException OutOfRange
| null xs = throwException OutOfRange
| otherwise = decodeValue bs bias ni (k+base bs) (w*(base bs - t)) xs | otherwise = decodeValue bs bias ni (k+base bs) (w*(base bs - t)) xs
where where
ni = i + x*w ni = i + x*w
t = getT bs k bias t = getT bs k bias
decodeValues :: BootString -> Int -> [Int] -> [(Char,Int)] decodeValues :: ByteSource m => BootString -> Int -> [Int] -> m [(Char,Int)]
decodeValues bs len xs = decodeValues' bs (init_n bs) 0 (init_bias bs) len xs decodeValues bs len xs = decodeValues' bs (init_n bs) 0 (init_bias bs) len xs
decodeValues' :: BootString -> Int -> Int -> Int -> Int -> [Int] -> [(Char,Int)] decodeValues' :: ByteSource m => BootString -> Int -> Int -> Int -> Int -> [Int] -> m [(Char,Int)]
decodeValues' bs n i bias len [] = [] decodeValues' bs n i bias len [] = return []
decodeValues' bs n i bias len xs decodeValues' bs n i bias len xs = do
| dn > maxBound - n = throwDyn OutOfRange (ni,rst) <- decodeValue bs bias i (base bs) 1 xs
| otherwise = (chr $ nn,nni):decodeValues' bs nn (nni+1) let (dn,nni) = ni `divMod` (len+1)
(adapt bs (ni-i) (len+1) (i==0)) (len+1) rst let nn = n+dn
where if dn > maxBound - n
(ni,rst) = decodeValue bs bias i (base bs) 1 xs then throwException OutOfRange
(dn,nni) = ni `divMod` (len+1) else (do
nn = n + dn rest <- decodeValues' bs nn (nni+1) (adapt bs (ni-i) (len+1) (i==0)) (len+1) rst
return $ (chr $ nn,nni):rest
insertDeltas :: [(Char,Int)] -> String -> String )
insertDeltas :: [(a,Int)] -> [a] -> [a]
insertDeltas [] str = str insertDeltas [] str = str
insertDeltas ((c,p):xs) str = let insertDeltas ((c,p):xs) str = let
(l,r) = splitAt p str (l,r) = splitAt p str
in insertDeltas xs (l++[c]++r) in insertDeltas xs (l++[c]++r)
punyDecode :: String -> String -> String punyDecode :: ByteSource m => [Word8] -> [Word8] -> m String
punyDecode base ext = insertDeltas (decodeValues punycode (length base) (map punyValue ext)) base punyDecode base ext = do
pvals <- mapM punyValue ext
vals <- decodeValues punycode (length base) pvals
return $ insertDeltas vals (map (chr.fromIntegral) base)
encodeValue :: BootString -> Int -> Int -> Int -> Int -> [Int] encodeValue :: BootString -> Int -> Int -> Int -> Int -> [Int]
encodeValue bs bias delta n c = unfoldr (\(q,k,out) -> let encodeValue bs bias delta n c = unfoldr (\(q,k,out) -> let
t = getT bs k bias t = getT bs k bias
@ -136,15 +146,39 @@ encodeValues bs b l h bias delta n cps
m = minimum (filter (>=n) cps) m = minimum (filter (>=n) cps)
(ndelta,nh,nbias,outp) = encodeValues' bs b h bias (delta + (m - n)*(h + 1)) m cps (ndelta,nh,nbias,outp) = encodeValues' bs b h bias (delta + (m - n)*(h + 1)) m cps
breakLast :: (a -> Bool) -> [a] -> Maybe ([a],[a])
breakLast p xs = do
(bf,af,ind) <- breakLast' 0 Nothing p xs
return (bf,af)
where
breakLast' n r p [] = do
v <- r
return ([],[],v)
breakLast' n r p (x:xs) = let res = if p x
then breakLast' (n+1) (Just n) p xs
else breakLast' (n+1) r p xs
in do
(bf,af,v) <- res
return $ if n<v then (x:bf,af,v) else (bf,x:af,v)
instance Encoding BootString where instance Encoding BootString where
encode bs str = let encodeChar _ c = error "Data.Encoding.BootString.encodeChar: Please use 'encode' for encoding BootStrings"
(base,nbase) = partition (\c -> ord c < init_n bs) str decodeChar _ = error "Data.Encoding.BootString.decodeChar: Please use 'decode' for decoding BootStrings"
b = length base encode bs str = let (base,nbase) = partition (\c -> ord c < init_n bs) str
res = map punyChar $ b = length base
encodeValues bs b (length str) b (init_bias bs) 0 (init_n bs) (map ord str) in do
in pack $ if null base res <- mapM punyChar $ encodeValues bs b (length str) b (init_bias bs) 0 (init_n bs) (map ord str)
then res when (not $ null base) $ do
else base++"-"++res mapM_ (pushWord8.fromIntegral.ord) base
decode bs str = case break (=='-') (unpack str) of pushWord8 (fromIntegral $ ord '-')
(base,'-':nbase) -> punyDecode base nbase mapM_ pushWord8 res
(nbase,"") -> punyDecode "" nbase decode bs = do
wrds <- untilM sourceEmpty fetchWord8
let m = fromIntegral $ ord '-'
case breakLast (==m) wrds of
Just ([],_) -> throwException (IllegalCharacter m)
Just (base,_:nbase) -> case find (\w -> fromIntegral w > init_n bs) base of
Nothing -> punyDecode base nbase
Just ww -> throwException (IllegalCharacter ww)
Nothing -> punyDecode [] wrds

195
Data/Encoding/ByteSink.hs Normal file
View File

@ -0,0 +1,195 @@
{-# LANGUAGE FlexibleInstances,FlexibleContexts,MultiParamTypeClasses #-}
module Data.Encoding.ByteSink where
import Data.Encoding.Exception
import Data.Binary.Put
import Data.Bits
import Data.Char
import Data.Sequence
import Data.Word
import Data.Foldable (toList)
import Control.Throws
import Control.Exception.Extensible
import Control.Monad.State
import Control.Monad.Identity
import Control.Monad.Reader
import Foreign.Ptr (Ptr,plusPtr,minusPtr)
import Foreign.Marshal.Alloc (mallocBytes,reallocBytes,free)
import Foreign.Storable (poke)
import System.IO
import System.IO.Unsafe (unsafePerformIO)
import qualified Data.ByteString as BS
import Data.ByteString.Unsafe (unsafePackCStringFinalizer)
class (Monad m,Throws EncodingException m) => ByteSink m where
pushWord8 :: Word8 -> m ()
pushWord16be :: Word16 -> m ()
pushWord16be w = do
pushWord8 (fromIntegral $ w `shiftR` 8)
pushWord8 (fromIntegral $ w)
pushWord16le :: Word16 -> m ()
pushWord16le w = do
pushWord8 (fromIntegral $ w)
pushWord8 (fromIntegral $ w `shiftR` 8)
pushWord32be :: Word32 -> m ()
pushWord32be w = do
pushWord8 (fromIntegral $ w `shiftR` 24)
pushWord8 (fromIntegral $ w `shiftR` 16)
pushWord8 (fromIntegral $ w `shiftR` 8)
pushWord8 (fromIntegral $ w)
pushWord32le :: Word32 -> m ()
pushWord32le w = do
pushWord8 (fromIntegral $ w)
pushWord8 (fromIntegral $ w `shiftR` 8)
pushWord8 (fromIntegral $ w `shiftR` 16)
pushWord8 (fromIntegral $ w `shiftR` 24)
pushWord64be :: Word64 -> m ()
pushWord64be w = do
pushWord8 (fromIntegral $ w `shiftR` 56)
pushWord8 (fromIntegral $ w `shiftR` 48)
pushWord8 (fromIntegral $ w `shiftR` 40)
pushWord8 (fromIntegral $ w `shiftR` 32)
pushWord8 (fromIntegral $ w `shiftR` 24)
pushWord8 (fromIntegral $ w `shiftR` 16)
pushWord8 (fromIntegral $ w `shiftR` 8)
pushWord8 (fromIntegral $ w)
pushWord64le :: Word64 -> m ()
pushWord64le w = do
pushWord8 (fromIntegral $ w)
pushWord8 (fromIntegral $ w `shiftR` 8)
pushWord8 (fromIntegral $ w `shiftR` 16)
pushWord8 (fromIntegral $ w `shiftR` 24)
pushWord8 (fromIntegral $ w `shiftR` 32)
pushWord8 (fromIntegral $ w `shiftR` 40)
pushWord8 (fromIntegral $ w `shiftR` 48)
pushWord8 (fromIntegral $ w `shiftR` 56)
instance Throws EncodingException PutM where
throwException = throw
instance ByteSink PutM where
pushWord8 = putWord8
pushWord16be = putWord16be
pushWord16le = putWord16le
pushWord32be = putWord32be
pushWord32le = putWord32le
pushWord64be = putWord64be
pushWord64le = putWord64le
newtype PutME a = PutME (Either EncodingException (PutM (),a))
instance Monad PutME where
return x = PutME $ Right (return (),x)
(PutME x) >>= g = PutME $ do
(m,r) <- x
let (PutME ng) = g r
case ng of
Left err -> Left err
Right (m',nr) -> Right (m>>m',nr)
instance Throws EncodingException PutME where
throwException = PutME . Left
instance ByteSink PutME where
pushWord8 w = PutME $ Right (putWord8 w,())
pushWord16be w = PutME $ Right (putWord16be w,())
pushWord16le w = PutME $ Right (putWord16le w,())
pushWord32be w = PutME $ Right (putWord32be w,())
pushWord32le w = PutME $ Right (putWord32le w,())
pushWord64be w = PutME $ Right (putWord64be w,())
pushWord64le w = PutME $ Right (putWord64le w,())
instance Monad (Either EncodingException) where
return x = Right x
Left err >>= g = Left err
Right x >>= g = g x
instance Throws EncodingException (State (Seq Char)) where
throwException = throw
instance ByteSink (State (Seq Char)) where
pushWord8 x = modify (|> (chr $ fromIntegral x))
instance ByteSink (StateT (Seq Char) (Either EncodingException)) where
pushWord8 x = modify (|> (chr $ fromIntegral x))
newtype StrictSink a = StrictS (Ptr Word8 -> Int -> Int -> IO (a,Ptr Word8,Int,Int))
instance Monad StrictSink where
return x = StrictS $ \cstr pos max -> return (x,cstr,pos,max)
(StrictS f) >>= g = StrictS (\cstr pos max -> do
(res,ncstr,npos,nmax) <- f cstr pos max
let StrictS g' = g res
g' ncstr npos nmax
)
instance Throws EncodingException StrictSink where
throwException = throw
instance ByteSink StrictSink where
pushWord8 x = StrictS (\cstr pos max -> do
(ncstr,nmax) <- if pos < max
then return (cstr,max)
else (do
let nmax = max + 32
nptr <- reallocBytes cstr nmax
return (nptr,nmax)
)
poke (ncstr `plusPtr` pos) x
return ((),ncstr,pos+1,nmax)
)
newtype StrictSinkE a = StrictSinkE (StrictSink (Either EncodingException a))
instance Monad StrictSinkE where
return = StrictSinkE . return . Right
(StrictSinkE s) >>= g = StrictSinkE $ do
res <- s
case res of
Left err -> return $ Left err
Right res' -> let StrictSinkE g' = g res'
in g'
instance Throws EncodingException StrictSinkE where
throwException = StrictSinkE . return . Left
instance ByteSink StrictSinkE where
pushWord8 x = StrictSinkE $ pushWord8 x >>= return . Right
createStrictWithLen :: StrictSink a -> Int -> (a,BS.ByteString)
createStrictWithLen (StrictS f) max = unsafePerformIO $ do
ptr <- mallocBytes max
(r,nptr,len,_) <- f ptr 0 max
str <- unsafePackCStringFinalizer nptr len (free nptr)
return (r,str)
createStrict :: StrictSink a -> (a,BS.ByteString)
createStrict sink = createStrictWithLen sink 32
newtype StrictSinkExplicit a = StrictSinkExplicit (StrictSink (Either EncodingException a))
instance Monad StrictSinkExplicit where
return = (StrictSinkExplicit).return.Right
(StrictSinkExplicit sink) >>= f
= StrictSinkExplicit (do
res <- sink
case res of
Left err -> return $ Left err
Right x -> let StrictSinkExplicit sink2 = f x
in sink2)
instance Throws EncodingException StrictSinkExplicit where
throwException = StrictSinkExplicit . return . Left
instance ByteSink StrictSinkExplicit where
pushWord8 x = StrictSinkExplicit $ do
pushWord8 x
return $ Right ()
instance ByteSink (ReaderT Handle IO) where
pushWord8 x = do
h <- ask
liftIO $ do
hPutChar h (chr $ fromIntegral x)

161
Data/Encoding/ByteSource.hs Normal file
View File

@ -0,0 +1,161 @@
{-# LANGUAGE FlexibleInstances,FlexibleContexts,MultiParamTypeClasses #-}
module Data.Encoding.ByteSource where
import Data.Encoding.Exception
import Data.Bits
import Data.Binary.Get
import Data.Char
import Data.Word
import Control.Monad.State
import Control.Monad.Identity
import Control.Monad.Reader
import Control.Exception.Extensible
import Control.Throws
import qualified Data.ByteString as BS
import qualified Data.ByteString.Lazy as LBS
import System.IO
class (Monad m,Throws DecodingException m) => ByteSource m where
sourceEmpty :: m Bool
fetchWord8 :: m Word8
fetchWord16be :: m Word16
fetchWord16be = do
w1 <- fetchWord8
w2 <- fetchWord8
return $ ((fromIntegral w1) `shiftL` 8)
.|. (fromIntegral w2)
fetchWord16le :: m Word16
fetchWord16le = do
w1 <- fetchWord8
w2 <- fetchWord8
return $ ((fromIntegral w2) `shiftL` 8)
.|. (fromIntegral w1)
fetchWord32be :: m Word32
fetchWord32be = do
w1 <- fetchWord8
w2 <- fetchWord8
w3 <- fetchWord8
w4 <- fetchWord8
return $ ((fromIntegral w1) `shiftL` 24)
.|. ((fromIntegral w2) `shiftL` 16)
.|. ((fromIntegral w3) `shiftL` 8)
.|. (fromIntegral w4)
fetchWord32le :: m Word32
fetchWord32le = do
w1 <- fetchWord8
w2 <- fetchWord8
w3 <- fetchWord8
w4 <- fetchWord8
return $ ((fromIntegral w4) `shiftL` 24)
.|. ((fromIntegral w3) `shiftL` 16)
.|. ((fromIntegral w2) `shiftL` 8)
.|. (fromIntegral w1)
fetchWord64be :: m Word64
fetchWord64be = do
w1 <- fetchWord8
w2 <- fetchWord8
w3 <- fetchWord8
w4 <- fetchWord8
w5 <- fetchWord8
w6 <- fetchWord8
w7 <- fetchWord8
w8 <- fetchWord8
return $ ((fromIntegral w1) `shiftL` 56)
.|. ((fromIntegral w2) `shiftL` 48)
.|. ((fromIntegral w3) `shiftL` 40)
.|. ((fromIntegral w4) `shiftL` 32)
.|. ((fromIntegral w5) `shiftL` 24)
.|. ((fromIntegral w6) `shiftL` 16)
.|. ((fromIntegral w7) `shiftL` 8)
.|. (fromIntegral w8)
fetchWord64le :: m Word64
fetchWord64le = do
w1 <- fetchWord8
w2 <- fetchWord8
w3 <- fetchWord8
w4 <- fetchWord8
w5 <- fetchWord8
w6 <- fetchWord8
w7 <- fetchWord8
w8 <- fetchWord8
return $ ((fromIntegral w8) `shiftL` 56)
.|. ((fromIntegral w7) `shiftL` 48)
.|. ((fromIntegral w6) `shiftL` 40)
.|. ((fromIntegral w5) `shiftL` 32)
.|. ((fromIntegral w4) `shiftL` 24)
.|. ((fromIntegral w3) `shiftL` 16)
.|. ((fromIntegral w2) `shiftL` 8)
.|. (fromIntegral w1)
instance Throws DecodingException Get where
throwException = throw
instance ByteSource Get where
sourceEmpty = isEmpty
fetchWord8 = getWord8
fetchWord16be = getWord16be
fetchWord16le = getWord16le
fetchWord32be = getWord32be
fetchWord32le = getWord32le
fetchWord64be = getWord64be
fetchWord64le = getWord64le
instance Throws DecodingException (State [Char]) where
throwException = throw
instance ByteSource (State [Char]) where
sourceEmpty = gets null
fetchWord8 = do
chs <- get
case chs of
[] -> throw UnexpectedEnd
c:cs -> do
put cs
return (fromIntegral $ ord c)
instance Monad (Either DecodingException) where
return = Right
(Left err) >>= g = Left err
(Right x) >>= g = g x
instance ByteSource (StateT [Char] (Either DecodingException)) where
sourceEmpty = gets null
fetchWord8 = do
chs <- get
case chs of
[] -> throwException UnexpectedEnd --handleDecodingError UnexpectedEnd
c:cs -> do
put cs
return (fromIntegral $ ord c)
instance Throws DecodingException (State BS.ByteString) where
throwException = throw
instance ByteSource (State BS.ByteString) where
sourceEmpty = gets BS.null
fetchWord8 = State (\str -> case BS.uncons str of
Nothing -> throw UnexpectedEnd
Just (c,cs) -> (c,cs))
instance ByteSource (StateT BS.ByteString (Either DecodingException)) where
sourceEmpty = gets BS.null
fetchWord8 = StateT (\str -> case BS.uncons str of
Nothing -> Left UnexpectedEnd
Just ns -> Right ns)
instance ByteSource (StateT LBS.ByteString (Either DecodingException)) where
sourceEmpty = gets LBS.null
fetchWord8 = StateT (\str -> case LBS.uncons str of
Nothing -> Left UnexpectedEnd
Just ns -> Right ns)
instance ByteSource (ReaderT Handle IO) where
sourceEmpty = do
h <- ask
liftIO (hIsEOF h)
fetchWord8 = do
h <- ask
liftIO $ do
ch <- hGetChar h
return (fromIntegral $ ord ch)

View File

@ -1,35 +1,6 @@
{-# LANGUAGE CPP,TemplateHaskell,DeriveDataTypeable #-} {-# LANGUAGE TemplateHaskell #-}
{- | This module implements Windows Codepage number 1250 which encodes languages that use latin script. module Data.Encoding.CP1250 where
See <http://en.wikipedia.org/wiki/CP1250> for more information.
-}
module Data.Encoding.CP1250
(CP1250(..)) where
import Data.Array ((!),Array) import Data.Encoding.Helper.Template (makeISOInstance)
import Data.Word (Word8)
import Data.ByteString (all)
import Data.Map (Map,lookup,member)
import Data.Encoding.Base
import Prelude hiding (lookup,all)
import Control.Exception (throwDyn)
import Data.Typeable
data CP1250 = CP1250 deriving (Eq,Show,Typeable) $( makeISOInstance "CP1250" "CP1250.TXT" )
instance Encoding CP1250 where
encode _ = encodeSinglebyte (\c -> case lookup c encodeMap of
Just v -> v
Nothing -> throwDyn (HasNoRepresentation c))
encodable _ c = member c encodeMap
decode _ = decodeSinglebyte (decodeArr!)
decodable _ = all (\w -> decodeArr!w /= '\xFFFD')
decodeArr :: Array Word8 Char
#ifndef __HADDOCK__
decodeArr = $(decodingArray "CP1250.TXT")
#endif
encodeMap :: Map Char Word8
#ifndef __HADDOCK__
encodeMap = $(encodingMap "CP1250.TXT")
#endif

View File

@ -1,35 +1,6 @@
{-# LANGUAGE CPP,TemplateHaskell,DeriveDataTypeable #-} {-# LANGUAGE TemplateHaskell #-}
{- | This module implements Windows Codepage number 1251 which encodes languages that use the cyrillic alphabet. module Data.Encoding.CP1251 where
See <http://en.wikipedia.org/wiki/CP1251> for more information.
-}
module Data.Encoding.CP1251
(CP1251(..)) where
import Data.Array ((!),Array) import Data.Encoding.Helper.Template (makeISOInstance)
import Data.Word (Word8)
import Data.ByteString (all)
import Data.Map (Map,lookup,member)
import Data.Encoding.Base
import Prelude hiding (lookup,all)
import Control.Exception (throwDyn)
import Data.Typeable
data CP1251 = CP1251 deriving (Eq,Show,Typeable) $( makeISOInstance "CP1251" "CP1251.TXT" )
instance Encoding CP1251 where
encode _ = encodeSinglebyte (\c -> case lookup c encodeMap of
Just v -> v
Nothing -> throwDyn (HasNoRepresentation c))
encodable _ c = member c encodeMap
decode _ = decodeSinglebyte (decodeArr!)
decodable _ = all (\w -> decodeArr!w /= '\xFFFD')
decodeArr :: Array Word8 Char
#ifndef __HADDOCK__
decodeArr = $(decodingArray "CP1251.TXT")
#endif
encodeMap :: Map Char Word8
#ifndef __HADDOCK__
encodeMap = $(encodingMap "CP1251.TXT")
#endif

View File

@ -1,35 +1,6 @@
{-# LANGUAGE CPP,TemplateHaskell,DeriveDataTypeable #-} {-# LANGUAGE TemplateHaskell #-}
{- | This module implements Windows Codepage number 1252 which is a superset of ISO 8859-1. module Data.Encoding.CP1252 where
See <http://en.wikipedia.org/wiki/CP1252> for more information.
-}
module Data.Encoding.CP1252
(CP1252(..)) where
import Data.Array ((!),Array) import Data.Encoding.Helper.Template (makeISOInstance)
import Data.Word (Word8)
import Data.ByteString (all)
import Data.Map (Map,lookup,member)
import Data.Encoding.Base
import Prelude hiding (lookup,all)
import Control.Exception (throwDyn)
import Data.Typeable
data CP1252 = CP1252 deriving (Eq,Show,Typeable) $( makeISOInstance "CP1252" "CP1252.TXT" )
instance Encoding CP1252 where
encode _ = encodeSinglebyte (\c -> case lookup c encodeMap of
Just v -> v
Nothing -> throwDyn (HasNoRepresentation c))
encodable _ c = member c encodeMap
decode _ = decodeSinglebyte (decodeArr!)
decodable _ = all (\w -> decodeArr!w /= '\xFFFD')
decodeArr :: Array Word8 Char
#ifndef __HADDOCK__
decodeArr = $(decodingArray "CP1252.TXT")
#endif
encodeMap :: Map Char Word8
#ifndef __HADDOCK__
encodeMap = $(encodingMap "CP1252.TXT")
#endif

View File

@ -1,35 +1,6 @@
{-# LANGUAGE CPP,TemplateHaskell,DeriveDataTypeable #-} {-# LANGUAGE TemplateHaskell #-}
{- | This module implements Windows Codepage number 1253 which encodes modern greek. module Data.Encoding.CP1253 where
See <http://en.wikipedia.org/wiki/CP1253> for more information.
-}
module Data.Encoding.CP1253
(CP1253(..)) where
import Data.Array ((!),Array) import Data.Encoding.Helper.Template (makeISOInstance)
import Data.Word (Word8)
import Data.ByteString (all)
import Data.Map (Map,lookup,member)
import Data.Encoding.Base
import Prelude hiding (lookup,all)
import Control.Exception (throwDyn)
import Data.Typeable
data CP1253 = CP1253 deriving (Eq,Show,Typeable) $( makeISOInstance "CP1253" "CP1253.TXT" )
instance Encoding CP1253 where
encode _ = encodeSinglebyte (\c -> case lookup c encodeMap of
Just v -> v
Nothing -> throwDyn (HasNoRepresentation c))
encodable _ c = member c encodeMap
decode _ = decodeSinglebyte (decodeArr!)
decodable _ = all (\w -> decodeArr!w /= '\xFFFD')
decodeArr :: Array Word8 Char
#ifndef __HADDOCK__
decodeArr = $(decodingArray "CP1253.TXT")
#endif
encodeMap :: Map Char Word8
#ifndef __HADDOCK__
encodeMap = $(encodingMap "CP1253.TXT")
#endif

View File

@ -1,35 +1,6 @@
{-# LANGUAGE CPP,TemplateHaskell,DeriveDataTypeable #-} {-# LANGUAGE TemplateHaskell #-}
{- | This module implements Windows Codepage number 1254 which encodes the turkish language. module Data.Encoding.CP1254 where
See <http://en.wikipedia.org/wiki/CP1254> for more information.
-}
module Data.Encoding.CP1254
(CP1254(..)) where
import Data.Array ((!),Array) import Data.Encoding.Helper.Template (makeISOInstance)
import Data.Word (Word8)
import Data.ByteString (all)
import Data.Map (Map,lookup,member)
import Data.Encoding.Base
import Prelude hiding (lookup,all)
import Control.Exception (throwDyn)
import Data.Typeable
data CP1254 = CP1254 deriving (Eq,Show,Typeable) $( makeISOInstance "CP1254" "CP1254.TXT" )
instance Encoding CP1254 where
encode _ = encodeSinglebyte (\c -> case lookup c encodeMap of
Just v -> v
Nothing -> throwDyn (HasNoRepresentation c))
encodable _ c = member c encodeMap
decode _ = decodeSinglebyte (decodeArr!)
decodable _ = all (\w -> decodeArr!w /= '\xFFFD')
decodeArr :: Array Word8 Char
#ifndef __HADDOCK__
decodeArr = $(decodingArray "CP1254.TXT")
#endif
encodeMap :: Map Char Word8
#ifndef __HADDOCK__
encodeMap = $(encodingMap "CP1254.TXT")
#endif

View File

@ -1,35 +1,6 @@
{-# LANGUAGE CPP,TemplateHaskell,DeriveDataTypeable #-} {-# LANGUAGE TemplateHaskell #-}
{- | This module implements Windows Codepage number 1255 which encodes the hebrew language. module Data.Encoding.CP1255 where
See <http://en.wikipedia.org/wiki/CP1255> for more information.
-}
module Data.Encoding.CP1255
(CP1255(..)) where
import Data.Array ((!),Array) import Data.Encoding.Helper.Template (makeISOInstance)
import Data.Word (Word8)
import Data.ByteString (all)
import Data.Map (Map,lookup,member)
import Data.Encoding.Base
import Prelude hiding (lookup,all)
import Control.Exception (throwDyn)
import Data.Typeable
data CP1255 = CP1255 deriving (Eq,Show,Typeable) $( makeISOInstance "CP1255" "CP1255.TXT" )
instance Encoding CP1255 where
encode _ = encodeSinglebyte (\c -> case lookup c encodeMap of
Just v -> v
Nothing -> throwDyn (HasNoRepresentation c))
encodable _ c = member c encodeMap
decode _ = decodeSinglebyte (decodeArr!)
decodable _ = all (\w -> decodeArr!w /= '\xFFFD')
decodeArr :: Array Word8 Char
#ifndef __HADDOCK__
decodeArr = $(decodingArray "CP1255.TXT")
#endif
encodeMap :: Map Char Word8
#ifndef __HADDOCK__
encodeMap = $(encodingMap "CP1255.TXT")
#endif

View File

@ -1,35 +1,6 @@
{-# LANGUAGE CPP,TemplateHaskell,DeriveDataTypeable #-} {-# LANGUAGE TemplateHaskell #-}
{- | This module implements Windows Codepage number 1256 which encodes languages which use the arabic script. module Data.Encoding.CP1256 where
See <http://en.wikipedia.org/wiki/CP1256> for more information.
-}
module Data.Encoding.CP1256
(CP1256(..)) where
import Data.Array ((!),Array) import Data.Encoding.Helper.Template (makeISOInstance)
import Data.Word (Word8)
import Data.ByteString (all)
import Data.Map (Map,lookup,member)
import Data.Encoding.Base
import Prelude hiding (lookup,all)
import Control.Exception (throwDyn)
import Data.Typeable
data CP1256 = CP1256 deriving (Eq,Show,Typeable) $( makeISOInstance "CP1256" "CP1256.TXT" )
instance Encoding CP1256 where
encode _ = encodeSinglebyte (\c -> case lookup c encodeMap of
Just v -> v
Nothing -> throwDyn (HasNoRepresentation c))
encodable _ c = member c encodeMap
decode _ = decodeSinglebyte (decodeArr!)
decodable _ = all (\w -> decodeArr!w /= '\xFFFD')
decodeArr :: Array Word8 Char
#ifndef __HADDOCK__
decodeArr = $(decodingArray "CP1256.TXT")
#endif
encodeMap :: Map Char Word8
#ifndef __HADDOCK__
encodeMap = $(encodingMap "CP1256.TXT")
#endif

View File

@ -1,35 +1,6 @@
{-# LANGUAGE CPP,TemplateHaskell,DeriveDataTypeable #-} {-# LANGUAGE TemplateHaskell #-}
{- | This module implements Windows Codepage number 1257 which encodes the estonian, latvian and lithuanian language. module Data.Encoding.CP1257 where
See <http://en.wikipedia.org/wiki/CP1257> for more information.
-}
module Data.Encoding.CP1257
(CP1257(..)) where
import Data.Array ((!),Array) import Data.Encoding.Helper.Template (makeISOInstance)
import Data.Word (Word8)
import Data.ByteString (all)
import Data.Map (Map,lookup,member)
import Data.Encoding.Base
import Prelude hiding (lookup,all)
import Control.Exception (throwDyn)
import Data.Typeable
data CP1257 = CP1257 deriving (Eq,Show,Typeable) $( makeISOInstance "CP1257" "CP1257.TXT" )
instance Encoding CP1257 where
encode _ = encodeSinglebyte (\c -> case lookup c encodeMap of
Just v -> v
Nothing -> throwDyn (HasNoRepresentation c))
encodable _ c = member c encodeMap
decode _ = decodeSinglebyte (decodeArr!)
decodable _ = all (\w -> decodeArr!w /= '\xFFFD')
decodeArr :: Array Word8 Char
#ifndef __HADDOCK__
decodeArr = $(decodingArray "CP1257.TXT")
#endif
encodeMap :: Map Char Word8
#ifndef __HADDOCK__
encodeMap = $(encodingMap "CP1257.TXT")
#endif

View File

@ -1,35 +1,6 @@
{-# LANGUAGE CPP,TemplateHaskell,DeriveDataTypeable #-} {-# LANGUAGE TemplateHaskell #-}
{- | This module implements Windows Codepage number 1258 which encodes the vietnamese language. module Data.Encoding.CP1258 where
See <http://en.wikipedia.org/wiki/CP1258> for more information.
-}
module Data.Encoding.CP1258
(CP1258(..)) where
import Data.Array ((!),Array) import Data.Encoding.Helper.Template (makeISOInstance)
import Data.Word (Word8)
import Data.ByteString (all)
import Data.Map (Map,lookup,member)
import Data.Encoding.Base
import Prelude hiding (lookup,all)
import Control.Exception (throwDyn)
import Data.Typeable
data CP1258 = CP1258 deriving (Eq,Show,Typeable) $( makeISOInstance "CP1258" "CP1258.TXT" )
instance Encoding CP1258 where
encode _ = encodeSinglebyte (\c -> case lookup c encodeMap of
Just v -> v
Nothing -> throwDyn (HasNoRepresentation c))
encodable _ c = member c encodeMap
decode _ = decodeSinglebyte (decodeArr!)
decodable _ = all (\w -> decodeArr!w /= '\xFFFD')
decodeArr :: Array Word8 Char
#ifndef __HADDOCK__
decodeArr = $(decodingArray "CP1258.TXT")
#endif
encodeMap :: Map Char Word8
#ifndef __HADDOCK__
encodeMap = $(encodingMap "CP1258.TXT")
#endif

View File

@ -0,0 +1,30 @@
{-# LANGUAGE DeriveDataTypeable #-}
module Data.Encoding.Exception where
import Control.Exception.Extensible
import Data.Word
import Data.Typeable
import Control.Monad.Identity
-- | This exception type is thrown whenever something went wrong during the
-- encoding-process.
data EncodingException
= HasNoRepresentation Char -- ^ Thrown if a specific character
-- is not representable in an encoding.
deriving (Eq,Show,Typeable)
instance Exception EncodingException
-- | This exception type is thrown whenever something went wrong during the
-- decoding-process.
data DecodingException
= IllegalCharacter Word8 -- ^ The sequence contained an illegal
-- byte that couldn't be decoded.
| UnexpectedEnd -- ^ more bytes were needed to allow a
-- successfull decoding.
| OutOfRange -- ^ the decoded value was out of the unicode range
| IllegalRepresentation [Word8] -- ^ The character sequence encodes a
-- character, but is illegal.
deriving (Eq,Show,Typeable)
instance Exception DecodingException

View File

@ -7,11 +7,14 @@ module Data.Encoding.GB18030
(GB18030(..)) (GB18030(..))
where where
import Control.Exception import Control.Throws
import Data.Char (chr,ord) import Data.Char (chr,ord)
import Data.Word import Data.Word
import Data.Bits import Data.Bits
import Data.Encoding.Base import Data.Encoding.Base
import Data.Encoding.ByteSource
import Data.Encoding.ByteSink
import Data.Encoding.Exception
import Data.ByteString (ByteString) import Data.ByteString (ByteString)
import qualified Data.ByteString as BS import qualified Data.ByteString as BS
import Data.Typeable import Data.Typeable
@ -27,111 +30,103 @@ import Data.Encoding.GB18030Data
data GB18030 = GB18030 deriving (Eq,Show,Typeable) data GB18030 = GB18030 deriving (Eq,Show,Typeable)
instance Encoding GB18030 where instance Encoding GB18030 where
encode _ = encodeMultibyte encodeGB decodeChar _ = do
encodeLazy _ = encodeMultibyteLazy encodeGB w1 <- fetchWord8
decode _ = decodeMultibyte decodeGB case () of
decodeLazy _ = decodeMultibyteLazy decodeGB _
encodable _ ch = ch <= '\x10FFFF' | w1 <= 0x80 -> return (chr $ fromIntegral w1) -- it's ascii
decodable _ = checkValidity | w1 <= 0xFE -> do
w2 <- fetchWord8
data DecodingState case () of
= Valid _
| Invalid | w2 < 0x30 -> throwException (IllegalCharacter w2)
| Second | w2 <= 0x39 -> do
| Third w3 <- fetchWord8
| Fourth case () of
deriving Eq _
| w3 < 0x81 -> throwException (IllegalCharacter w3)
checkValidity :: ByteString -> Bool | w3 <= 0xFE -> do
checkValidity bs = BS.foldl' (\st w -> case st of w4 <- fetchWord8
Invalid -> Invalid case () of
Valid | w<=0x80 -> Valid _
| w<=0xFE -> Second | w4 < 0x30 -> throwException (IllegalCharacter w4)
| otherwise -> Invalid | w4 <= 0x39 -> decodeGBFour $ linear w1 w2 w3 w4
Second | w< 0x30 -> Invalid | otherwise -> throwException (IllegalCharacter w4)
| w<=0x39 -> Third | otherwise -> throwException (IllegalCharacter w3)
| w<=0x7E -> Valid | w2 <= 0x7E -> return $ decodeGBTwo $ linear2 w1 w2
| w==0x7F -> Invalid | w2 == 0x7F -> throwException (IllegalCharacter w2)
| w<=0xFE -> Valid | w2 <= 0xFE -> return $ decodeGBTwo $ linear2 w1 w2
| otherwise -> Invalid | otherwise -> throwException (IllegalCharacter w2)
Third | w< 0x81 -> Invalid | otherwise -> throwException (IllegalCharacter w1)
| w<=0xFE -> Fourth {- How this works: The nested if-structures form an binary tree over the
| otherwise -> Invalid - encoding range.
Fourth | w< 0x30 -> Invalid -}
| w<=0x39 -> Valid encodeChar _ ch = if ch<='\x4946' -- 1
| otherwise -> Invalid then (if ch<='\x4055' -- 2
) Valid bs == Valid then (if ch<='\x2E80' -- 3
then (if ch<='\x200F' -- 4
{- How this works: The nested if-structures form an binary tree over the then (if ch<'\x0452'
- encoding range. then arr 0x0000 arr1
-} else range range1)
encodeGB :: Char -> (Word8,EncodeState) else (if ch<'\x2643'
encodeGB ch = if ch<='\x4946' -- 1 then arr 0x2010 arr2
then (if ch<='\x4055' -- 2
then (if ch<='\x2E80' -- 3
then (if ch<='\x200F' -- 4
then (if ch<'\x0452'
then arr 0x0000 arr1
else range range1)
else (if ch<'\x2643'
then arr 0x2010 arr2
else range range2)) else range range2))
else (if ch<='\x3917' -- 4 else (if ch<='\x3917' -- 4
then (if ch<'\x361B' then (if ch<'\x361B'
then arr 0x2E81 arr3 then arr 0x2E81 arr3
else range range3) else range range3)
else (if ch<'\x3CE1' else (if ch<'\x3CE1'
then arr 0x3918 arr4 then arr 0x3918 arr4
else range range4))) else range range4)))
else (if ch<='\x464B' -- 3 else (if ch<='\x464B' -- 3
then (if ch<='\x4336' -- 4 then (if ch<='\x4336' -- 4
then (if ch<'\x4160' then (if ch<'\x4160'
then arr 0x4056 arr5 then arr 0x4056 arr5
else range range5) else range range5)
else (if ch<'\x44D7' else (if ch<'\x44D7'
then arr 0x4337 arr6 then arr 0x4337 arr6
else range range6)) else range range6))
else (if ch<'\x478E' else (if ch<'\x478E'
then arr 0x464C arr7 then arr 0x464C arr7
else range range7))) else range range7)))
else (if ch<='\xF92B' -- 2 else (if ch<='\xF92B' -- 2
then (if ch<='\xD7FF' -- 3 then (if ch<='\xD7FF' -- 3
then (if ch<='\x4C76' -- 4 then (if ch<='\x4C76' -- 4
then (if ch<'\x49B8' then (if ch<'\x49B8'
then arr 0x4947 arr8 then arr 0x4947 arr8
else range range8) else range range8)
else (if ch<'\x9FA6' else (if ch<'\x9FA6'
then arr 0x4C77 arr9 then arr 0x4C77 arr9
else range range9)) else range range9))
else (if ch<'\xE865' else (if ch<'\xE865'
then arr 0xD800 arr10 then arr 0xD800 arr10
else range range10)) else range range10))
else (if ch<='\xFFFF' -- 3 else (if ch<='\xFFFF' -- 3
then (if ch<='\xFE2F' -- 4 then (if ch<='\xFE2F' -- 4
then (if ch<'\xFA2A' then (if ch<'\xFA2A'
then arr 0xF92C arr11 then arr 0xF92C arr11
else range range11) else range range11)
else (if ch<'\xFFE6' else (if ch<'\xFFE6'
then arr 0xFE30 arr12 then arr 0xFE30 arr12
else range range12)) else range range12))
else (if ch<='\x10FFFF' -- 4 else (if ch<='\x10FFFF' -- 4
then range range13 then range range13
else throwDyn (HasNoRepresentation ch)))) else throwException (HasNoRepresentation ch))))
where where
range r = let range r = let (w1,w2,w3,w4) = delinear (ord ch + r)
(w1,w2,w3,w4) = delinear (ord ch + r) in pushWord8 w1 >> pushWord8 w2 >> pushWord8 w3 >> pushWord8 w4
in (w1,Put3 w2 w3 w4) arr off a = let ind = (ord ch - off)*5
arr off a = let w1 = unsafeIndex a (ind+1)
ind = (ord ch - off)*5 w2 = unsafeIndex a (ind+2)
w1 = unsafeIndex a (ind+1) w3 = unsafeIndex a (ind+3)
w2 = unsafeIndex a (ind+2) w4 = unsafeIndex a (ind+4)
w3 = unsafeIndex a (ind+3) in do
w4 = unsafeIndex a (ind+4) pushWord8 w1
in (w1,case unsafeIndex a ind of case unsafeIndex a ind of
1 -> Done 1 -> return ()
2 -> Put1 w2 2 -> pushWord8 w2
3 -> Put2 w2 w3 3 -> pushWord8 w2 >> pushWord8 w3
4 -> Put3 w2 w3 w4) 4 -> pushWord8 w2 >> pushWord8 w3 >> pushWord8 w4
linear :: Word8 -> Word8 -> Word8 -> Word8 -> Int linear :: Word8 -> Word8 -> Word8 -> Word8 -> Int
linear w1 w2 w3 w4 linear w1 w2 w3 w4
@ -157,32 +152,6 @@ delinear n = let
,fromIntegral w3+0x81 ,fromIntegral w3+0x81
,fromIntegral w4+0x30) ,fromIntegral w4+0x30)
decodeGB :: [Word8] -> (Char,[Word8])
decodeGB (w1:rst)
| w1 <=0x80 = (chr $ fromIntegral w1,rst) -- it's ascii
| w1 <=0xFE = case rst of
w2:rst2
| w2 < 0x30 -> throwDyn (IllegalCharacter w2)
| w2 <=0x39 -> case rst2 of
w3:rst3
| w3 < 0x81 -> throwDyn (IllegalCharacter w3)
| w3 <=0xFE -> case rst3 of
w4:rst4
| w4 < 0x30 -> throwDyn (IllegalCharacter w4)
| w4 <=0x39 -> let
v = linear w1 w2 w3 w4
in (decodeGBFour v,rst4)
| otherwise -> throwDyn (IllegalCharacter w4)
[] -> throwDyn UnexpectedEnd
| otherwise -> throwDyn (IllegalCharacter w3)
[] -> throwDyn UnexpectedEnd
| w2 <=0x7E -> (decodeGBTwo (linear2 w1 w2),rst2)
| w2 ==0x7F -> throwDyn (IllegalCharacter w2)
| w2 <=0xFE -> (decodeGBTwo (linear2 w1 w2),rst2)
| otherwise -> throwDyn (IllegalCharacter w2)
[] -> throwDyn UnexpectedEnd
| otherwise = throwDyn (IllegalCharacter w1)
decodeGBTwo :: Int -> Char decodeGBTwo :: Int -> Char
decodeGBTwo n = let decodeGBTwo n = let
rn = n*2 rn = n*2
@ -190,7 +159,7 @@ decodeGBTwo n = let
w2 = unsafeIndex rrarr (rn+1) w2 = unsafeIndex rrarr (rn+1)
in chr $ ((fromIntegral w1) `shiftL` 8) .|. (fromIntegral w2) in chr $ ((fromIntegral w1) `shiftL` 8) .|. (fromIntegral w2)
decodeGBFour :: Int -> Char decodeGBFour :: ByteSource m => Int -> m Char
decodeGBFour v = if v<=17858 -- 1 decodeGBFour v = if v<=17858 -- 1
then (if v<=15582 -- 2 then (if v<=15582 -- 2
then (if v<=11328 -- 3 then (if v<=11328 -- 3
@ -241,15 +210,15 @@ decodeGBFour v = if v<=17858 -- 1
else range range12)) else range range12))
else (if v<=1237575 && v>=189000 else (if v<=1237575 && v>=189000
then range range13 then range range13
else throwDyn OutOfRange))) else throwException OutOfRange)))
where where
arr off a = let arr off a = let
v' = (v-off)*2 v' = (v-off)*2
w1 = unsafeIndex a v' w1 = unsafeIndex a v'
w2 = unsafeIndex a (v'+1) w2 = unsafeIndex a (v'+1)
in chr $ ((fromIntegral w1) `shiftL` 8) in return $ chr $ ((fromIntegral w1) `shiftL` 8)
.|. (fromIntegral w2) .|. (fromIntegral w2)
range r = chr (v-r) range r = return $ chr (v-r)
range1,range2,range3,range4,range5,range6,range7,range8,range9,range10,range11,range12,range13 :: Int range1,range2,range3,range4,range5,range6,range7,range8,range9,range10,range11,range12,range13 :: Int
range1 = -286 range1 = -286

View File

@ -1,4 +1,4 @@
{-# LANGUAGE CPP, MagicHash #-} {-# LANGUAGE CPP,MagicHash #-}
module Data.Encoding.GB18030Data where module Data.Encoding.GB18030Data where
import Data.ByteString(ByteString) import Data.ByteString(ByteString)

View File

@ -1,34 +1,75 @@
{-# LANGUAGE CPP,TemplateHaskell #-} {-# LANGUAGE TemplateHaskell #-}
{- This module is used to create arrays from lists in template haskell -}
module Data.Encoding.Helper.Template where module Data.Encoding.Helper.Template where
import Data.Encoding.Base
import Data.Char import Data.Char
import Data.Word import Data.Maybe (mapMaybe)
import Data.Array.IArray (Array,array) import Data.Map as Map (fromList,lookup)
import Data.Array
import Language.Haskell.TH import Language.Haskell.TH
createCharArray :: [(Integer,Char)] -> Integer -> Integer -> Q Exp makeISOInstance :: String -> FilePath -> Q [Dec]
#ifndef __HADDOCK__ makeISOInstance name file = do
createCharArray lst = createArray (map (\(x,y) -> (x,LitE $ CharL y)) lst) let rname = mkName name
#endif trans <- runIO (readTranslation file)
mp <- encodingMap (validTranslations trans)
arr <- decodingArray (fillTranslations trans)
return [ DataD [] rname [] [NormalC rname []] [''Show]
, InstanceD [] (AppT (ConT ''Encoding) (ConT rname))
[FunD 'encodeChar
[Clause [WildP] (NormalB $ AppE (VarE 'encodeWithMap) (VarE $ mkName "mp"))
[ValD (VarP $ mkName "mp") (NormalB mp) []]
]
,FunD 'decodeChar
[Clause [WildP] (NormalB $ AppE (VarE 'decodeWithArray) (VarE $ mkName "arr"))
[ValD (VarP $ mkName "arr") (NormalB arr) []]
]
]
]
createCharArray :: [(Integer,Maybe Char)] -> Integer -> Integer -> Q Exp
createCharArray lst = createArray (map (\(x,y) -> (x,case y of
Nothing -> ConE 'Nothing
Just c -> AppE (ConE 'Just) (LitE $ CharL c))
) lst)
createArray :: [(Integer,Exp)] -> Integer -> Integer -> Q Exp createArray :: [(Integer,Exp)] -> Integer -> Integer -> Q Exp
#ifndef __HADDOCK__
createArray lst from to = return $ AppE createArray lst from to = return $ AppE
(AppE (AppE
(VarE 'array) (VarE 'array)
(TupE [LitE $ IntegerL from,LitE $ IntegerL to])) (TupE [LitE $ IntegerL from,LitE $ IntegerL to]))
(ListE [ TupE [LitE $ IntegerL x,y] (ListE [ TupE [LitE $ IntegerL x,y]
| (x,y) <- lst ]) | (x,y) <- lst ])
#endif
xmlArray :: [(Char,[Word8])] -> Integer -> Integer -> Q Exp decodingArray :: [(Integer,Maybe Char)] -> Q Exp
#ifndef __HADDOCK__ decodingArray trans = createCharArray trans 0 255
xmlArray lst l u = do
let trans = map (\(ch,bin) -> encodingMap :: [(Integer,Char)] -> Q Exp
(toInteger $ ord ch encodingMap trans = return $ AppE
,TupE [LitE $ IntegerL (toInteger $ length bin),TupE $ map (\b -> LitE $ IntegerL (fromIntegral b)) bin ++ replicate (4-length bin) (LitE $ IntegerL 0)] (VarE 'fromList)
)) (filter (\(c,_) -> ord c <= fromInteger u && ord c >= fromInteger l) lst) (ListE [ TupE [LitE $ CharL to,LitE $ IntegerL from]
createArray trans l u | (from,to) <- trans])
#endif
readTranslation :: FilePath -> IO [(Integer,Maybe Char)]
readTranslation file = do
cont <- readFile file
return $ mapMaybe (\ln -> case ln of
[] -> Nothing
('#':xs) -> Nothing
_ -> case words ln of
(src:"#UNDEFINED":_) -> Just (read src,Nothing) -- XXX: Find a better way to handle this
(src:trg:_) -> Just (read src,Just $ chr $ read trg)
_ -> Nothing
) (lines cont)
fillTranslations :: [(Integer,Maybe Char)] -> [(Integer,Maybe Char)]
fillTranslations = fillTranslations' (-1)
where
fillTranslations' n ((n',c):cs) = (map (\i -> (i,Nothing)) [n+1..n'-1])++((n',c):fillTranslations' n' cs)
fillTranslations' n [] = map (\i -> (i,Nothing)) [n+1..255]
validTranslations :: [(Integer,Maybe Char)] -> [(Integer,Char)]
validTranslations = mapMaybe (\(n,mc) -> case mc of
Nothing -> Nothing
Just c -> Just (n,c))

View File

@ -1,26 +1,18 @@
{-# LANGUAGE CPP,TemplateHaskell,DeriveDataTypeable #-} module Data.Encoding.ISO88591 where
{- | Implements ISO\/IEC 8859-1 alias latin-1 encoding. See
<http://en.wikipedia.org/wiki/ISO/IEC_8859-1> for further information.
-}
module Data.Encoding.ISO88591
(ISO88591(..)
) where
import Control.Throws
import Data.Encoding.Base import Data.Encoding.Base
import Data.Char(ord,chr) import Data.Encoding.Exception
import Data.Word import Data.Encoding.ByteSource
import Control.Exception import Data.Encoding.ByteSink
import Data.Typeable import Data.Char (ord,chr)
data ISO88591 = ISO88591 deriving (Eq,Show,Typeable) data ISO88591 = ISO88591 deriving (Show)
enc :: Char -> Word8
enc c = if ord c < 256
then fromIntegral $ ord c
else throwDyn (HasNoRepresentation c)
instance Encoding ISO88591 where instance Encoding ISO88591 where
encode _ = encodeSinglebyte enc encodeChar _ c
encodeLazy _ = encodeSinglebyteLazy enc | c > '\255' = throwException (HasNoRepresentation c)
encodable _ c = ord c < 256 | otherwise = pushWord8 (fromIntegral $ ord c)
decode _ = decodeSinglebyte (chr.fromIntegral) decodeChar _ = do
w <- fetchWord8
return (chr $ fromIntegral w)

View File

@ -1,34 +1,6 @@
{-# LANGUAGE CPP,TemplateHaskell,DeriveDataTypeable #-} {-# LANGUAGE TemplateHaskell #-}
module Data.Encoding.ISO885910 module Data.Encoding.ISO885910 where
(ISO885910(..)) where
import Data.Array ((!),Array) import Data.Encoding.Helper.Template (makeISOInstance)
import Data.Word (Word8)
import Data.Map (Map,lookup,member)
import Data.Encoding.Base
import Prelude hiding (lookup)
import Control.Exception (throwDyn)
import Data.Typeable
data ISO885910 = ISO885910 deriving (Eq,Show,Typeable) $( makeISOInstance "ISO885910" "8859-10.TXT" )
enc :: Char -> Word8
enc c = case lookup c encodeMap of
Just v -> v
Nothing -> throwDyn (HasNoRepresentation c)
instance Encoding ISO885910 where
encode _ = encodeSinglebyte enc
encodeLazy _ = encodeSinglebyteLazy enc
encodable _ c = member c encodeMap
decode _ = decodeSinglebyte (decodeArr!)
decodeArr :: Array Word8 Char
#ifndef __HADDOCK__
decodeArr = $(decodingArray "8859-10.TXT")
#endif
encodeMap :: Map Char Word8
#ifndef __HADDOCK__
encodeMap = $(encodingMap "8859-10.TXT")
#endif

View File

@ -1,34 +1,6 @@
{-# LANGUAGE CPP,TemplateHaskell,DeriveDataTypeable #-} {-# LANGUAGE TemplateHaskell #-}
module Data.Encoding.ISO885911 module Data.Encoding.ISO885911 where
(ISO885911(..)) where
import Data.Array ((!),Array) import Data.Encoding.Helper.Template (makeISOInstance)
import Data.Word (Word8)
import Data.Map (Map,lookup,member)
import Data.Encoding.Base
import Prelude hiding (lookup)
import Control.Exception (throwDyn)
import Data.Typeable
data ISO885911 = ISO885911 deriving (Eq,Show,Typeable) $( makeISOInstance "ISO885911" "8859-11.TXT" )
enc :: Char -> Word8
enc c = case lookup c encodeMap of
Just v -> v
Nothing -> throwDyn (HasNoRepresentation c)
instance Encoding ISO885911 where
encode _ = encodeSinglebyte enc
encodeLazy _ = encodeSinglebyteLazy enc
encodable _ c = member c encodeMap
decode _ = decodeSinglebyte (decodeArr!)
decodeArr :: Array Word8 Char
#ifndef __HADDOCK__
decodeArr = $(decodingArray "8859-11.TXT")
#endif
encodeMap :: Map Char Word8
#ifndef __HADDOCK__
encodeMap = $(encodingMap "8859-11.TXT")
#endif

View File

@ -1,34 +1,6 @@
{-# LANGUAGE CPP,TemplateHaskell,DeriveDataTypeable #-} {-# LANGUAGE TemplateHaskell #-}
module Data.Encoding.ISO885913 module Data.Encoding.ISO885913 where
(ISO885913(..)) where
import Data.Array ((!),Array) import Data.Encoding.Helper.Template (makeISOInstance)
import Data.Word (Word8)
import Data.Map (Map,lookup,member)
import Data.Encoding.Base
import Prelude hiding (lookup)
import Control.Exception (throwDyn)
import Data.Typeable
data ISO885913 = ISO885913 deriving (Eq,Show,Typeable) $( makeISOInstance "ISO885913" "8859-13.TXT" )
enc :: Char -> Word8
enc c = case lookup c encodeMap of
Just v -> v
Nothing -> throwDyn (HasNoRepresentation c)
instance Encoding ISO885913 where
encode _ = encodeSinglebyte enc
encodeLazy _ = encodeSinglebyteLazy enc
encodable _ c = member c encodeMap
decode _ = decodeSinglebyte (decodeArr!)
decodeArr :: Array Word8 Char
#ifndef __HADDOCK__
decodeArr = $(decodingArray "8859-13.TXT")
#endif
encodeMap :: Map Char Word8
#ifndef __HADDOCK__
encodeMap = $(encodingMap "8859-13.TXT")
#endif

View File

@ -1,34 +1,6 @@
{-# LANGUAGE CPP,TemplateHaskell,DeriveDataTypeable #-} {-# LANGUAGE TemplateHaskell #-}
module Data.Encoding.ISO885914 module Data.Encoding.ISO885914 where
(ISO885914(..)) where
import Data.Array ((!),Array) import Data.Encoding.Helper.Template (makeISOInstance)
import Data.Word (Word8)
import Data.Map (Map,lookup,member)
import Data.Encoding.Base
import Prelude hiding (lookup)
import Control.Exception (throwDyn)
import Data.Typeable
data ISO885914 = ISO885914 deriving (Eq,Show,Typeable) $( makeISOInstance "ISO885914" "8859-14.TXT" )
enc :: Char -> Word8
enc c = case lookup c encodeMap of
Just v -> v
Nothing -> throwDyn (HasNoRepresentation c)
instance Encoding ISO885914 where
encode _ = encodeSinglebyte enc
encodeLazy _ = encodeSinglebyteLazy enc
encodable _ c = member c encodeMap
decode _ = decodeSinglebyte (decodeArr!)
decodeArr :: Array Word8 Char
#ifndef __HADDOCK__
decodeArr = $(decodingArray "8859-14.TXT")
#endif
encodeMap :: Map Char Word8
#ifndef __HADDOCK__
encodeMap = $(encodingMap "8859-14.TXT")
#endif

View File

@ -1,34 +1,6 @@
{-# LANGUAGE CPP,TemplateHaskell,DeriveDataTypeable #-} {-# LANGUAGE TemplateHaskell #-}
module Data.Encoding.ISO885915 module Data.Encoding.ISO885915 where
(ISO885915(..)) where
import Data.Array ((!),Array) import Data.Encoding.Helper.Template (makeISOInstance)
import Data.Word (Word8)
import Data.Map (Map,lookup,member)
import Data.Encoding.Base
import Prelude hiding (lookup)
import Control.Exception (throwDyn)
import Data.Typeable
data ISO885915 = ISO885915 deriving (Eq,Show,Typeable) $( makeISOInstance "ISO885915" "8859-15.TXT" )
enc :: Char -> Word8
enc c = case lookup c encodeMap of
Just v -> v
Nothing -> throwDyn (HasNoRepresentation c)
instance Encoding ISO885915 where
encode _ = encodeSinglebyte enc
encodeLazy _ = encodeSinglebyteLazy enc
encodable _ c = member c encodeMap
decode _ = decodeSinglebyte (decodeArr!)
decodeArr :: Array Word8 Char
#ifndef __HADDOCK__
decodeArr = $(decodingArray "8859-15.TXT")
#endif
encodeMap :: Map Char Word8
#ifndef __HADDOCK__
encodeMap = $(encodingMap "8859-15.TXT")
#endif

View File

@ -1,34 +1,6 @@
{-# LANGUAGE CPP,TemplateHaskell,DeriveDataTypeable #-} {-# LANGUAGE TemplateHaskell #-}
module Data.Encoding.ISO885916 module Data.Encoding.ISO885916 where
(ISO885916(..)) where
import Data.Array ((!),Array) import Data.Encoding.Helper.Template (makeISOInstance)
import Data.Word (Word8)
import Data.Map (Map,lookup,member)
import Data.Encoding.Base
import Prelude hiding (lookup)
import Control.Exception (throwDyn)
import Data.Typeable
data ISO885916 = ISO885916 deriving (Eq,Show,Typeable) $( makeISOInstance "ISO885916" "8859-16.TXT" )
enc :: Char -> Word8
enc c = case lookup c encodeMap of
Just v -> v
Nothing -> throwDyn (HasNoRepresentation c)
instance Encoding ISO885916 where
encode _ = encodeSinglebyte enc
encodeLazy _ = encodeSinglebyteLazy enc
encodable _ c = member c encodeMap
decode _ = decodeSinglebyte (decodeArr!)
decodeArr :: Array Word8 Char
#ifndef __HADDOCK__
decodeArr = $(decodingArray "8859-16.TXT")
#endif
encodeMap :: Map Char Word8
#ifndef __HADDOCK__
encodeMap = $(encodingMap "8859-16.TXT")
#endif

View File

@ -1,38 +1,6 @@
{-# LANGUAGE CPP,TemplateHaskell,DeriveDataTypeable #-} {-# LANGUAGE TemplateHaskell #-}
{- | Implements ISO\/IEC 8859-2 alias latin-2 encoding. See module Data.Encoding.ISO88592 where
<http://en.wikipedia.org/wiki/ISO/IEC_8859-2> for further informations.
-}
module Data.Encoding.ISO88592
(ISO88592(..)) where
import Data.Array import Data.Encoding.Helper.Template (makeISOInstance)
import Data.Map hiding ((!))
import Data.Word
import Data.Encoding.Base
import Data.ByteString hiding (length,map)
import Prelude hiding (lookup,all)
import Control.Exception
import Data.Typeable
data ISO88592 = ISO88592 deriving (Eq,Show,Typeable) $( makeISOInstance "ISO88592" "8859-2.TXT" )
enc :: Char -> Word8
enc c = case lookup c encodeMap of
Just v -> v
Nothing -> throwDyn (HasNoRepresentation c)
instance Encoding ISO88592 where
encode _ = encodeSinglebyte enc
encodeLazy _ = encodeSinglebyteLazy enc
encodable _ c = member c encodeMap
decode _ = decodeSinglebyte (\w -> decodeArr!w)
decodeArr :: Array Word8 Char
#ifndef __HADDOCK__
decodeArr = $(decodingArray "8859-2.TXT")
#endif
encodeMap :: Map Char Word8
#ifndef __HADDOCK__
encodeMap = $(encodingMap "8859-2.TXT")
#endif

View File

@ -1,36 +1,6 @@
{-# LANGUAGE CPP,TemplateHaskell,DeriveDataTypeable #-} {-# LANGUAGE TemplateHaskell #-}
{- | Implements ISO 8859-3 encoding, alias latin-3, alias south european module Data.Encoding.ISO88593 where
-}
module Data.Encoding.ISO88593
(ISO88593(..)) where
import Data.Array ((!),Array) import Data.Encoding.Helper.Template (makeISOInstance)
import Data.Word (Word8)
import Data.Map (Map,lookup,member)
import Data.Encoding.Base
import Prelude hiding (lookup)
import Control.Exception (throwDyn)
import Data.Typeable
data ISO88593 = ISO88593 deriving (Eq,Show,Typeable) $( makeISOInstance "ISO88593" "8859-3.TXT" )
enc :: Char -> Word8
enc c = case lookup c encodeMap of
Just v -> v
Nothing -> throwDyn (HasNoRepresentation c)
instance Encoding ISO88593 where
encode _ = encodeSinglebyte enc
encodeLazy _ = encodeSinglebyteLazy enc
encodable _ c = member c encodeMap
decode _ = decodeSinglebyte (decodeArr!)
decodeArr :: Array Word8 Char
#ifndef __HADDOCK__
decodeArr = $(decodingArray "8859-3.TXT")
#endif
encodeMap :: Map Char Word8
#ifndef __HADDOCK__
encodeMap = $(encodingMap "8859-3.TXT")
#endif

View File

@ -1,34 +1,6 @@
{-# LANGUAGE CPP,TemplateHaskell,DeriveDataTypeable #-} {-# LANGUAGE TemplateHaskell #-}
module Data.Encoding.ISO88594 module Data.Encoding.ISO88594 where
(ISO88594(..)) where
import Data.Array ((!),Array) import Data.Encoding.Helper.Template (makeISOInstance)
import Data.Word (Word8)
import Data.Map (Map,lookup,member)
import Data.Encoding.Base
import Prelude hiding (lookup)
import Control.Exception (throwDyn)
import Data.Typeable
data ISO88594 = ISO88594 deriving (Eq,Show,Typeable) $( makeISOInstance "ISO88594" "8859-4.TXT" )
enc :: Char -> Word8
enc c = case lookup c encodeMap of
Just v -> v
Nothing -> throwDyn (HasNoRepresentation c)
instance Encoding ISO88594 where
encode _ = encodeSinglebyte enc
encodeLazy _ = encodeSinglebyteLazy enc
encodable _ c = member c encodeMap
decode _ = decodeSinglebyte (decodeArr!)
decodeArr :: Array Word8 Char
#ifndef __HADDOCK__
decodeArr = $(decodingArray "8859-4.TXT")
#endif
encodeMap :: Map Char Word8
#ifndef __HADDOCK__
encodeMap = $(encodingMap "8859-4.TXT")
#endif

View File

@ -1,30 +1,6 @@
{-# LANGUAGE CPP,TemplateHaskell,DeriveDataTypeable #-} {-# LANGUAGE TemplateHaskell #-}
module Data.Encoding.ISO88595 module Data.Encoding.ISO88595 where
(ISO88595(..)) where
import Data.Array ((!),Array) import Data.Encoding.Helper.Template (makeISOInstance)
import Data.Word (Word8)
import Data.Map (Map,lookup,member)
import Data.Encoding.Base
import Prelude hiding (lookup)
import Control.Exception (throwDyn)
import Data.Typeable
data ISO88595 = ISO88595 deriving (Eq,Show,Typeable) $( makeISOInstance "ISO88595" "8859-5.TXT" )
instance Encoding ISO88595 where
encode _ = encodeSinglebyte (\c -> case lookup c encodeMap of
Just v -> v
Nothing -> throwDyn (HasNoRepresentation c))
encodable _ c = member c encodeMap
decode _ = decodeSinglebyte (decodeArr!)
decodeArr :: Array Word8 Char
#ifndef __HADDOCK__
decodeArr = $(decodingArray "8859-5.TXT")
#endif
encodeMap :: Map Char Word8
#ifndef __HADDOCK__
encodeMap = $(encodingMap "8859-5.TXT")
#endif

View File

@ -1,30 +1,6 @@
{-# LANGUAGE CPP,TemplateHaskell,DeriveDataTypeable #-} {-# LANGUAGE TemplateHaskell #-}
module Data.Encoding.ISO88596 module Data.Encoding.ISO88596 where
(ISO88596(..)) where
import Data.Array ((!),Array) import Data.Encoding.Helper.Template (makeISOInstance)
import Data.Word (Word8)
import Data.Map (Map,lookup,member)
import Data.Encoding.Base
import Prelude hiding (lookup)
import Control.Exception (throwDyn)
import Data.Typeable
data ISO88596 = ISO88596 deriving (Eq,Show,Typeable) $( makeISOInstance "ISO88596" "8859-6.TXT" )
instance Encoding ISO88596 where
encode _ = encodeSinglebyte (\c -> case lookup c encodeMap of
Just v -> v
Nothing -> throwDyn (HasNoRepresentation c))
encodable _ c = member c encodeMap
decode _ = decodeSinglebyte (decodeArr!)
decodeArr :: Array Word8 Char
#ifndef __HADDOCK__
decodeArr = $(decodingArray "8859-6.TXT")
#endif
encodeMap :: Map Char Word8
#ifndef __HADDOCK__
encodeMap = $(encodingMap "8859-6.TXT")
#endif

View File

@ -1,30 +1,6 @@
{-# LANGUAGE CPP,TemplateHaskell,DeriveDataTypeable #-} {-# LANGUAGE TemplateHaskell #-}
module Data.Encoding.ISO88597 module Data.Encoding.ISO88597 where
(ISO88597(..)) where
import Data.Array ((!),Array) import Data.Encoding.Helper.Template (makeISOInstance)
import Data.Word (Word8)
import Data.Map (Map,lookup,member)
import Data.Encoding.Base
import Prelude hiding (lookup)
import Control.Exception (throwDyn)
import Data.Typeable
data ISO88597 = ISO88597 deriving (Eq,Show,Typeable) $( makeISOInstance "ISO88597" "8859-7.TXT" )
instance Encoding ISO88597 where
encode _ = encodeSinglebyte (\c -> case lookup c encodeMap of
Just v -> v
Nothing -> throwDyn (HasNoRepresentation c))
encodable _ c = member c encodeMap
decode _ = decodeSinglebyte (decodeArr!)
decodeArr :: Array Word8 Char
#ifndef __HADDOCK__
decodeArr = $(decodingArray "8859-7.TXT")
#endif
encodeMap :: Map Char Word8
#ifndef __HADDOCK__
encodeMap = $(encodingMap "8859-7.TXT")
#endif

View File

@ -1,30 +1,6 @@
{-# LANGUAGE CPP,TemplateHaskell,DeriveDataTypeable #-} {-# LANGUAGE TemplateHaskell #-}
module Data.Encoding.ISO88598 module Data.Encoding.ISO88598 where
(ISO88598(..)) where
import Data.Array ((!),Array) import Data.Encoding.Helper.Template (makeISOInstance)
import Data.Word (Word8)
import Data.Map (Map,lookup,member)
import Data.Encoding.Base
import Prelude hiding (lookup)
import Control.Exception (throwDyn)
import Data.Typeable
data ISO88598 = ISO88598 deriving (Eq,Show,Typeable) $( makeISOInstance "ISO88598" "8859-8.TXT" )
instance Encoding ISO88598 where
encode _ = encodeSinglebyte (\c -> case lookup c encodeMap of
Just v -> v
Nothing -> throwDyn (HasNoRepresentation c))
encodable _ c = member c encodeMap
decode _ = decodeSinglebyte (decodeArr!)
decodeArr :: Array Word8 Char
#ifndef __HADDOCK__
decodeArr = $(decodingArray "8859-8.TXT")
#endif
encodeMap :: Map Char Word8
#ifndef __HADDOCK__
encodeMap = $(encodingMap "8859-8.TXT")
#endif

View File

@ -1,30 +1,6 @@
{-# LANGUAGE CPP,TemplateHaskell,DeriveDataTypeable #-} {-# LANGUAGE TemplateHaskell #-}
module Data.Encoding.ISO88599 module Data.Encoding.ISO88599 where
(ISO88599(..)) where
import Data.Array ((!),Array) import Data.Encoding.Helper.Template (makeISOInstance)
import Data.Word (Word8)
import Data.Map (Map,lookup,member)
import Data.Encoding.Base
import Prelude hiding (lookup)
import Control.Exception (throwDyn)
import Data.Typeable
data ISO88599 = ISO88599 deriving (Eq,Show,Typeable) $( makeISOInstance "ISO88599" "8859-9.TXT" )
instance Encoding ISO88599 where
encode _ = encodeSinglebyte (\c -> case lookup c encodeMap of
Just v -> v
Nothing -> throwDyn (HasNoRepresentation c))
encodable _ c = member c encodeMap
decode _ = decodeSinglebyte (decodeArr!)
decodeArr :: Array Word8 Char
#ifndef __HADDOCK__
decodeArr = $(decodingArray "8859-9.TXT")
#endif
encodeMap :: Map Char Word8
#ifndef __HADDOCK__
encodeMap = $(encodingMap "8859-9.TXT")
#endif

View File

@ -5,16 +5,18 @@
module Data.Encoding.KOI8R module Data.Encoding.KOI8R
(KOI8R(..)) where (KOI8R(..)) where
import Control.Exception (throwDyn) import Control.Throws
import Data.Array.Unboxed import Data.Array.Unboxed
import Data.Char (ord,chr) import Data.Char (ord,chr)
import qualified Data.ByteString.Lazy as Lazy
import Data.Map hiding (map,(!)) import Data.Map hiding (map,(!))
import Data.Word import Data.Word
import Prelude hiding (lookup) import Prelude hiding (lookup)
import Data.Typeable import Data.Typeable
import Data.Encoding.Base import Data.Encoding.Base
import Data.Encoding.ByteSource
import Data.Encoding.ByteSink
import Data.Encoding.Exception
data KOI8R = KOI8R deriving (Eq,Show,Typeable) data KOI8R = KOI8R deriving (Eq,Show,Typeable)
@ -44,22 +46,14 @@ koi8rList =
,'\x042c','\x042b','\x0417','\x0428','\x042d','\x0429','\x0427','\x042a' ,'\x042c','\x042b','\x0417','\x0428','\x042d','\x0429','\x0427','\x042a'
] ]
koi8rDecode :: Word8 -> Char
koi8rDecode ch
| ch < 128 = chr $ fromIntegral ch
| otherwise = koi8rArr!ch
koi8rEncode :: Char -> Word8
koi8rEncode ch
| ch < '\128' = fromIntegral $ ord ch
| otherwise = case lookup ch koi8rMap of
Just w -> w
Nothing -> throwDyn (HasNoRepresentation ch)
instance Encoding KOI8R where instance Encoding KOI8R where
encode _ = encodeSinglebyte koi8rEncode decodeChar _ = do
encodeLazy _ = encodeSinglebyteLazy koi8rEncode w <- fetchWord8
encodable _ c = (c < '\128') || (member c koi8rMap) if w < 128
decode _ = decodeSinglebyte koi8rDecode then return $ chr $ fromIntegral w
decodeLazy _ str = concatMap (decodeSinglebyte koi8rDecode) (Lazy.toChunks str) else return $ koi8rArr!w
decodable _ = const True encodeChar _ ch
| ch < '\128' = pushWord8 $ fromIntegral $ ord ch
| otherwise = case lookup ch koi8rMap of
Just w -> pushWord8 w
Nothing -> throwException (HasNoRepresentation ch)

View File

@ -5,13 +5,16 @@
module Data.Encoding.KOI8U module Data.Encoding.KOI8U
(KOI8U(..)) where (KOI8U(..)) where
import Control.Exception (throwDyn) import Data.Encoding.Base
import Data.Encoding.ByteSource
import Data.Encoding.ByteSink
import Data.Encoding.Exception
import Control.Throws
import Data.Word import Data.Word
import Data.Array.Unboxed import Data.Array.Unboxed
import Data.Encoding.Base
import Data.Char (chr,ord) import Data.Char (chr,ord)
import Data.Map (Map,fromList,lookup,member) import Data.Map (Map,fromList,lookup,member)
import qualified Data.ByteString.Lazy as Lazy
import Prelude hiding (lookup) import Prelude hiding (lookup)
import Data.Typeable import Data.Typeable
@ -43,22 +46,14 @@ koi8uList =
,'\x042c','\x042b','\x0417','\x0428','\x042d','\x0429','\x0427','\x042a' ,'\x042c','\x042b','\x0417','\x0428','\x042d','\x0429','\x0427','\x042a'
] ]
koi8uDecode :: Word8 -> Char
koi8uDecode ch
| ch < 128 = chr $ fromIntegral ch
| otherwise = koi8uArr!ch
koi8uEncode :: Char -> Word8
koi8uEncode ch
| ch < '\128' = fromIntegral $ ord ch
| otherwise = case lookup ch koi8uMap of
Just w -> w
Nothing -> throwDyn (HasNoRepresentation ch)
instance Encoding KOI8U where instance Encoding KOI8U where
encode _ = encodeSinglebyte koi8uEncode decodeChar _ = do
encodeLazy _ = encodeSinglebyteLazy koi8uEncode w <- fetchWord8
encodable _ c = (c < '\128') || (member c koi8uMap) if w < 128
decode _ = decodeSinglebyte koi8uDecode then return $ chr $ fromIntegral w
decodeLazy _ str = concatMap (decodeSinglebyte koi8uDecode) (Lazy.toChunks str) else return $ koi8uArr!w
decodable _ = const True encodeChar _ ch
| ch < '\128' = pushWord8 $ fromIntegral $ ord ch
| otherwise = case lookup ch koi8uMap of
Just w -> pushWord8 w
Nothing -> throwException (HasNoRepresentation ch)

View File

@ -3,150 +3,79 @@
See <http://en.wikipedia.org/wiki/UTF-16> for more information. See <http://en.wikipedia.org/wiki/UTF-16> for more information.
-} -}
module Data.Encoding.UTF16 module Data.Encoding.UTF16
(UTF16(..) (UTF16(..)
) where ) where
import Data.Encoding.Base import Data.Encoding.Base
import Data.Char(ord,chr) import Data.Encoding.ByteSink
import Data.Encoding.ByteSource
import Data.Encoding.Exception
import Control.Throws
import Data.Bits import Data.Bits
import Data.Int import Data.Char
import Data.Word
import Data.ByteString as BS
import qualified Data.ByteString.Lazy as LBS
import Prelude hiding (length)
import Control.Exception
import Data.Dynamic (toDyn)
import Data.Typeable import Data.Typeable
import Data.Word
data UTF16 data UTF16
= UTF16 -- ^ Decodes big and little endian, encodes big endian. = UTF16 -- ^ Decodes big and little endian, encodes big endian.
| UTF16BE -- ^ Big endian decoding and encoding, fails if the string isn\'t actually big endian. | UTF16BE -- ^ Big endian decoding and encoding, fails if the string isn\'t actually big endian.
| UTF16LE -- ^ Little endian decoding and encoding. | UTF16LE -- ^ Little endian decoding and encoding.
deriving (Eq,Show,Typeable) deriving (Eq,Show,Typeable)
utf16enc :: Bool -> (EncodeState,String) -> Maybe (Word8,(EncodeState,String)) readBOM :: ByteSource m => m (Either Char UTF16)
utf16enc _ (Done,[]) = Nothing readBOM = do
utf16enc True (Done,x:xs) ch <- decodeChar UTF16
| n<=0x0000FFFF = Just case ch of
(fromIntegral $ n `shiftR` 8 '\xFEFF' -> return (Right UTF16BE)
,(Put1 (fromIntegral $ n),xs)) '\xFFFE' -> return (Right UTF16LE)
| n<=0x0010FFFF = Just _ -> return (Left ch)
(fromIntegral $ 0xD8 .|. (n' `shiftR` 18)
,(Put3 (fromIntegral $ (n' `shiftR` 10)) decodeUTF16 :: ByteSource m => (m Word16) -> m Char
(fromIntegral $ decodeUTF16 fetch = do
0xDC .|. ((n' `shiftR` 8) .&. 0x03)) w1 <- fetch
(fromIntegral n'),xs)) if w1 < 0xD800 || w1 > 0xDFFF
| otherwise = throwDyn $ HasNoRepresentation x then return (chr $ fromIntegral w1)
where else (if w1 > 0xDBFF
n = ord x then throwException (IllegalCharacter (fromIntegral (w1 `shiftR` 8)))
n' = n - 0x10000 else (do
utf16enc False (Done,x:xs) w2 <- fetch
| n<=0x0000FFFF = Just if w2 < 0xDC00 || w2 > 0xDFFF
(fromIntegral $ n then throwException (IllegalCharacter (fromIntegral (w2 `shiftR` 8)))
,(Put1 (fromIntegral $ n `shiftR` 8),xs)) else let v = ((fromIntegral (w1 .&. 0x3FF)) `shiftL` 10)
| n<=0x0010FFFF = Just .|. (fromIntegral (w2 .&. 0x3FF))
(fromIntegral n' in return $ chr (v+0x10000)
,(Put3 (fromIntegral $ )
0xDC .|. ((n' `shiftR` 8) .&. 0x03)) )
(fromIntegral $ (n' `shiftR` 10))
(fromIntegral $ 0xD8 .|. (n' `shiftR` 18)),xs))
| otherwise = throwDyn $ HasNoRepresentation x
where
n = ord x
n' = n - 0x10000
utf16enc _ (Put3 w1 w2 w3,xs) = Just (w1,(Put2 w2 w3,xs))
utf16enc _ (Put2 w1 w2,xs) = Just (w1,(Put1 w2,xs))
utf16enc _ (Put1 w1,xs) = Just (w1,(Done,xs))
{-# SPECIALIZE utf16dec :: Bool -> Word8 -> Word8 -> Word8 -> Word8 -> (Char,Int) #-} encodeUTF16 :: ByteSink m => (Word16 -> m ()) -> Char -> m ()
{-# SPECIALIZE utf16dec :: Bool -> Word8 -> Word8 -> Word8 -> Word8 -> (Char,Int64) #-} encodeUTF16 push ch
utf16dec :: Num a => Bool -> Word8 -> Word8 -> Word8 -> Word8 -> (Char,a) | val<=0xDFFF && val>=0xD800 = throwException (HasNoRepresentation ch)
utf16dec be s1 s2 s3 s4 | val<=0x0000FFFF = push $ fromIntegral val
| w1< 0xD8 || w1> 0xDF | val<=0x0010FFFF = let v = val - 0x10000
= (chr $ ((fromIntegral w1) `shiftL` 8) .|. (fromIntegral w2),2) w1 = (fromIntegral (v `shiftR` 10)) .|. 0xD800
| w1> 0xDB = throwDyn $ IllegalCharacter w1 w2 = ((fromIntegral v) .&. 0x3FF) .|. 0xDC00
| w3< 0xDC || w3>0xDF = throwDyn $ IllegalCharacter w3 in push w1 >> push w2
| otherwise = (chr $ (((fromIntegral w1 .&. 0x03) `shiftL` 18) | otherwise = throwException (HasNoRepresentation ch)
.|. ((fromIntegral w2) `shiftL` 10) where
.|. ((fromIntegral w3 .&. 0x03) `shiftL` 8) val = ord ch
.|. (fromIntegral w4)) + 0x10000,4)
where
(w1,w2,w3,w4) = if be then (s1,s2,s3,s4) else (s2,s1,s4,s3)
instance Encoding UTF16 where instance Encoding UTF16 where
encode enc str = unfoldr (utf16enc (enc/=UTF16LE)) (case enc of encodeChar UTF16LE = encodeUTF16 pushWord16le
UTF16 -> Put2 0xFE 0xFF encodeChar _ = encodeUTF16 pushWord16be
_ -> Done,str) decodeChar UTF16LE = decodeUTF16 fetchWord16le
encodeLazy enc str = LBS.unfoldr (utf16enc (enc/=UTF16LE)) (case enc of decodeChar _ = decodeUTF16 fetchWord16be
UTF16 -> Put2 0xFE 0xFF
_ -> Done,str)
encodable _ c = ord c <= 0x0010FFFF
decode bo str = case findByteOrder str of
Nothing -> decode' (bo/=UTF16LE) 0
Just big -> decode' big 2
where
l = BS.length str
decode' be i = if i>=l
then []
else c:decode' be (i+took)
where
(c,took) = mapException (\ex -> case ex of
ErrorCall _ -> DynException (toDyn UnexpectedEnd)
_ -> ex) (utf16dec be s1 s2 s3 s4)
s1 = index str i
s2 = index str (i+1)
s3 = index str (i+2)
s4 = index str (i+3)
decodeLazy bo str = case findByteOrderLazy str of
Nothing -> decode' (bo/=UTF16LE) 0
Just big -> decode' big 2
where
l = LBS.length str
decode' be i = if i>=l
then []
else c:decode' be (i+took)
where
(c,took) = mapException (\ex -> case ex of
ErrorCall _ -> DynException (toDyn UnexpectedEnd)
_ -> ex) (utf16dec be s1 s2 s3 s4)
s1 = LBS.index str i
s2 = LBS.index str (i+1)
s3 = LBS.index str (i+2)
s4 = LBS.index str (i+3)
decodable bo str = case findByteOrder str of
Nothing -> check' (bo/=UTF16LE) (length str) 0
Just big -> check' big (length str) 2
where
check' be m i
| m == i = True
| m == i+1 = False
| w1< 0xD8 || w1> 0xDF = check' be m (i+2)
| w1> 0xDB = False
| m <= i+3 = False
| w3< 0xDC || w3>0xDF = False
| otherwise = check' be m (i+4)
where
(w1,w3) = if be then (s1,s3) else (s2,s4)
s1 = index str i
s2 = index str (i+1)
s3 = index str (i+2)
s4 = index str (i+3)
findByteOrder :: ByteString -> Maybe Bool encode UTF16 str = do
findByteOrder str encodeChar UTF16 '\xFEFF'
| length str < 2 = Nothing mapM_ (encodeChar UTF16) str
| w1 == 0xFE && w2 == 0xFF = Just True encode enc str = mapM_ (encodeChar enc) str
| w1 == 0xFF && w2 == 0xFE = Just False
| otherwise = Nothing
where
w1 = index str 0
w2 = index str 1
decode UTF16 = do
findByteOrderLazy :: LBS.ByteString -> Maybe Bool res <- readBOM
findByteOrderLazy str = case LBS.unpack (LBS.take 2 str) of case res of
[w1,w2] Left c -> do
| w1 == 0xFE && w2 == 0xFF -> Just True cs <- untilM sourceEmpty (decodeChar UTF16BE)
| w1 == 0xFF && w2 == 0xFE -> Just False return (c:cs)
| otherwise -> Nothing Right bom -> decode bom
_ -> Nothing decode enc = untilM sourceEmpty (decodeChar enc)

View File

@ -3,17 +3,17 @@
See <http://en.wikipedia.org/wiki/UTF-32> for more information. See <http://en.wikipedia.org/wiki/UTF-32> for more information.
-} -}
module Data.Encoding.UTF32 module Data.Encoding.UTF32
(UTF32(..)) (UTF32(..))
where where
import Data.Bits
import Data.Char (ord,chr)
import Data.Encoding.Base import Data.Encoding.Base
import Data.Word import Data.Encoding.ByteSink
import Control.Exception (throwDyn) import Data.Encoding.ByteSource
import Data.Encoding.Exception
import Data.Char
import Data.Typeable import Data.Typeable
import qualified Data.ByteString as BS
import qualified Data.ByteString.Lazy as LBS
data UTF32 data UTF32
= UTF32 -- ^ Detects big or little endian through the use of the BOM (Byte Order Mask) character. Defaults to big endian if not present. = UTF32 -- ^ Detects big or little endian through the use of the BOM (Byte Order Mask) character. Defaults to big endian if not present.
@ -21,70 +21,26 @@ data UTF32
| UTF32LE -- ^ Encodes and decodes using the little endian encoding. | UTF32LE -- ^ Encodes and decodes using the little endian encoding.
deriving (Eq,Show,Typeable) deriving (Eq,Show,Typeable)
bom :: Char
bom = '\xFEFF'
instance Encoding UTF32 where instance Encoding UTF32 where
encode UTF32 str = encodeMultibyte encodeUTF32be (bom:str) encodeChar UTF32LE ch = pushWord32le (fromIntegral $ ord ch)
encode UTF32LE str = encodeMultibyte encodeUTF32le str encodeChar _ ch = pushWord32be (fromIntegral $ ord ch)
encode UTF32BE str = encodeMultibyte encodeUTF32be str decodeChar UTF32LE = do
encodeLazy UTF32 str = encodeMultibyteLazy encodeUTF32be (bom:str) wrd <- fetchWord32le
encodeLazy UTF32LE str = encodeMultibyteLazy encodeUTF32le str return $ chr $ fromIntegral wrd
encodeLazy UTF32BE str = encodeMultibyteLazy encodeUTF32be str decodeChar _ = do
encodable _ c = ord c < 0x0010FFFF wrd <- fetchWord32be
decode UTF32 str = let return $ chr $ fromIntegral wrd
(start,rest) = BS.splitAt 4 str encode UTF32 str = do
in case BS.unpack start of encodeChar UTF32 '\xFEFF'
[0x00,0x00,0xFE,0xFF] -> decode UTF32BE rest mapM_ (encodeChar UTF32) str
[0xFE,0xFF,0x00,0x00] -> decode UTF32LE rest encode enc str = mapM_ (encodeChar enc) str
_ -> decode UTF32BE str
decode UTF32LE str = decodeMultibyte decodeUTF32le str
decode UTF32BE str = decodeMultibyte decodeUTF32be str
decodeLazy UTF32 str = let
(start,rest) = LBS.splitAt 4 str
in case LBS.unpack start of
[0x00,0x00,0xFE,0xFF] -> decodeLazy UTF32BE rest
[0xFE,0xFF,0x00,0x00] -> decodeLazy UTF32LE rest
_ -> decodeLazy UTF32BE str
decodeLazy UTF32LE str = decodeMultibyteLazy decodeUTF32le str
decodeLazy UTF32BE str = decodeMultibyteLazy decodeUTF32be str
encodeUTF32be :: Char -> (Word8,EncodeState) decode UTF32 = do
encodeUTF32be ch = let ch <- fetchWord32be
w = ord ch case ch of
w1 = fromIntegral $ w `shiftR` 24 0x0000FEFF -> untilM sourceEmpty (decodeChar UTF32BE)
w2 = fromIntegral $ w `shiftR` 16 0xFFFE0000 -> untilM sourceEmpty (decodeChar UTF32LE)
w3 = fromIntegral $ w `shiftR` 8 _ -> do
w4 = fromIntegral $ w rest <- untilM sourceEmpty (decodeChar UTF32)
in (w1,Put3 w2 w3 w4) return ((chr $ fromIntegral ch):rest)
decode enc = untilM sourceEmpty (decodeChar enc)
encodeUTF32le :: Char -> (Word8,EncodeState)
encodeUTF32le ch = let
w = ord ch
w1 = fromIntegral $ w `shiftR` 24
w2 = fromIntegral $ w `shiftR` 16
w3 = fromIntegral $ w `shiftR` 8
w4 = fromIntegral $ w
in (w4,Put3 w3 w2 w1)
decodeUTF32be :: [Word8] -> (Char,[Word8])
decodeUTF32be (w1:w2:w3:w4:rest) = let
v = (fromIntegral w1 `shiftL` 24) .|.
(fromIntegral w2 `shiftL` 16) .|.
(fromIntegral w3 `shiftL` 8) .|.
(fromIntegral w4)
in if v < 0x0010FFFF
then (chr v,rest)
else throwDyn (IllegalRepresentation [w1,w2,w3,w4])
decodeUTF32be _ = throwDyn UnexpectedEnd
decodeUTF32le :: [Word8] -> (Char,[Word8])
decodeUTF32le (w1:w2:w3:w4:rest) = let
v = (fromIntegral w4 `shiftL` 24) .|.
(fromIntegral w3 `shiftL` 16) .|.
(fromIntegral w2 `shiftL` 8) .|.
(fromIntegral w1)
in if v < 0x0010FFFF
then (chr v,rest)
else throwDyn (IllegalRepresentation [w1,w2,w3,w4])
decodeUTF32le _ = throwDyn UnexpectedEnd

View File

@ -2,156 +2,124 @@
{- | This module implements UTF-8 encoding and decoding as in RFC 3629. {- | This module implements UTF-8 encoding and decoding as in RFC 3629.
See <http://en.wikipedia.org/wiki/UTF-8> for more information. See <http://en.wikipedia.org/wiki/UTF-8> for more information.
-} -}
module Data.Encoding.UTF8 module Data.Encoding.UTF8 where
(UTF8(..)) where
import Control.Throws
import Data.Char
import Data.Bits import Data.Bits
import Data.Char (ord,chr)
import Data.Encoding.Base import Data.Encoding.Base
import Data.ByteString import Data.Encoding.ByteSource
import Data.Word import Data.Encoding.ByteSink
import Prelude hiding (length) import Data.Encoding.Exception
import Control.Exception
import Data.Typeable import Data.Typeable
data UTF8 data UTF8 = UTF8 -- ^ Very forgiving decoding mechanism, accepts everything that it can make any sense of.
= UTF8 -- ^ Very forgiving decoding mechanism, accepts everything that it can make any sense of. | UTF8Strict -- ^ More strict decoding, doesn\'t accept sequences that have a too long representation and checks bits that aren\'t used in the decoding
| UTF8Strict -- ^ More strict decoding, doesn\'t accept sequences that have a too long representation and checks bits that aren\'t used in the decoding. deriving (Eq,Show,Typeable)
deriving (Eq,Show,Typeable)
encodeUTF8 :: Char -> (Word8,EncodeState)
encodeUTF8 x
| n<=0x0000007F = (v,Done)
| n<=0x000007FF = (fromIntegral $ 0xC0 .|. (n `shiftR` 6)
,Put1 (0x80 .|. (v .&. 0x3F)))
| n<=0x0000FFFF = (fromIntegral $ 0xE0 .|. (n `shiftR` 12)
,Put2 (fromIntegral $
0x80 .|. ((n `shiftR` 6) .&. 0x3F))
(fromIntegral $
0x80 .|. (n .&. 0x3F)))
| n<=0x0010FFFF = (fromIntegral $ 0xF0 .|. (n `shiftR` 18)
,Put3 (fromIntegral $
0x80 .|. ((n `shiftR` 12) .&. 0x3F))
(fromIntegral $
0x80 .|. ((n `shiftR` 6) .&. 0x3F))
(fromIntegral $
0x80 .|. (n .&. 0x3F)))
| otherwise = throwDyn (HasNoRepresentation x)
where
n = ord x
v = fromIntegral $ ord x
decodeUTF8 :: [Word8] -> (Char,[Word8])
decodeUTF8 ~(w1:rest1)
| w1<=0x7F = (chr $ fromIntegral w1,rest1)
| w1<=0xBF = throwDyn (IllegalCharacter w1)
| w1<=0xDF = case rest1 of
(w2:rest2) -> (chr $ ((fromIntegral $ w1 .&. 0x1F) `shiftL` 6)
.|. (fromIntegral (w2 .&. 0x3F)),rest2)
_ -> throwDyn UnexpectedEnd
| w1<=0xEF = case rest1 of
(w2:w3:rest3) -> (chr $ ((fromIntegral $ w1 .&. 0x0F) `shiftL` 12)
.|. ((fromIntegral $ w2 .&. 0x3F) `shiftL` 6)
.|. (fromIntegral $ w3 .&. 0x3F),rest3)
_ -> throwDyn UnexpectedEnd
| w1<=0xF7 = case rest1 of
(w2:w3:w4:rest4) -> (chr $ ((fromIntegral $ w1 .&. 0x07) `shiftL` 18)
.|. ((fromIntegral $ w2 .&. 0x3F) `shiftL` 12)
.|. ((fromIntegral $ w3 .&. 0x3F) `shiftL` 6)
.|. (fromIntegral $ w4 .&. 0x3F),rest4)
_ -> throwDyn UnexpectedEnd
| otherwise = throwDyn (IllegalCharacter w1)
decodeUTF8Strict :: [Word8] -> (Char,[Word8])
decodeUTF8Strict ~(w1:rest1)
| w1<=0x7F = (chr $ fromIntegral w1,rest1)
| w1<=0xBF = throwDyn (IllegalCharacter w1)
| w1<=0xDF = case rest1 of
(w2:rest2)
| invalidExtend w2 -> throwDyn (IllegalCharacter w2)
| otherwise -> let
v1 = w1 .&. 0x1F
in if v1 <= 1
then throwDyn (IllegalRepresentation [w1,w2])
else (chr $ ((fromIntegral v1) `shiftL` 6)
.|. (fromIntegral (w2 .&. 0x3F)),rest2)
_ -> throwDyn UnexpectedEnd
| w1<=0xEF = case rest1 of
(w2:w3:rest3)
| invalidExtend w2 -> throwDyn (IllegalCharacter w2)
| invalidExtend w3 -> throwDyn (IllegalCharacter w3)
| otherwise -> let
v1 = w1 .&. 0x0F
v2 = w2 .&. 0x3F
in if v1 == 0 && v2 < 0x20
then throwDyn (IllegalRepresentation [w1,w2,w3])
else (chr $ ((fromIntegral v1) `shiftL` 12)
.|. ((fromIntegral v2) `shiftL` 6)
.|. (fromIntegral $ w3 .&. 0x3F),rest3)
_ -> throwDyn UnexpectedEnd
| w1<=0xF7 = case rest1 of
(w2:w3:w4:rest4)
| invalidExtend w2 -> throwDyn (IllegalCharacter w2)
| invalidExtend w3 -> throwDyn (IllegalCharacter w3)
| invalidExtend w4 -> throwDyn (IllegalCharacter w4)
| otherwise -> let
v1 = w1 .&. 0x07
v2 = w2 .&. 0x3F
in if v1 == 0 && v2 < 0x10
then throwDyn (IllegalRepresentation [w1,w2,w3,w4])
else (chr $ ((fromIntegral $ w1 .&. 0x07) `shiftL` 18)
.|. ((fromIntegral $ w2 .&. 0x3F) `shiftL` 12)
.|. ((fromIntegral $ w3 .&. 0x3F) `shiftL` 6)
.|. (fromIntegral $ w4 .&. 0x3F),rest4)
_ -> throwDyn UnexpectedEnd
| otherwise = throwDyn (IllegalCharacter w1)
where
invalidExtend wrd = wrd .&. 0xC0 /= 0x80
data UTF8AnalyzeState
= Skip !Int
| CheckAndSkip !Word8 !Int
| Ok
| Failed
deriving Eq
instance Encoding UTF8 where instance Encoding UTF8 where
encode _ = encodeMultibyte encodeUTF8 encodeChar _ c
encodeLazy _ = encodeMultibyteLazy encodeUTF8 | n <= 0x0000007F = p8 n
encodable _ c = ord c <= 0x0010FFFF | n <= 0x000007FF = do
decode UTF8 = decodeMultibyte decodeUTF8 p8 $ 0xC0 .|. (n `shiftR` 6)
decode UTF8Strict = decodeMultibyte decodeUTF8Strict p8 $ 0x80 .|. (n .&. 0x3F)
decodeLazy UTF8 = decodeMultibyteLazy decodeUTF8 | n <= 0x0000FFFF = do
decodeLazy UTF8Strict = decodeMultibyteLazy decodeUTF8Strict p8 $ 0xE0 .|. (n `shiftR` 12)
decodable UTF8 str = (foldl' (\st w -> case st of p8 $ 0x80 .|. ((n `shiftR` 6) .&. 0x3F)
Ok | w<=0x7F -> Ok p8 $ 0x80 .|. (n .&. 0x3F)
| w<=0xBF -> Failed | n <= 0x0010FFFF = do
| w<=0xDF -> Skip 0 p8 $ 0xF0 .|. (n `shiftR` 18)
| w<=0xEF -> Skip 1 p8 $ 0x80 .|. ((n `shiftR` 12) .&. 0x3F)
| w<=0xF7 -> Skip 2 p8 $ 0x80 .|. ((n `shiftR` 6) .&. 0x3F)
| otherwise -> Failed p8 $ 0x80 .|. (n .&. 0x3F)
Failed -> Failed | otherwise = throwException (HasNoRepresentation c)
Skip n -> if w .&. 0xC0 == 0x80 where
then (if n == 0 then Ok else Skip (n-1)) n = ord c
else Failed) Ok str) == Ok p8 = pushWord8.fromIntegral
decodable UTF8Strict str = (foldl' (\st w -> case st of decodeChar UTF8 = do
Ok | w<=0x7F -> Ok w1 <- fetchWord8
| w<=0xBF -> Failed case () of
| w<=0xDF -> if w .&. 0x1F <= 1 _
then Failed | w1 <= 0x7F -> return $ chr $ fromIntegral w1
else Skip 0 | w1 <= 0xBF -> throwException (IllegalCharacter w1)
| w<=0xEF -> if w .&. 0x0F == 0 | w1 <= 0xDF -> do
then CheckAndSkip 0x20 1 w2 <- fetchWord8
else Skip 1 return $ chr $
| w<=0xF7 -> if w .&. 0x07 == 0 ((fromIntegral $ w1 .&. 0x1F) `shiftL` 6)
then CheckAndSkip 0x10 2 .|. (fromIntegral $ w2 .&. 0x3F)
else Skip 2
| otherwise -> Failed | w1 <= 0xEF -> do
Failed -> Failed w2 <- fetchWord8
Skip n -> if w .&. 0xC0 == 0x80 w3 <- fetchWord8
then (if n == 0 then Ok else Skip (n-1)) let v1 = w1 .&. 0x0F
else Failed v2 = w2 .&. 0x3F
CheckAndSkip chk n -> if w .&. 0xC0 == 0x80 && w .&. 0x3F >= chk v3 = w3 .&. 0x3F
then (if n == 0 then Ok else Skip (n-1)) return $ chr $
else Failed ((fromIntegral v1) `shiftL` 12)
) Ok str) == Ok .|. ((fromIntegral v2) `shiftL` 6)
.|. (fromIntegral v3)
| w1 <= 0xF7 -> do
w2 <- fetchWord8
w3 <- fetchWord8
w4 <- fetchWord8
let v1 = w1 .&. 0x07
v2 = w2 .&. 0x3F
v3 = w3 .&. 0x3F
v4 = w4 .&. 0x3F
return $ chr $
((fromIntegral v1) `shiftL` 18)
.|. ((fromIntegral v2) `shiftL` 12)
.|. ((fromIntegral v3) `shiftL` 6)
.|. (fromIntegral v4)
| otherwise -> throwException (IllegalCharacter w1)
decodeChar UTF8Strict = do
w1 <- fetchWord8
case () of
_
| w1 <= 0x7F -> return $ chr $ fromIntegral w1
| w1 <= 0xBF -> throwException (IllegalCharacter w1)
| w1 <= 0xDF -> do
w2 <- fetchExtend8
let v1 = w1 .&. 0x1F
if v1 <= 1
then throwException (IllegalRepresentation [w1,w2])
else return $ chr $
((fromIntegral v1) `shiftL` 6)
.|. (fromIntegral $ w2 .&. 0x3F)
| w1 <= 0xEF -> do
w2 <- fetchExtend8
w3 <- fetchExtend8
let v1 = w1 .&. 0x0F
v2 = w2 .&. 0x3F
v3 = w3 .&. 0x3F
if v1 == 0 && v2 < 0x20
then throwException (IllegalRepresentation [w1,w2,w3])
else return $ chr $
((fromIntegral v1) `shiftL` 12)
.|. ((fromIntegral v2) `shiftL` 6)
.|. (fromIntegral v3)
| w1 <= 0xF7 -> do
w2 <- fetchExtend8
w3 <- fetchExtend8
w4 <- fetchExtend8
let v1 = w1 .&. 0x07
v2 = w2 .&. 0x3F
v3 = w3 .&. 0x3F
v4 = w4 .&. 0x3F
if v1 == 0 && v2 < 0x10
then throwException (IllegalRepresentation [w1,w2,w3,w4])
else return $ chr $
((fromIntegral v1) `shiftL` 18)
.|. ((fromIntegral v2) `shiftL` 12)
.|. ((fromIntegral v3) `shiftL` 6)
.|. (fromIntegral v4)
| otherwise -> throwException (IllegalCharacter w1)
where
invalidExtend wrd = wrd .&. 0xC0 /= 0x80
fetchExtend8 = do
w <- fetchWord8
if invalidExtend w
then throwException (IllegalCharacter w)
else return w

View File

@ -1,25 +1,80 @@
{-# LANGUAGE ImplicitParams,ForeignFunctionInterface #-}
{- | This module provides a replacement for the normal (unicode unaware) IO functions of haskell.
By using implicit parameters, it can be used almost as a drop-in replacement.
-}
module System.IO.Encoding module System.IO.Encoding
(getSystemEncoding (getSystemEncoding
,hPutStr ,getContents
,hGetContents) where ,hPutStr
,hPutStrLn
,hGetContents
,readFile
,writeFile
,appendFile
,hGetChar
,hGetLine
,hPutChar
,interact
,print) where
import Foreign.C.String import Foreign.C.String
import Data.Encoding import Data.Encoding
import System.IO hiding (hPutStr,hGetContents) import System.IO (Handle,stdout,stdin)
import qualified Data.ByteString.Lazy as BS import Prelude hiding (print,getContents,readFile,writeFile,appendFile,interact)
import qualified Data.ByteString.Lazy as LBS
import qualified Data.ByteString as BS
import Control.Monad.Reader (runReaderT)
-- | Like the normal 'System.IO.hGetContents', but decodes the input using an -- | Like the normal 'System.IO.hGetContents', but decodes the input using an
-- encoding. -- encoding.
hGetContents :: Encoding e => e -> Handle -> IO String hGetContents :: (Encoding e,?enc :: e) => Handle -> IO String
hGetContents enc h = do hGetContents h = do
str <- BS.hGetContents h str <- LBS.hGetContents h
return $ decodeLazy enc str return $ decodeLazyByteString ?enc str
getContents :: (Encoding e,?enc :: e) => IO String
getContents = do
str <- LBS.getContents
return $ decodeLazyByteString ?enc str
-- | Like the normal 'System.IO.hPutStr', but encodes the output using an -- | Like the normal 'System.IO.hPutStr', but encodes the output using an
-- encoding. -- encoding.
hPutStr :: Encoding e => e -> Handle -> String -> IO () hPutStr :: (Encoding e,?enc :: e) => Handle -> String -> IO ()
hPutStr enc h str = BS.hPut h (encodeLazy enc str) hPutStr h str = LBS.hPut h (encodeLazyByteString ?enc str)
hPutStrLn :: (Encoding e,?enc :: e) => Handle -> String -> IO ()
hPutStrLn h str = do
LBS.hPut h (encodeLazyByteString ?enc str)
LBS.hPut h (encodeLazyByteString ?enc "\n")
print :: (Encoding e,Show a,?enc :: e) => a -> IO ()
print x = hPutStrLn stdout (show x)
readFile :: (Encoding e,?enc :: e) => FilePath -> IO String
readFile fn = LBS.readFile fn >>= return.(decodeLazyByteString ?enc)
writeFile :: (Encoding e,?enc :: e) => FilePath -> String -> IO ()
writeFile fn str = LBS.writeFile fn $ encodeLazyByteString ?enc str
appendFile :: (Encoding e,?enc :: e) => FilePath -> String -> IO ()
appendFile fn str = LBS.appendFile fn $ encodeLazyByteString ?enc str
hGetChar :: (Encoding e,?enc :: e) => Handle -> IO Char
hGetChar h = runReaderT (decodeChar ?enc) h
hGetLine :: (Encoding e,?enc :: e) => Handle -> IO String
hGetLine h = do
line <- BS.hGetLine h
return $ decodeStrictByteString ?enc line
hPutChar :: (Encoding e,?enc :: e) => Handle -> Char -> IO ()
hPutChar h c = runReaderT (encodeChar ?enc c) h
interact :: (Encoding e,?enc :: e) => (String -> String) -> IO ()
interact f = do
line <- hGetLine stdin
hPutStrLn stdout (f line)
foreign import ccall "system_encoding.h get_system_encoding" foreign import ccall "system_encoding.h get_system_encoding"
get_system_encoding :: IO CString get_system_encoding :: IO CString
@ -29,4 +84,4 @@ getSystemEncoding :: IO DynEncoding
getSystemEncoding = do getSystemEncoding = do
enc <- get_system_encoding enc <- get_system_encoding
str <- peekCString enc str <- peekCString enc
return $ encodingFromString str return $ encodingFromString str

View File

@ -4,9 +4,10 @@ module Test.Tester where
import Data.Encoding import Data.Encoding
import Test.HUnit import Test.HUnit
import Data.Word import Data.Word
import Data.Char
import qualified Data.ByteString as BS import qualified Data.ByteString as BS
import qualified Data.ByteString.Lazy as LBS import qualified Data.ByteString.Lazy as LBS
import Control.Exception (catchDyn,evaluate) import Test.QuickCheck hiding (Testable)
data EncodingTest data EncodingTest
= forall enc. (Encoding enc,Show enc) => = forall enc. (Encoding enc,Show enc) =>
@ -17,52 +18,60 @@ data EncodingTest
EncodingError enc String EncodingException EncodingError enc String EncodingException
instance Testable EncodingTest where instance Testable EncodingTest where
test (EncodingTest enc src trg) = TestList test (EncodingTest enc src trg)
[TestLabel (show enc ++ " encodable") = TestList
(TestCase $ (all (encodable enc) src) @=? True) [TestLabel (show enc ++ " encoding")
,TestLabel (show enc ++ " encoding (strict)") (TestCase $ encodeStrictByteStringExplicit enc src
(TestCase $ bstr @=? (encode enc src)) @?= Right (BS.pack trg))
,TestLabel (show enc ++ " encoding (lazy)") ,TestLabel (show enc ++ " decoding")
(TestCase $ lbstr @=? (encodeLazy enc src)) (TestCase $ decodeStrictByteStringExplicit enc (BS.pack trg)
,TestLabel (show enc ++ " decodable") @=? Right src)
(TestCase $ (decodable enc bstr) @=? True) ]
,TestLabel (show enc ++ " decoding (strict)") test (DecodingError enc src ex)
(TestCase $ src @=? (decode enc bstr)) = TestLabel (show enc ++ " decoding error")
,TestLabel (show enc ++ " decoding (lazy)") (TestCase $ decodeStrictByteStringExplicit enc (BS.pack src) @=? Left ex)
(TestCase $ src @=? (decodeLazy enc lbstr))
]
where charGen :: Gen Char
bstr = BS.pack trg charGen = let
lbstr = LBS.pack trg ascii = choose (0x00,0x7F) >>= return.chr
test (DecodingError enc trg what) = TestList oneByte = choose (0x80,0xFF) >>= return.chr
[TestLabel (show what++" not decodable in "++show enc) $ twoByte = choose (0x0100,0xFFFF) >>= return.chr
TestCase $ assert $ not $ decodable enc (BS.pack trg) threeByte = choose (0x010000,0x10FFFF) >>= return.chr
,TestLabel (show enc ++ " decoding error (strict)") $ TestCase $ in frequency [(40,ascii),(30,oneByte),(20,twoByte),(10,threeByte)]
catchDyn (do
mapM_ evaluate (decode enc (BS.pack trg)) instance Arbitrary Char where
assertFailure "No exception thrown" arbitrary = charGen
) coarbitrary x = id
(\exc -> exc @=? what)
,TestLabel (show enc ++ " decoding error (lazy)") $ TestCase $ instance Arbitrary Word8 where
catchDyn (do arbitrary = choose (0x00,0xFF::Int) >>= return.fromIntegral
mapM_ evaluate (decodeLazy enc (LBS.pack trg)) coarbitrary x = id
assertFailure "No exception thrown"
) quickCheckEncoding :: Encoding enc => enc -> IO ()
(\exc -> exc @=? what) quickCheckEncoding e = do
] quickCheck (encodingIdentity e)
test (EncodingError enc src what) = TestList quickCheck (decodingIdentity e)
[TestLabel (show src ++ " not encodable in " ++ show enc) $
TestCase $ assert $ not $ all (encodable enc) src encodingIdentity :: Encoding enc => enc -> String -> Property
,TestLabel (show enc ++ " encoding error (strict)") $ TestCase $ encodingIdentity e str
catchDyn (do = trivial (null str)
evaluate (encode enc src) $ case encoded of
assertFailure "No exception thrown" Left err -> trivial True True
) Right res -> case decodeStrictByteStringExplicit e res of
(\exc -> exc @=? what) Left err -> property False
,TestLabel (show enc ++ " encoding error (lazy)") $ TestCase $ Right res' -> property (str==res')
catchDyn (do where
evaluate (encodeLazy enc src) encoded = encodeStrictByteStringExplicit e str
assertFailure "No exception thrown"
) decodingIdentity :: Encoding enc => enc -> [Word8] -> Property
(\exc -> exc @=? what) decodingIdentity e wrd
] = trivial (null wrd)
$ case decoded of
Left err -> trivial True True
Right res -> case encodeStrictByteStringExplicit e res of
Left err -> property False
Right res' -> property (bstr==res')
where
bstr = BS.pack wrd
decoded = decodeStrictByteStringExplicit e bstr

View File

@ -1,119 +1,158 @@
module Test.Tests where module Test.Tests where
import Test.Tester
import Data.Encoding import Data.Encoding
import Data.Encoding.ASCII
import Data.Encoding.UTF8 import Data.Encoding.UTF8
import Data.Encoding.UTF16 import Data.Encoding.UTF16
import Data.Encoding.UTF32
import Data.Encoding.ISO88592
import Data.Encoding.ISO88593
import Data.Encoding.ISO88594
import Data.Encoding.ISO88595
import Data.Encoding.ISO88596
import Data.Encoding.ISO88597
import Data.Encoding.ISO88598
import Data.Encoding.ISO88599
import Data.Encoding.ISO885910
import Data.Encoding.ISO885911
import Data.Encoding.ISO885913
import Data.Encoding.ISO885914
import Data.Encoding.BootString import Data.Encoding.BootString
import Test.Tester
import Test.HUnit import Test.HUnit
import Test.QuickCheck hiding (test)
import Data.Char (ord) import Data.Char (ord)
asciiTests :: Test identityTests :: IO ()
asciiTests = TestList $ map test $ identityTests = do
[EncodingTest ASCII putStrLn "for UTF8..."
"Hello, world!" quickCheckEncoding UTF8Strict
[0x48,0x65,0x6C,0x6C,0x6F,0x2C,0x20,0x77,0x6F,0x72,0x6C,0x64,0x21] putStrLn "for UTF16..."
,EncodingError ASCII quickCheckEncoding UTF16BE
"\x0041\x2262\x0391\x002E" quickCheckEncoding UTF16LE
(HasNoRepresentation '\x2262') putStrLn "for UTF32..."
] quickCheck $ encodingIdentity UTF32
putStrLn "for ISO88592..."
quickCheckEncoding ISO88592
putStrLn "for ISO88593..."
quickCheckEncoding ISO88593
putStrLn "for ISO88594..."
quickCheckEncoding ISO88594
putStrLn "for ISO88595..."
quickCheckEncoding ISO88595
putStrLn "for ISO88596..."
quickCheckEncoding ISO88596
putStrLn "for ISO88597..."
quickCheckEncoding ISO88597
putStrLn "for ISO88598..."
quickCheckEncoding ISO88598
putStrLn "for ISO88599..."
quickCheckEncoding ISO88599
putStrLn "for ISO885910..."
quickCheckEncoding ISO885910
putStrLn "for ISO885911..."
quickCheckEncoding ISO885911
putStrLn "for ISO885913..."
quickCheckEncoding ISO885913
putStrLn "for ISO885914..."
quickCheckEncoding ISO885914
putStrLn "for Punycode..."
quickCheck $ encodingIdentity punycode
utf8Tests :: Test utf8Tests :: Test
utf8Tests = TestList $ map test $ utf8Tests = TestList $ map test $ concat
-- Simple encoding tests [[EncodingTest enc "\x0041\x2262\x0391\x002E"
concat [[EncodingTest enc "\x0041\x2262\x0391\x002E" [0x41,0xE2,0x89,0xA2,0xCE,0x91,0x2E]
[0x41,0xE2,0x89,0xA2,0xCE,0x91,0x2E] ,EncodingTest enc "\xD55C\xAD6D\xC5B4"
,EncodingTest enc "\xD55C\xAD6D\xC5B4" [0xED,0x95,0x9C,0xEA,0xB5,0xAD,0xEC,0x96,0xB4]
[0xED,0x95,0x9C,0xEA,0xB5,0xAD,0xEC,0x96,0xB4] ,EncodingTest enc "\x65E5\x672C\x8A9E"
,EncodingTest enc "\x65E5\x672C\x8A9E" [0xE6,0x97,0xA5,0xE6,0x9C,0xAC,0xE8,0xAA,0x9E]
[0xE6,0x97,0xA5,0xE6,0x9C,0xAC,0xE8,0xAA,0x9E] ,EncodingTest enc "\x233B4"
,EncodingTest enc "\x233B4" [0xF0,0xA3,0x8E,0xB4]
[0xF0,0xA3,0x8E,0xB4] ,EncodingTest enc ""
,EncodingTest enc "" []
[] -- First possible sequence of a certain length
-- First possible sequence of a certain length ,EncodingTest enc "\x0000"
,EncodingTest enc "\x0000" [0x00]
[0x00] ,EncodingTest enc "\x0080"
,EncodingTest enc "\x0080" [0xC2,0x80]
[0xC2,0x80] ,EncodingTest enc "\x0800"
,EncodingTest enc "\x0800" [0xE0,0xA0,0x80]
[0xE0,0xA0,0x80] ,EncodingTest enc "\x10000"
,EncodingTest enc "\x10000" [0xF0,0x90,0x80,0x80]
[0xF0,0x90,0x80,0x80] -- Last possible sequence of a certain length
-- Last possible sequence of a certain length ,EncodingTest enc "\x007F"
,EncodingTest enc "\x007F" [0x7F]
[0x7F] ,EncodingTest enc "\x07FF"
,EncodingTest enc "\x07FF" [0xDF,0xBF]
[0xDF,0xBF] ,EncodingTest enc "\xFFFF"
,EncodingTest enc "\xFFFF" [0xEF,0xBF,0xBF]
[0xEF,0xBF,0xBF] -- Other boundaries
-- Other boundaries ,EncodingTest enc "\xD7FF"
,EncodingTest enc "\xD7FF" [0xED,0x9F,0xBF]
[0xED,0x9F,0xBF] ,EncodingTest enc "\xE000"
,EncodingTest enc "\xE000" [0xEE,0x80,0x80]
[0xEE,0x80,0x80] ,EncodingTest enc "\xFFFD"
,EncodingTest enc "\xFFFD" [0xEF,0xBF,0xBD]
[0xEF,0xBF,0xBD] -- Illegal starting characters
-- Illegal starting characters ,DecodingError enc
,DecodingError enc [0x65,0x55,0x85]
[0x65,0x55,0x85] (IllegalCharacter 0x85)
(IllegalCharacter 0x85) -- Unexpected end
-- Unexpected end ,DecodingError enc
,DecodingError enc [0x41,0xE2,0x89,0xA2,0xCE]
[0x41,0xE2,0x89,0xA2,0xCE] UnexpectedEnd
UnexpectedEnd ,DecodingError enc
,DecodingError enc [0x41,0xE2,0x89]
[0x41,0xE2,0x89] UnexpectedEnd
UnexpectedEnd ,DecodingError enc
,DecodingError enc [0x41,0xE2]
[0x41,0xE2] UnexpectedEnd
UnexpectedEnd] ]
| enc <- [UTF8,UTF8Strict] | enc <- [UTF8,UTF8Strict]
]++ ]++
[DecodingError UTF8 [0xFE] (IllegalCharacter 0xFE) [DecodingError UTF8 [0xFE] (IllegalCharacter 0xFE)
,DecodingError UTF8 [0xFF] (IllegalCharacter 0xFF) ,DecodingError UTF8 [0xFF] (IllegalCharacter 0xFF)
-- Overlong representations of '/' -- Overlong representations of '/'
,DecodingError UTF8Strict [0xC0,0xAF] ,DecodingError UTF8Strict [0xC0,0xAF]
(IllegalRepresentation [0xC0,0xAF]) (IllegalRepresentation [0xC0,0xAF])
,DecodingError UTF8Strict [0xE0,0x80,0xAF] ,DecodingError UTF8Strict [0xE0,0x80,0xAF]
(IllegalRepresentation [0xE0,0x80,0xAF]) (IllegalRepresentation [0xE0,0x80,0xAF])
,DecodingError UTF8Strict [0xF0,0x80,0x80,0xAF] ,DecodingError UTF8Strict [0xF0,0x80,0x80,0xAF]
(IllegalRepresentation [0xF0,0x80,0x80,0xAF]) (IllegalRepresentation [0xF0,0x80,0x80,0xAF])
-- Maximum overlong sequences -- Maximum overlong sequences
,DecodingError UTF8Strict [0xC1,0xBF] ,DecodingError UTF8Strict [0xC1,0xBF]
(IllegalRepresentation [0xC1,0xBF]) (IllegalRepresentation [0xC1,0xBF])
,DecodingError UTF8Strict [0xE0,0x9F,0xBF] ,DecodingError UTF8Strict [0xE0,0x9F,0xBF]
(IllegalRepresentation [0xE0,0x9F,0xBF]) (IllegalRepresentation [0xE0,0x9F,0xBF])
,DecodingError UTF8Strict [0xF0,0x8F,0xBF,0xBF] ,DecodingError UTF8Strict [0xF0,0x8F,0xBF,0xBF]
(IllegalRepresentation [0xF0,0x8F,0xBF,0xBF]) (IllegalRepresentation [0xF0,0x8F,0xBF,0xBF])
-- Overlong represenations of '\NUL' -- Overlong represenations of '\NUL'
,DecodingError UTF8Strict [0xC0,0x80] ,DecodingError UTF8Strict [0xC0,0x80]
(IllegalRepresentation [0xC0,0x80]) (IllegalRepresentation [0xC0,0x80])
,DecodingError UTF8Strict [0xE0,0x80,0x80] ,DecodingError UTF8Strict [0xE0,0x80,0x80]
(IllegalRepresentation [0xE0,0x80,0x80]) (IllegalRepresentation [0xE0,0x80,0x80])
,DecodingError UTF8Strict [0xF0,0x80,0x80,0x80] ,DecodingError UTF8Strict [0xF0,0x80,0x80,0x80]
(IllegalRepresentation [0xF0,0x80,0x80,0x80]) (IllegalRepresentation [0xF0,0x80,0x80,0x80])
-- Invalid extends -- Invalid extends
-- 2 of 2 -- 2 of 2
,DecodingError UTF8Strict [0xCC,0x1C,0xE0] ,DecodingError UTF8Strict [0xCC,0x1C,0xE0]
(IllegalCharacter 0x1C) (IllegalCharacter 0x1C)
-- 2 of 3 -- 2 of 3
,DecodingError UTF8Strict [0xE3,0x6C,0xB3] ,DecodingError UTF8Strict [0xE3,0x6C,0xB3]
(IllegalCharacter 0x6C) (IllegalCharacter 0x6C)
-- 3 of 3 -- 3 of 3
,DecodingError UTF8Strict [0xE3,0xB4,0x6D] ,DecodingError UTF8Strict [0xE3,0xB4,0x6D]
(IllegalCharacter 0x6D) (IllegalCharacter 0x6D)
-- 2 of 4 -- 2 of 4
,DecodingError UTF8Strict [0xF2,0x6C,0xB3,0xB3] ,DecodingError UTF8Strict [0xF2,0x6C,0xB3,0xB3]
(IllegalCharacter 0x6C) (IllegalCharacter 0x6C)
-- 3 of 4 -- 3 of 4
,DecodingError UTF8Strict [0xF2,0xB3,0x6C,0xB3] ,DecodingError UTF8Strict [0xF2,0xB3,0x6C,0xB3]
(IllegalCharacter 0x6C) (IllegalCharacter 0x6C)
-- 4 of 4 -- 4 of 4
,DecodingError UTF8Strict [0xF2,0xB3,0xB3,0x6C] ,DecodingError UTF8Strict [0xF2,0xB3,0xB3,0x6C]
(IllegalCharacter 0x6C) (IllegalCharacter 0x6C)
] ]
utf16Tests :: Test utf16Tests :: Test
utf16Tests = TestList $ map test $ utf16Tests = TestList $ map test $
@ -129,9 +168,9 @@ utf16Tests = TestList $ map test $
[0x02,0xE8] [0x02,0xE8]
,DecodingError UTF16LE [0x65,0xDC] ,DecodingError UTF16LE [0x65,0xDC]
(IllegalCharacter 0xDC) (IllegalCharacter 0xDC)
,DecodingError UTF16BE [0xDC] ,DecodingError UTF16BE [0xDC,0x33]
(IllegalCharacter 0xDC) (IllegalCharacter 0xDC)
,DecodingError UTF16BE [0xD9,0x78,0xDA] ,DecodingError UTF16BE [0xD9,0x78,0xDA,0x33]
(IllegalCharacter 0xDA) (IllegalCharacter 0xDA)
,DecodingError UTF16BE [0xD9,0x78,0xDA,0x66] ,DecodingError UTF16BE [0xD9,0x78,0xDA,0x66]
(IllegalCharacter 0xDA) (IllegalCharacter 0xDA)
@ -183,5 +222,9 @@ punycodeTests = TestList $ map test $
-- Vietnamese -- Vietnamese
,punyTest "TạisaohọkhôngthểchỉnóitiếngViệt" ,punyTest "TạisaohọkhôngthểchỉnóitiếngViệt"
"TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g" "TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"
{-,punyTest "foo\xC2\xAD\xCD\x8F\xE1\xA0\x86\xE1\xA0\x8B\
\bar\xE2\x80\x8B\xE2\x81\xA0\
\baz\xEF\xB8\x80\xEF\xB8\x88\xEF\xB8\x8F\xEF\xBB\xBF"
"foobarbaz"-}
] ]
where punyTest str outp = EncodingTest punycode str (map (fromIntegral.ord) outp) where punyTest str outp = EncodingTest punycode str (map (fromIntegral.ord) outp)

View File

@ -1,63 +1,37 @@
Name: encoding Name: encoding
Version: 0.4.1 Version: 0.5.0
Author: Henning Günther Author: Henning Günther
Maintainer: h.guenther@tu-bs.de Maintainer: h.guenther@tu-bs.de
License: BSD3 License: BSD3
Synopsis: A library for various character encodings Synopsis: A library for various character encodings
Stability: alpha
Description: Description:
Haskell has excellect handling of unicode, the Char type covers all unicode chars. Unfortunatly, there's no possibility to read or write something to the outer world in an encoding other than ascii due to the lack of support for encodings. This library should help with that. Haskell has excellect handling of unicode, the Char type covers all unicode chars. Unfortunatly, there's no possibility to read or write something to the outer world in an encoding other than ascii due to the lack of support for encodings. This library should help with that.
Category: Codec Category: Codec
Homepage: http://code.haskell.org/encoding/ Homepage: http://code.haskell.org/encoding/
Cabal-Version: >=1.2 Cabal-Version: >=1.2
Build-Type: Simple Build-Type: Simple
Extra-Source-Files:
8859-2.TXT
8859-3.TXT
8859-4.TXT
8859-5.TXT
8859-6.TXT
8859-7.TXT
8859-8.TXT
8859-9.TXT
8859-10.TXT
8859-11.TXT
8859-13.TXT
8859-14.TXT
8859-15.TXT
8859-16.TXT
CP1250.TXT
CP1251.TXT
CP1252.TXT
CP1253.TXT
CP1254.TXT
CP1255.TXT
CP1256.TXT
CP1257.TXT
CP1258.TXT
gb-18030-2000.xml
system_encoding.h
create_gb18030_data.sh
NEWS
Flag splitBase Flag splitBase
description: Choose the new smaller, split-up base package. description: Choose the new smaller, split-up base package.
Library Library
if flag(splitBase) if flag(splitBase)
Build-Depends: bytestring, base >= 3, template-haskell, containers, array, regex-compat Build-Depends: bytestring, base >= 3, binary, mtl, containers, extensible-exceptions, array, template-haskell, regex-compat
else else
Build-Depends: base < 3, template-haskell, regex-compat Build-Depends: base < 3, binary, extensible-exceptions, template-haskell
Extensions: TemplateHaskell,CPP,ExistentialQuantification,ForeignFunctionInterface
C-Sources: system_encoding.c
Include-Dirs: .
Install-Includes: system_encoding.h
Exposed-Modules: Exposed-Modules:
Data.Encoding Data.Encoding
Data.Encoding.ByteSource
Data.Encoding.ByteSink
Data.Encoding.Exception
Control.Throws
Data.Encoding.ASCII Data.Encoding.ASCII
Data.Encoding.UTF8 Data.Encoding.UTF8
Data.Encoding.UTF16 Data.Encoding.UTF16
Data.Encoding.UTF32 Data.Encoding.UTF32
Data.Encoding.KOI8R
Data.Encoding.KOI8U
Data.Encoding.ISO88591 Data.Encoding.ISO88591
Data.Encoding.ISO88592 Data.Encoding.ISO88592
Data.Encoding.ISO88593 Data.Encoding.ISO88593
@ -73,7 +47,6 @@ Library
Data.Encoding.ISO885914 Data.Encoding.ISO885914
Data.Encoding.ISO885915 Data.Encoding.ISO885915
Data.Encoding.ISO885916 Data.Encoding.ISO885916
Data.Encoding.BootString
Data.Encoding.CP1250 Data.Encoding.CP1250
Data.Encoding.CP1251 Data.Encoding.CP1251
Data.Encoding.CP1252 Data.Encoding.CP1252
@ -83,12 +56,7 @@ Library
Data.Encoding.CP1256 Data.Encoding.CP1256
Data.Encoding.CP1257 Data.Encoding.CP1257
Data.Encoding.CP1258 Data.Encoding.CP1258
Data.Encoding.KOI8R
Data.Encoding.KOI8U
Data.Encoding.GB18030 Data.Encoding.GB18030
System.IO.Encoding
Other-Modules:
Data.Encoding.Base
Data.Encoding.GB18030Data
Data.Encoding.Helper.Template Data.Encoding.Helper.Template
Data.Encoding.BootString
System.IO.Encoding