Rewrite to support more sources and changing the encoding dynamically
Now it's possible to change the character encoding while de-/encoding. Also, it's possible to use any data structure as a source or target of the de-/encoding process. darcs-hash:20090221203100-a4fee-6da31f2e37c30a3f5cd5f10af71984209488bb0b
This commit is contained in:
parent
8b16078e5e
commit
eeee054f1e
28
Control/Throws.hs
Normal file
28
Control/Throws.hs
Normal file
@ -0,0 +1,28 @@
|
||||
{-# LANGUAGE FlexibleInstances,MultiParamTypeClasses #-}
|
||||
module Control.Throws where
|
||||
|
||||
import Control.Exception.Extensible
|
||||
import Control.Monad.Identity
|
||||
import Control.Monad.Reader
|
||||
import Control.Monad.State
|
||||
|
||||
class Throws e m where
|
||||
throwException :: e -> m a
|
||||
|
||||
instance Exception e => Throws e Identity where
|
||||
throwException = throw
|
||||
|
||||
{-instance MonadError e m => Throws e m where
|
||||
throwException = throwError-}
|
||||
|
||||
instance Throws e (Either e) where
|
||||
throwException = Left
|
||||
|
||||
instance Exception e => Throws e IO where
|
||||
throwException = throw
|
||||
|
||||
instance Throws e m => Throws e (StateT s m) where
|
||||
throwException x = StateT (\s -> throwException x)
|
||||
|
||||
instance Throws e m => Throws e (ReaderT s m) where
|
||||
throwException x = ReaderT (\s -> throwException x)
|
||||
129
Data/Encoding.hs
129
Data/Encoding.hs
@ -1,24 +1,45 @@
|
||||
{-# LANGUAGE ExistentialQuantification,CPP #-}
|
||||
{-# LANGUAGE FlexibleContexts,ExistentialQuantification #-}
|
||||
module Data.Encoding
|
||||
(Encoding(..)
|
||||
,EncodingException(..)
|
||||
,DecodingException(..)
|
||||
,recode
|
||||
,recodeLazy
|
||||
,DynEncoding()
|
||||
#ifndef USE_HPC
|
||||
,encodingFromString
|
||||
,encodingFromStringMaybe
|
||||
#endif
|
||||
)
|
||||
where
|
||||
(module Data.Encoding.Exception
|
||||
,module Data.Encoding.ByteSource
|
||||
,module Data.Encoding.ByteSink
|
||||
,Encoding(..)
|
||||
,DynEncoding
|
||||
,recode
|
||||
,encodeString
|
||||
,encodeStringExplicit
|
||||
,decodeString
|
||||
,decodeStringExplicit
|
||||
,encodeLazyByteString
|
||||
,encodeLazyByteStringExplicit
|
||||
,decodeLazyByteString
|
||||
,decodeLazyByteStringExplicit
|
||||
,encodeStrictByteString
|
||||
,encodeStrictByteStringExplicit
|
||||
,decodeStrictByteString
|
||||
,decodeStrictByteStringExplicit
|
||||
,encodingFromString
|
||||
)
|
||||
where
|
||||
|
||||
import Data.ByteString (ByteString)
|
||||
import qualified Data.ByteString.Lazy as Lazy (ByteString)
|
||||
import Data.Typeable
|
||||
import Data.Encoding.Base
|
||||
import Data.Encoding.ByteSource
|
||||
import Data.Encoding.ByteSink
|
||||
import Data.Encoding.Exception
|
||||
|
||||
import Data.Sequence
|
||||
import Data.Foldable(toList)
|
||||
import Data.Char
|
||||
|
||||
import Control.Monad.State
|
||||
import Control.Monad.Identity
|
||||
import Control.Monad.Error.Class
|
||||
import Data.Binary.Put
|
||||
import Data.Binary.Get
|
||||
|
||||
import qualified Data.ByteString as BS
|
||||
import qualified Data.ByteString.Lazy as LBS
|
||||
|
||||
#ifndef USE_HPC
|
||||
import Data.Encoding.ASCII
|
||||
import Data.Encoding.UTF8
|
||||
import Data.Encoding.UTF16
|
||||
@ -52,36 +73,61 @@ import Data.Encoding.KOI8U
|
||||
import Data.Encoding.GB18030
|
||||
import Data.Char
|
||||
import Text.Regex
|
||||
#endif
|
||||
|
||||
-- | An untyped encoding. Used in 'System.IO.Encoding.getSystemEncoding'.
|
||||
data DynEncoding = forall t. (Encoding t,Show t,Typeable t,Eq t)
|
||||
=> DynEncoding t
|
||||
data DynEncoding = forall enc. Encoding enc => DynEncoding enc
|
||||
|
||||
instance Encoding DynEncoding where
|
||||
encode (DynEncoding enc) = encode enc
|
||||
encodeLazy (DynEncoding enc) = encodeLazy enc
|
||||
encodable (DynEncoding enc) = encodable enc
|
||||
decode (DynEncoding enc) = decode enc
|
||||
decodeLazy (DynEncoding enc) = decodeLazy enc
|
||||
decodable (DynEncoding enc) = decodable enc
|
||||
decodeChar (DynEncoding e) = decodeChar e
|
||||
encodeChar (DynEncoding e) = encodeChar e
|
||||
decode (DynEncoding e) = decode e
|
||||
encode (DynEncoding e) = encode e
|
||||
|
||||
instance Show DynEncoding where
|
||||
show (DynEncoding enc) = "DynEncoding "++show enc
|
||||
recode :: (Encoding enc1,Encoding enc2,ByteSource m,ByteSink m) => enc1 -> enc2 -> m ()
|
||||
recode e1 e2 = untilM_ sourceEmpty (decodeChar e1 >>= encodeChar e2)
|
||||
|
||||
instance Eq DynEncoding where
|
||||
(DynEncoding enc1) == (DynEncoding enc2) = case cast enc2 of
|
||||
Nothing -> False
|
||||
Just renc2 -> enc1 == renc2
|
||||
encodeString :: Encoding enc => enc -> String -> String
|
||||
encodeString e str = toList $ viewl $ execState (encode e str) empty
|
||||
|
||||
-- | This decodes a string from one encoding and encodes it into another.
|
||||
recode :: (Encoding from,Encoding to) => from -> to -> ByteString -> ByteString
|
||||
recode enc_f enc_t bs = encode enc_t (decode enc_f bs)
|
||||
encodeStringExplicit :: Encoding enc => enc -> String -> Either EncodingException String
|
||||
encodeStringExplicit e str = execStateT (encode e str) empty >>= return.toList.viewl
|
||||
|
||||
recodeLazy :: (Encoding from,Encoding to) => from -> to -> Lazy.ByteString -> Lazy.ByteString
|
||||
recodeLazy enc_f enc_t bs = encodeLazy enc_t (decodeLazy enc_f bs)
|
||||
decodeString :: Encoding enc => enc -> String -> String
|
||||
decodeString e str = evalState (decode e) str
|
||||
|
||||
decodeStringExplicit :: Encoding enc => enc -> String -> Either DecodingException String
|
||||
decodeStringExplicit e str = evalStateT (decode e) str
|
||||
|
||||
encodeLazyByteString :: Encoding enc => enc -> String -> LBS.ByteString
|
||||
encodeLazyByteString e str = runPut $ encode e str
|
||||
|
||||
encodeLazyByteStringExplicit :: Encoding enc => enc -> String -> Either EncodingException LBS.ByteString
|
||||
encodeLazyByteStringExplicit e str = let PutME g = encode e str
|
||||
in case g of
|
||||
Left err -> Left err
|
||||
Right (p,()) -> Right $ runPut p
|
||||
|
||||
decodeLazyByteString :: Encoding enc => enc -> LBS.ByteString -> String
|
||||
decodeLazyByteString e str = runGet (decode e) str
|
||||
|
||||
decodeLazyByteStringExplicit :: Encoding enc => enc -> LBS.ByteString -> Either DecodingException String
|
||||
decodeLazyByteStringExplicit e str = evalStateT (decode e) str
|
||||
|
||||
encodeStrictByteString :: Encoding enc => enc -> String -> BS.ByteString
|
||||
encodeStrictByteString e str = snd $ createStrict $ encode e str
|
||||
|
||||
encodeStrictByteStringExplicit :: Encoding enc => enc -> String -> Either EncodingException BS.ByteString
|
||||
encodeStrictByteStringExplicit e str = let StrictSinkE g = encode e str
|
||||
(r,bstr) = createStrict g
|
||||
in case r of
|
||||
Left err -> Left err
|
||||
Right _ -> Right bstr
|
||||
|
||||
decodeStrictByteString :: Encoding enc => enc -> BS.ByteString -> String
|
||||
decodeStrictByteString e str = evalState (decode e) str
|
||||
|
||||
decodeStrictByteStringExplicit :: Encoding enc => enc -> BS.ByteString -> Either DecodingException String
|
||||
decodeStrictByteStringExplicit e str = evalStateT (decode e) str
|
||||
|
||||
#ifndef USE_HPC
|
||||
-- | Like 'encodingFromString' but returns 'Nothing' instead of throwing an error
|
||||
encodingFromStringMaybe :: String -> Maybe DynEncoding
|
||||
encodingFromStringMaybe codeName = case (normalizeEncoding codeName) of
|
||||
@ -266,12 +312,9 @@ encodingFromStringMaybe codeName = case (normalizeEncoding codeName) of
|
||||
normalizeEncoding s = map toLower $ subRegex sep s "_"
|
||||
sep = mkRegex "[^0-9A-Za-z]+"
|
||||
|
||||
|
||||
|
||||
-- | Takes the name of an encoding and creates a dynamic encoding from it.
|
||||
encodingFromString :: String -> DynEncoding
|
||||
encodingFromString str = maybe
|
||||
(error $ "Data.Encoding.encodingFromString: Unknown encoding: "++show str)
|
||||
id
|
||||
(encodingFromStringMaybe str)
|
||||
#endif
|
||||
(encodingFromStringMaybe str)
|
||||
@ -1,31 +1,17 @@
|
||||
{-# LANGUAGE DeriveDataTypeable #-}
|
||||
-- | ASCII (American Standard Code for Information Interchange) is the
|
||||
-- \"normal\" computer encoding using the byte values 0-127 to represent
|
||||
-- characters. Refer to <http://en.wikipedia.org/wiki/ASCII> for
|
||||
-- more information.
|
||||
module Data.Encoding.ASCII
|
||||
(ASCII(..)) where
|
||||
module Data.Encoding.ASCII where
|
||||
|
||||
import Control.Exception (throwDyn)
|
||||
import Data.ByteString (pack)
|
||||
import qualified Data.ByteString.Lazy as Lazy (pack)
|
||||
import Data.ByteString.Char8 (unpack)
|
||||
import Data.Char (ord)
|
||||
import qualified Data.ByteString.Lazy as Lazy
|
||||
import Data.Char
|
||||
import Data.Encoding.Base
|
||||
import Data.Word
|
||||
import Data.Encoding.ByteSource
|
||||
import Data.Encoding.ByteSink
|
||||
import Data.Typeable
|
||||
|
||||
data ASCII = ASCII deriving (Show,Eq,Typeable)
|
||||
|
||||
charToASCII :: Char -> Word8
|
||||
charToASCII ch = if ch < '\128'
|
||||
then fromIntegral $ ord ch
|
||||
else throwDyn (HasNoRepresentation ch)
|
||||
|
||||
instance Encoding ASCII where
|
||||
encode _ str = pack (map charToASCII str)
|
||||
encodeLazy _ str = Lazy.pack (map charToASCII str)
|
||||
encodable _ ch = ch < '\128'
|
||||
decode _ = unpack
|
||||
decodable _ = const True
|
||||
decodeChar _ = do
|
||||
w <- fetchWord8
|
||||
return $ chr $ fromIntegral w
|
||||
encodeChar _ c = do
|
||||
pushWord8 $ fromIntegral $ ord c
|
||||
@ -1,157 +1,45 @@
|
||||
{-# LANGUAGE CPP,TemplateHaskell,DeriveDataTypeable #-}
|
||||
module Data.Encoding.Base where
|
||||
|
||||
module Data.Encoding.Base
|
||||
(Encoding(..)
|
||||
,EncodeState(..)
|
||||
,encodeMultibyte
|
||||
,encodeMultibyteLazy
|
||||
,decodeMultibyte
|
||||
,decodeMultibyteLazy
|
||||
,encodeSinglebyte
|
||||
,encodeSinglebyteLazy
|
||||
,decodeSinglebyte
|
||||
,EncodingException(..)
|
||||
,DecodingException(..)
|
||||
,decodingArray
|
||||
,encodingMap)
|
||||
where
|
||||
import Data.Encoding.Exception
|
||||
import Data.Encoding.ByteSource
|
||||
import Data.Encoding.ByteSink
|
||||
|
||||
import Data.Array(array)
|
||||
import Data.ByteString (ByteString,unfoldrN,unfoldr,length,index,unpack)
|
||||
import qualified Data.ByteString.Lazy as LBS
|
||||
import Data.Encoding.Helper.Template
|
||||
|
||||
#if __GLASGOW_HASKELL__>=608
|
||||
import Data.ByteString.Unsafe(unsafeIndex)
|
||||
#else
|
||||
import Data.ByteString.Base(unsafeIndex)
|
||||
#endif
|
||||
|
||||
import Data.Map (Map,fromList,lookup)
|
||||
import Data.Char(chr)
|
||||
import Data.Maybe(mapMaybe)
|
||||
import Data.Typeable
|
||||
import Control.Throws
|
||||
import Data.Array as Array
|
||||
import Data.Map as Map hiding ((!))
|
||||
import Data.Word
|
||||
import Prelude hiding (lookup,length)
|
||||
import qualified Prelude
|
||||
import Control.Exception
|
||||
import Data.Dynamic(toDyn)
|
||||
import Data.Char
|
||||
|
||||
import Language.Haskell.TH
|
||||
|
||||
{- | Represents an encoding, supporting various methods of de- and encoding.
|
||||
Minimal complete definition: encode, decode
|
||||
-}
|
||||
class Encoding enc where
|
||||
-- | Encode a 'String' into a strict 'ByteString'. Throws the
|
||||
-- 'HasNoRepresentation'-Exception if it encounters an unrepresentable
|
||||
-- character.
|
||||
encode :: enc -> String -> ByteString
|
||||
-- | Encode a 'String' into a lazy 'Data.ByteString.Lazy.ByteString'.
|
||||
encodeLazy :: enc -> String -> LBS.ByteString
|
||||
encodeLazy e str = LBS.fromChunks [encode e str]
|
||||
-- | Whether or not the given 'Char' is representable in this encoding. Default: 'True'.
|
||||
encodable :: enc -> Char -> Bool
|
||||
encodable _ _ = True
|
||||
-- | Decode a strict 'ByteString' into a 'String'. If the string is not
|
||||
-- decodable, a 'DecodingException' is thrown.
|
||||
decode :: enc -> ByteString -> String
|
||||
decodeLazy :: enc -> LBS.ByteString -> String
|
||||
decodeLazy e str = concatMap (decode e) (LBS.toChunks str)
|
||||
-- | Whether or no a given 'ByteString' is decodable. Default: 'True'.
|
||||
decodable :: enc -> ByteString -> Bool
|
||||
decodable _ _ = True
|
||||
decodeChar :: ByteSource m => enc -> m Char
|
||||
encodeChar :: ByteSink m => enc -> Char -> m ()
|
||||
decode :: ByteSource m => enc -> m String
|
||||
decode e = untilM sourceEmpty (decodeChar e)
|
||||
encode :: ByteSink m => enc -> String -> m ()
|
||||
encode e = mapM_ (encodeChar e)
|
||||
|
||||
encodeMultibyte :: (Char -> (Word8,EncodeState)) -> String -> ByteString
|
||||
encodeMultibyte f str = unfoldr (\st -> case st of
|
||||
(Done,[]) -> Nothing
|
||||
(Done,x:xs) -> let (w,st) = f x in Just (w,(st,xs))
|
||||
(Put1 w1,xs) -> Just (w1,(Done,xs))
|
||||
(Put2 w1 w2,xs) -> Just (w1,(Put1 w2,xs))
|
||||
(Put3 w1 w2 w3,xs) -> Just (w1,(Put2 w2 w3,xs))) (Done,str)
|
||||
untilM :: Monad m => m Bool -> m a -> m [a]
|
||||
untilM check act = do
|
||||
end <- check
|
||||
if end
|
||||
then return []
|
||||
else (do
|
||||
x <- act
|
||||
xs <- untilM check act
|
||||
return (x:xs)
|
||||
)
|
||||
|
||||
encodeMultibyteLazy :: (Char -> (Word8,EncodeState)) -> String -> LBS.ByteString
|
||||
encodeMultibyteLazy f str = LBS.unfoldr (\ ~(st,rest) -> case st of
|
||||
Done -> case rest of
|
||||
[] -> Nothing
|
||||
x:xs -> let ~(w,st) = f x in Just (w,(st,xs))
|
||||
Put1 w1 -> Just (w1,(Done,rest))
|
||||
Put2 w1 w2 -> Just (w1,(Put1 w2,rest))
|
||||
Put3 w1 w2 w3 -> Just (w1,(Put2 w2 w3,rest))) (Done,str)
|
||||
untilM_ :: Monad m => m Bool -> m a -> m ()
|
||||
untilM_ check act = untilM check act >> return ()
|
||||
|
||||
decodeMultibyte :: ([Word8] -> (Char,[Word8])) -> ByteString -> String
|
||||
decodeMultibyte f str = decode (unpack str)
|
||||
where
|
||||
decode lst = let (c,nlst) = f lst in if null lst then [] else c:(decode nlst)
|
||||
|
||||
decodeMultibyteLazy :: ([Word8] -> (Char,[Word8])) -> LBS.ByteString -> String
|
||||
decodeMultibyteLazy f str = decode (LBS.unpack str)
|
||||
where
|
||||
decode lst = let (c,nlst) = f lst in if null lst then [] else c:(decode nlst)
|
||||
|
||||
encodeSinglebyte :: (Char -> Word8) -> String -> ByteString
|
||||
encodeSinglebyte f str = fst $ unfoldrN (Prelude.length str) (\st -> case st of
|
||||
[] -> Nothing
|
||||
(x:xs) -> Just (f x,xs)) str
|
||||
|
||||
encodeSinglebyteLazy :: (Char -> Word8) -> String -> LBS.ByteString
|
||||
encodeSinglebyteLazy f str = LBS.unfoldr (\st -> case st of
|
||||
[] -> Nothing
|
||||
(x:xs) -> Just (f x,xs)) str
|
||||
|
||||
decodeSinglebyte :: (Word8 -> Char) -> ByteString -> String
|
||||
decodeSinglebyte f str = map f (unpack str)
|
||||
|
||||
data EncodeState
|
||||
= Done
|
||||
| Put1 !Word8
|
||||
| Put2 !Word8 !Word8
|
||||
| Put3 !Word8 !Word8 !Word8
|
||||
deriving Show
|
||||
|
||||
-- | This exception type is thrown whenever something went wrong during the
|
||||
-- encoding-process.
|
||||
data EncodingException
|
||||
= HasNoRepresentation Char -- ^ Thrown if a specific character
|
||||
-- is not representable in an encoding.
|
||||
deriving (Eq,Show,Typeable)
|
||||
|
||||
-- | This exception type is thrown whenever something went wrong during the
|
||||
-- decoding-process.
|
||||
data DecodingException
|
||||
= IllegalCharacter Word8 -- ^ The sequence contained an illegal
|
||||
-- byte that couldn't be decoded.
|
||||
| UnexpectedEnd -- ^ more bytes were needed to allow a
|
||||
-- successfull decoding.
|
||||
| OutOfRange -- ^ the decoded value was out of the unicode range
|
||||
| IllegalRepresentation [Word8] -- ^ The character sequence encodes a
|
||||
-- character, but is illegal.
|
||||
deriving (Eq,Show,Typeable)
|
||||
|
||||
decodingArray :: FilePath -> Q Exp
|
||||
decodingArray file = do
|
||||
trans <- runIO (readTranslation file)
|
||||
createCharArray trans 0 255
|
||||
|
||||
encodingMap :: FilePath -> Q Exp
|
||||
#ifndef __HADDOCK__
|
||||
encodingMap file = do
|
||||
trans <- runIO (readTranslation file)
|
||||
return $ AppE
|
||||
(VarE 'fromList)
|
||||
(ListE [ TupE [LitE $ CharL to,LitE $ IntegerL from]
|
||||
| (from,to) <- trans])
|
||||
#endif
|
||||
|
||||
readTranslation :: FilePath -> IO [(Integer,Char)]
|
||||
readTranslation file = do
|
||||
cont <- readFile file
|
||||
return $ mapMaybe (\ln -> case ln of
|
||||
[] -> Nothing
|
||||
('#':xs) -> Nothing
|
||||
_ -> case words ln of
|
||||
(src:"#UNDEFINED":_) -> Just (read src,'\xFFFD') -- XXX: Find a better way to handle this
|
||||
(src:trg:_) -> Just (read src,chr $ read trg)
|
||||
_ -> Nothing
|
||||
) (lines cont)
|
||||
encodeWithMap :: ByteSink m => Map Char Word8 -> Char -> m ()
|
||||
encodeWithMap mp c = case Map.lookup c mp of
|
||||
Nothing -> throwException $ HasNoRepresentation c
|
||||
Just v -> pushWord8 v
|
||||
|
||||
decodeWithArray :: ByteSource m => Array Word8 (Maybe Char) -> m Char
|
||||
decodeWithArray arr = do
|
||||
w <- fetchWord8
|
||||
case arr!w of
|
||||
Nothing -> throwException $ IllegalCharacter w
|
||||
Just c -> return c
|
||||
@ -6,11 +6,15 @@ module Data.Encoding.BootString
|
||||
,punycode) where
|
||||
|
||||
import Data.Encoding.Base
|
||||
import Data.ByteString.Char8 (pack,unpack)
|
||||
import Data.List (unfoldr,partition)
|
||||
import Data.Encoding.Exception
|
||||
import Data.Encoding.ByteSource
|
||||
import Data.Encoding.ByteSink
|
||||
import Control.Throws
|
||||
import Data.Word
|
||||
import Data.List (unfoldr,partition,find)
|
||||
import Data.Char (ord,chr)
|
||||
import Data.Typeable
|
||||
import Control.Exception (throwDyn)
|
||||
import Control.Monad (when)
|
||||
|
||||
data BootString = BootString
|
||||
{base :: Int
|
||||
@ -34,27 +38,27 @@ punycode = BootString
|
||||
,init_n = 0x80
|
||||
}
|
||||
|
||||
punyValue :: Char -> Int
|
||||
punyValue :: ByteSource m => Word8 -> m Int
|
||||
punyValue c
|
||||
| n < 0x30 = norep
|
||||
| n <= 0x39 = n-0x30+26
|
||||
| n <= 0x39 = return $ n-0x30+26
|
||||
| n < 0x41 = norep
|
||||
| n <= 0x5A = n-0x41
|
||||
| n <= 0x5A = return $ n-0x41
|
||||
| n < 0x61 = norep
|
||||
| n <= 0x7A = n-0x61
|
||||
| n <= 0x7A = return $ n-0x61
|
||||
| otherwise = norep
|
||||
where
|
||||
n = ord c
|
||||
norep = throwDyn (HasNoRepresentation c)
|
||||
n = fromIntegral c
|
||||
norep = throwException (IllegalCharacter c)
|
||||
|
||||
punyChar :: Int -> Char
|
||||
punyChar :: ByteSink m => Int -> m Word8
|
||||
punyChar c
|
||||
| c < 0 = norep
|
||||
| c < 26 = chr $ 0x61+c
|
||||
| c < 36 = chr $ 0x30+c-26
|
||||
| c < 26 = return $ fromIntegral $ 0x61+c
|
||||
| c < 36 = return $ fromIntegral $ 0x30+c-26
|
||||
| otherwise = norep
|
||||
where
|
||||
norep = throwDyn OutOfRange
|
||||
norep = throwException (HasNoRepresentation (chr c))
|
||||
|
||||
getT :: BootString -> Int -> Int -> Int
|
||||
getT bs k bias
|
||||
@ -73,40 +77,46 @@ adapt bs delta numpoints firsttime = let
|
||||
$ iterate (\(d,k) -> (d `div` (base bs - tmin bs),k+(base bs))) (delta2,0)
|
||||
in rk + (((base bs - tmin bs +1) * rd) `div` (rd + skew bs))
|
||||
|
||||
decodeValue :: BootString -> Int -> Int -> Int -> Int -> [Int] -> (Int,[Int])
|
||||
decodeValue :: ByteSource m => BootString -> Int -> Int -> Int -> Int -> [Int] -> m (Int,[Int])
|
||||
decodeValue bs bias i k w (x:xs)
|
||||
| x >= base bs = throwDyn OutOfRange
|
||||
| x > (maxBound - i) `div` w = throwDyn OutOfRange
|
||||
| x < t = (ni,xs)
|
||||
| w > maxBound `div` (base bs - t) = throwDyn OutOfRange
|
||||
| x >= base bs = throwException OutOfRange
|
||||
| x > (maxBound - i) `div` w = throwException OutOfRange
|
||||
| x < t = return (ni,xs)
|
||||
| w > maxBound `div` (base bs - t) = throwException OutOfRange
|
||||
| null xs = throwException OutOfRange
|
||||
| otherwise = decodeValue bs bias ni (k+base bs) (w*(base bs - t)) xs
|
||||
where
|
||||
ni = i + x*w
|
||||
t = getT bs k bias
|
||||
|
||||
decodeValues :: BootString -> Int -> [Int] -> [(Char,Int)]
|
||||
decodeValues :: ByteSource m => BootString -> Int -> [Int] -> m [(Char,Int)]
|
||||
decodeValues bs len xs = decodeValues' bs (init_n bs) 0 (init_bias bs) len xs
|
||||
|
||||
decodeValues' :: BootString -> Int -> Int -> Int -> Int -> [Int] -> [(Char,Int)]
|
||||
decodeValues' bs n i bias len [] = []
|
||||
decodeValues' bs n i bias len xs
|
||||
| dn > maxBound - n = throwDyn OutOfRange
|
||||
| otherwise = (chr $ nn,nni):decodeValues' bs nn (nni+1)
|
||||
(adapt bs (ni-i) (len+1) (i==0)) (len+1) rst
|
||||
where
|
||||
(ni,rst) = decodeValue bs bias i (base bs) 1 xs
|
||||
(dn,nni) = ni `divMod` (len+1)
|
||||
nn = n + dn
|
||||
|
||||
insertDeltas :: [(Char,Int)] -> String -> String
|
||||
decodeValues' :: ByteSource m => BootString -> Int -> Int -> Int -> Int -> [Int] -> m [(Char,Int)]
|
||||
decodeValues' bs n i bias len [] = return []
|
||||
decodeValues' bs n i bias len xs = do
|
||||
(ni,rst) <- decodeValue bs bias i (base bs) 1 xs
|
||||
let (dn,nni) = ni `divMod` (len+1)
|
||||
let nn = n+dn
|
||||
if dn > maxBound - n
|
||||
then throwException OutOfRange
|
||||
else (do
|
||||
rest <- decodeValues' bs nn (nni+1) (adapt bs (ni-i) (len+1) (i==0)) (len+1) rst
|
||||
return $ (chr $ nn,nni):rest
|
||||
)
|
||||
|
||||
insertDeltas :: [(a,Int)] -> [a] -> [a]
|
||||
insertDeltas [] str = str
|
||||
insertDeltas ((c,p):xs) str = let
|
||||
(l,r) = splitAt p str
|
||||
in insertDeltas xs (l++[c]++r)
|
||||
|
||||
punyDecode :: String -> String -> String
|
||||
punyDecode base ext = insertDeltas (decodeValues punycode (length base) (map punyValue ext)) base
|
||||
|
||||
punyDecode :: ByteSource m => [Word8] -> [Word8] -> m String
|
||||
punyDecode base ext = do
|
||||
pvals <- mapM punyValue ext
|
||||
vals <- decodeValues punycode (length base) pvals
|
||||
return $ insertDeltas vals (map (chr.fromIntegral) base)
|
||||
|
||||
encodeValue :: BootString -> Int -> Int -> Int -> Int -> [Int]
|
||||
encodeValue bs bias delta n c = unfoldr (\(q,k,out) -> let
|
||||
t = getT bs k bias
|
||||
@ -136,15 +146,39 @@ encodeValues bs b l h bias delta n cps
|
||||
m = minimum (filter (>=n) cps)
|
||||
(ndelta,nh,nbias,outp) = encodeValues' bs b h bias (delta + (m - n)*(h + 1)) m cps
|
||||
|
||||
breakLast :: (a -> Bool) -> [a] -> Maybe ([a],[a])
|
||||
breakLast p xs = do
|
||||
(bf,af,ind) <- breakLast' 0 Nothing p xs
|
||||
return (bf,af)
|
||||
where
|
||||
breakLast' n r p [] = do
|
||||
v <- r
|
||||
return ([],[],v)
|
||||
breakLast' n r p (x:xs) = let res = if p x
|
||||
then breakLast' (n+1) (Just n) p xs
|
||||
else breakLast' (n+1) r p xs
|
||||
in do
|
||||
(bf,af,v) <- res
|
||||
return $ if n<v then (x:bf,af,v) else (bf,x:af,v)
|
||||
|
||||
|
||||
instance Encoding BootString where
|
||||
encode bs str = let
|
||||
(base,nbase) = partition (\c -> ord c < init_n bs) str
|
||||
b = length base
|
||||
res = map punyChar $
|
||||
encodeValues bs b (length str) b (init_bias bs) 0 (init_n bs) (map ord str)
|
||||
in pack $ if null base
|
||||
then res
|
||||
else base++"-"++res
|
||||
decode bs str = case break (=='-') (unpack str) of
|
||||
(base,'-':nbase) -> punyDecode base nbase
|
||||
(nbase,"") -> punyDecode "" nbase
|
||||
encodeChar _ c = error "Data.Encoding.BootString.encodeChar: Please use 'encode' for encoding BootStrings"
|
||||
decodeChar _ = error "Data.Encoding.BootString.decodeChar: Please use 'decode' for decoding BootStrings"
|
||||
encode bs str = let (base,nbase) = partition (\c -> ord c < init_n bs) str
|
||||
b = length base
|
||||
in do
|
||||
res <- mapM punyChar $ encodeValues bs b (length str) b (init_bias bs) 0 (init_n bs) (map ord str)
|
||||
when (not $ null base) $ do
|
||||
mapM_ (pushWord8.fromIntegral.ord) base
|
||||
pushWord8 (fromIntegral $ ord '-')
|
||||
mapM_ pushWord8 res
|
||||
decode bs = do
|
||||
wrds <- untilM sourceEmpty fetchWord8
|
||||
let m = fromIntegral $ ord '-'
|
||||
case breakLast (==m) wrds of
|
||||
Just ([],_) -> throwException (IllegalCharacter m)
|
||||
Just (base,_:nbase) -> case find (\w -> fromIntegral w > init_n bs) base of
|
||||
Nothing -> punyDecode base nbase
|
||||
Just ww -> throwException (IllegalCharacter ww)
|
||||
Nothing -> punyDecode [] wrds
|
||||
|
||||
195
Data/Encoding/ByteSink.hs
Normal file
195
Data/Encoding/ByteSink.hs
Normal file
@ -0,0 +1,195 @@
|
||||
{-# LANGUAGE FlexibleInstances,FlexibleContexts,MultiParamTypeClasses #-}
|
||||
module Data.Encoding.ByteSink where
|
||||
|
||||
import Data.Encoding.Exception
|
||||
|
||||
import Data.Binary.Put
|
||||
import Data.Bits
|
||||
import Data.Char
|
||||
import Data.Sequence
|
||||
import Data.Word
|
||||
import Data.Foldable (toList)
|
||||
import Control.Throws
|
||||
import Control.Exception.Extensible
|
||||
import Control.Monad.State
|
||||
import Control.Monad.Identity
|
||||
import Control.Monad.Reader
|
||||
import Foreign.Ptr (Ptr,plusPtr,minusPtr)
|
||||
import Foreign.Marshal.Alloc (mallocBytes,reallocBytes,free)
|
||||
import Foreign.Storable (poke)
|
||||
import System.IO
|
||||
import System.IO.Unsafe (unsafePerformIO)
|
||||
|
||||
import qualified Data.ByteString as BS
|
||||
import Data.ByteString.Unsafe (unsafePackCStringFinalizer)
|
||||
|
||||
class (Monad m,Throws EncodingException m) => ByteSink m where
|
||||
pushWord8 :: Word8 -> m ()
|
||||
pushWord16be :: Word16 -> m ()
|
||||
pushWord16be w = do
|
||||
pushWord8 (fromIntegral $ w `shiftR` 8)
|
||||
pushWord8 (fromIntegral $ w)
|
||||
pushWord16le :: Word16 -> m ()
|
||||
pushWord16le w = do
|
||||
pushWord8 (fromIntegral $ w)
|
||||
pushWord8 (fromIntegral $ w `shiftR` 8)
|
||||
pushWord32be :: Word32 -> m ()
|
||||
pushWord32be w = do
|
||||
pushWord8 (fromIntegral $ w `shiftR` 24)
|
||||
pushWord8 (fromIntegral $ w `shiftR` 16)
|
||||
pushWord8 (fromIntegral $ w `shiftR` 8)
|
||||
pushWord8 (fromIntegral $ w)
|
||||
pushWord32le :: Word32 -> m ()
|
||||
pushWord32le w = do
|
||||
pushWord8 (fromIntegral $ w)
|
||||
pushWord8 (fromIntegral $ w `shiftR` 8)
|
||||
pushWord8 (fromIntegral $ w `shiftR` 16)
|
||||
pushWord8 (fromIntegral $ w `shiftR` 24)
|
||||
pushWord64be :: Word64 -> m ()
|
||||
pushWord64be w = do
|
||||
pushWord8 (fromIntegral $ w `shiftR` 56)
|
||||
pushWord8 (fromIntegral $ w `shiftR` 48)
|
||||
pushWord8 (fromIntegral $ w `shiftR` 40)
|
||||
pushWord8 (fromIntegral $ w `shiftR` 32)
|
||||
pushWord8 (fromIntegral $ w `shiftR` 24)
|
||||
pushWord8 (fromIntegral $ w `shiftR` 16)
|
||||
pushWord8 (fromIntegral $ w `shiftR` 8)
|
||||
pushWord8 (fromIntegral $ w)
|
||||
pushWord64le :: Word64 -> m ()
|
||||
pushWord64le w = do
|
||||
pushWord8 (fromIntegral $ w)
|
||||
pushWord8 (fromIntegral $ w `shiftR` 8)
|
||||
pushWord8 (fromIntegral $ w `shiftR` 16)
|
||||
pushWord8 (fromIntegral $ w `shiftR` 24)
|
||||
pushWord8 (fromIntegral $ w `shiftR` 32)
|
||||
pushWord8 (fromIntegral $ w `shiftR` 40)
|
||||
pushWord8 (fromIntegral $ w `shiftR` 48)
|
||||
pushWord8 (fromIntegral $ w `shiftR` 56)
|
||||
|
||||
instance Throws EncodingException PutM where
|
||||
throwException = throw
|
||||
|
||||
instance ByteSink PutM where
|
||||
pushWord8 = putWord8
|
||||
pushWord16be = putWord16be
|
||||
pushWord16le = putWord16le
|
||||
pushWord32be = putWord32be
|
||||
pushWord32le = putWord32le
|
||||
pushWord64be = putWord64be
|
||||
pushWord64le = putWord64le
|
||||
|
||||
newtype PutME a = PutME (Either EncodingException (PutM (),a))
|
||||
|
||||
instance Monad PutME where
|
||||
return x = PutME $ Right (return (),x)
|
||||
(PutME x) >>= g = PutME $ do
|
||||
(m,r) <- x
|
||||
let (PutME ng) = g r
|
||||
case ng of
|
||||
Left err -> Left err
|
||||
Right (m',nr) -> Right (m>>m',nr)
|
||||
|
||||
instance Throws EncodingException PutME where
|
||||
throwException = PutME . Left
|
||||
|
||||
instance ByteSink PutME where
|
||||
pushWord8 w = PutME $ Right (putWord8 w,())
|
||||
pushWord16be w = PutME $ Right (putWord16be w,())
|
||||
pushWord16le w = PutME $ Right (putWord16le w,())
|
||||
pushWord32be w = PutME $ Right (putWord32be w,())
|
||||
pushWord32le w = PutME $ Right (putWord32le w,())
|
||||
pushWord64be w = PutME $ Right (putWord64be w,())
|
||||
pushWord64le w = PutME $ Right (putWord64le w,())
|
||||
|
||||
instance Monad (Either EncodingException) where
|
||||
return x = Right x
|
||||
Left err >>= g = Left err
|
||||
Right x >>= g = g x
|
||||
|
||||
instance Throws EncodingException (State (Seq Char)) where
|
||||
throwException = throw
|
||||
|
||||
instance ByteSink (State (Seq Char)) where
|
||||
pushWord8 x = modify (|> (chr $ fromIntegral x))
|
||||
|
||||
instance ByteSink (StateT (Seq Char) (Either EncodingException)) where
|
||||
pushWord8 x = modify (|> (chr $ fromIntegral x))
|
||||
|
||||
newtype StrictSink a = StrictS (Ptr Word8 -> Int -> Int -> IO (a,Ptr Word8,Int,Int))
|
||||
|
||||
instance Monad StrictSink where
|
||||
return x = StrictS $ \cstr pos max -> return (x,cstr,pos,max)
|
||||
(StrictS f) >>= g = StrictS (\cstr pos max -> do
|
||||
(res,ncstr,npos,nmax) <- f cstr pos max
|
||||
let StrictS g' = g res
|
||||
g' ncstr npos nmax
|
||||
)
|
||||
|
||||
instance Throws EncodingException StrictSink where
|
||||
throwException = throw
|
||||
|
||||
instance ByteSink StrictSink where
|
||||
pushWord8 x = StrictS (\cstr pos max -> do
|
||||
(ncstr,nmax) <- if pos < max
|
||||
then return (cstr,max)
|
||||
else (do
|
||||
let nmax = max + 32
|
||||
nptr <- reallocBytes cstr nmax
|
||||
return (nptr,nmax)
|
||||
)
|
||||
poke (ncstr `plusPtr` pos) x
|
||||
return ((),ncstr,pos+1,nmax)
|
||||
)
|
||||
|
||||
newtype StrictSinkE a = StrictSinkE (StrictSink (Either EncodingException a))
|
||||
|
||||
instance Monad StrictSinkE where
|
||||
return = StrictSinkE . return . Right
|
||||
(StrictSinkE s) >>= g = StrictSinkE $ do
|
||||
res <- s
|
||||
case res of
|
||||
Left err -> return $ Left err
|
||||
Right res' -> let StrictSinkE g' = g res'
|
||||
in g'
|
||||
|
||||
instance Throws EncodingException StrictSinkE where
|
||||
throwException = StrictSinkE . return . Left
|
||||
|
||||
instance ByteSink StrictSinkE where
|
||||
pushWord8 x = StrictSinkE $ pushWord8 x >>= return . Right
|
||||
|
||||
createStrictWithLen :: StrictSink a -> Int -> (a,BS.ByteString)
|
||||
createStrictWithLen (StrictS f) max = unsafePerformIO $ do
|
||||
ptr <- mallocBytes max
|
||||
(r,nptr,len,_) <- f ptr 0 max
|
||||
str <- unsafePackCStringFinalizer nptr len (free nptr)
|
||||
return (r,str)
|
||||
|
||||
createStrict :: StrictSink a -> (a,BS.ByteString)
|
||||
createStrict sink = createStrictWithLen sink 32
|
||||
|
||||
newtype StrictSinkExplicit a = StrictSinkExplicit (StrictSink (Either EncodingException a))
|
||||
|
||||
instance Monad StrictSinkExplicit where
|
||||
return = (StrictSinkExplicit).return.Right
|
||||
(StrictSinkExplicit sink) >>= f
|
||||
= StrictSinkExplicit (do
|
||||
res <- sink
|
||||
case res of
|
||||
Left err -> return $ Left err
|
||||
Right x -> let StrictSinkExplicit sink2 = f x
|
||||
in sink2)
|
||||
|
||||
instance Throws EncodingException StrictSinkExplicit where
|
||||
throwException = StrictSinkExplicit . return . Left
|
||||
|
||||
instance ByteSink StrictSinkExplicit where
|
||||
pushWord8 x = StrictSinkExplicit $ do
|
||||
pushWord8 x
|
||||
return $ Right ()
|
||||
|
||||
instance ByteSink (ReaderT Handle IO) where
|
||||
pushWord8 x = do
|
||||
h <- ask
|
||||
liftIO $ do
|
||||
hPutChar h (chr $ fromIntegral x)
|
||||
161
Data/Encoding/ByteSource.hs
Normal file
161
Data/Encoding/ByteSource.hs
Normal file
@ -0,0 +1,161 @@
|
||||
{-# LANGUAGE FlexibleInstances,FlexibleContexts,MultiParamTypeClasses #-}
|
||||
module Data.Encoding.ByteSource where
|
||||
|
||||
import Data.Encoding.Exception
|
||||
|
||||
import Data.Bits
|
||||
import Data.Binary.Get
|
||||
import Data.Char
|
||||
import Data.Word
|
||||
import Control.Monad.State
|
||||
import Control.Monad.Identity
|
||||
import Control.Monad.Reader
|
||||
import Control.Exception.Extensible
|
||||
import Control.Throws
|
||||
import qualified Data.ByteString as BS
|
||||
import qualified Data.ByteString.Lazy as LBS
|
||||
import System.IO
|
||||
|
||||
class (Monad m,Throws DecodingException m) => ByteSource m where
|
||||
sourceEmpty :: m Bool
|
||||
fetchWord8 :: m Word8
|
||||
fetchWord16be :: m Word16
|
||||
fetchWord16be = do
|
||||
w1 <- fetchWord8
|
||||
w2 <- fetchWord8
|
||||
return $ ((fromIntegral w1) `shiftL` 8)
|
||||
.|. (fromIntegral w2)
|
||||
fetchWord16le :: m Word16
|
||||
fetchWord16le = do
|
||||
w1 <- fetchWord8
|
||||
w2 <- fetchWord8
|
||||
return $ ((fromIntegral w2) `shiftL` 8)
|
||||
.|. (fromIntegral w1)
|
||||
fetchWord32be :: m Word32
|
||||
fetchWord32be = do
|
||||
w1 <- fetchWord8
|
||||
w2 <- fetchWord8
|
||||
w3 <- fetchWord8
|
||||
w4 <- fetchWord8
|
||||
return $ ((fromIntegral w1) `shiftL` 24)
|
||||
.|. ((fromIntegral w2) `shiftL` 16)
|
||||
.|. ((fromIntegral w3) `shiftL` 8)
|
||||
.|. (fromIntegral w4)
|
||||
fetchWord32le :: m Word32
|
||||
fetchWord32le = do
|
||||
w1 <- fetchWord8
|
||||
w2 <- fetchWord8
|
||||
w3 <- fetchWord8
|
||||
w4 <- fetchWord8
|
||||
return $ ((fromIntegral w4) `shiftL` 24)
|
||||
.|. ((fromIntegral w3) `shiftL` 16)
|
||||
.|. ((fromIntegral w2) `shiftL` 8)
|
||||
.|. (fromIntegral w1)
|
||||
fetchWord64be :: m Word64
|
||||
fetchWord64be = do
|
||||
w1 <- fetchWord8
|
||||
w2 <- fetchWord8
|
||||
w3 <- fetchWord8
|
||||
w4 <- fetchWord8
|
||||
w5 <- fetchWord8
|
||||
w6 <- fetchWord8
|
||||
w7 <- fetchWord8
|
||||
w8 <- fetchWord8
|
||||
return $ ((fromIntegral w1) `shiftL` 56)
|
||||
.|. ((fromIntegral w2) `shiftL` 48)
|
||||
.|. ((fromIntegral w3) `shiftL` 40)
|
||||
.|. ((fromIntegral w4) `shiftL` 32)
|
||||
.|. ((fromIntegral w5) `shiftL` 24)
|
||||
.|. ((fromIntegral w6) `shiftL` 16)
|
||||
.|. ((fromIntegral w7) `shiftL` 8)
|
||||
.|. (fromIntegral w8)
|
||||
fetchWord64le :: m Word64
|
||||
fetchWord64le = do
|
||||
w1 <- fetchWord8
|
||||
w2 <- fetchWord8
|
||||
w3 <- fetchWord8
|
||||
w4 <- fetchWord8
|
||||
w5 <- fetchWord8
|
||||
w6 <- fetchWord8
|
||||
w7 <- fetchWord8
|
||||
w8 <- fetchWord8
|
||||
return $ ((fromIntegral w8) `shiftL` 56)
|
||||
.|. ((fromIntegral w7) `shiftL` 48)
|
||||
.|. ((fromIntegral w6) `shiftL` 40)
|
||||
.|. ((fromIntegral w5) `shiftL` 32)
|
||||
.|. ((fromIntegral w4) `shiftL` 24)
|
||||
.|. ((fromIntegral w3) `shiftL` 16)
|
||||
.|. ((fromIntegral w2) `shiftL` 8)
|
||||
.|. (fromIntegral w1)
|
||||
|
||||
instance Throws DecodingException Get where
|
||||
throwException = throw
|
||||
|
||||
instance ByteSource Get where
|
||||
sourceEmpty = isEmpty
|
||||
fetchWord8 = getWord8
|
||||
fetchWord16be = getWord16be
|
||||
fetchWord16le = getWord16le
|
||||
fetchWord32be = getWord32be
|
||||
fetchWord32le = getWord32le
|
||||
fetchWord64be = getWord64be
|
||||
fetchWord64le = getWord64le
|
||||
|
||||
instance Throws DecodingException (State [Char]) where
|
||||
throwException = throw
|
||||
|
||||
instance ByteSource (State [Char]) where
|
||||
sourceEmpty = gets null
|
||||
fetchWord8 = do
|
||||
chs <- get
|
||||
case chs of
|
||||
[] -> throw UnexpectedEnd
|
||||
c:cs -> do
|
||||
put cs
|
||||
return (fromIntegral $ ord c)
|
||||
|
||||
instance Monad (Either DecodingException) where
|
||||
return = Right
|
||||
(Left err) >>= g = Left err
|
||||
(Right x) >>= g = g x
|
||||
|
||||
instance ByteSource (StateT [Char] (Either DecodingException)) where
|
||||
sourceEmpty = gets null
|
||||
fetchWord8 = do
|
||||
chs <- get
|
||||
case chs of
|
||||
[] -> throwException UnexpectedEnd --handleDecodingError UnexpectedEnd
|
||||
c:cs -> do
|
||||
put cs
|
||||
return (fromIntegral $ ord c)
|
||||
|
||||
instance Throws DecodingException (State BS.ByteString) where
|
||||
throwException = throw
|
||||
|
||||
instance ByteSource (State BS.ByteString) where
|
||||
sourceEmpty = gets BS.null
|
||||
fetchWord8 = State (\str -> case BS.uncons str of
|
||||
Nothing -> throw UnexpectedEnd
|
||||
Just (c,cs) -> (c,cs))
|
||||
|
||||
instance ByteSource (StateT BS.ByteString (Either DecodingException)) where
|
||||
sourceEmpty = gets BS.null
|
||||
fetchWord8 = StateT (\str -> case BS.uncons str of
|
||||
Nothing -> Left UnexpectedEnd
|
||||
Just ns -> Right ns)
|
||||
|
||||
instance ByteSource (StateT LBS.ByteString (Either DecodingException)) where
|
||||
sourceEmpty = gets LBS.null
|
||||
fetchWord8 = StateT (\str -> case LBS.uncons str of
|
||||
Nothing -> Left UnexpectedEnd
|
||||
Just ns -> Right ns)
|
||||
|
||||
instance ByteSource (ReaderT Handle IO) where
|
||||
sourceEmpty = do
|
||||
h <- ask
|
||||
liftIO (hIsEOF h)
|
||||
fetchWord8 = do
|
||||
h <- ask
|
||||
liftIO $ do
|
||||
ch <- hGetChar h
|
||||
return (fromIntegral $ ord ch)
|
||||
@ -1,35 +1,6 @@
|
||||
{-# LANGUAGE CPP,TemplateHaskell,DeriveDataTypeable #-}
|
||||
{- | This module implements Windows Codepage number 1250 which encodes languages that use latin script.
|
||||
See <http://en.wikipedia.org/wiki/CP1250> for more information.
|
||||
-}
|
||||
module Data.Encoding.CP1250
|
||||
(CP1250(..)) where
|
||||
{-# LANGUAGE TemplateHaskell #-}
|
||||
module Data.Encoding.CP1250 where
|
||||
|
||||
import Data.Array ((!),Array)
|
||||
import Data.Word (Word8)
|
||||
import Data.ByteString (all)
|
||||
import Data.Map (Map,lookup,member)
|
||||
import Data.Encoding.Base
|
||||
import Prelude hiding (lookup,all)
|
||||
import Control.Exception (throwDyn)
|
||||
import Data.Typeable
|
||||
import Data.Encoding.Helper.Template (makeISOInstance)
|
||||
|
||||
data CP1250 = CP1250 deriving (Eq,Show,Typeable)
|
||||
|
||||
instance Encoding CP1250 where
|
||||
encode _ = encodeSinglebyte (\c -> case lookup c encodeMap of
|
||||
Just v -> v
|
||||
Nothing -> throwDyn (HasNoRepresentation c))
|
||||
encodable _ c = member c encodeMap
|
||||
decode _ = decodeSinglebyte (decodeArr!)
|
||||
decodable _ = all (\w -> decodeArr!w /= '\xFFFD')
|
||||
|
||||
decodeArr :: Array Word8 Char
|
||||
#ifndef __HADDOCK__
|
||||
decodeArr = $(decodingArray "CP1250.TXT")
|
||||
#endif
|
||||
|
||||
encodeMap :: Map Char Word8
|
||||
#ifndef __HADDOCK__
|
||||
encodeMap = $(encodingMap "CP1250.TXT")
|
||||
#endif
|
||||
$( makeISOInstance "CP1250" "CP1250.TXT" )
|
||||
@ -1,35 +1,6 @@
|
||||
{-# LANGUAGE CPP,TemplateHaskell,DeriveDataTypeable #-}
|
||||
{- | This module implements Windows Codepage number 1251 which encodes languages that use the cyrillic alphabet.
|
||||
See <http://en.wikipedia.org/wiki/CP1251> for more information.
|
||||
-}
|
||||
module Data.Encoding.CP1251
|
||||
(CP1251(..)) where
|
||||
{-# LANGUAGE TemplateHaskell #-}
|
||||
module Data.Encoding.CP1251 where
|
||||
|
||||
import Data.Array ((!),Array)
|
||||
import Data.Word (Word8)
|
||||
import Data.ByteString (all)
|
||||
import Data.Map (Map,lookup,member)
|
||||
import Data.Encoding.Base
|
||||
import Prelude hiding (lookup,all)
|
||||
import Control.Exception (throwDyn)
|
||||
import Data.Typeable
|
||||
import Data.Encoding.Helper.Template (makeISOInstance)
|
||||
|
||||
data CP1251 = CP1251 deriving (Eq,Show,Typeable)
|
||||
|
||||
instance Encoding CP1251 where
|
||||
encode _ = encodeSinglebyte (\c -> case lookup c encodeMap of
|
||||
Just v -> v
|
||||
Nothing -> throwDyn (HasNoRepresentation c))
|
||||
encodable _ c = member c encodeMap
|
||||
decode _ = decodeSinglebyte (decodeArr!)
|
||||
decodable _ = all (\w -> decodeArr!w /= '\xFFFD')
|
||||
|
||||
decodeArr :: Array Word8 Char
|
||||
#ifndef __HADDOCK__
|
||||
decodeArr = $(decodingArray "CP1251.TXT")
|
||||
#endif
|
||||
|
||||
encodeMap :: Map Char Word8
|
||||
#ifndef __HADDOCK__
|
||||
encodeMap = $(encodingMap "CP1251.TXT")
|
||||
#endif
|
||||
$( makeISOInstance "CP1251" "CP1251.TXT" )
|
||||
@ -1,35 +1,6 @@
|
||||
{-# LANGUAGE CPP,TemplateHaskell,DeriveDataTypeable #-}
|
||||
{- | This module implements Windows Codepage number 1252 which is a superset of ISO 8859-1.
|
||||
See <http://en.wikipedia.org/wiki/CP1252> for more information.
|
||||
-}
|
||||
module Data.Encoding.CP1252
|
||||
(CP1252(..)) where
|
||||
{-# LANGUAGE TemplateHaskell #-}
|
||||
module Data.Encoding.CP1252 where
|
||||
|
||||
import Data.Array ((!),Array)
|
||||
import Data.Word (Word8)
|
||||
import Data.ByteString (all)
|
||||
import Data.Map (Map,lookup,member)
|
||||
import Data.Encoding.Base
|
||||
import Prelude hiding (lookup,all)
|
||||
import Control.Exception (throwDyn)
|
||||
import Data.Typeable
|
||||
import Data.Encoding.Helper.Template (makeISOInstance)
|
||||
|
||||
data CP1252 = CP1252 deriving (Eq,Show,Typeable)
|
||||
|
||||
instance Encoding CP1252 where
|
||||
encode _ = encodeSinglebyte (\c -> case lookup c encodeMap of
|
||||
Just v -> v
|
||||
Nothing -> throwDyn (HasNoRepresentation c))
|
||||
encodable _ c = member c encodeMap
|
||||
decode _ = decodeSinglebyte (decodeArr!)
|
||||
decodable _ = all (\w -> decodeArr!w /= '\xFFFD')
|
||||
|
||||
decodeArr :: Array Word8 Char
|
||||
#ifndef __HADDOCK__
|
||||
decodeArr = $(decodingArray "CP1252.TXT")
|
||||
#endif
|
||||
|
||||
encodeMap :: Map Char Word8
|
||||
#ifndef __HADDOCK__
|
||||
encodeMap = $(encodingMap "CP1252.TXT")
|
||||
#endif
|
||||
$( makeISOInstance "CP1252" "CP1252.TXT" )
|
||||
@ -1,35 +1,6 @@
|
||||
{-# LANGUAGE CPP,TemplateHaskell,DeriveDataTypeable #-}
|
||||
{- | This module implements Windows Codepage number 1253 which encodes modern greek.
|
||||
See <http://en.wikipedia.org/wiki/CP1253> for more information.
|
||||
-}
|
||||
module Data.Encoding.CP1253
|
||||
(CP1253(..)) where
|
||||
{-# LANGUAGE TemplateHaskell #-}
|
||||
module Data.Encoding.CP1253 where
|
||||
|
||||
import Data.Array ((!),Array)
|
||||
import Data.Word (Word8)
|
||||
import Data.ByteString (all)
|
||||
import Data.Map (Map,lookup,member)
|
||||
import Data.Encoding.Base
|
||||
import Prelude hiding (lookup,all)
|
||||
import Control.Exception (throwDyn)
|
||||
import Data.Typeable
|
||||
import Data.Encoding.Helper.Template (makeISOInstance)
|
||||
|
||||
data CP1253 = CP1253 deriving (Eq,Show,Typeable)
|
||||
|
||||
instance Encoding CP1253 where
|
||||
encode _ = encodeSinglebyte (\c -> case lookup c encodeMap of
|
||||
Just v -> v
|
||||
Nothing -> throwDyn (HasNoRepresentation c))
|
||||
encodable _ c = member c encodeMap
|
||||
decode _ = decodeSinglebyte (decodeArr!)
|
||||
decodable _ = all (\w -> decodeArr!w /= '\xFFFD')
|
||||
|
||||
decodeArr :: Array Word8 Char
|
||||
#ifndef __HADDOCK__
|
||||
decodeArr = $(decodingArray "CP1253.TXT")
|
||||
#endif
|
||||
|
||||
encodeMap :: Map Char Word8
|
||||
#ifndef __HADDOCK__
|
||||
encodeMap = $(encodingMap "CP1253.TXT")
|
||||
#endif
|
||||
$( makeISOInstance "CP1253" "CP1253.TXT" )
|
||||
@ -1,35 +1,6 @@
|
||||
{-# LANGUAGE CPP,TemplateHaskell,DeriveDataTypeable #-}
|
||||
{- | This module implements Windows Codepage number 1254 which encodes the turkish language.
|
||||
See <http://en.wikipedia.org/wiki/CP1254> for more information.
|
||||
-}
|
||||
module Data.Encoding.CP1254
|
||||
(CP1254(..)) where
|
||||
{-# LANGUAGE TemplateHaskell #-}
|
||||
module Data.Encoding.CP1254 where
|
||||
|
||||
import Data.Array ((!),Array)
|
||||
import Data.Word (Word8)
|
||||
import Data.ByteString (all)
|
||||
import Data.Map (Map,lookup,member)
|
||||
import Data.Encoding.Base
|
||||
import Prelude hiding (lookup,all)
|
||||
import Control.Exception (throwDyn)
|
||||
import Data.Typeable
|
||||
import Data.Encoding.Helper.Template (makeISOInstance)
|
||||
|
||||
data CP1254 = CP1254 deriving (Eq,Show,Typeable)
|
||||
|
||||
instance Encoding CP1254 where
|
||||
encode _ = encodeSinglebyte (\c -> case lookup c encodeMap of
|
||||
Just v -> v
|
||||
Nothing -> throwDyn (HasNoRepresentation c))
|
||||
encodable _ c = member c encodeMap
|
||||
decode _ = decodeSinglebyte (decodeArr!)
|
||||
decodable _ = all (\w -> decodeArr!w /= '\xFFFD')
|
||||
|
||||
decodeArr :: Array Word8 Char
|
||||
#ifndef __HADDOCK__
|
||||
decodeArr = $(decodingArray "CP1254.TXT")
|
||||
#endif
|
||||
|
||||
encodeMap :: Map Char Word8
|
||||
#ifndef __HADDOCK__
|
||||
encodeMap = $(encodingMap "CP1254.TXT")
|
||||
#endif
|
||||
$( makeISOInstance "CP1254" "CP1254.TXT" )
|
||||
@ -1,35 +1,6 @@
|
||||
{-# LANGUAGE CPP,TemplateHaskell,DeriveDataTypeable #-}
|
||||
{- | This module implements Windows Codepage number 1255 which encodes the hebrew language.
|
||||
See <http://en.wikipedia.org/wiki/CP1255> for more information.
|
||||
-}
|
||||
module Data.Encoding.CP1255
|
||||
(CP1255(..)) where
|
||||
{-# LANGUAGE TemplateHaskell #-}
|
||||
module Data.Encoding.CP1255 where
|
||||
|
||||
import Data.Array ((!),Array)
|
||||
import Data.Word (Word8)
|
||||
import Data.ByteString (all)
|
||||
import Data.Map (Map,lookup,member)
|
||||
import Data.Encoding.Base
|
||||
import Prelude hiding (lookup,all)
|
||||
import Control.Exception (throwDyn)
|
||||
import Data.Typeable
|
||||
import Data.Encoding.Helper.Template (makeISOInstance)
|
||||
|
||||
data CP1255 = CP1255 deriving (Eq,Show,Typeable)
|
||||
|
||||
instance Encoding CP1255 where
|
||||
encode _ = encodeSinglebyte (\c -> case lookup c encodeMap of
|
||||
Just v -> v
|
||||
Nothing -> throwDyn (HasNoRepresentation c))
|
||||
encodable _ c = member c encodeMap
|
||||
decode _ = decodeSinglebyte (decodeArr!)
|
||||
decodable _ = all (\w -> decodeArr!w /= '\xFFFD')
|
||||
|
||||
decodeArr :: Array Word8 Char
|
||||
#ifndef __HADDOCK__
|
||||
decodeArr = $(decodingArray "CP1255.TXT")
|
||||
#endif
|
||||
|
||||
encodeMap :: Map Char Word8
|
||||
#ifndef __HADDOCK__
|
||||
encodeMap = $(encodingMap "CP1255.TXT")
|
||||
#endif
|
||||
$( makeISOInstance "CP1255" "CP1255.TXT" )
|
||||
@ -1,35 +1,6 @@
|
||||
{-# LANGUAGE CPP,TemplateHaskell,DeriveDataTypeable #-}
|
||||
{- | This module implements Windows Codepage number 1256 which encodes languages which use the arabic script.
|
||||
See <http://en.wikipedia.org/wiki/CP1256> for more information.
|
||||
-}
|
||||
module Data.Encoding.CP1256
|
||||
(CP1256(..)) where
|
||||
{-# LANGUAGE TemplateHaskell #-}
|
||||
module Data.Encoding.CP1256 where
|
||||
|
||||
import Data.Array ((!),Array)
|
||||
import Data.Word (Word8)
|
||||
import Data.ByteString (all)
|
||||
import Data.Map (Map,lookup,member)
|
||||
import Data.Encoding.Base
|
||||
import Prelude hiding (lookup,all)
|
||||
import Control.Exception (throwDyn)
|
||||
import Data.Typeable
|
||||
import Data.Encoding.Helper.Template (makeISOInstance)
|
||||
|
||||
data CP1256 = CP1256 deriving (Eq,Show,Typeable)
|
||||
|
||||
instance Encoding CP1256 where
|
||||
encode _ = encodeSinglebyte (\c -> case lookup c encodeMap of
|
||||
Just v -> v
|
||||
Nothing -> throwDyn (HasNoRepresentation c))
|
||||
encodable _ c = member c encodeMap
|
||||
decode _ = decodeSinglebyte (decodeArr!)
|
||||
decodable _ = all (\w -> decodeArr!w /= '\xFFFD')
|
||||
|
||||
decodeArr :: Array Word8 Char
|
||||
#ifndef __HADDOCK__
|
||||
decodeArr = $(decodingArray "CP1256.TXT")
|
||||
#endif
|
||||
|
||||
encodeMap :: Map Char Word8
|
||||
#ifndef __HADDOCK__
|
||||
encodeMap = $(encodingMap "CP1256.TXT")
|
||||
#endif
|
||||
$( makeISOInstance "CP1256" "CP1256.TXT" )
|
||||
@ -1,35 +1,6 @@
|
||||
{-# LANGUAGE CPP,TemplateHaskell,DeriveDataTypeable #-}
|
||||
{- | This module implements Windows Codepage number 1257 which encodes the estonian, latvian and lithuanian language.
|
||||
See <http://en.wikipedia.org/wiki/CP1257> for more information.
|
||||
-}
|
||||
module Data.Encoding.CP1257
|
||||
(CP1257(..)) where
|
||||
{-# LANGUAGE TemplateHaskell #-}
|
||||
module Data.Encoding.CP1257 where
|
||||
|
||||
import Data.Array ((!),Array)
|
||||
import Data.Word (Word8)
|
||||
import Data.ByteString (all)
|
||||
import Data.Map (Map,lookup,member)
|
||||
import Data.Encoding.Base
|
||||
import Prelude hiding (lookup,all)
|
||||
import Control.Exception (throwDyn)
|
||||
import Data.Typeable
|
||||
import Data.Encoding.Helper.Template (makeISOInstance)
|
||||
|
||||
data CP1257 = CP1257 deriving (Eq,Show,Typeable)
|
||||
|
||||
instance Encoding CP1257 where
|
||||
encode _ = encodeSinglebyte (\c -> case lookup c encodeMap of
|
||||
Just v -> v
|
||||
Nothing -> throwDyn (HasNoRepresentation c))
|
||||
encodable _ c = member c encodeMap
|
||||
decode _ = decodeSinglebyte (decodeArr!)
|
||||
decodable _ = all (\w -> decodeArr!w /= '\xFFFD')
|
||||
|
||||
decodeArr :: Array Word8 Char
|
||||
#ifndef __HADDOCK__
|
||||
decodeArr = $(decodingArray "CP1257.TXT")
|
||||
#endif
|
||||
|
||||
encodeMap :: Map Char Word8
|
||||
#ifndef __HADDOCK__
|
||||
encodeMap = $(encodingMap "CP1257.TXT")
|
||||
#endif
|
||||
$( makeISOInstance "CP1257" "CP1257.TXT" )
|
||||
@ -1,35 +1,6 @@
|
||||
{-# LANGUAGE CPP,TemplateHaskell,DeriveDataTypeable #-}
|
||||
{- | This module implements Windows Codepage number 1258 which encodes the vietnamese language.
|
||||
See <http://en.wikipedia.org/wiki/CP1258> for more information.
|
||||
-}
|
||||
module Data.Encoding.CP1258
|
||||
(CP1258(..)) where
|
||||
{-# LANGUAGE TemplateHaskell #-}
|
||||
module Data.Encoding.CP1258 where
|
||||
|
||||
import Data.Array ((!),Array)
|
||||
import Data.Word (Word8)
|
||||
import Data.ByteString (all)
|
||||
import Data.Map (Map,lookup,member)
|
||||
import Data.Encoding.Base
|
||||
import Prelude hiding (lookup,all)
|
||||
import Control.Exception (throwDyn)
|
||||
import Data.Typeable
|
||||
import Data.Encoding.Helper.Template (makeISOInstance)
|
||||
|
||||
data CP1258 = CP1258 deriving (Eq,Show,Typeable)
|
||||
|
||||
instance Encoding CP1258 where
|
||||
encode _ = encodeSinglebyte (\c -> case lookup c encodeMap of
|
||||
Just v -> v
|
||||
Nothing -> throwDyn (HasNoRepresentation c))
|
||||
encodable _ c = member c encodeMap
|
||||
decode _ = decodeSinglebyte (decodeArr!)
|
||||
decodable _ = all (\w -> decodeArr!w /= '\xFFFD')
|
||||
|
||||
decodeArr :: Array Word8 Char
|
||||
#ifndef __HADDOCK__
|
||||
decodeArr = $(decodingArray "CP1258.TXT")
|
||||
#endif
|
||||
|
||||
encodeMap :: Map Char Word8
|
||||
#ifndef __HADDOCK__
|
||||
encodeMap = $(encodingMap "CP1258.TXT")
|
||||
#endif
|
||||
$( makeISOInstance "CP1258" "CP1258.TXT" )
|
||||
30
Data/Encoding/Exception.hs
Normal file
30
Data/Encoding/Exception.hs
Normal file
@ -0,0 +1,30 @@
|
||||
{-# LANGUAGE DeriveDataTypeable #-}
|
||||
module Data.Encoding.Exception where
|
||||
|
||||
import Control.Exception.Extensible
|
||||
import Data.Word
|
||||
import Data.Typeable
|
||||
import Control.Monad.Identity
|
||||
|
||||
-- | This exception type is thrown whenever something went wrong during the
|
||||
-- encoding-process.
|
||||
data EncodingException
|
||||
= HasNoRepresentation Char -- ^ Thrown if a specific character
|
||||
-- is not representable in an encoding.
|
||||
deriving (Eq,Show,Typeable)
|
||||
|
||||
instance Exception EncodingException
|
||||
|
||||
-- | This exception type is thrown whenever something went wrong during the
|
||||
-- decoding-process.
|
||||
data DecodingException
|
||||
= IllegalCharacter Word8 -- ^ The sequence contained an illegal
|
||||
-- byte that couldn't be decoded.
|
||||
| UnexpectedEnd -- ^ more bytes were needed to allow a
|
||||
-- successfull decoding.
|
||||
| OutOfRange -- ^ the decoded value was out of the unicode range
|
||||
| IllegalRepresentation [Word8] -- ^ The character sequence encodes a
|
||||
-- character, but is illegal.
|
||||
deriving (Eq,Show,Typeable)
|
||||
|
||||
instance Exception DecodingException
|
||||
@ -7,11 +7,14 @@ module Data.Encoding.GB18030
|
||||
(GB18030(..))
|
||||
where
|
||||
|
||||
import Control.Exception
|
||||
import Control.Throws
|
||||
import Data.Char (chr,ord)
|
||||
import Data.Word
|
||||
import Data.Bits
|
||||
import Data.Encoding.Base
|
||||
import Data.Encoding.ByteSource
|
||||
import Data.Encoding.ByteSink
|
||||
import Data.Encoding.Exception
|
||||
import Data.ByteString (ByteString)
|
||||
import qualified Data.ByteString as BS
|
||||
import Data.Typeable
|
||||
@ -27,111 +30,103 @@ import Data.Encoding.GB18030Data
|
||||
data GB18030 = GB18030 deriving (Eq,Show,Typeable)
|
||||
|
||||
instance Encoding GB18030 where
|
||||
encode _ = encodeMultibyte encodeGB
|
||||
encodeLazy _ = encodeMultibyteLazy encodeGB
|
||||
decode _ = decodeMultibyte decodeGB
|
||||
decodeLazy _ = decodeMultibyteLazy decodeGB
|
||||
encodable _ ch = ch <= '\x10FFFF'
|
||||
decodable _ = checkValidity
|
||||
|
||||
data DecodingState
|
||||
= Valid
|
||||
| Invalid
|
||||
| Second
|
||||
| Third
|
||||
| Fourth
|
||||
deriving Eq
|
||||
|
||||
checkValidity :: ByteString -> Bool
|
||||
checkValidity bs = BS.foldl' (\st w -> case st of
|
||||
Invalid -> Invalid
|
||||
Valid | w<=0x80 -> Valid
|
||||
| w<=0xFE -> Second
|
||||
| otherwise -> Invalid
|
||||
Second | w< 0x30 -> Invalid
|
||||
| w<=0x39 -> Third
|
||||
| w<=0x7E -> Valid
|
||||
| w==0x7F -> Invalid
|
||||
| w<=0xFE -> Valid
|
||||
| otherwise -> Invalid
|
||||
Third | w< 0x81 -> Invalid
|
||||
| w<=0xFE -> Fourth
|
||||
| otherwise -> Invalid
|
||||
Fourth | w< 0x30 -> Invalid
|
||||
| w<=0x39 -> Valid
|
||||
| otherwise -> Invalid
|
||||
) Valid bs == Valid
|
||||
|
||||
{- How this works: The nested if-structures form an binary tree over the
|
||||
- encoding range.
|
||||
-}
|
||||
encodeGB :: Char -> (Word8,EncodeState)
|
||||
encodeGB ch = if ch<='\x4946' -- 1
|
||||
then (if ch<='\x4055' -- 2
|
||||
then (if ch<='\x2E80' -- 3
|
||||
then (if ch<='\x200F' -- 4
|
||||
then (if ch<'\x0452'
|
||||
then arr 0x0000 arr1
|
||||
else range range1)
|
||||
else (if ch<'\x2643'
|
||||
then arr 0x2010 arr2
|
||||
decodeChar _ = do
|
||||
w1 <- fetchWord8
|
||||
case () of
|
||||
_
|
||||
| w1 <= 0x80 -> return (chr $ fromIntegral w1) -- it's ascii
|
||||
| w1 <= 0xFE -> do
|
||||
w2 <- fetchWord8
|
||||
case () of
|
||||
_
|
||||
| w2 < 0x30 -> throwException (IllegalCharacter w2)
|
||||
| w2 <= 0x39 -> do
|
||||
w3 <- fetchWord8
|
||||
case () of
|
||||
_
|
||||
| w3 < 0x81 -> throwException (IllegalCharacter w3)
|
||||
| w3 <= 0xFE -> do
|
||||
w4 <- fetchWord8
|
||||
case () of
|
||||
_
|
||||
| w4 < 0x30 -> throwException (IllegalCharacter w4)
|
||||
| w4 <= 0x39 -> decodeGBFour $ linear w1 w2 w3 w4
|
||||
| otherwise -> throwException (IllegalCharacter w4)
|
||||
| otherwise -> throwException (IllegalCharacter w3)
|
||||
| w2 <= 0x7E -> return $ decodeGBTwo $ linear2 w1 w2
|
||||
| w2 == 0x7F -> throwException (IllegalCharacter w2)
|
||||
| w2 <= 0xFE -> return $ decodeGBTwo $ linear2 w1 w2
|
||||
| otherwise -> throwException (IllegalCharacter w2)
|
||||
| otherwise -> throwException (IllegalCharacter w1)
|
||||
{- How this works: The nested if-structures form an binary tree over the
|
||||
- encoding range.
|
||||
-}
|
||||
encodeChar _ ch = if ch<='\x4946' -- 1
|
||||
then (if ch<='\x4055' -- 2
|
||||
then (if ch<='\x2E80' -- 3
|
||||
then (if ch<='\x200F' -- 4
|
||||
then (if ch<'\x0452'
|
||||
then arr 0x0000 arr1
|
||||
else range range1)
|
||||
else (if ch<'\x2643'
|
||||
then arr 0x2010 arr2
|
||||
else range range2))
|
||||
else (if ch<='\x3917' -- 4
|
||||
then (if ch<'\x361B'
|
||||
then arr 0x2E81 arr3
|
||||
else range range3)
|
||||
else (if ch<'\x3CE1'
|
||||
then arr 0x3918 arr4
|
||||
else range range4)))
|
||||
then (if ch<'\x361B'
|
||||
then arr 0x2E81 arr3
|
||||
else range range3)
|
||||
else (if ch<'\x3CE1'
|
||||
then arr 0x3918 arr4
|
||||
else range range4)))
|
||||
else (if ch<='\x464B' -- 3
|
||||
then (if ch<='\x4336' -- 4
|
||||
then (if ch<'\x4160'
|
||||
then arr 0x4056 arr5
|
||||
else range range5)
|
||||
else (if ch<'\x44D7'
|
||||
then arr 0x4337 arr6
|
||||
else range range6))
|
||||
then (if ch<='\x4336' -- 4
|
||||
then (if ch<'\x4160'
|
||||
then arr 0x4056 arr5
|
||||
else range range5)
|
||||
else (if ch<'\x44D7'
|
||||
then arr 0x4337 arr6
|
||||
else range range6))
|
||||
else (if ch<'\x478E'
|
||||
then arr 0x464C arr7
|
||||
else range range7)))
|
||||
then arr 0x464C arr7
|
||||
else range range7)))
|
||||
else (if ch<='\xF92B' -- 2
|
||||
then (if ch<='\xD7FF' -- 3
|
||||
then (if ch<='\x4C76' -- 4
|
||||
then (if ch<'\x49B8'
|
||||
then arr 0x4947 arr8
|
||||
else range range8)
|
||||
else (if ch<'\x9FA6'
|
||||
then arr 0x4C77 arr9
|
||||
else range range9))
|
||||
then (if ch<='\xD7FF' -- 3
|
||||
then (if ch<='\x4C76' -- 4
|
||||
then (if ch<'\x49B8'
|
||||
then arr 0x4947 arr8
|
||||
else range range8)
|
||||
else (if ch<'\x9FA6'
|
||||
then arr 0x4C77 arr9
|
||||
else range range9))
|
||||
else (if ch<'\xE865'
|
||||
then arr 0xD800 arr10
|
||||
else range range10))
|
||||
then arr 0xD800 arr10
|
||||
else range range10))
|
||||
else (if ch<='\xFFFF' -- 3
|
||||
then (if ch<='\xFE2F' -- 4
|
||||
then (if ch<'\xFA2A'
|
||||
then arr 0xF92C arr11
|
||||
else range range11)
|
||||
else (if ch<'\xFFE6'
|
||||
then arr 0xFE30 arr12
|
||||
else range range12))
|
||||
then (if ch<='\xFE2F' -- 4
|
||||
then (if ch<'\xFA2A'
|
||||
then arr 0xF92C arr11
|
||||
else range range11)
|
||||
else (if ch<'\xFFE6'
|
||||
then arr 0xFE30 arr12
|
||||
else range range12))
|
||||
else (if ch<='\x10FFFF' -- 4
|
||||
then range range13
|
||||
else throwDyn (HasNoRepresentation ch))))
|
||||
where
|
||||
range r = let
|
||||
(w1,w2,w3,w4) = delinear (ord ch + r)
|
||||
in (w1,Put3 w2 w3 w4)
|
||||
arr off a = let
|
||||
ind = (ord ch - off)*5
|
||||
w1 = unsafeIndex a (ind+1)
|
||||
w2 = unsafeIndex a (ind+2)
|
||||
w3 = unsafeIndex a (ind+3)
|
||||
w4 = unsafeIndex a (ind+4)
|
||||
in (w1,case unsafeIndex a ind of
|
||||
1 -> Done
|
||||
2 -> Put1 w2
|
||||
3 -> Put2 w2 w3
|
||||
4 -> Put3 w2 w3 w4)
|
||||
then range range13
|
||||
else throwException (HasNoRepresentation ch))))
|
||||
where
|
||||
range r = let (w1,w2,w3,w4) = delinear (ord ch + r)
|
||||
in pushWord8 w1 >> pushWord8 w2 >> pushWord8 w3 >> pushWord8 w4
|
||||
arr off a = let ind = (ord ch - off)*5
|
||||
w1 = unsafeIndex a (ind+1)
|
||||
w2 = unsafeIndex a (ind+2)
|
||||
w3 = unsafeIndex a (ind+3)
|
||||
w4 = unsafeIndex a (ind+4)
|
||||
in do
|
||||
pushWord8 w1
|
||||
case unsafeIndex a ind of
|
||||
1 -> return ()
|
||||
2 -> pushWord8 w2
|
||||
3 -> pushWord8 w2 >> pushWord8 w3
|
||||
4 -> pushWord8 w2 >> pushWord8 w3 >> pushWord8 w4
|
||||
|
||||
linear :: Word8 -> Word8 -> Word8 -> Word8 -> Int
|
||||
linear w1 w2 w3 w4
|
||||
@ -157,32 +152,6 @@ delinear n = let
|
||||
,fromIntegral w3+0x81
|
||||
,fromIntegral w4+0x30)
|
||||
|
||||
decodeGB :: [Word8] -> (Char,[Word8])
|
||||
decodeGB (w1:rst)
|
||||
| w1 <=0x80 = (chr $ fromIntegral w1,rst) -- it's ascii
|
||||
| w1 <=0xFE = case rst of
|
||||
w2:rst2
|
||||
| w2 < 0x30 -> throwDyn (IllegalCharacter w2)
|
||||
| w2 <=0x39 -> case rst2 of
|
||||
w3:rst3
|
||||
| w3 < 0x81 -> throwDyn (IllegalCharacter w3)
|
||||
| w3 <=0xFE -> case rst3 of
|
||||
w4:rst4
|
||||
| w4 < 0x30 -> throwDyn (IllegalCharacter w4)
|
||||
| w4 <=0x39 -> let
|
||||
v = linear w1 w2 w3 w4
|
||||
in (decodeGBFour v,rst4)
|
||||
| otherwise -> throwDyn (IllegalCharacter w4)
|
||||
[] -> throwDyn UnexpectedEnd
|
||||
| otherwise -> throwDyn (IllegalCharacter w3)
|
||||
[] -> throwDyn UnexpectedEnd
|
||||
| w2 <=0x7E -> (decodeGBTwo (linear2 w1 w2),rst2)
|
||||
| w2 ==0x7F -> throwDyn (IllegalCharacter w2)
|
||||
| w2 <=0xFE -> (decodeGBTwo (linear2 w1 w2),rst2)
|
||||
| otherwise -> throwDyn (IllegalCharacter w2)
|
||||
[] -> throwDyn UnexpectedEnd
|
||||
| otherwise = throwDyn (IllegalCharacter w1)
|
||||
|
||||
decodeGBTwo :: Int -> Char
|
||||
decodeGBTwo n = let
|
||||
rn = n*2
|
||||
@ -190,7 +159,7 @@ decodeGBTwo n = let
|
||||
w2 = unsafeIndex rrarr (rn+1)
|
||||
in chr $ ((fromIntegral w1) `shiftL` 8) .|. (fromIntegral w2)
|
||||
|
||||
decodeGBFour :: Int -> Char
|
||||
decodeGBFour :: ByteSource m => Int -> m Char
|
||||
decodeGBFour v = if v<=17858 -- 1
|
||||
then (if v<=15582 -- 2
|
||||
then (if v<=11328 -- 3
|
||||
@ -241,15 +210,15 @@ decodeGBFour v = if v<=17858 -- 1
|
||||
else range range12))
|
||||
else (if v<=1237575 && v>=189000
|
||||
then range range13
|
||||
else throwDyn OutOfRange)))
|
||||
else throwException OutOfRange)))
|
||||
where
|
||||
arr off a = let
|
||||
v' = (v-off)*2
|
||||
w1 = unsafeIndex a v'
|
||||
w2 = unsafeIndex a (v'+1)
|
||||
in chr $ ((fromIntegral w1) `shiftL` 8)
|
||||
in return $ chr $ ((fromIntegral w1) `shiftL` 8)
|
||||
.|. (fromIntegral w2)
|
||||
range r = chr (v-r)
|
||||
range r = return $ chr (v-r)
|
||||
|
||||
range1,range2,range3,range4,range5,range6,range7,range8,range9,range10,range11,range12,range13 :: Int
|
||||
range1 = -286
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
{-# LANGUAGE CPP, MagicHash #-}
|
||||
{-# LANGUAGE CPP,MagicHash #-}
|
||||
module Data.Encoding.GB18030Data where
|
||||
|
||||
import Data.ByteString(ByteString)
|
||||
|
||||
@ -1,34 +1,75 @@
|
||||
{-# LANGUAGE CPP,TemplateHaskell #-}
|
||||
{- This module is used to create arrays from lists in template haskell -}
|
||||
|
||||
{-# LANGUAGE TemplateHaskell #-}
|
||||
module Data.Encoding.Helper.Template where
|
||||
|
||||
import Data.Encoding.Base
|
||||
import Data.Char
|
||||
import Data.Word
|
||||
import Data.Array.IArray (Array,array)
|
||||
import Data.Maybe (mapMaybe)
|
||||
import Data.Map as Map (fromList,lookup)
|
||||
import Data.Array
|
||||
import Language.Haskell.TH
|
||||
|
||||
createCharArray :: [(Integer,Char)] -> Integer -> Integer -> Q Exp
|
||||
#ifndef __HADDOCK__
|
||||
createCharArray lst = createArray (map (\(x,y) -> (x,LitE $ CharL y)) lst)
|
||||
#endif
|
||||
makeISOInstance :: String -> FilePath -> Q [Dec]
|
||||
makeISOInstance name file = do
|
||||
let rname = mkName name
|
||||
trans <- runIO (readTranslation file)
|
||||
mp <- encodingMap (validTranslations trans)
|
||||
arr <- decodingArray (fillTranslations trans)
|
||||
return [ DataD [] rname [] [NormalC rname []] [''Show]
|
||||
, InstanceD [] (AppT (ConT ''Encoding) (ConT rname))
|
||||
[FunD 'encodeChar
|
||||
[Clause [WildP] (NormalB $ AppE (VarE 'encodeWithMap) (VarE $ mkName "mp"))
|
||||
[ValD (VarP $ mkName "mp") (NormalB mp) []]
|
||||
]
|
||||
,FunD 'decodeChar
|
||||
[Clause [WildP] (NormalB $ AppE (VarE 'decodeWithArray) (VarE $ mkName "arr"))
|
||||
[ValD (VarP $ mkName "arr") (NormalB arr) []]
|
||||
]
|
||||
]
|
||||
]
|
||||
|
||||
createCharArray :: [(Integer,Maybe Char)] -> Integer -> Integer -> Q Exp
|
||||
createCharArray lst = createArray (map (\(x,y) -> (x,case y of
|
||||
Nothing -> ConE 'Nothing
|
||||
Just c -> AppE (ConE 'Just) (LitE $ CharL c))
|
||||
) lst)
|
||||
|
||||
|
||||
createArray :: [(Integer,Exp)] -> Integer -> Integer -> Q Exp
|
||||
#ifndef __HADDOCK__
|
||||
createArray lst from to = return $ AppE
|
||||
(AppE
|
||||
(VarE 'array)
|
||||
(TupE [LitE $ IntegerL from,LitE $ IntegerL to]))
|
||||
(ListE [ TupE [LitE $ IntegerL x,y]
|
||||
| (x,y) <- lst ])
|
||||
#endif
|
||||
|
||||
xmlArray :: [(Char,[Word8])] -> Integer -> Integer -> Q Exp
|
||||
#ifndef __HADDOCK__
|
||||
xmlArray lst l u = do
|
||||
let trans = map (\(ch,bin) ->
|
||||
(toInteger $ ord ch
|
||||
,TupE [LitE $ IntegerL (toInteger $ length bin),TupE $ map (\b -> LitE $ IntegerL (fromIntegral b)) bin ++ replicate (4-length bin) (LitE $ IntegerL 0)]
|
||||
)) (filter (\(c,_) -> ord c <= fromInteger u && ord c >= fromInteger l) lst)
|
||||
createArray trans l u
|
||||
#endif
|
||||
decodingArray :: [(Integer,Maybe Char)] -> Q Exp
|
||||
decodingArray trans = createCharArray trans 0 255
|
||||
|
||||
encodingMap :: [(Integer,Char)] -> Q Exp
|
||||
encodingMap trans = return $ AppE
|
||||
(VarE 'fromList)
|
||||
(ListE [ TupE [LitE $ CharL to,LitE $ IntegerL from]
|
||||
| (from,to) <- trans])
|
||||
|
||||
readTranslation :: FilePath -> IO [(Integer,Maybe Char)]
|
||||
readTranslation file = do
|
||||
cont <- readFile file
|
||||
return $ mapMaybe (\ln -> case ln of
|
||||
[] -> Nothing
|
||||
('#':xs) -> Nothing
|
||||
_ -> case words ln of
|
||||
(src:"#UNDEFINED":_) -> Just (read src,Nothing) -- XXX: Find a better way to handle this
|
||||
(src:trg:_) -> Just (read src,Just $ chr $ read trg)
|
||||
_ -> Nothing
|
||||
) (lines cont)
|
||||
|
||||
fillTranslations :: [(Integer,Maybe Char)] -> [(Integer,Maybe Char)]
|
||||
fillTranslations = fillTranslations' (-1)
|
||||
where
|
||||
fillTranslations' n ((n',c):cs) = (map (\i -> (i,Nothing)) [n+1..n'-1])++((n',c):fillTranslations' n' cs)
|
||||
fillTranslations' n [] = map (\i -> (i,Nothing)) [n+1..255]
|
||||
|
||||
validTranslations :: [(Integer,Maybe Char)] -> [(Integer,Char)]
|
||||
validTranslations = mapMaybe (\(n,mc) -> case mc of
|
||||
Nothing -> Nothing
|
||||
Just c -> Just (n,c))
|
||||
@ -1,26 +1,18 @@
|
||||
{-# LANGUAGE CPP,TemplateHaskell,DeriveDataTypeable #-}
|
||||
{- | Implements ISO\/IEC 8859-1 alias latin-1 encoding. See
|
||||
<http://en.wikipedia.org/wiki/ISO/IEC_8859-1> for further information.
|
||||
-}
|
||||
module Data.Encoding.ISO88591
|
||||
(ISO88591(..)
|
||||
) where
|
||||
module Data.Encoding.ISO88591 where
|
||||
|
||||
import Control.Throws
|
||||
import Data.Encoding.Base
|
||||
import Data.Char(ord,chr)
|
||||
import Data.Word
|
||||
import Control.Exception
|
||||
import Data.Typeable
|
||||
import Data.Encoding.Exception
|
||||
import Data.Encoding.ByteSource
|
||||
import Data.Encoding.ByteSink
|
||||
import Data.Char (ord,chr)
|
||||
|
||||
data ISO88591 = ISO88591 deriving (Eq,Show,Typeable)
|
||||
|
||||
enc :: Char -> Word8
|
||||
enc c = if ord c < 256
|
||||
then fromIntegral $ ord c
|
||||
else throwDyn (HasNoRepresentation c)
|
||||
data ISO88591 = ISO88591 deriving (Show)
|
||||
|
||||
instance Encoding ISO88591 where
|
||||
encode _ = encodeSinglebyte enc
|
||||
encodeLazy _ = encodeSinglebyteLazy enc
|
||||
encodable _ c = ord c < 256
|
||||
decode _ = decodeSinglebyte (chr.fromIntegral)
|
||||
encodeChar _ c
|
||||
| c > '\255' = throwException (HasNoRepresentation c)
|
||||
| otherwise = pushWord8 (fromIntegral $ ord c)
|
||||
decodeChar _ = do
|
||||
w <- fetchWord8
|
||||
return (chr $ fromIntegral w)
|
||||
@ -1,34 +1,6 @@
|
||||
{-# LANGUAGE CPP,TemplateHaskell,DeriveDataTypeable #-}
|
||||
module Data.Encoding.ISO885910
|
||||
(ISO885910(..)) where
|
||||
{-# LANGUAGE TemplateHaskell #-}
|
||||
module Data.Encoding.ISO885910 where
|
||||
|
||||
import Data.Array ((!),Array)
|
||||
import Data.Word (Word8)
|
||||
import Data.Map (Map,lookup,member)
|
||||
import Data.Encoding.Base
|
||||
import Prelude hiding (lookup)
|
||||
import Control.Exception (throwDyn)
|
||||
import Data.Typeable
|
||||
import Data.Encoding.Helper.Template (makeISOInstance)
|
||||
|
||||
data ISO885910 = ISO885910 deriving (Eq,Show,Typeable)
|
||||
|
||||
enc :: Char -> Word8
|
||||
enc c = case lookup c encodeMap of
|
||||
Just v -> v
|
||||
Nothing -> throwDyn (HasNoRepresentation c)
|
||||
|
||||
instance Encoding ISO885910 where
|
||||
encode _ = encodeSinglebyte enc
|
||||
encodeLazy _ = encodeSinglebyteLazy enc
|
||||
encodable _ c = member c encodeMap
|
||||
decode _ = decodeSinglebyte (decodeArr!)
|
||||
|
||||
decodeArr :: Array Word8 Char
|
||||
#ifndef __HADDOCK__
|
||||
decodeArr = $(decodingArray "8859-10.TXT")
|
||||
#endif
|
||||
|
||||
encodeMap :: Map Char Word8
|
||||
#ifndef __HADDOCK__
|
||||
encodeMap = $(encodingMap "8859-10.TXT")
|
||||
#endif
|
||||
$( makeISOInstance "ISO885910" "8859-10.TXT" )
|
||||
@ -1,34 +1,6 @@
|
||||
{-# LANGUAGE CPP,TemplateHaskell,DeriveDataTypeable #-}
|
||||
module Data.Encoding.ISO885911
|
||||
(ISO885911(..)) where
|
||||
{-# LANGUAGE TemplateHaskell #-}
|
||||
module Data.Encoding.ISO885911 where
|
||||
|
||||
import Data.Array ((!),Array)
|
||||
import Data.Word (Word8)
|
||||
import Data.Map (Map,lookup,member)
|
||||
import Data.Encoding.Base
|
||||
import Prelude hiding (lookup)
|
||||
import Control.Exception (throwDyn)
|
||||
import Data.Typeable
|
||||
import Data.Encoding.Helper.Template (makeISOInstance)
|
||||
|
||||
data ISO885911 = ISO885911 deriving (Eq,Show,Typeable)
|
||||
|
||||
enc :: Char -> Word8
|
||||
enc c = case lookup c encodeMap of
|
||||
Just v -> v
|
||||
Nothing -> throwDyn (HasNoRepresentation c)
|
||||
|
||||
instance Encoding ISO885911 where
|
||||
encode _ = encodeSinglebyte enc
|
||||
encodeLazy _ = encodeSinglebyteLazy enc
|
||||
encodable _ c = member c encodeMap
|
||||
decode _ = decodeSinglebyte (decodeArr!)
|
||||
|
||||
decodeArr :: Array Word8 Char
|
||||
#ifndef __HADDOCK__
|
||||
decodeArr = $(decodingArray "8859-11.TXT")
|
||||
#endif
|
||||
|
||||
encodeMap :: Map Char Word8
|
||||
#ifndef __HADDOCK__
|
||||
encodeMap = $(encodingMap "8859-11.TXT")
|
||||
#endif
|
||||
$( makeISOInstance "ISO885911" "8859-11.TXT" )
|
||||
@ -1,34 +1,6 @@
|
||||
{-# LANGUAGE CPP,TemplateHaskell,DeriveDataTypeable #-}
|
||||
module Data.Encoding.ISO885913
|
||||
(ISO885913(..)) where
|
||||
{-# LANGUAGE TemplateHaskell #-}
|
||||
module Data.Encoding.ISO885913 where
|
||||
|
||||
import Data.Array ((!),Array)
|
||||
import Data.Word (Word8)
|
||||
import Data.Map (Map,lookup,member)
|
||||
import Data.Encoding.Base
|
||||
import Prelude hiding (lookup)
|
||||
import Control.Exception (throwDyn)
|
||||
import Data.Typeable
|
||||
import Data.Encoding.Helper.Template (makeISOInstance)
|
||||
|
||||
data ISO885913 = ISO885913 deriving (Eq,Show,Typeable)
|
||||
|
||||
enc :: Char -> Word8
|
||||
enc c = case lookup c encodeMap of
|
||||
Just v -> v
|
||||
Nothing -> throwDyn (HasNoRepresentation c)
|
||||
|
||||
instance Encoding ISO885913 where
|
||||
encode _ = encodeSinglebyte enc
|
||||
encodeLazy _ = encodeSinglebyteLazy enc
|
||||
encodable _ c = member c encodeMap
|
||||
decode _ = decodeSinglebyte (decodeArr!)
|
||||
|
||||
decodeArr :: Array Word8 Char
|
||||
#ifndef __HADDOCK__
|
||||
decodeArr = $(decodingArray "8859-13.TXT")
|
||||
#endif
|
||||
|
||||
encodeMap :: Map Char Word8
|
||||
#ifndef __HADDOCK__
|
||||
encodeMap = $(encodingMap "8859-13.TXT")
|
||||
#endif
|
||||
$( makeISOInstance "ISO885913" "8859-13.TXT" )
|
||||
@ -1,34 +1,6 @@
|
||||
{-# LANGUAGE CPP,TemplateHaskell,DeriveDataTypeable #-}
|
||||
module Data.Encoding.ISO885914
|
||||
(ISO885914(..)) where
|
||||
{-# LANGUAGE TemplateHaskell #-}
|
||||
module Data.Encoding.ISO885914 where
|
||||
|
||||
import Data.Array ((!),Array)
|
||||
import Data.Word (Word8)
|
||||
import Data.Map (Map,lookup,member)
|
||||
import Data.Encoding.Base
|
||||
import Prelude hiding (lookup)
|
||||
import Control.Exception (throwDyn)
|
||||
import Data.Typeable
|
||||
import Data.Encoding.Helper.Template (makeISOInstance)
|
||||
|
||||
data ISO885914 = ISO885914 deriving (Eq,Show,Typeable)
|
||||
|
||||
enc :: Char -> Word8
|
||||
enc c = case lookup c encodeMap of
|
||||
Just v -> v
|
||||
Nothing -> throwDyn (HasNoRepresentation c)
|
||||
|
||||
instance Encoding ISO885914 where
|
||||
encode _ = encodeSinglebyte enc
|
||||
encodeLazy _ = encodeSinglebyteLazy enc
|
||||
encodable _ c = member c encodeMap
|
||||
decode _ = decodeSinglebyte (decodeArr!)
|
||||
|
||||
decodeArr :: Array Word8 Char
|
||||
#ifndef __HADDOCK__
|
||||
decodeArr = $(decodingArray "8859-14.TXT")
|
||||
#endif
|
||||
|
||||
encodeMap :: Map Char Word8
|
||||
#ifndef __HADDOCK__
|
||||
encodeMap = $(encodingMap "8859-14.TXT")
|
||||
#endif
|
||||
$( makeISOInstance "ISO885914" "8859-14.TXT" )
|
||||
@ -1,34 +1,6 @@
|
||||
{-# LANGUAGE CPP,TemplateHaskell,DeriveDataTypeable #-}
|
||||
module Data.Encoding.ISO885915
|
||||
(ISO885915(..)) where
|
||||
{-# LANGUAGE TemplateHaskell #-}
|
||||
module Data.Encoding.ISO885915 where
|
||||
|
||||
import Data.Array ((!),Array)
|
||||
import Data.Word (Word8)
|
||||
import Data.Map (Map,lookup,member)
|
||||
import Data.Encoding.Base
|
||||
import Prelude hiding (lookup)
|
||||
import Control.Exception (throwDyn)
|
||||
import Data.Typeable
|
||||
import Data.Encoding.Helper.Template (makeISOInstance)
|
||||
|
||||
data ISO885915 = ISO885915 deriving (Eq,Show,Typeable)
|
||||
|
||||
enc :: Char -> Word8
|
||||
enc c = case lookup c encodeMap of
|
||||
Just v -> v
|
||||
Nothing -> throwDyn (HasNoRepresentation c)
|
||||
|
||||
instance Encoding ISO885915 where
|
||||
encode _ = encodeSinglebyte enc
|
||||
encodeLazy _ = encodeSinglebyteLazy enc
|
||||
encodable _ c = member c encodeMap
|
||||
decode _ = decodeSinglebyte (decodeArr!)
|
||||
|
||||
decodeArr :: Array Word8 Char
|
||||
#ifndef __HADDOCK__
|
||||
decodeArr = $(decodingArray "8859-15.TXT")
|
||||
#endif
|
||||
|
||||
encodeMap :: Map Char Word8
|
||||
#ifndef __HADDOCK__
|
||||
encodeMap = $(encodingMap "8859-15.TXT")
|
||||
#endif
|
||||
$( makeISOInstance "ISO885915" "8859-15.TXT" )
|
||||
@ -1,34 +1,6 @@
|
||||
{-# LANGUAGE CPP,TemplateHaskell,DeriveDataTypeable #-}
|
||||
module Data.Encoding.ISO885916
|
||||
(ISO885916(..)) where
|
||||
{-# LANGUAGE TemplateHaskell #-}
|
||||
module Data.Encoding.ISO885916 where
|
||||
|
||||
import Data.Array ((!),Array)
|
||||
import Data.Word (Word8)
|
||||
import Data.Map (Map,lookup,member)
|
||||
import Data.Encoding.Base
|
||||
import Prelude hiding (lookup)
|
||||
import Control.Exception (throwDyn)
|
||||
import Data.Typeable
|
||||
import Data.Encoding.Helper.Template (makeISOInstance)
|
||||
|
||||
data ISO885916 = ISO885916 deriving (Eq,Show,Typeable)
|
||||
|
||||
enc :: Char -> Word8
|
||||
enc c = case lookup c encodeMap of
|
||||
Just v -> v
|
||||
Nothing -> throwDyn (HasNoRepresentation c)
|
||||
|
||||
instance Encoding ISO885916 where
|
||||
encode _ = encodeSinglebyte enc
|
||||
encodeLazy _ = encodeSinglebyteLazy enc
|
||||
encodable _ c = member c encodeMap
|
||||
decode _ = decodeSinglebyte (decodeArr!)
|
||||
|
||||
decodeArr :: Array Word8 Char
|
||||
#ifndef __HADDOCK__
|
||||
decodeArr = $(decodingArray "8859-16.TXT")
|
||||
#endif
|
||||
|
||||
encodeMap :: Map Char Word8
|
||||
#ifndef __HADDOCK__
|
||||
encodeMap = $(encodingMap "8859-16.TXT")
|
||||
#endif
|
||||
$( makeISOInstance "ISO885916" "8859-16.TXT" )
|
||||
@ -1,38 +1,6 @@
|
||||
{-# LANGUAGE CPP,TemplateHaskell,DeriveDataTypeable #-}
|
||||
{- | Implements ISO\/IEC 8859-2 alias latin-2 encoding. See
|
||||
<http://en.wikipedia.org/wiki/ISO/IEC_8859-2> for further informations.
|
||||
-}
|
||||
module Data.Encoding.ISO88592
|
||||
(ISO88592(..)) where
|
||||
{-# LANGUAGE TemplateHaskell #-}
|
||||
module Data.Encoding.ISO88592 where
|
||||
|
||||
import Data.Array
|
||||
import Data.Map hiding ((!))
|
||||
import Data.Word
|
||||
import Data.Encoding.Base
|
||||
import Data.ByteString hiding (length,map)
|
||||
import Prelude hiding (lookup,all)
|
||||
import Control.Exception
|
||||
import Data.Typeable
|
||||
import Data.Encoding.Helper.Template (makeISOInstance)
|
||||
|
||||
data ISO88592 = ISO88592 deriving (Eq,Show,Typeable)
|
||||
|
||||
enc :: Char -> Word8
|
||||
enc c = case lookup c encodeMap of
|
||||
Just v -> v
|
||||
Nothing -> throwDyn (HasNoRepresentation c)
|
||||
|
||||
instance Encoding ISO88592 where
|
||||
encode _ = encodeSinglebyte enc
|
||||
encodeLazy _ = encodeSinglebyteLazy enc
|
||||
encodable _ c = member c encodeMap
|
||||
decode _ = decodeSinglebyte (\w -> decodeArr!w)
|
||||
|
||||
decodeArr :: Array Word8 Char
|
||||
#ifndef __HADDOCK__
|
||||
decodeArr = $(decodingArray "8859-2.TXT")
|
||||
#endif
|
||||
|
||||
encodeMap :: Map Char Word8
|
||||
#ifndef __HADDOCK__
|
||||
encodeMap = $(encodingMap "8859-2.TXT")
|
||||
#endif
|
||||
$( makeISOInstance "ISO88592" "8859-2.TXT" )
|
||||
@ -1,36 +1,6 @@
|
||||
{-# LANGUAGE CPP,TemplateHaskell,DeriveDataTypeable #-}
|
||||
{- | Implements ISO 8859-3 encoding, alias latin-3, alias south european
|
||||
-}
|
||||
module Data.Encoding.ISO88593
|
||||
(ISO88593(..)) where
|
||||
{-# LANGUAGE TemplateHaskell #-}
|
||||
module Data.Encoding.ISO88593 where
|
||||
|
||||
import Data.Array ((!),Array)
|
||||
import Data.Word (Word8)
|
||||
import Data.Map (Map,lookup,member)
|
||||
import Data.Encoding.Base
|
||||
import Prelude hiding (lookup)
|
||||
import Control.Exception (throwDyn)
|
||||
import Data.Typeable
|
||||
import Data.Encoding.Helper.Template (makeISOInstance)
|
||||
|
||||
data ISO88593 = ISO88593 deriving (Eq,Show,Typeable)
|
||||
|
||||
enc :: Char -> Word8
|
||||
enc c = case lookup c encodeMap of
|
||||
Just v -> v
|
||||
Nothing -> throwDyn (HasNoRepresentation c)
|
||||
|
||||
instance Encoding ISO88593 where
|
||||
encode _ = encodeSinglebyte enc
|
||||
encodeLazy _ = encodeSinglebyteLazy enc
|
||||
encodable _ c = member c encodeMap
|
||||
decode _ = decodeSinglebyte (decodeArr!)
|
||||
|
||||
decodeArr :: Array Word8 Char
|
||||
#ifndef __HADDOCK__
|
||||
decodeArr = $(decodingArray "8859-3.TXT")
|
||||
#endif
|
||||
|
||||
encodeMap :: Map Char Word8
|
||||
#ifndef __HADDOCK__
|
||||
encodeMap = $(encodingMap "8859-3.TXT")
|
||||
#endif
|
||||
$( makeISOInstance "ISO88593" "8859-3.TXT" )
|
||||
@ -1,34 +1,6 @@
|
||||
{-# LANGUAGE CPP,TemplateHaskell,DeriveDataTypeable #-}
|
||||
module Data.Encoding.ISO88594
|
||||
(ISO88594(..)) where
|
||||
{-# LANGUAGE TemplateHaskell #-}
|
||||
module Data.Encoding.ISO88594 where
|
||||
|
||||
import Data.Array ((!),Array)
|
||||
import Data.Word (Word8)
|
||||
import Data.Map (Map,lookup,member)
|
||||
import Data.Encoding.Base
|
||||
import Prelude hiding (lookup)
|
||||
import Control.Exception (throwDyn)
|
||||
import Data.Typeable
|
||||
import Data.Encoding.Helper.Template (makeISOInstance)
|
||||
|
||||
data ISO88594 = ISO88594 deriving (Eq,Show,Typeable)
|
||||
|
||||
enc :: Char -> Word8
|
||||
enc c = case lookup c encodeMap of
|
||||
Just v -> v
|
||||
Nothing -> throwDyn (HasNoRepresentation c)
|
||||
|
||||
instance Encoding ISO88594 where
|
||||
encode _ = encodeSinglebyte enc
|
||||
encodeLazy _ = encodeSinglebyteLazy enc
|
||||
encodable _ c = member c encodeMap
|
||||
decode _ = decodeSinglebyte (decodeArr!)
|
||||
|
||||
decodeArr :: Array Word8 Char
|
||||
#ifndef __HADDOCK__
|
||||
decodeArr = $(decodingArray "8859-4.TXT")
|
||||
#endif
|
||||
|
||||
encodeMap :: Map Char Word8
|
||||
#ifndef __HADDOCK__
|
||||
encodeMap = $(encodingMap "8859-4.TXT")
|
||||
#endif
|
||||
$( makeISOInstance "ISO88594" "8859-4.TXT" )
|
||||
@ -1,30 +1,6 @@
|
||||
{-# LANGUAGE CPP,TemplateHaskell,DeriveDataTypeable #-}
|
||||
module Data.Encoding.ISO88595
|
||||
(ISO88595(..)) where
|
||||
{-# LANGUAGE TemplateHaskell #-}
|
||||
module Data.Encoding.ISO88595 where
|
||||
|
||||
import Data.Array ((!),Array)
|
||||
import Data.Word (Word8)
|
||||
import Data.Map (Map,lookup,member)
|
||||
import Data.Encoding.Base
|
||||
import Prelude hiding (lookup)
|
||||
import Control.Exception (throwDyn)
|
||||
import Data.Typeable
|
||||
import Data.Encoding.Helper.Template (makeISOInstance)
|
||||
|
||||
data ISO88595 = ISO88595 deriving (Eq,Show,Typeable)
|
||||
|
||||
instance Encoding ISO88595 where
|
||||
encode _ = encodeSinglebyte (\c -> case lookup c encodeMap of
|
||||
Just v -> v
|
||||
Nothing -> throwDyn (HasNoRepresentation c))
|
||||
encodable _ c = member c encodeMap
|
||||
decode _ = decodeSinglebyte (decodeArr!)
|
||||
|
||||
decodeArr :: Array Word8 Char
|
||||
#ifndef __HADDOCK__
|
||||
decodeArr = $(decodingArray "8859-5.TXT")
|
||||
#endif
|
||||
|
||||
encodeMap :: Map Char Word8
|
||||
#ifndef __HADDOCK__
|
||||
encodeMap = $(encodingMap "8859-5.TXT")
|
||||
#endif
|
||||
$( makeISOInstance "ISO88595" "8859-5.TXT" )
|
||||
@ -1,30 +1,6 @@
|
||||
{-# LANGUAGE CPP,TemplateHaskell,DeriveDataTypeable #-}
|
||||
module Data.Encoding.ISO88596
|
||||
(ISO88596(..)) where
|
||||
{-# LANGUAGE TemplateHaskell #-}
|
||||
module Data.Encoding.ISO88596 where
|
||||
|
||||
import Data.Array ((!),Array)
|
||||
import Data.Word (Word8)
|
||||
import Data.Map (Map,lookup,member)
|
||||
import Data.Encoding.Base
|
||||
import Prelude hiding (lookup)
|
||||
import Control.Exception (throwDyn)
|
||||
import Data.Typeable
|
||||
import Data.Encoding.Helper.Template (makeISOInstance)
|
||||
|
||||
data ISO88596 = ISO88596 deriving (Eq,Show,Typeable)
|
||||
|
||||
instance Encoding ISO88596 where
|
||||
encode _ = encodeSinglebyte (\c -> case lookup c encodeMap of
|
||||
Just v -> v
|
||||
Nothing -> throwDyn (HasNoRepresentation c))
|
||||
encodable _ c = member c encodeMap
|
||||
decode _ = decodeSinglebyte (decodeArr!)
|
||||
|
||||
decodeArr :: Array Word8 Char
|
||||
#ifndef __HADDOCK__
|
||||
decodeArr = $(decodingArray "8859-6.TXT")
|
||||
#endif
|
||||
|
||||
encodeMap :: Map Char Word8
|
||||
#ifndef __HADDOCK__
|
||||
encodeMap = $(encodingMap "8859-6.TXT")
|
||||
#endif
|
||||
$( makeISOInstance "ISO88596" "8859-6.TXT" )
|
||||
@ -1,30 +1,6 @@
|
||||
{-# LANGUAGE CPP,TemplateHaskell,DeriveDataTypeable #-}
|
||||
module Data.Encoding.ISO88597
|
||||
(ISO88597(..)) where
|
||||
{-# LANGUAGE TemplateHaskell #-}
|
||||
module Data.Encoding.ISO88597 where
|
||||
|
||||
import Data.Array ((!),Array)
|
||||
import Data.Word (Word8)
|
||||
import Data.Map (Map,lookup,member)
|
||||
import Data.Encoding.Base
|
||||
import Prelude hiding (lookup)
|
||||
import Control.Exception (throwDyn)
|
||||
import Data.Typeable
|
||||
import Data.Encoding.Helper.Template (makeISOInstance)
|
||||
|
||||
data ISO88597 = ISO88597 deriving (Eq,Show,Typeable)
|
||||
|
||||
instance Encoding ISO88597 where
|
||||
encode _ = encodeSinglebyte (\c -> case lookup c encodeMap of
|
||||
Just v -> v
|
||||
Nothing -> throwDyn (HasNoRepresentation c))
|
||||
encodable _ c = member c encodeMap
|
||||
decode _ = decodeSinglebyte (decodeArr!)
|
||||
|
||||
decodeArr :: Array Word8 Char
|
||||
#ifndef __HADDOCK__
|
||||
decodeArr = $(decodingArray "8859-7.TXT")
|
||||
#endif
|
||||
|
||||
encodeMap :: Map Char Word8
|
||||
#ifndef __HADDOCK__
|
||||
encodeMap = $(encodingMap "8859-7.TXT")
|
||||
#endif
|
||||
$( makeISOInstance "ISO88597" "8859-7.TXT" )
|
||||
@ -1,30 +1,6 @@
|
||||
{-# LANGUAGE CPP,TemplateHaskell,DeriveDataTypeable #-}
|
||||
module Data.Encoding.ISO88598
|
||||
(ISO88598(..)) where
|
||||
{-# LANGUAGE TemplateHaskell #-}
|
||||
module Data.Encoding.ISO88598 where
|
||||
|
||||
import Data.Array ((!),Array)
|
||||
import Data.Word (Word8)
|
||||
import Data.Map (Map,lookup,member)
|
||||
import Data.Encoding.Base
|
||||
import Prelude hiding (lookup)
|
||||
import Control.Exception (throwDyn)
|
||||
import Data.Typeable
|
||||
import Data.Encoding.Helper.Template (makeISOInstance)
|
||||
|
||||
data ISO88598 = ISO88598 deriving (Eq,Show,Typeable)
|
||||
|
||||
instance Encoding ISO88598 where
|
||||
encode _ = encodeSinglebyte (\c -> case lookup c encodeMap of
|
||||
Just v -> v
|
||||
Nothing -> throwDyn (HasNoRepresentation c))
|
||||
encodable _ c = member c encodeMap
|
||||
decode _ = decodeSinglebyte (decodeArr!)
|
||||
|
||||
decodeArr :: Array Word8 Char
|
||||
#ifndef __HADDOCK__
|
||||
decodeArr = $(decodingArray "8859-8.TXT")
|
||||
#endif
|
||||
|
||||
encodeMap :: Map Char Word8
|
||||
#ifndef __HADDOCK__
|
||||
encodeMap = $(encodingMap "8859-8.TXT")
|
||||
#endif
|
||||
$( makeISOInstance "ISO88598" "8859-8.TXT" )
|
||||
@ -1,30 +1,6 @@
|
||||
{-# LANGUAGE CPP,TemplateHaskell,DeriveDataTypeable #-}
|
||||
module Data.Encoding.ISO88599
|
||||
(ISO88599(..)) where
|
||||
{-# LANGUAGE TemplateHaskell #-}
|
||||
module Data.Encoding.ISO88599 where
|
||||
|
||||
import Data.Array ((!),Array)
|
||||
import Data.Word (Word8)
|
||||
import Data.Map (Map,lookup,member)
|
||||
import Data.Encoding.Base
|
||||
import Prelude hiding (lookup)
|
||||
import Control.Exception (throwDyn)
|
||||
import Data.Typeable
|
||||
import Data.Encoding.Helper.Template (makeISOInstance)
|
||||
|
||||
data ISO88599 = ISO88599 deriving (Eq,Show,Typeable)
|
||||
|
||||
instance Encoding ISO88599 where
|
||||
encode _ = encodeSinglebyte (\c -> case lookup c encodeMap of
|
||||
Just v -> v
|
||||
Nothing -> throwDyn (HasNoRepresentation c))
|
||||
encodable _ c = member c encodeMap
|
||||
decode _ = decodeSinglebyte (decodeArr!)
|
||||
|
||||
decodeArr :: Array Word8 Char
|
||||
#ifndef __HADDOCK__
|
||||
decodeArr = $(decodingArray "8859-9.TXT")
|
||||
#endif
|
||||
|
||||
encodeMap :: Map Char Word8
|
||||
#ifndef __HADDOCK__
|
||||
encodeMap = $(encodingMap "8859-9.TXT")
|
||||
#endif
|
||||
$( makeISOInstance "ISO88599" "8859-9.TXT" )
|
||||
@ -5,16 +5,18 @@
|
||||
module Data.Encoding.KOI8R
|
||||
(KOI8R(..)) where
|
||||
|
||||
import Control.Exception (throwDyn)
|
||||
import Control.Throws
|
||||
import Data.Array.Unboxed
|
||||
import Data.Char (ord,chr)
|
||||
import qualified Data.ByteString.Lazy as Lazy
|
||||
import Data.Map hiding (map,(!))
|
||||
import Data.Word
|
||||
import Prelude hiding (lookup)
|
||||
import Data.Typeable
|
||||
|
||||
import Data.Encoding.Base
|
||||
import Data.Encoding.ByteSource
|
||||
import Data.Encoding.ByteSink
|
||||
import Data.Encoding.Exception
|
||||
|
||||
data KOI8R = KOI8R deriving (Eq,Show,Typeable)
|
||||
|
||||
@ -44,22 +46,14 @@ koi8rList =
|
||||
,'\x042c','\x042b','\x0417','\x0428','\x042d','\x0429','\x0427','\x042a'
|
||||
]
|
||||
|
||||
koi8rDecode :: Word8 -> Char
|
||||
koi8rDecode ch
|
||||
| ch < 128 = chr $ fromIntegral ch
|
||||
| otherwise = koi8rArr!ch
|
||||
|
||||
koi8rEncode :: Char -> Word8
|
||||
koi8rEncode ch
|
||||
| ch < '\128' = fromIntegral $ ord ch
|
||||
| otherwise = case lookup ch koi8rMap of
|
||||
Just w -> w
|
||||
Nothing -> throwDyn (HasNoRepresentation ch)
|
||||
|
||||
instance Encoding KOI8R where
|
||||
encode _ = encodeSinglebyte koi8rEncode
|
||||
encodeLazy _ = encodeSinglebyteLazy koi8rEncode
|
||||
encodable _ c = (c < '\128') || (member c koi8rMap)
|
||||
decode _ = decodeSinglebyte koi8rDecode
|
||||
decodeLazy _ str = concatMap (decodeSinglebyte koi8rDecode) (Lazy.toChunks str)
|
||||
decodable _ = const True
|
||||
decodeChar _ = do
|
||||
w <- fetchWord8
|
||||
if w < 128
|
||||
then return $ chr $ fromIntegral w
|
||||
else return $ koi8rArr!w
|
||||
encodeChar _ ch
|
||||
| ch < '\128' = pushWord8 $ fromIntegral $ ord ch
|
||||
| otherwise = case lookup ch koi8rMap of
|
||||
Just w -> pushWord8 w
|
||||
Nothing -> throwException (HasNoRepresentation ch)
|
||||
@ -5,13 +5,16 @@
|
||||
module Data.Encoding.KOI8U
|
||||
(KOI8U(..)) where
|
||||
|
||||
import Control.Exception (throwDyn)
|
||||
import Data.Encoding.Base
|
||||
import Data.Encoding.ByteSource
|
||||
import Data.Encoding.ByteSink
|
||||
import Data.Encoding.Exception
|
||||
|
||||
import Control.Throws
|
||||
import Data.Word
|
||||
import Data.Array.Unboxed
|
||||
import Data.Encoding.Base
|
||||
import Data.Char (chr,ord)
|
||||
import Data.Map (Map,fromList,lookup,member)
|
||||
import qualified Data.ByteString.Lazy as Lazy
|
||||
import Prelude hiding (lookup)
|
||||
import Data.Typeable
|
||||
|
||||
@ -43,22 +46,14 @@ koi8uList =
|
||||
,'\x042c','\x042b','\x0417','\x0428','\x042d','\x0429','\x0427','\x042a'
|
||||
]
|
||||
|
||||
koi8uDecode :: Word8 -> Char
|
||||
koi8uDecode ch
|
||||
| ch < 128 = chr $ fromIntegral ch
|
||||
| otherwise = koi8uArr!ch
|
||||
|
||||
koi8uEncode :: Char -> Word8
|
||||
koi8uEncode ch
|
||||
| ch < '\128' = fromIntegral $ ord ch
|
||||
| otherwise = case lookup ch koi8uMap of
|
||||
Just w -> w
|
||||
Nothing -> throwDyn (HasNoRepresentation ch)
|
||||
|
||||
instance Encoding KOI8U where
|
||||
encode _ = encodeSinglebyte koi8uEncode
|
||||
encodeLazy _ = encodeSinglebyteLazy koi8uEncode
|
||||
encodable _ c = (c < '\128') || (member c koi8uMap)
|
||||
decode _ = decodeSinglebyte koi8uDecode
|
||||
decodeLazy _ str = concatMap (decodeSinglebyte koi8uDecode) (Lazy.toChunks str)
|
||||
decodable _ = const True
|
||||
decodeChar _ = do
|
||||
w <- fetchWord8
|
||||
if w < 128
|
||||
then return $ chr $ fromIntegral w
|
||||
else return $ koi8uArr!w
|
||||
encodeChar _ ch
|
||||
| ch < '\128' = pushWord8 $ fromIntegral $ ord ch
|
||||
| otherwise = case lookup ch koi8uMap of
|
||||
Just w -> pushWord8 w
|
||||
Nothing -> throwException (HasNoRepresentation ch)
|
||||
@ -3,150 +3,79 @@
|
||||
See <http://en.wikipedia.org/wiki/UTF-16> for more information.
|
||||
-}
|
||||
module Data.Encoding.UTF16
|
||||
(UTF16(..)
|
||||
) where
|
||||
(UTF16(..)
|
||||
) where
|
||||
|
||||
import Data.Encoding.Base
|
||||
import Data.Char(ord,chr)
|
||||
import Data.Encoding.ByteSink
|
||||
import Data.Encoding.ByteSource
|
||||
import Data.Encoding.Exception
|
||||
|
||||
import Control.Throws
|
||||
import Data.Bits
|
||||
import Data.Int
|
||||
import Data.Word
|
||||
import Data.ByteString as BS
|
||||
import qualified Data.ByteString.Lazy as LBS
|
||||
import Prelude hiding (length)
|
||||
import Control.Exception
|
||||
import Data.Dynamic (toDyn)
|
||||
import Data.Char
|
||||
import Data.Typeable
|
||||
import Data.Word
|
||||
|
||||
data UTF16
|
||||
= UTF16 -- ^ Decodes big and little endian, encodes big endian.
|
||||
| UTF16BE -- ^ Big endian decoding and encoding, fails if the string isn\'t actually big endian.
|
||||
| UTF16LE -- ^ Little endian decoding and encoding.
|
||||
deriving (Eq,Show,Typeable)
|
||||
= UTF16 -- ^ Decodes big and little endian, encodes big endian.
|
||||
| UTF16BE -- ^ Big endian decoding and encoding, fails if the string isn\'t actually big endian.
|
||||
| UTF16LE -- ^ Little endian decoding and encoding.
|
||||
deriving (Eq,Show,Typeable)
|
||||
|
||||
utf16enc :: Bool -> (EncodeState,String) -> Maybe (Word8,(EncodeState,String))
|
||||
utf16enc _ (Done,[]) = Nothing
|
||||
utf16enc True (Done,x:xs)
|
||||
| n<=0x0000FFFF = Just
|
||||
(fromIntegral $ n `shiftR` 8
|
||||
,(Put1 (fromIntegral $ n),xs))
|
||||
| n<=0x0010FFFF = Just
|
||||
(fromIntegral $ 0xD8 .|. (n' `shiftR` 18)
|
||||
,(Put3 (fromIntegral $ (n' `shiftR` 10))
|
||||
(fromIntegral $
|
||||
0xDC .|. ((n' `shiftR` 8) .&. 0x03))
|
||||
(fromIntegral n'),xs))
|
||||
| otherwise = throwDyn $ HasNoRepresentation x
|
||||
where
|
||||
n = ord x
|
||||
n' = n - 0x10000
|
||||
utf16enc False (Done,x:xs)
|
||||
| n<=0x0000FFFF = Just
|
||||
(fromIntegral $ n
|
||||
,(Put1 (fromIntegral $ n `shiftR` 8),xs))
|
||||
| n<=0x0010FFFF = Just
|
||||
(fromIntegral n'
|
||||
,(Put3 (fromIntegral $
|
||||
0xDC .|. ((n' `shiftR` 8) .&. 0x03))
|
||||
(fromIntegral $ (n' `shiftR` 10))
|
||||
(fromIntegral $ 0xD8 .|. (n' `shiftR` 18)),xs))
|
||||
| otherwise = throwDyn $ HasNoRepresentation x
|
||||
where
|
||||
n = ord x
|
||||
n' = n - 0x10000
|
||||
utf16enc _ (Put3 w1 w2 w3,xs) = Just (w1,(Put2 w2 w3,xs))
|
||||
utf16enc _ (Put2 w1 w2,xs) = Just (w1,(Put1 w2,xs))
|
||||
utf16enc _ (Put1 w1,xs) = Just (w1,(Done,xs))
|
||||
readBOM :: ByteSource m => m (Either Char UTF16)
|
||||
readBOM = do
|
||||
ch <- decodeChar UTF16
|
||||
case ch of
|
||||
'\xFEFF' -> return (Right UTF16BE)
|
||||
'\xFFFE' -> return (Right UTF16LE)
|
||||
_ -> return (Left ch)
|
||||
|
||||
decodeUTF16 :: ByteSource m => (m Word16) -> m Char
|
||||
decodeUTF16 fetch = do
|
||||
w1 <- fetch
|
||||
if w1 < 0xD800 || w1 > 0xDFFF
|
||||
then return (chr $ fromIntegral w1)
|
||||
else (if w1 > 0xDBFF
|
||||
then throwException (IllegalCharacter (fromIntegral (w1 `shiftR` 8)))
|
||||
else (do
|
||||
w2 <- fetch
|
||||
if w2 < 0xDC00 || w2 > 0xDFFF
|
||||
then throwException (IllegalCharacter (fromIntegral (w2 `shiftR` 8)))
|
||||
else let v = ((fromIntegral (w1 .&. 0x3FF)) `shiftL` 10)
|
||||
.|. (fromIntegral (w2 .&. 0x3FF))
|
||||
in return $ chr (v+0x10000)
|
||||
)
|
||||
)
|
||||
|
||||
{-# SPECIALIZE utf16dec :: Bool -> Word8 -> Word8 -> Word8 -> Word8 -> (Char,Int) #-}
|
||||
{-# SPECIALIZE utf16dec :: Bool -> Word8 -> Word8 -> Word8 -> Word8 -> (Char,Int64) #-}
|
||||
utf16dec :: Num a => Bool -> Word8 -> Word8 -> Word8 -> Word8 -> (Char,a)
|
||||
utf16dec be s1 s2 s3 s4
|
||||
| w1< 0xD8 || w1> 0xDF
|
||||
= (chr $ ((fromIntegral w1) `shiftL` 8) .|. (fromIntegral w2),2)
|
||||
| w1> 0xDB = throwDyn $ IllegalCharacter w1
|
||||
| w3< 0xDC || w3>0xDF = throwDyn $ IllegalCharacter w3
|
||||
| otherwise = (chr $ (((fromIntegral w1 .&. 0x03) `shiftL` 18)
|
||||
.|. ((fromIntegral w2) `shiftL` 10)
|
||||
.|. ((fromIntegral w3 .&. 0x03) `shiftL` 8)
|
||||
.|. (fromIntegral w4)) + 0x10000,4)
|
||||
where
|
||||
(w1,w2,w3,w4) = if be then (s1,s2,s3,s4) else (s2,s1,s4,s3)
|
||||
encodeUTF16 :: ByteSink m => (Word16 -> m ()) -> Char -> m ()
|
||||
encodeUTF16 push ch
|
||||
| val<=0xDFFF && val>=0xD800 = throwException (HasNoRepresentation ch)
|
||||
| val<=0x0000FFFF = push $ fromIntegral val
|
||||
| val<=0x0010FFFF = let v = val - 0x10000
|
||||
w1 = (fromIntegral (v `shiftR` 10)) .|. 0xD800
|
||||
w2 = ((fromIntegral v) .&. 0x3FF) .|. 0xDC00
|
||||
in push w1 >> push w2
|
||||
| otherwise = throwException (HasNoRepresentation ch)
|
||||
where
|
||||
val = ord ch
|
||||
|
||||
instance Encoding UTF16 where
|
||||
encode enc str = unfoldr (utf16enc (enc/=UTF16LE)) (case enc of
|
||||
UTF16 -> Put2 0xFE 0xFF
|
||||
_ -> Done,str)
|
||||
encodeLazy enc str = LBS.unfoldr (utf16enc (enc/=UTF16LE)) (case enc of
|
||||
UTF16 -> Put2 0xFE 0xFF
|
||||
_ -> Done,str)
|
||||
encodable _ c = ord c <= 0x0010FFFF
|
||||
decode bo str = case findByteOrder str of
|
||||
Nothing -> decode' (bo/=UTF16LE) 0
|
||||
Just big -> decode' big 2
|
||||
where
|
||||
l = BS.length str
|
||||
decode' be i = if i>=l
|
||||
then []
|
||||
else c:decode' be (i+took)
|
||||
where
|
||||
(c,took) = mapException (\ex -> case ex of
|
||||
ErrorCall _ -> DynException (toDyn UnexpectedEnd)
|
||||
_ -> ex) (utf16dec be s1 s2 s3 s4)
|
||||
s1 = index str i
|
||||
s2 = index str (i+1)
|
||||
s3 = index str (i+2)
|
||||
s4 = index str (i+3)
|
||||
decodeLazy bo str = case findByteOrderLazy str of
|
||||
Nothing -> decode' (bo/=UTF16LE) 0
|
||||
Just big -> decode' big 2
|
||||
where
|
||||
l = LBS.length str
|
||||
decode' be i = if i>=l
|
||||
then []
|
||||
else c:decode' be (i+took)
|
||||
where
|
||||
(c,took) = mapException (\ex -> case ex of
|
||||
ErrorCall _ -> DynException (toDyn UnexpectedEnd)
|
||||
_ -> ex) (utf16dec be s1 s2 s3 s4)
|
||||
s1 = LBS.index str i
|
||||
s2 = LBS.index str (i+1)
|
||||
s3 = LBS.index str (i+2)
|
||||
s4 = LBS.index str (i+3)
|
||||
decodable bo str = case findByteOrder str of
|
||||
Nothing -> check' (bo/=UTF16LE) (length str) 0
|
||||
Just big -> check' big (length str) 2
|
||||
where
|
||||
check' be m i
|
||||
| m == i = True
|
||||
| m == i+1 = False
|
||||
| w1< 0xD8 || w1> 0xDF = check' be m (i+2)
|
||||
| w1> 0xDB = False
|
||||
| m <= i+3 = False
|
||||
| w3< 0xDC || w3>0xDF = False
|
||||
| otherwise = check' be m (i+4)
|
||||
where
|
||||
(w1,w3) = if be then (s1,s3) else (s2,s4)
|
||||
s1 = index str i
|
||||
s2 = index str (i+1)
|
||||
s3 = index str (i+2)
|
||||
s4 = index str (i+3)
|
||||
encodeChar UTF16LE = encodeUTF16 pushWord16le
|
||||
encodeChar _ = encodeUTF16 pushWord16be
|
||||
decodeChar UTF16LE = decodeUTF16 fetchWord16le
|
||||
decodeChar _ = decodeUTF16 fetchWord16be
|
||||
|
||||
findByteOrder :: ByteString -> Maybe Bool
|
||||
findByteOrder str
|
||||
| length str < 2 = Nothing
|
||||
| w1 == 0xFE && w2 == 0xFF = Just True
|
||||
| w1 == 0xFF && w2 == 0xFE = Just False
|
||||
| otherwise = Nothing
|
||||
where
|
||||
w1 = index str 0
|
||||
w2 = index str 1
|
||||
encode UTF16 str = do
|
||||
encodeChar UTF16 '\xFEFF'
|
||||
mapM_ (encodeChar UTF16) str
|
||||
encode enc str = mapM_ (encodeChar enc) str
|
||||
|
||||
|
||||
findByteOrderLazy :: LBS.ByteString -> Maybe Bool
|
||||
findByteOrderLazy str = case LBS.unpack (LBS.take 2 str) of
|
||||
[w1,w2]
|
||||
| w1 == 0xFE && w2 == 0xFF -> Just True
|
||||
| w1 == 0xFF && w2 == 0xFE -> Just False
|
||||
| otherwise -> Nothing
|
||||
_ -> Nothing
|
||||
decode UTF16 = do
|
||||
res <- readBOM
|
||||
case res of
|
||||
Left c -> do
|
||||
cs <- untilM sourceEmpty (decodeChar UTF16BE)
|
||||
return (c:cs)
|
||||
Right bom -> decode bom
|
||||
decode enc = untilM sourceEmpty (decodeChar enc)
|
||||
|
||||
@ -3,17 +3,17 @@
|
||||
See <http://en.wikipedia.org/wiki/UTF-32> for more information.
|
||||
-}
|
||||
module Data.Encoding.UTF32
|
||||
(UTF32(..))
|
||||
where
|
||||
(UTF32(..))
|
||||
where
|
||||
|
||||
import Data.Bits
|
||||
import Data.Char (ord,chr)
|
||||
import Data.Encoding.Base
|
||||
import Data.Word
|
||||
import Control.Exception (throwDyn)
|
||||
import Data.Encoding.ByteSink
|
||||
import Data.Encoding.ByteSource
|
||||
import Data.Encoding.Exception
|
||||
|
||||
import Data.Char
|
||||
import Data.Typeable
|
||||
import qualified Data.ByteString as BS
|
||||
import qualified Data.ByteString.Lazy as LBS
|
||||
|
||||
|
||||
data UTF32
|
||||
= UTF32 -- ^ Detects big or little endian through the use of the BOM (Byte Order Mask) character. Defaults to big endian if not present.
|
||||
@ -21,70 +21,26 @@ data UTF32
|
||||
| UTF32LE -- ^ Encodes and decodes using the little endian encoding.
|
||||
deriving (Eq,Show,Typeable)
|
||||
|
||||
bom :: Char
|
||||
bom = '\xFEFF'
|
||||
|
||||
instance Encoding UTF32 where
|
||||
encode UTF32 str = encodeMultibyte encodeUTF32be (bom:str)
|
||||
encode UTF32LE str = encodeMultibyte encodeUTF32le str
|
||||
encode UTF32BE str = encodeMultibyte encodeUTF32be str
|
||||
encodeLazy UTF32 str = encodeMultibyteLazy encodeUTF32be (bom:str)
|
||||
encodeLazy UTF32LE str = encodeMultibyteLazy encodeUTF32le str
|
||||
encodeLazy UTF32BE str = encodeMultibyteLazy encodeUTF32be str
|
||||
encodable _ c = ord c < 0x0010FFFF
|
||||
decode UTF32 str = let
|
||||
(start,rest) = BS.splitAt 4 str
|
||||
in case BS.unpack start of
|
||||
[0x00,0x00,0xFE,0xFF] -> decode UTF32BE rest
|
||||
[0xFE,0xFF,0x00,0x00] -> decode UTF32LE rest
|
||||
_ -> decode UTF32BE str
|
||||
decode UTF32LE str = decodeMultibyte decodeUTF32le str
|
||||
decode UTF32BE str = decodeMultibyte decodeUTF32be str
|
||||
decodeLazy UTF32 str = let
|
||||
(start,rest) = LBS.splitAt 4 str
|
||||
in case LBS.unpack start of
|
||||
[0x00,0x00,0xFE,0xFF] -> decodeLazy UTF32BE rest
|
||||
[0xFE,0xFF,0x00,0x00] -> decodeLazy UTF32LE rest
|
||||
_ -> decodeLazy UTF32BE str
|
||||
decodeLazy UTF32LE str = decodeMultibyteLazy decodeUTF32le str
|
||||
decodeLazy UTF32BE str = decodeMultibyteLazy decodeUTF32be str
|
||||
encodeChar UTF32LE ch = pushWord32le (fromIntegral $ ord ch)
|
||||
encodeChar _ ch = pushWord32be (fromIntegral $ ord ch)
|
||||
decodeChar UTF32LE = do
|
||||
wrd <- fetchWord32le
|
||||
return $ chr $ fromIntegral wrd
|
||||
decodeChar _ = do
|
||||
wrd <- fetchWord32be
|
||||
return $ chr $ fromIntegral wrd
|
||||
encode UTF32 str = do
|
||||
encodeChar UTF32 '\xFEFF'
|
||||
mapM_ (encodeChar UTF32) str
|
||||
encode enc str = mapM_ (encodeChar enc) str
|
||||
|
||||
encodeUTF32be :: Char -> (Word8,EncodeState)
|
||||
encodeUTF32be ch = let
|
||||
w = ord ch
|
||||
w1 = fromIntegral $ w `shiftR` 24
|
||||
w2 = fromIntegral $ w `shiftR` 16
|
||||
w3 = fromIntegral $ w `shiftR` 8
|
||||
w4 = fromIntegral $ w
|
||||
in (w1,Put3 w2 w3 w4)
|
||||
|
||||
encodeUTF32le :: Char -> (Word8,EncodeState)
|
||||
encodeUTF32le ch = let
|
||||
w = ord ch
|
||||
w1 = fromIntegral $ w `shiftR` 24
|
||||
w2 = fromIntegral $ w `shiftR` 16
|
||||
w3 = fromIntegral $ w `shiftR` 8
|
||||
w4 = fromIntegral $ w
|
||||
in (w4,Put3 w3 w2 w1)
|
||||
|
||||
decodeUTF32be :: [Word8] -> (Char,[Word8])
|
||||
decodeUTF32be (w1:w2:w3:w4:rest) = let
|
||||
v = (fromIntegral w1 `shiftL` 24) .|.
|
||||
(fromIntegral w2 `shiftL` 16) .|.
|
||||
(fromIntegral w3 `shiftL` 8) .|.
|
||||
(fromIntegral w4)
|
||||
in if v < 0x0010FFFF
|
||||
then (chr v,rest)
|
||||
else throwDyn (IllegalRepresentation [w1,w2,w3,w4])
|
||||
decodeUTF32be _ = throwDyn UnexpectedEnd
|
||||
|
||||
decodeUTF32le :: [Word8] -> (Char,[Word8])
|
||||
decodeUTF32le (w1:w2:w3:w4:rest) = let
|
||||
v = (fromIntegral w4 `shiftL` 24) .|.
|
||||
(fromIntegral w3 `shiftL` 16) .|.
|
||||
(fromIntegral w2 `shiftL` 8) .|.
|
||||
(fromIntegral w1)
|
||||
in if v < 0x0010FFFF
|
||||
then (chr v,rest)
|
||||
else throwDyn (IllegalRepresentation [w1,w2,w3,w4])
|
||||
decodeUTF32le _ = throwDyn UnexpectedEnd
|
||||
decode UTF32 = do
|
||||
ch <- fetchWord32be
|
||||
case ch of
|
||||
0x0000FEFF -> untilM sourceEmpty (decodeChar UTF32BE)
|
||||
0xFFFE0000 -> untilM sourceEmpty (decodeChar UTF32LE)
|
||||
_ -> do
|
||||
rest <- untilM sourceEmpty (decodeChar UTF32)
|
||||
return ((chr $ fromIntegral ch):rest)
|
||||
decode enc = untilM sourceEmpty (decodeChar enc)
|
||||
|
||||
@ -2,156 +2,124 @@
|
||||
{- | This module implements UTF-8 encoding and decoding as in RFC 3629.
|
||||
See <http://en.wikipedia.org/wiki/UTF-8> for more information.
|
||||
-}
|
||||
module Data.Encoding.UTF8
|
||||
(UTF8(..)) where
|
||||
module Data.Encoding.UTF8 where
|
||||
|
||||
import Control.Throws
|
||||
import Data.Char
|
||||
import Data.Bits
|
||||
import Data.Char (ord,chr)
|
||||
|
||||
import Data.Encoding.Base
|
||||
import Data.ByteString
|
||||
import Data.Word
|
||||
import Prelude hiding (length)
|
||||
import Control.Exception
|
||||
import Data.Encoding.ByteSource
|
||||
import Data.Encoding.ByteSink
|
||||
import Data.Encoding.Exception
|
||||
|
||||
import Data.Typeable
|
||||
|
||||
data UTF8
|
||||
= UTF8 -- ^ Very forgiving decoding mechanism, accepts everything that it can make any sense of.
|
||||
| UTF8Strict -- ^ More strict decoding, doesn\'t accept sequences that have a too long representation and checks bits that aren\'t used in the decoding.
|
||||
deriving (Eq,Show,Typeable)
|
||||
|
||||
encodeUTF8 :: Char -> (Word8,EncodeState)
|
||||
encodeUTF8 x
|
||||
| n<=0x0000007F = (v,Done)
|
||||
| n<=0x000007FF = (fromIntegral $ 0xC0 .|. (n `shiftR` 6)
|
||||
,Put1 (0x80 .|. (v .&. 0x3F)))
|
||||
| n<=0x0000FFFF = (fromIntegral $ 0xE0 .|. (n `shiftR` 12)
|
||||
,Put2 (fromIntegral $
|
||||
0x80 .|. ((n `shiftR` 6) .&. 0x3F))
|
||||
(fromIntegral $
|
||||
0x80 .|. (n .&. 0x3F)))
|
||||
| n<=0x0010FFFF = (fromIntegral $ 0xF0 .|. (n `shiftR` 18)
|
||||
,Put3 (fromIntegral $
|
||||
0x80 .|. ((n `shiftR` 12) .&. 0x3F))
|
||||
(fromIntegral $
|
||||
0x80 .|. ((n `shiftR` 6) .&. 0x3F))
|
||||
(fromIntegral $
|
||||
0x80 .|. (n .&. 0x3F)))
|
||||
| otherwise = throwDyn (HasNoRepresentation x)
|
||||
where
|
||||
n = ord x
|
||||
v = fromIntegral $ ord x
|
||||
|
||||
decodeUTF8 :: [Word8] -> (Char,[Word8])
|
||||
decodeUTF8 ~(w1:rest1)
|
||||
| w1<=0x7F = (chr $ fromIntegral w1,rest1)
|
||||
| w1<=0xBF = throwDyn (IllegalCharacter w1)
|
||||
| w1<=0xDF = case rest1 of
|
||||
(w2:rest2) -> (chr $ ((fromIntegral $ w1 .&. 0x1F) `shiftL` 6)
|
||||
.|. (fromIntegral (w2 .&. 0x3F)),rest2)
|
||||
_ -> throwDyn UnexpectedEnd
|
||||
| w1<=0xEF = case rest1 of
|
||||
(w2:w3:rest3) -> (chr $ ((fromIntegral $ w1 .&. 0x0F) `shiftL` 12)
|
||||
.|. ((fromIntegral $ w2 .&. 0x3F) `shiftL` 6)
|
||||
.|. (fromIntegral $ w3 .&. 0x3F),rest3)
|
||||
_ -> throwDyn UnexpectedEnd
|
||||
| w1<=0xF7 = case rest1 of
|
||||
(w2:w3:w4:rest4) -> (chr $ ((fromIntegral $ w1 .&. 0x07) `shiftL` 18)
|
||||
.|. ((fromIntegral $ w2 .&. 0x3F) `shiftL` 12)
|
||||
.|. ((fromIntegral $ w3 .&. 0x3F) `shiftL` 6)
|
||||
.|. (fromIntegral $ w4 .&. 0x3F),rest4)
|
||||
_ -> throwDyn UnexpectedEnd
|
||||
| otherwise = throwDyn (IllegalCharacter w1)
|
||||
|
||||
decodeUTF8Strict :: [Word8] -> (Char,[Word8])
|
||||
decodeUTF8Strict ~(w1:rest1)
|
||||
| w1<=0x7F = (chr $ fromIntegral w1,rest1)
|
||||
| w1<=0xBF = throwDyn (IllegalCharacter w1)
|
||||
| w1<=0xDF = case rest1 of
|
||||
(w2:rest2)
|
||||
| invalidExtend w2 -> throwDyn (IllegalCharacter w2)
|
||||
| otherwise -> let
|
||||
v1 = w1 .&. 0x1F
|
||||
in if v1 <= 1
|
||||
then throwDyn (IllegalRepresentation [w1,w2])
|
||||
else (chr $ ((fromIntegral v1) `shiftL` 6)
|
||||
.|. (fromIntegral (w2 .&. 0x3F)),rest2)
|
||||
_ -> throwDyn UnexpectedEnd
|
||||
| w1<=0xEF = case rest1 of
|
||||
(w2:w3:rest3)
|
||||
| invalidExtend w2 -> throwDyn (IllegalCharacter w2)
|
||||
| invalidExtend w3 -> throwDyn (IllegalCharacter w3)
|
||||
| otherwise -> let
|
||||
v1 = w1 .&. 0x0F
|
||||
v2 = w2 .&. 0x3F
|
||||
in if v1 == 0 && v2 < 0x20
|
||||
then throwDyn (IllegalRepresentation [w1,w2,w3])
|
||||
else (chr $ ((fromIntegral v1) `shiftL` 12)
|
||||
.|. ((fromIntegral v2) `shiftL` 6)
|
||||
.|. (fromIntegral $ w3 .&. 0x3F),rest3)
|
||||
_ -> throwDyn UnexpectedEnd
|
||||
| w1<=0xF7 = case rest1 of
|
||||
(w2:w3:w4:rest4)
|
||||
| invalidExtend w2 -> throwDyn (IllegalCharacter w2)
|
||||
| invalidExtend w3 -> throwDyn (IllegalCharacter w3)
|
||||
| invalidExtend w4 -> throwDyn (IllegalCharacter w4)
|
||||
| otherwise -> let
|
||||
v1 = w1 .&. 0x07
|
||||
v2 = w2 .&. 0x3F
|
||||
in if v1 == 0 && v2 < 0x10
|
||||
then throwDyn (IllegalRepresentation [w1,w2,w3,w4])
|
||||
else (chr $ ((fromIntegral $ w1 .&. 0x07) `shiftL` 18)
|
||||
.|. ((fromIntegral $ w2 .&. 0x3F) `shiftL` 12)
|
||||
.|. ((fromIntegral $ w3 .&. 0x3F) `shiftL` 6)
|
||||
.|. (fromIntegral $ w4 .&. 0x3F),rest4)
|
||||
_ -> throwDyn UnexpectedEnd
|
||||
| otherwise = throwDyn (IllegalCharacter w1)
|
||||
where
|
||||
invalidExtend wrd = wrd .&. 0xC0 /= 0x80
|
||||
|
||||
data UTF8AnalyzeState
|
||||
= Skip !Int
|
||||
| CheckAndSkip !Word8 !Int
|
||||
| Ok
|
||||
| Failed
|
||||
deriving Eq
|
||||
data UTF8 = UTF8 -- ^ Very forgiving decoding mechanism, accepts everything that it can make any sense of.
|
||||
| UTF8Strict -- ^ More strict decoding, doesn\'t accept sequences that have a too long representation and checks bits that aren\'t used in the decoding
|
||||
deriving (Eq,Show,Typeable)
|
||||
|
||||
instance Encoding UTF8 where
|
||||
encode _ = encodeMultibyte encodeUTF8
|
||||
encodeLazy _ = encodeMultibyteLazy encodeUTF8
|
||||
encodable _ c = ord c <= 0x0010FFFF
|
||||
decode UTF8 = decodeMultibyte decodeUTF8
|
||||
decode UTF8Strict = decodeMultibyte decodeUTF8Strict
|
||||
decodeLazy UTF8 = decodeMultibyteLazy decodeUTF8
|
||||
decodeLazy UTF8Strict = decodeMultibyteLazy decodeUTF8Strict
|
||||
decodable UTF8 str = (foldl' (\st w -> case st of
|
||||
Ok | w<=0x7F -> Ok
|
||||
| w<=0xBF -> Failed
|
||||
| w<=0xDF -> Skip 0
|
||||
| w<=0xEF -> Skip 1
|
||||
| w<=0xF7 -> Skip 2
|
||||
| otherwise -> Failed
|
||||
Failed -> Failed
|
||||
Skip n -> if w .&. 0xC0 == 0x80
|
||||
then (if n == 0 then Ok else Skip (n-1))
|
||||
else Failed) Ok str) == Ok
|
||||
decodable UTF8Strict str = (foldl' (\st w -> case st of
|
||||
Ok | w<=0x7F -> Ok
|
||||
| w<=0xBF -> Failed
|
||||
| w<=0xDF -> if w .&. 0x1F <= 1
|
||||
then Failed
|
||||
else Skip 0
|
||||
| w<=0xEF -> if w .&. 0x0F == 0
|
||||
then CheckAndSkip 0x20 1
|
||||
else Skip 1
|
||||
| w<=0xF7 -> if w .&. 0x07 == 0
|
||||
then CheckAndSkip 0x10 2
|
||||
else Skip 2
|
||||
| otherwise -> Failed
|
||||
Failed -> Failed
|
||||
Skip n -> if w .&. 0xC0 == 0x80
|
||||
then (if n == 0 then Ok else Skip (n-1))
|
||||
else Failed
|
||||
CheckAndSkip chk n -> if w .&. 0xC0 == 0x80 && w .&. 0x3F >= chk
|
||||
then (if n == 0 then Ok else Skip (n-1))
|
||||
else Failed
|
||||
) Ok str) == Ok
|
||||
encodeChar _ c
|
||||
| n <= 0x0000007F = p8 n
|
||||
| n <= 0x000007FF = do
|
||||
p8 $ 0xC0 .|. (n `shiftR` 6)
|
||||
p8 $ 0x80 .|. (n .&. 0x3F)
|
||||
| n <= 0x0000FFFF = do
|
||||
p8 $ 0xE0 .|. (n `shiftR` 12)
|
||||
p8 $ 0x80 .|. ((n `shiftR` 6) .&. 0x3F)
|
||||
p8 $ 0x80 .|. (n .&. 0x3F)
|
||||
| n <= 0x0010FFFF = do
|
||||
p8 $ 0xF0 .|. (n `shiftR` 18)
|
||||
p8 $ 0x80 .|. ((n `shiftR` 12) .&. 0x3F)
|
||||
p8 $ 0x80 .|. ((n `shiftR` 6) .&. 0x3F)
|
||||
p8 $ 0x80 .|. (n .&. 0x3F)
|
||||
| otherwise = throwException (HasNoRepresentation c)
|
||||
where
|
||||
n = ord c
|
||||
p8 = pushWord8.fromIntegral
|
||||
decodeChar UTF8 = do
|
||||
w1 <- fetchWord8
|
||||
case () of
|
||||
_
|
||||
| w1 <= 0x7F -> return $ chr $ fromIntegral w1
|
||||
| w1 <= 0xBF -> throwException (IllegalCharacter w1)
|
||||
| w1 <= 0xDF -> do
|
||||
w2 <- fetchWord8
|
||||
return $ chr $
|
||||
((fromIntegral $ w1 .&. 0x1F) `shiftL` 6)
|
||||
.|. (fromIntegral $ w2 .&. 0x3F)
|
||||
|
||||
| w1 <= 0xEF -> do
|
||||
w2 <- fetchWord8
|
||||
w3 <- fetchWord8
|
||||
let v1 = w1 .&. 0x0F
|
||||
v2 = w2 .&. 0x3F
|
||||
v3 = w3 .&. 0x3F
|
||||
return $ chr $
|
||||
((fromIntegral v1) `shiftL` 12)
|
||||
.|. ((fromIntegral v2) `shiftL` 6)
|
||||
.|. (fromIntegral v3)
|
||||
| w1 <= 0xF7 -> do
|
||||
w2 <- fetchWord8
|
||||
w3 <- fetchWord8
|
||||
w4 <- fetchWord8
|
||||
let v1 = w1 .&. 0x07
|
||||
v2 = w2 .&. 0x3F
|
||||
v3 = w3 .&. 0x3F
|
||||
v4 = w4 .&. 0x3F
|
||||
return $ chr $
|
||||
((fromIntegral v1) `shiftL` 18)
|
||||
.|. ((fromIntegral v2) `shiftL` 12)
|
||||
.|. ((fromIntegral v3) `shiftL` 6)
|
||||
.|. (fromIntegral v4)
|
||||
| otherwise -> throwException (IllegalCharacter w1)
|
||||
decodeChar UTF8Strict = do
|
||||
w1 <- fetchWord8
|
||||
case () of
|
||||
_
|
||||
| w1 <= 0x7F -> return $ chr $ fromIntegral w1
|
||||
| w1 <= 0xBF -> throwException (IllegalCharacter w1)
|
||||
| w1 <= 0xDF -> do
|
||||
w2 <- fetchExtend8
|
||||
let v1 = w1 .&. 0x1F
|
||||
if v1 <= 1
|
||||
then throwException (IllegalRepresentation [w1,w2])
|
||||
else return $ chr $
|
||||
((fromIntegral v1) `shiftL` 6)
|
||||
.|. (fromIntegral $ w2 .&. 0x3F)
|
||||
| w1 <= 0xEF -> do
|
||||
w2 <- fetchExtend8
|
||||
w3 <- fetchExtend8
|
||||
let v1 = w1 .&. 0x0F
|
||||
v2 = w2 .&. 0x3F
|
||||
v3 = w3 .&. 0x3F
|
||||
if v1 == 0 && v2 < 0x20
|
||||
then throwException (IllegalRepresentation [w1,w2,w3])
|
||||
else return $ chr $
|
||||
((fromIntegral v1) `shiftL` 12)
|
||||
.|. ((fromIntegral v2) `shiftL` 6)
|
||||
.|. (fromIntegral v3)
|
||||
| w1 <= 0xF7 -> do
|
||||
w2 <- fetchExtend8
|
||||
w3 <- fetchExtend8
|
||||
w4 <- fetchExtend8
|
||||
let v1 = w1 .&. 0x07
|
||||
v2 = w2 .&. 0x3F
|
||||
v3 = w3 .&. 0x3F
|
||||
v4 = w4 .&. 0x3F
|
||||
if v1 == 0 && v2 < 0x10
|
||||
then throwException (IllegalRepresentation [w1,w2,w3,w4])
|
||||
else return $ chr $
|
||||
((fromIntegral v1) `shiftL` 18)
|
||||
.|. ((fromIntegral v2) `shiftL` 12)
|
||||
.|. ((fromIntegral v3) `shiftL` 6)
|
||||
.|. (fromIntegral v4)
|
||||
| otherwise -> throwException (IllegalCharacter w1)
|
||||
where
|
||||
invalidExtend wrd = wrd .&. 0xC0 /= 0x80
|
||||
fetchExtend8 = do
|
||||
w <- fetchWord8
|
||||
if invalidExtend w
|
||||
then throwException (IllegalCharacter w)
|
||||
else return w
|
||||
|
||||
@ -1,25 +1,80 @@
|
||||
{-# LANGUAGE ImplicitParams,ForeignFunctionInterface #-}
|
||||
{- | This module provides a replacement for the normal (unicode unaware) IO functions of haskell.
|
||||
By using implicit parameters, it can be used almost as a drop-in replacement.
|
||||
-}
|
||||
module System.IO.Encoding
|
||||
(getSystemEncoding
|
||||
,hPutStr
|
||||
,hGetContents) where
|
||||
(getSystemEncoding
|
||||
,getContents
|
||||
,hPutStr
|
||||
,hPutStrLn
|
||||
,hGetContents
|
||||
,readFile
|
||||
,writeFile
|
||||
,appendFile
|
||||
,hGetChar
|
||||
,hGetLine
|
||||
,hPutChar
|
||||
,interact
|
||||
,print) where
|
||||
|
||||
import Foreign.C.String
|
||||
|
||||
import Data.Encoding
|
||||
import System.IO hiding (hPutStr,hGetContents)
|
||||
import qualified Data.ByteString.Lazy as BS
|
||||
import System.IO (Handle,stdout,stdin)
|
||||
import Prelude hiding (print,getContents,readFile,writeFile,appendFile,interact)
|
||||
import qualified Data.ByteString.Lazy as LBS
|
||||
import qualified Data.ByteString as BS
|
||||
import Control.Monad.Reader (runReaderT)
|
||||
|
||||
-- | Like the normal 'System.IO.hGetContents', but decodes the input using an
|
||||
-- encoding.
|
||||
hGetContents :: Encoding e => e -> Handle -> IO String
|
||||
hGetContents enc h = do
|
||||
str <- BS.hGetContents h
|
||||
return $ decodeLazy enc str
|
||||
hGetContents :: (Encoding e,?enc :: e) => Handle -> IO String
|
||||
hGetContents h = do
|
||||
str <- LBS.hGetContents h
|
||||
return $ decodeLazyByteString ?enc str
|
||||
|
||||
getContents :: (Encoding e,?enc :: e) => IO String
|
||||
getContents = do
|
||||
str <- LBS.getContents
|
||||
return $ decodeLazyByteString ?enc str
|
||||
|
||||
-- | Like the normal 'System.IO.hPutStr', but encodes the output using an
|
||||
-- encoding.
|
||||
hPutStr :: Encoding e => e -> Handle -> String -> IO ()
|
||||
hPutStr enc h str = BS.hPut h (encodeLazy enc str)
|
||||
hPutStr :: (Encoding e,?enc :: e) => Handle -> String -> IO ()
|
||||
hPutStr h str = LBS.hPut h (encodeLazyByteString ?enc str)
|
||||
|
||||
hPutStrLn :: (Encoding e,?enc :: e) => Handle -> String -> IO ()
|
||||
hPutStrLn h str = do
|
||||
LBS.hPut h (encodeLazyByteString ?enc str)
|
||||
LBS.hPut h (encodeLazyByteString ?enc "\n")
|
||||
|
||||
print :: (Encoding e,Show a,?enc :: e) => a -> IO ()
|
||||
print x = hPutStrLn stdout (show x)
|
||||
|
||||
readFile :: (Encoding e,?enc :: e) => FilePath -> IO String
|
||||
readFile fn = LBS.readFile fn >>= return.(decodeLazyByteString ?enc)
|
||||
|
||||
writeFile :: (Encoding e,?enc :: e) => FilePath -> String -> IO ()
|
||||
writeFile fn str = LBS.writeFile fn $ encodeLazyByteString ?enc str
|
||||
|
||||
appendFile :: (Encoding e,?enc :: e) => FilePath -> String -> IO ()
|
||||
appendFile fn str = LBS.appendFile fn $ encodeLazyByteString ?enc str
|
||||
|
||||
hGetChar :: (Encoding e,?enc :: e) => Handle -> IO Char
|
||||
hGetChar h = runReaderT (decodeChar ?enc) h
|
||||
|
||||
hGetLine :: (Encoding e,?enc :: e) => Handle -> IO String
|
||||
hGetLine h = do
|
||||
line <- BS.hGetLine h
|
||||
return $ decodeStrictByteString ?enc line
|
||||
|
||||
hPutChar :: (Encoding e,?enc :: e) => Handle -> Char -> IO ()
|
||||
hPutChar h c = runReaderT (encodeChar ?enc c) h
|
||||
|
||||
interact :: (Encoding e,?enc :: e) => (String -> String) -> IO ()
|
||||
interact f = do
|
||||
line <- hGetLine stdin
|
||||
hPutStrLn stdout (f line)
|
||||
|
||||
foreign import ccall "system_encoding.h get_system_encoding"
|
||||
get_system_encoding :: IO CString
|
||||
@ -29,4 +84,4 @@ getSystemEncoding :: IO DynEncoding
|
||||
getSystemEncoding = do
|
||||
enc <- get_system_encoding
|
||||
str <- peekCString enc
|
||||
return $ encodingFromString str
|
||||
return $ encodingFromString str
|
||||
109
Test/Tester.hs
109
Test/Tester.hs
@ -4,9 +4,10 @@ module Test.Tester where
|
||||
import Data.Encoding
|
||||
import Test.HUnit
|
||||
import Data.Word
|
||||
import Data.Char
|
||||
import qualified Data.ByteString as BS
|
||||
import qualified Data.ByteString.Lazy as LBS
|
||||
import Control.Exception (catchDyn,evaluate)
|
||||
import Test.QuickCheck hiding (Testable)
|
||||
|
||||
data EncodingTest
|
||||
= forall enc. (Encoding enc,Show enc) =>
|
||||
@ -17,52 +18,60 @@ data EncodingTest
|
||||
EncodingError enc String EncodingException
|
||||
|
||||
instance Testable EncodingTest where
|
||||
test (EncodingTest enc src trg) = TestList
|
||||
[TestLabel (show enc ++ " encodable")
|
||||
(TestCase $ (all (encodable enc) src) @=? True)
|
||||
,TestLabel (show enc ++ " encoding (strict)")
|
||||
(TestCase $ bstr @=? (encode enc src))
|
||||
,TestLabel (show enc ++ " encoding (lazy)")
|
||||
(TestCase $ lbstr @=? (encodeLazy enc src))
|
||||
,TestLabel (show enc ++ " decodable")
|
||||
(TestCase $ (decodable enc bstr) @=? True)
|
||||
,TestLabel (show enc ++ " decoding (strict)")
|
||||
(TestCase $ src @=? (decode enc bstr))
|
||||
,TestLabel (show enc ++ " decoding (lazy)")
|
||||
(TestCase $ src @=? (decodeLazy enc lbstr))
|
||||
]
|
||||
where
|
||||
bstr = BS.pack trg
|
||||
lbstr = LBS.pack trg
|
||||
test (DecodingError enc trg what) = TestList
|
||||
[TestLabel (show what++" not decodable in "++show enc) $
|
||||
TestCase $ assert $ not $ decodable enc (BS.pack trg)
|
||||
,TestLabel (show enc ++ " decoding error (strict)") $ TestCase $
|
||||
catchDyn (do
|
||||
mapM_ evaluate (decode enc (BS.pack trg))
|
||||
assertFailure "No exception thrown"
|
||||
)
|
||||
(\exc -> exc @=? what)
|
||||
,TestLabel (show enc ++ " decoding error (lazy)") $ TestCase $
|
||||
catchDyn (do
|
||||
mapM_ evaluate (decodeLazy enc (LBS.pack trg))
|
||||
assertFailure "No exception thrown"
|
||||
)
|
||||
(\exc -> exc @=? what)
|
||||
]
|
||||
test (EncodingError enc src what) = TestList
|
||||
[TestLabel (show src ++ " not encodable in " ++ show enc) $
|
||||
TestCase $ assert $ not $ all (encodable enc) src
|
||||
,TestLabel (show enc ++ " encoding error (strict)") $ TestCase $
|
||||
catchDyn (do
|
||||
evaluate (encode enc src)
|
||||
assertFailure "No exception thrown"
|
||||
)
|
||||
(\exc -> exc @=? what)
|
||||
,TestLabel (show enc ++ " encoding error (lazy)") $ TestCase $
|
||||
catchDyn (do
|
||||
evaluate (encodeLazy enc src)
|
||||
assertFailure "No exception thrown"
|
||||
)
|
||||
(\exc -> exc @=? what)
|
||||
]
|
||||
test (EncodingTest enc src trg)
|
||||
= TestList
|
||||
[TestLabel (show enc ++ " encoding")
|
||||
(TestCase $ encodeStrictByteStringExplicit enc src
|
||||
@?= Right (BS.pack trg))
|
||||
,TestLabel (show enc ++ " decoding")
|
||||
(TestCase $ decodeStrictByteStringExplicit enc (BS.pack trg)
|
||||
@=? Right src)
|
||||
]
|
||||
test (DecodingError enc src ex)
|
||||
= TestLabel (show enc ++ " decoding error")
|
||||
(TestCase $ decodeStrictByteStringExplicit enc (BS.pack src) @=? Left ex)
|
||||
|
||||
|
||||
charGen :: Gen Char
|
||||
charGen = let
|
||||
ascii = choose (0x00,0x7F) >>= return.chr
|
||||
oneByte = choose (0x80,0xFF) >>= return.chr
|
||||
twoByte = choose (0x0100,0xFFFF) >>= return.chr
|
||||
threeByte = choose (0x010000,0x10FFFF) >>= return.chr
|
||||
in frequency [(40,ascii),(30,oneByte),(20,twoByte),(10,threeByte)]
|
||||
|
||||
instance Arbitrary Char where
|
||||
arbitrary = charGen
|
||||
coarbitrary x = id
|
||||
|
||||
instance Arbitrary Word8 where
|
||||
arbitrary = choose (0x00,0xFF::Int) >>= return.fromIntegral
|
||||
coarbitrary x = id
|
||||
|
||||
quickCheckEncoding :: Encoding enc => enc -> IO ()
|
||||
quickCheckEncoding e = do
|
||||
quickCheck (encodingIdentity e)
|
||||
quickCheck (decodingIdentity e)
|
||||
|
||||
encodingIdentity :: Encoding enc => enc -> String -> Property
|
||||
encodingIdentity e str
|
||||
= trivial (null str)
|
||||
$ case encoded of
|
||||
Left err -> trivial True True
|
||||
Right res -> case decodeStrictByteStringExplicit e res of
|
||||
Left err -> property False
|
||||
Right res' -> property (str==res')
|
||||
where
|
||||
encoded = encodeStrictByteStringExplicit e str
|
||||
|
||||
decodingIdentity :: Encoding enc => enc -> [Word8] -> Property
|
||||
decodingIdentity e wrd
|
||||
= trivial (null wrd)
|
||||
$ case decoded of
|
||||
Left err -> trivial True True
|
||||
Right res -> case encodeStrictByteStringExplicit e res of
|
||||
Left err -> property False
|
||||
Right res' -> property (bstr==res')
|
||||
where
|
||||
bstr = BS.pack wrd
|
||||
decoded = decodeStrictByteStringExplicit e bstr
|
||||
259
Test/Tests.hs
259
Test/Tests.hs
@ -1,119 +1,158 @@
|
||||
module Test.Tests where
|
||||
|
||||
import Test.Tester
|
||||
import Data.Encoding
|
||||
import Data.Encoding.ASCII
|
||||
import Data.Encoding.UTF8
|
||||
import Data.Encoding.UTF16
|
||||
import Data.Encoding.UTF32
|
||||
import Data.Encoding.ISO88592
|
||||
import Data.Encoding.ISO88593
|
||||
import Data.Encoding.ISO88594
|
||||
import Data.Encoding.ISO88595
|
||||
import Data.Encoding.ISO88596
|
||||
import Data.Encoding.ISO88597
|
||||
import Data.Encoding.ISO88598
|
||||
import Data.Encoding.ISO88599
|
||||
import Data.Encoding.ISO885910
|
||||
import Data.Encoding.ISO885911
|
||||
import Data.Encoding.ISO885913
|
||||
import Data.Encoding.ISO885914
|
||||
import Data.Encoding.BootString
|
||||
import Test.Tester
|
||||
import Test.HUnit
|
||||
import Test.QuickCheck hiding (test)
|
||||
import Data.Char (ord)
|
||||
|
||||
asciiTests :: Test
|
||||
asciiTests = TestList $ map test $
|
||||
[EncodingTest ASCII
|
||||
"Hello, world!"
|
||||
[0x48,0x65,0x6C,0x6C,0x6F,0x2C,0x20,0x77,0x6F,0x72,0x6C,0x64,0x21]
|
||||
,EncodingError ASCII
|
||||
"\x0041\x2262\x0391\x002E"
|
||||
(HasNoRepresentation '\x2262')
|
||||
]
|
||||
identityTests :: IO ()
|
||||
identityTests = do
|
||||
putStrLn "for UTF8..."
|
||||
quickCheckEncoding UTF8Strict
|
||||
putStrLn "for UTF16..."
|
||||
quickCheckEncoding UTF16BE
|
||||
quickCheckEncoding UTF16LE
|
||||
putStrLn "for UTF32..."
|
||||
quickCheck $ encodingIdentity UTF32
|
||||
putStrLn "for ISO88592..."
|
||||
quickCheckEncoding ISO88592
|
||||
putStrLn "for ISO88593..."
|
||||
quickCheckEncoding ISO88593
|
||||
putStrLn "for ISO88594..."
|
||||
quickCheckEncoding ISO88594
|
||||
putStrLn "for ISO88595..."
|
||||
quickCheckEncoding ISO88595
|
||||
putStrLn "for ISO88596..."
|
||||
quickCheckEncoding ISO88596
|
||||
putStrLn "for ISO88597..."
|
||||
quickCheckEncoding ISO88597
|
||||
putStrLn "for ISO88598..."
|
||||
quickCheckEncoding ISO88598
|
||||
putStrLn "for ISO88599..."
|
||||
quickCheckEncoding ISO88599
|
||||
putStrLn "for ISO885910..."
|
||||
quickCheckEncoding ISO885910
|
||||
putStrLn "for ISO885911..."
|
||||
quickCheckEncoding ISO885911
|
||||
putStrLn "for ISO885913..."
|
||||
quickCheckEncoding ISO885913
|
||||
putStrLn "for ISO885914..."
|
||||
quickCheckEncoding ISO885914
|
||||
putStrLn "for Punycode..."
|
||||
quickCheck $ encodingIdentity punycode
|
||||
|
||||
utf8Tests :: Test
|
||||
utf8Tests = TestList $ map test $
|
||||
-- Simple encoding tests
|
||||
concat [[EncodingTest enc "\x0041\x2262\x0391\x002E"
|
||||
[0x41,0xE2,0x89,0xA2,0xCE,0x91,0x2E]
|
||||
,EncodingTest enc "\xD55C\xAD6D\xC5B4"
|
||||
[0xED,0x95,0x9C,0xEA,0xB5,0xAD,0xEC,0x96,0xB4]
|
||||
,EncodingTest enc "\x65E5\x672C\x8A9E"
|
||||
[0xE6,0x97,0xA5,0xE6,0x9C,0xAC,0xE8,0xAA,0x9E]
|
||||
,EncodingTest enc "\x233B4"
|
||||
[0xF0,0xA3,0x8E,0xB4]
|
||||
,EncodingTest enc ""
|
||||
[]
|
||||
-- First possible sequence of a certain length
|
||||
,EncodingTest enc "\x0000"
|
||||
[0x00]
|
||||
,EncodingTest enc "\x0080"
|
||||
[0xC2,0x80]
|
||||
,EncodingTest enc "\x0800"
|
||||
[0xE0,0xA0,0x80]
|
||||
,EncodingTest enc "\x10000"
|
||||
[0xF0,0x90,0x80,0x80]
|
||||
-- Last possible sequence of a certain length
|
||||
,EncodingTest enc "\x007F"
|
||||
[0x7F]
|
||||
,EncodingTest enc "\x07FF"
|
||||
[0xDF,0xBF]
|
||||
,EncodingTest enc "\xFFFF"
|
||||
[0xEF,0xBF,0xBF]
|
||||
-- Other boundaries
|
||||
,EncodingTest enc "\xD7FF"
|
||||
[0xED,0x9F,0xBF]
|
||||
,EncodingTest enc "\xE000"
|
||||
[0xEE,0x80,0x80]
|
||||
,EncodingTest enc "\xFFFD"
|
||||
[0xEF,0xBF,0xBD]
|
||||
-- Illegal starting characters
|
||||
,DecodingError enc
|
||||
[0x65,0x55,0x85]
|
||||
(IllegalCharacter 0x85)
|
||||
-- Unexpected end
|
||||
,DecodingError enc
|
||||
[0x41,0xE2,0x89,0xA2,0xCE]
|
||||
UnexpectedEnd
|
||||
,DecodingError enc
|
||||
[0x41,0xE2,0x89]
|
||||
UnexpectedEnd
|
||||
,DecodingError enc
|
||||
[0x41,0xE2]
|
||||
UnexpectedEnd]
|
||||
| enc <- [UTF8,UTF8Strict]
|
||||
]++
|
||||
[DecodingError UTF8 [0xFE] (IllegalCharacter 0xFE)
|
||||
,DecodingError UTF8 [0xFF] (IllegalCharacter 0xFF)
|
||||
-- Overlong representations of '/'
|
||||
,DecodingError UTF8Strict [0xC0,0xAF]
|
||||
(IllegalRepresentation [0xC0,0xAF])
|
||||
,DecodingError UTF8Strict [0xE0,0x80,0xAF]
|
||||
(IllegalRepresentation [0xE0,0x80,0xAF])
|
||||
,DecodingError UTF8Strict [0xF0,0x80,0x80,0xAF]
|
||||
(IllegalRepresentation [0xF0,0x80,0x80,0xAF])
|
||||
-- Maximum overlong sequences
|
||||
,DecodingError UTF8Strict [0xC1,0xBF]
|
||||
(IllegalRepresentation [0xC1,0xBF])
|
||||
,DecodingError UTF8Strict [0xE0,0x9F,0xBF]
|
||||
(IllegalRepresentation [0xE0,0x9F,0xBF])
|
||||
,DecodingError UTF8Strict [0xF0,0x8F,0xBF,0xBF]
|
||||
(IllegalRepresentation [0xF0,0x8F,0xBF,0xBF])
|
||||
-- Overlong represenations of '\NUL'
|
||||
,DecodingError UTF8Strict [0xC0,0x80]
|
||||
(IllegalRepresentation [0xC0,0x80])
|
||||
,DecodingError UTF8Strict [0xE0,0x80,0x80]
|
||||
(IllegalRepresentation [0xE0,0x80,0x80])
|
||||
,DecodingError UTF8Strict [0xF0,0x80,0x80,0x80]
|
||||
(IllegalRepresentation [0xF0,0x80,0x80,0x80])
|
||||
-- Invalid extends
|
||||
-- 2 of 2
|
||||
,DecodingError UTF8Strict [0xCC,0x1C,0xE0]
|
||||
(IllegalCharacter 0x1C)
|
||||
-- 2 of 3
|
||||
,DecodingError UTF8Strict [0xE3,0x6C,0xB3]
|
||||
(IllegalCharacter 0x6C)
|
||||
-- 3 of 3
|
||||
,DecodingError UTF8Strict [0xE3,0xB4,0x6D]
|
||||
(IllegalCharacter 0x6D)
|
||||
-- 2 of 4
|
||||
,DecodingError UTF8Strict [0xF2,0x6C,0xB3,0xB3]
|
||||
(IllegalCharacter 0x6C)
|
||||
-- 3 of 4
|
||||
,DecodingError UTF8Strict [0xF2,0xB3,0x6C,0xB3]
|
||||
(IllegalCharacter 0x6C)
|
||||
-- 4 of 4
|
||||
,DecodingError UTF8Strict [0xF2,0xB3,0xB3,0x6C]
|
||||
(IllegalCharacter 0x6C)
|
||||
]
|
||||
utf8Tests = TestList $ map test $ concat
|
||||
[[EncodingTest enc "\x0041\x2262\x0391\x002E"
|
||||
[0x41,0xE2,0x89,0xA2,0xCE,0x91,0x2E]
|
||||
,EncodingTest enc "\xD55C\xAD6D\xC5B4"
|
||||
[0xED,0x95,0x9C,0xEA,0xB5,0xAD,0xEC,0x96,0xB4]
|
||||
,EncodingTest enc "\x65E5\x672C\x8A9E"
|
||||
[0xE6,0x97,0xA5,0xE6,0x9C,0xAC,0xE8,0xAA,0x9E]
|
||||
,EncodingTest enc "\x233B4"
|
||||
[0xF0,0xA3,0x8E,0xB4]
|
||||
,EncodingTest enc ""
|
||||
[]
|
||||
-- First possible sequence of a certain length
|
||||
,EncodingTest enc "\x0000"
|
||||
[0x00]
|
||||
,EncodingTest enc "\x0080"
|
||||
[0xC2,0x80]
|
||||
,EncodingTest enc "\x0800"
|
||||
[0xE0,0xA0,0x80]
|
||||
,EncodingTest enc "\x10000"
|
||||
[0xF0,0x90,0x80,0x80]
|
||||
-- Last possible sequence of a certain length
|
||||
,EncodingTest enc "\x007F"
|
||||
[0x7F]
|
||||
,EncodingTest enc "\x07FF"
|
||||
[0xDF,0xBF]
|
||||
,EncodingTest enc "\xFFFF"
|
||||
[0xEF,0xBF,0xBF]
|
||||
-- Other boundaries
|
||||
,EncodingTest enc "\xD7FF"
|
||||
[0xED,0x9F,0xBF]
|
||||
,EncodingTest enc "\xE000"
|
||||
[0xEE,0x80,0x80]
|
||||
,EncodingTest enc "\xFFFD"
|
||||
[0xEF,0xBF,0xBD]
|
||||
-- Illegal starting characters
|
||||
,DecodingError enc
|
||||
[0x65,0x55,0x85]
|
||||
(IllegalCharacter 0x85)
|
||||
-- Unexpected end
|
||||
,DecodingError enc
|
||||
[0x41,0xE2,0x89,0xA2,0xCE]
|
||||
UnexpectedEnd
|
||||
,DecodingError enc
|
||||
[0x41,0xE2,0x89]
|
||||
UnexpectedEnd
|
||||
,DecodingError enc
|
||||
[0x41,0xE2]
|
||||
UnexpectedEnd
|
||||
]
|
||||
| enc <- [UTF8,UTF8Strict]
|
||||
]++
|
||||
[DecodingError UTF8 [0xFE] (IllegalCharacter 0xFE)
|
||||
,DecodingError UTF8 [0xFF] (IllegalCharacter 0xFF)
|
||||
-- Overlong representations of '/'
|
||||
,DecodingError UTF8Strict [0xC0,0xAF]
|
||||
(IllegalRepresentation [0xC0,0xAF])
|
||||
,DecodingError UTF8Strict [0xE0,0x80,0xAF]
|
||||
(IllegalRepresentation [0xE0,0x80,0xAF])
|
||||
,DecodingError UTF8Strict [0xF0,0x80,0x80,0xAF]
|
||||
(IllegalRepresentation [0xF0,0x80,0x80,0xAF])
|
||||
-- Maximum overlong sequences
|
||||
,DecodingError UTF8Strict [0xC1,0xBF]
|
||||
(IllegalRepresentation [0xC1,0xBF])
|
||||
,DecodingError UTF8Strict [0xE0,0x9F,0xBF]
|
||||
(IllegalRepresentation [0xE0,0x9F,0xBF])
|
||||
,DecodingError UTF8Strict [0xF0,0x8F,0xBF,0xBF]
|
||||
(IllegalRepresentation [0xF0,0x8F,0xBF,0xBF])
|
||||
-- Overlong represenations of '\NUL'
|
||||
,DecodingError UTF8Strict [0xC0,0x80]
|
||||
(IllegalRepresentation [0xC0,0x80])
|
||||
,DecodingError UTF8Strict [0xE0,0x80,0x80]
|
||||
(IllegalRepresentation [0xE0,0x80,0x80])
|
||||
,DecodingError UTF8Strict [0xF0,0x80,0x80,0x80]
|
||||
(IllegalRepresentation [0xF0,0x80,0x80,0x80])
|
||||
-- Invalid extends
|
||||
-- 2 of 2
|
||||
,DecodingError UTF8Strict [0xCC,0x1C,0xE0]
|
||||
(IllegalCharacter 0x1C)
|
||||
-- 2 of 3
|
||||
,DecodingError UTF8Strict [0xE3,0x6C,0xB3]
|
||||
(IllegalCharacter 0x6C)
|
||||
-- 3 of 3
|
||||
,DecodingError UTF8Strict [0xE3,0xB4,0x6D]
|
||||
(IllegalCharacter 0x6D)
|
||||
-- 2 of 4
|
||||
,DecodingError UTF8Strict [0xF2,0x6C,0xB3,0xB3]
|
||||
(IllegalCharacter 0x6C)
|
||||
-- 3 of 4
|
||||
,DecodingError UTF8Strict [0xF2,0xB3,0x6C,0xB3]
|
||||
(IllegalCharacter 0x6C)
|
||||
-- 4 of 4
|
||||
,DecodingError UTF8Strict [0xF2,0xB3,0xB3,0x6C]
|
||||
(IllegalCharacter 0x6C)
|
||||
]
|
||||
|
||||
utf16Tests :: Test
|
||||
utf16Tests = TestList $ map test $
|
||||
@ -129,9 +168,9 @@ utf16Tests = TestList $ map test $
|
||||
[0x02,0xE8]
|
||||
,DecodingError UTF16LE [0x65,0xDC]
|
||||
(IllegalCharacter 0xDC)
|
||||
,DecodingError UTF16BE [0xDC]
|
||||
,DecodingError UTF16BE [0xDC,0x33]
|
||||
(IllegalCharacter 0xDC)
|
||||
,DecodingError UTF16BE [0xD9,0x78,0xDA]
|
||||
,DecodingError UTF16BE [0xD9,0x78,0xDA,0x33]
|
||||
(IllegalCharacter 0xDA)
|
||||
,DecodingError UTF16BE [0xD9,0x78,0xDA,0x66]
|
||||
(IllegalCharacter 0xDA)
|
||||
@ -183,5 +222,9 @@ punycodeTests = TestList $ map test $
|
||||
-- Vietnamese
|
||||
,punyTest "TạisaohọkhôngthểchỉnóitiếngViệt"
|
||||
"TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"
|
||||
{-,punyTest "foo\xC2\xAD\xCD\x8F\xE1\xA0\x86\xE1\xA0\x8B\
|
||||
\bar\xE2\x80\x8B\xE2\x81\xA0\
|
||||
\baz\xEF\xB8\x80\xEF\xB8\x88\xEF\xB8\x8F\xEF\xBB\xBF"
|
||||
"foobarbaz"-}
|
||||
]
|
||||
where punyTest str outp = EncodingTest punycode str (map (fromIntegral.ord) outp)
|
||||
where punyTest str outp = EncodingTest punycode str (map (fromIntegral.ord) outp)
|
||||
@ -1,63 +1,37 @@
|
||||
Name: encoding
|
||||
Version: 0.4.1
|
||||
Version: 0.5.0
|
||||
Author: Henning Günther
|
||||
Maintainer: h.guenther@tu-bs.de
|
||||
License: BSD3
|
||||
Synopsis: A library for various character encodings
|
||||
Stability: alpha
|
||||
Description:
|
||||
Haskell has excellect handling of unicode, the Char type covers all unicode chars. Unfortunatly, there's no possibility to read or write something to the outer world in an encoding other than ascii due to the lack of support for encodings. This library should help with that.
|
||||
Category: Codec
|
||||
Homepage: http://code.haskell.org/encoding/
|
||||
Cabal-Version: >=1.2
|
||||
Build-Type: Simple
|
||||
Extra-Source-Files:
|
||||
8859-2.TXT
|
||||
8859-3.TXT
|
||||
8859-4.TXT
|
||||
8859-5.TXT
|
||||
8859-6.TXT
|
||||
8859-7.TXT
|
||||
8859-8.TXT
|
||||
8859-9.TXT
|
||||
8859-10.TXT
|
||||
8859-11.TXT
|
||||
8859-13.TXT
|
||||
8859-14.TXT
|
||||
8859-15.TXT
|
||||
8859-16.TXT
|
||||
CP1250.TXT
|
||||
CP1251.TXT
|
||||
CP1252.TXT
|
||||
CP1253.TXT
|
||||
CP1254.TXT
|
||||
CP1255.TXT
|
||||
CP1256.TXT
|
||||
CP1257.TXT
|
||||
CP1258.TXT
|
||||
gb-18030-2000.xml
|
||||
system_encoding.h
|
||||
create_gb18030_data.sh
|
||||
NEWS
|
||||
|
||||
Flag splitBase
|
||||
description: Choose the new smaller, split-up base package.
|
||||
|
||||
Library
|
||||
if flag(splitBase)
|
||||
Build-Depends: bytestring, base >= 3, template-haskell, containers, array, regex-compat
|
||||
Build-Depends: bytestring, base >= 3, binary, mtl, containers, extensible-exceptions, array, template-haskell, regex-compat
|
||||
else
|
||||
Build-Depends: base < 3, template-haskell, regex-compat
|
||||
Extensions: TemplateHaskell,CPP,ExistentialQuantification,ForeignFunctionInterface
|
||||
C-Sources: system_encoding.c
|
||||
Include-Dirs: .
|
||||
Install-Includes: system_encoding.h
|
||||
Build-Depends: base < 3, binary, extensible-exceptions, template-haskell
|
||||
|
||||
Exposed-Modules:
|
||||
Data.Encoding
|
||||
Data.Encoding.ByteSource
|
||||
Data.Encoding.ByteSink
|
||||
Data.Encoding.Exception
|
||||
Control.Throws
|
||||
Data.Encoding.ASCII
|
||||
Data.Encoding.UTF8
|
||||
Data.Encoding.UTF16
|
||||
Data.Encoding.UTF32
|
||||
Data.Encoding.KOI8R
|
||||
Data.Encoding.KOI8U
|
||||
Data.Encoding.ISO88591
|
||||
Data.Encoding.ISO88592
|
||||
Data.Encoding.ISO88593
|
||||
@ -73,7 +47,6 @@ Library
|
||||
Data.Encoding.ISO885914
|
||||
Data.Encoding.ISO885915
|
||||
Data.Encoding.ISO885916
|
||||
Data.Encoding.BootString
|
||||
Data.Encoding.CP1250
|
||||
Data.Encoding.CP1251
|
||||
Data.Encoding.CP1252
|
||||
@ -83,12 +56,7 @@ Library
|
||||
Data.Encoding.CP1256
|
||||
Data.Encoding.CP1257
|
||||
Data.Encoding.CP1258
|
||||
Data.Encoding.KOI8R
|
||||
Data.Encoding.KOI8U
|
||||
Data.Encoding.GB18030
|
||||
System.IO.Encoding
|
||||
Other-Modules:
|
||||
Data.Encoding.Base
|
||||
Data.Encoding.GB18030Data
|
||||
Data.Encoding.Helper.Template
|
||||
|
||||
Data.Encoding.BootString
|
||||
System.IO.Encoding
|
||||
Loading…
Reference in New Issue
Block a user