encoding/Data/Encoding/Base.hs
Henning Guenther dc47984bf6 Implemented GB18030 encoding
This is a bit of a hack, because the static lookup data this encoding requires brings template haskell to it's knees. So I've got a program that generates a haskell module file from the XML mapping.

darcs-hash:20070823034759-a4fee-883359c8951d4376fc2d783cd7be352d6c5b2111
2007-08-22 20:47:59 -07:00

150 lines
4.9 KiB
Haskell

{-# OPTIONS -fglasgow-exts #-}
module Data.Encoding.Base
(Encoding(..)
,EncodeState(..)
,encodeMultibyte
,encodeMultibyteLazy
,decodeMultibyte
,decodeMultibyteLazy
,encodeSinglebyte
,encodeSinglebyteLazy
,decodeSinglebyte
,EncodingException(..)
,DecodingException(..)
,decodingArray
,encodingMap)
where
import Data.Array(array)
import Data.ByteString (ByteString,unfoldrN,unfoldr,length,index,unpack)
import qualified Data.ByteString.Lazy as LBS
import Data.Encoding.Helper.Template
import Data.ByteString.Base(unsafeIndex)
import Data.Map (Map,fromList,lookup)
import Data.Char(chr)
import Data.Maybe(mapMaybe)
import Data.Typeable
import Data.Word
import Prelude hiding (lookup,length)
import qualified Prelude
import Control.Exception
import Data.Dynamic(toDyn)
import Language.Haskell.TH
{- | Represents an encoding, supporting various methods of de- and encoding.
Minimal complete definition: encode, decode
-}
class Encoding enc where
-- | Encode a 'String' into a strict 'ByteString'. Throws the
-- 'HasNoRepresentation'-Exception if it encounters an unrepresentable
-- character.
encode :: enc -> String -> ByteString
-- | Encode a 'String' into a lazy 'Data.ByteString.Lazy.ByteString'.
encodeLazy :: enc -> String -> LBS.ByteString
encodeLazy e str = LBS.fromChunks [encode e str]
-- | Whether or not the given 'Char' is representable in this encoding. Default: 'True'.
encodable :: enc -> Char -> Bool
encodable _ _ = True
-- | Decode a strict 'ByteString' into a 'String'. If the string is not
-- decodable, a 'DecodingException' is thrown.
decode :: enc -> ByteString -> String
decodeLazy :: enc -> LBS.ByteString -> String
decodeLazy e str = concatMap (decode e) (LBS.toChunks str)
-- | Whether or no a given 'ByteString' is decodable. Default: 'True'.
decodable :: enc -> ByteString -> Bool
decodable _ _ = True
encodeMultibyte :: (Char -> (Word8,EncodeState)) -> String -> ByteString
encodeMultibyte f str = unfoldr (\st -> case st of
(Done,[]) -> Nothing
(Done,x:xs) -> let (w,st) = f x in Just (w,(st,xs))
(Put1 w1,xs) -> Just (w1,(Done,xs))
(Put2 w1 w2,xs) -> Just (w1,(Put1 w2,xs))
(Put3 w1 w2 w3,xs) -> Just (w1,(Put2 w2 w3,xs))) (Done,str)
encodeMultibyteLazy :: (Char -> (Word8,EncodeState)) -> String -> LBS.ByteString
encodeMultibyteLazy f str = LBS.unfoldr (\ ~(st,rest) -> case st of
Done -> case rest of
[] -> Nothing
x:xs -> let ~(w,st) = f x in Just (w,(st,xs))
Put1 w1 -> Just (w1,(Done,rest))
Put2 w1 w2 -> Just (w1,(Put1 w2,rest))
Put3 w1 w2 w3 -> Just (w1,(Put2 w2 w3,rest))) (Done,str)
decodeMultibyte :: ([Word8] -> (Char,[Word8])) -> ByteString -> String
decodeMultibyte f str = decode (unpack str)
where
decode lst = let (c,nlst) = f lst in if null lst then [] else c:(decode nlst)
decodeMultibyteLazy :: ([Word8] -> (Char,[Word8])) -> LBS.ByteString -> String
decodeMultibyteLazy f str = decode (LBS.unpack str)
where
decode lst = let (c,nlst) = f lst in if null lst then [] else c:(decode nlst)
encodeSinglebyte :: (Char -> Word8) -> String -> ByteString
encodeSinglebyte f str = fst $ unfoldrN (Prelude.length str) (\st -> case st of
[] -> Nothing
(x:xs) -> Just (f x,xs)) str
encodeSinglebyteLazy :: (Char -> Word8) -> String -> LBS.ByteString
encodeSinglebyteLazy f str = LBS.unfoldr (\st -> case st of
[] -> Nothing
(x:xs) -> Just (f x,xs)) str
decodeSinglebyte :: (Word8 -> Char) -> ByteString -> String
decodeSinglebyte f str = map f (unpack str)
data EncodeState
= Done
| Put1 !Word8
| Put2 !Word8 !Word8
| Put3 !Word8 !Word8 !Word8
deriving Show
-- | This exception type is thrown whenever something went wrong during the
-- encoding-process.
data EncodingException
= HasNoRepresentation Char -- ^ Thrown if a specific character
-- is not representable in an encoding.
deriving (Show,Typeable)
-- | This exception type is thrown whenever something went wrong during the
-- decoding-process.
data DecodingException
= IllegalCharacter Word8 -- ^ The sequence contained an illegal
-- byte that couldn't be decoded.
| UnexpectedEnd -- ^ more bytes were needed to allow a
-- successfull decoding.
| OutOfRange -- ^ the decoded value was out of the unicode range
deriving (Show,Typeable)
decodingArray :: FilePath -> Q Exp
decodingArray file = do
trans <- runIO (readTranslation file)
createCharArray trans 0 255
encodingMap :: FilePath -> Q Exp
#ifndef __HADDOCK__
encodingMap file = do
trans <- runIO (readTranslation file)
return $ AppE
(VarE 'fromList)
(ListE [ TupE [LitE $ CharL to,LitE $ IntegerL from]
| (from,to) <- trans])
#endif
readTranslation :: FilePath -> IO [(Integer,Char)]
readTranslation file = do
cont <- readFile file
return $ mapMaybe (\ln -> case ln of
[] -> Nothing
('#':xs) -> Nothing
_ -> case words ln of
(src:"#UNDEFINED":_) -> Just (read src,'\xFFFD') -- XXX: Find a better way to handle this
(src:trg:_) -> Just (read src,chr $ read trg)
_ -> Nothing
) (lines cont)