155 lines
5.0 KiB
Haskell
155 lines
5.0 KiB
Haskell
{-# OPTIONS -fglasgow-exts #-}
|
|
|
|
module Data.Encoding.Base
|
|
(Encoding(..)
|
|
,EncodeState(..)
|
|
,encodeMultibyte
|
|
,encodeMultibyteLazy
|
|
,decodeMultibyte
|
|
,decodeMultibyteLazy
|
|
,encodeSinglebyte
|
|
,encodeSinglebyteLazy
|
|
,decodeSinglebyte
|
|
,EncodingException(..)
|
|
,DecodingException(..)
|
|
,decodingArray
|
|
,encodingMap)
|
|
where
|
|
|
|
import Data.Array(array)
|
|
import Data.ByteString (ByteString,unfoldrN,unfoldr,length,index,unpack)
|
|
import qualified Data.ByteString.Lazy as LBS
|
|
import Data.ByteString.Base(unsafeIndex)
|
|
import Data.Map (Map,fromList,lookup)
|
|
import Data.Char(chr)
|
|
import Data.Maybe(mapMaybe)
|
|
import Data.Typeable
|
|
import Data.Word
|
|
import Prelude hiding (lookup,length)
|
|
import qualified Prelude
|
|
import Control.Exception
|
|
import Data.Dynamic(toDyn)
|
|
|
|
import Language.Haskell.TH
|
|
|
|
{- | Represents an encoding, supporting various methods of de- and encoding.
|
|
Minimal complete definition: encode, decode
|
|
-}
|
|
class Encoding enc where
|
|
-- | Encode a 'String' into a strict 'ByteString'. Throws the
|
|
-- 'HasNoRepresentation'-Exception if it encounters an unrepresentable
|
|
-- character.
|
|
encode :: enc -> String -> ByteString
|
|
-- | Encode a 'String' into a lazy 'Data.ByteString.Lazy.ByteString'.
|
|
encodeLazy :: enc -> String -> LBS.ByteString
|
|
encodeLazy e str = LBS.fromChunks [encode e str]
|
|
-- | Whether or not the given 'Char' is representable in this encoding. Default: 'True'.
|
|
encodable :: enc -> Char -> Bool
|
|
encodable _ _ = True
|
|
-- | Decode a strict 'ByteString' into a 'String'. If the string is not
|
|
-- decodable, a 'DecodingException' is thrown.
|
|
decode :: enc -> ByteString -> String
|
|
decodeLazy :: enc -> LBS.ByteString -> String
|
|
decodeLazy e str = concatMap (decode e) (LBS.toChunks str)
|
|
-- | Whether or no a given 'ByteString' is decodable. Default: 'True'.
|
|
decodable :: enc -> ByteString -> Bool
|
|
decodable _ _ = True
|
|
|
|
encodeMultibyte :: (Char -> (Word8,EncodeState)) -> String -> ByteString
|
|
encodeMultibyte f str = unfoldr (\st -> case st of
|
|
(Done,[]) -> Nothing
|
|
(Done,x:xs) -> let (w,st) = f x in Just (w,(st,xs))
|
|
(Put1 w1,xs) -> Just (w1,(Done,xs))
|
|
(Put2 w1 w2,xs) -> Just (w1,(Put1 w2,xs))
|
|
(Put3 w1 w2 w3,xs) -> Just (w1,(Put2 w2 w3,xs))) (Done,str)
|
|
|
|
encodeMultibyteLazy :: (Char -> (Word8,EncodeState)) -> String -> LBS.ByteString
|
|
encodeMultibyteLazy f str = LBS.unfoldr (\ ~(st,rest) -> case st of
|
|
Done -> case rest of
|
|
[] -> Nothing
|
|
x:xs -> let ~(w,st) = f x in Just (w,(st,xs))
|
|
Put1 w1 -> Just (w1,(Done,rest))
|
|
Put2 w1 w2 -> Just (w1,(Put1 w2,rest))
|
|
Put3 w1 w2 w3 -> Just (w1,(Put2 w2 w3,rest))) (Done,str)
|
|
|
|
decodeMultibyte :: ([Word8] -> (Char,[Word8])) -> ByteString -> String
|
|
decodeMultibyte f str = decode (unpack str)
|
|
where
|
|
decode lst = let (c,nlst) = f lst in if null lst then [] else c:(decode nlst)
|
|
|
|
decodeMultibyteLazy :: ([Word8] -> (Char,[Word8])) -> LBS.ByteString -> String
|
|
decodeMultibyteLazy f str = decode (LBS.unpack str)
|
|
where
|
|
decode lst = let (c,nlst) = f lst in if null lst then [] else c:(decode nlst)
|
|
|
|
encodeSinglebyte :: (Char -> Word8) -> String -> ByteString
|
|
encodeSinglebyte f str = fst $ unfoldrN (Prelude.length str) (\st -> case st of
|
|
[] -> Nothing
|
|
(x:xs) -> Just (f x,xs)) str
|
|
|
|
encodeSinglebyteLazy :: (Char -> Word8) -> String -> LBS.ByteString
|
|
encodeSinglebyteLazy f str = LBS.unfoldr (\st -> case st of
|
|
[] -> Nothing
|
|
(x:xs) -> Just (f x,xs)) str
|
|
|
|
decodeSinglebyte :: (Word8 -> Char) -> ByteString -> String
|
|
decodeSinglebyte f str = map f (unpack str)
|
|
|
|
data EncodeState
|
|
= Done
|
|
| Put1 !Word8
|
|
| Put2 !Word8 !Word8
|
|
| Put3 !Word8 !Word8 !Word8
|
|
|
|
-- | This exception type is thrown whenever something went wrong during the
|
|
-- encoding-process.
|
|
data EncodingException
|
|
= HasNoRepresentation Char -- ^ Thrown if a specific character
|
|
-- is not representable in an encoding.
|
|
deriving (Show,Typeable)
|
|
|
|
-- | This exception type is thrown whenever something went wrong during the
|
|
-- decoding-process.
|
|
data DecodingException
|
|
= IllegalCharacter Word8 -- ^ The sequence contained an illegal
|
|
-- byte that couldn't be decoded.
|
|
| UnexpectedEnd -- ^ more bytes were needed to allow a
|
|
-- successfull decoding.
|
|
deriving (Show,Typeable)
|
|
|
|
decodingArray :: FilePath -> Q Exp
|
|
-- Haddock hates template haskell...
|
|
#ifndef __HADDOCK__
|
|
decodingArray file = do
|
|
trans <- runIO (readTranslation file)
|
|
return $ AppE
|
|
(AppE
|
|
(VarE 'array)
|
|
(TupE [LitE $ IntegerL 0,LitE $ IntegerL 255]))
|
|
(ListE [ TupE [LitE $ IntegerL from,LitE $ CharL to]
|
|
| (from,to) <- trans ])
|
|
#endif
|
|
|
|
encodingMap :: FilePath -> Q Exp
|
|
#ifndef __HADDOCK__
|
|
encodingMap file = do
|
|
trans <- runIO (readTranslation file)
|
|
return $ AppE
|
|
(VarE 'fromList)
|
|
(ListE [ TupE [LitE $ CharL to,LitE $ IntegerL from]
|
|
| (from,to) <- trans])
|
|
#endif
|
|
|
|
readTranslation :: FilePath -> IO [(Integer,Char)]
|
|
readTranslation file = do
|
|
cont <- readFile file
|
|
return $ mapMaybe (\ln -> case ln of
|
|
[] -> Nothing
|
|
('#':xs) -> Nothing
|
|
_ -> case words ln of
|
|
(src:"#UNDEFINED":_) -> Just (read src,'\xFFFD') -- XXX: Find a better way to handle this
|
|
(src:trg:_) -> Just (read src,chr $ read trg)
|
|
_ -> Nothing
|
|
) (lines cont)
|
|
|