91 lines
3.1 KiB
Haskell
91 lines
3.1 KiB
Haskell
{-# LANGUAGE DeriveDataTypeable #-}
|
|
{- | This module implements UTF-32 encoding and decoding.
|
|
See <http://en.wikipedia.org/wiki/UTF-32> for more information.
|
|
-}
|
|
module Data.Encoding.UTF32
|
|
(UTF32(..))
|
|
where
|
|
|
|
import Data.Bits
|
|
import Data.Char (ord,chr)
|
|
import Data.Encoding.Base
|
|
import Data.Word
|
|
import Control.Exception (throwDyn)
|
|
import Data.Typeable
|
|
import qualified Data.ByteString as BS
|
|
import qualified Data.ByteString.Lazy as LBS
|
|
|
|
data UTF32
|
|
= UTF32 -- ^ Detects big or little endian through the use of the BOM (Byte Order Mask) character. Defaults to big endian if not present.
|
|
| UTF32BE -- ^ Encodes and decodes using the big endian encoding.
|
|
| UTF32LE -- ^ Encodes and decodes using the little endian encoding.
|
|
deriving (Eq,Show,Typeable)
|
|
|
|
bom :: Char
|
|
bom = '\xFEFF'
|
|
|
|
instance Encoding UTF32 where
|
|
encode UTF32 str = encodeMultibyte encodeUTF32be (bom:str)
|
|
encode UTF32LE str = encodeMultibyte encodeUTF32le str
|
|
encode UTF32BE str = encodeMultibyte encodeUTF32be str
|
|
encodeLazy UTF32 str = encodeMultibyteLazy encodeUTF32be (bom:str)
|
|
encodeLazy UTF32LE str = encodeMultibyteLazy encodeUTF32le str
|
|
encodeLazy UTF32BE str = encodeMultibyteLazy encodeUTF32be str
|
|
encodable _ c = ord c < 0x0010FFFF
|
|
decode UTF32 str = let
|
|
(start,rest) = BS.splitAt 4 str
|
|
in case BS.unpack start of
|
|
[0x00,0x00,0xFE,0xFF] -> decode UTF32BE rest
|
|
[0xFE,0xFF,0x00,0x00] -> decode UTF32LE rest
|
|
_ -> decode UTF32BE str
|
|
decode UTF32LE str = decodeMultibyte decodeUTF32le str
|
|
decode UTF32BE str = decodeMultibyte decodeUTF32be str
|
|
decodeLazy UTF32 str = let
|
|
(start,rest) = LBS.splitAt 4 str
|
|
in case LBS.unpack start of
|
|
[0x00,0x00,0xFE,0xFF] -> decodeLazy UTF32BE rest
|
|
[0xFE,0xFF,0x00,0x00] -> decodeLazy UTF32LE rest
|
|
_ -> decodeLazy UTF32BE str
|
|
decodeLazy UTF32LE str = decodeMultibyteLazy decodeUTF32le str
|
|
decodeLazy UTF32BE str = decodeMultibyteLazy decodeUTF32be str
|
|
|
|
encodeUTF32be :: Char -> (Word8,EncodeState)
|
|
encodeUTF32be ch = let
|
|
w = ord ch
|
|
w1 = fromIntegral $ w `shiftR` 24
|
|
w2 = fromIntegral $ w `shiftR` 16
|
|
w3 = fromIntegral $ w `shiftR` 8
|
|
w4 = fromIntegral $ w
|
|
in (w1,Put3 w2 w3 w4)
|
|
|
|
encodeUTF32le :: Char -> (Word8,EncodeState)
|
|
encodeUTF32le ch = let
|
|
w = ord ch
|
|
w1 = fromIntegral $ w `shiftR` 24
|
|
w2 = fromIntegral $ w `shiftR` 16
|
|
w3 = fromIntegral $ w `shiftR` 8
|
|
w4 = fromIntegral $ w
|
|
in (w4,Put3 w3 w2 w1)
|
|
|
|
decodeUTF32be :: [Word8] -> (Char,[Word8])
|
|
decodeUTF32be (w1:w2:w3:w4:rest) = let
|
|
v = (fromIntegral w1 `shiftL` 24) .|.
|
|
(fromIntegral w2 `shiftL` 16) .|.
|
|
(fromIntegral w3 `shiftL` 8) .|.
|
|
(fromIntegral w4)
|
|
in if v < 0x0010FFFF
|
|
then (chr v,rest)
|
|
else throwDyn (IllegalRepresentation [w1,w2,w3,w4])
|
|
decodeUTF32be _ = throwDyn UnexpectedEnd
|
|
|
|
decodeUTF32le :: [Word8] -> (Char,[Word8])
|
|
decodeUTF32le (w1:w2:w3:w4:rest) = let
|
|
v = (fromIntegral w4 `shiftL` 24) .|.
|
|
(fromIntegral w3 `shiftL` 16) .|.
|
|
(fromIntegral w2 `shiftL` 8) .|.
|
|
(fromIntegral w1)
|
|
in if v < 0x0010FFFF
|
|
then (chr v,rest)
|
|
else throwDyn (IllegalRepresentation [w1,w2,w3,w4])
|
|
decodeUTF32le _ = throwDyn UnexpectedEnd
|