81 lines
2.4 KiB
Haskell
81 lines
2.4 KiB
Haskell
{- | This module implements UTF-8 encoding and decoding as in RFC 3629.
|
|
-}
|
|
module Data.Encoding.UTF8
|
|
(UTF8(..)) where
|
|
|
|
import Data.Bits
|
|
import Data.Char (ord,chr)
|
|
import Data.Encoding.Base
|
|
import Data.ByteString
|
|
import Data.Word
|
|
import Prelude hiding (length)
|
|
import Control.Exception
|
|
|
|
data UTF8 = UTF8 deriving Show
|
|
|
|
encodeUTF8 :: Char -> (Word8,EncodeState)
|
|
encodeUTF8 x
|
|
| n<=0x0000007F = (v,Done)
|
|
| n<=0x000007FF = (fromIntegral $ 0xC0 .|. (n `shiftR` 6)
|
|
,Put1 (0x80 .|. (v .&. 0x3F)))
|
|
| n<=0x0000FFFF = (fromIntegral $ 0xE0 .|. (n `shiftR` 12)
|
|
,Put2 (fromIntegral $
|
|
0x80 .|. ((n `shiftR` 6) .&. 0x3F))
|
|
(fromIntegral $
|
|
0x80 .|. (n .&. 0x3F)))
|
|
| n<=0x0010FFFF = (fromIntegral $ 0xF0 .|. (n `shiftR` 18)
|
|
,Put3 (fromIntegral $
|
|
0x80 .|. ((n `shiftR` 12) .&. 0x3F))
|
|
(fromIntegral $
|
|
0x80 .|. ((n `shiftR` 6) .&. 0x3F))
|
|
(fromIntegral $
|
|
0x80 .|. (n .&. 0x3F)))
|
|
| otherwise = throwDyn (HasNoRepresentation x)
|
|
where
|
|
n = ord x
|
|
v = fromIntegral $ ord x
|
|
|
|
decodeUTF8 :: [Word8] -> (Char,[Word8])
|
|
decodeUTF8 ~(w1:rest1)
|
|
| w1<=0x7F = (chr $ fromIntegral w1,rest1)
|
|
| w1<=0xBF = throwDyn (IllegalCharacter w1)
|
|
| w1<=0xDF = case rest1 of
|
|
(w2:rest2) -> (chr $ ((fromIntegral $ w1 .&. 0x1F) `shiftL` 6)
|
|
.|. (fromIntegral (w2 .&. 0x3F)),rest2)
|
|
_ -> throwDyn UnexpectedEnd
|
|
| w1<=0xEF = case rest1 of
|
|
(w2:w3:rest3) -> (chr $ ((fromIntegral $ w1 .&. 0x0F) `shiftL` 12)
|
|
.|. ((fromIntegral $ w2 .&. 0x3F) `shiftL` 6)
|
|
.|. (fromIntegral $ w3 .&. 0x3F),rest3)
|
|
_ -> throwDyn UnexpectedEnd
|
|
| w1<=0xF7 = case rest1 of
|
|
(w2:w3:w4:rest4) -> (chr $ ((fromIntegral $ w1 .&. 0x07) `shiftL` 18)
|
|
.|. ((fromIntegral $ w2 .&. 0x3F) `shiftL` 12)
|
|
.|. ((fromIntegral $ w3 .&. 0x3F) `shiftL` 6)
|
|
.|. (fromIntegral $ w4 .&. 0x3F),rest4)
|
|
| otherwise = throwDyn (IllegalCharacter w1)
|
|
|
|
data UTF8AnalyzeState
|
|
= Skip !Int
|
|
| Ok
|
|
| Failed
|
|
deriving Eq
|
|
|
|
instance Encoding UTF8 where
|
|
encode _ = encodeMultibyte encodeUTF8
|
|
encodeLazy _ = encodeMultibyteLazy encodeUTF8
|
|
encodable _ c = ord c <= 0x0010FFFF
|
|
decode _ = decodeMultibyte decodeUTF8
|
|
decodeLazy _ = decodeMultibyteLazy decodeUTF8
|
|
decodable _ str = (foldl' (\st w -> case st of
|
|
Ok | w<=0x7F -> Ok
|
|
| w<=0xBF -> Failed
|
|
| w<=0xDF -> Skip 0
|
|
| w<=0xEF -> Skip 1
|
|
| w<=0xF7 -> Skip 2
|
|
| otherwise -> Failed
|
|
Failed -> Failed
|
|
Skip n -> if w .&. 0xC0 == 0x80
|
|
then (if n == 0 then Ok else Skip (n-1))
|
|
else Failed) Ok str) == Ok
|