238 lines
7.2 KiB
Haskell
238 lines
7.2 KiB
Haskell
{-# LANGUAGE CPP,DeriveDataTypeable #-}
|
|
{- | GB18030 is a chinese character encoding that is mandatory in china (if you
|
|
- don\'t implement it, you\'re not allowed to sell your software there).
|
|
-}
|
|
|
|
module Data.Encoding.GB18030
|
|
(GB18030(..))
|
|
where
|
|
|
|
import Control.Throws
|
|
import Data.Char (chr,ord)
|
|
import Data.Word
|
|
import Data.Bits
|
|
import Data.Encoding.Base
|
|
import Data.Encoding.ByteSource
|
|
import Data.Encoding.ByteSink
|
|
import Data.Encoding.Exception
|
|
import Data.ByteString (ByteString)
|
|
import qualified Data.ByteString as BS
|
|
import Data.Typeable
|
|
|
|
#if __GLASGOW_HASKELL__>=608
|
|
import Data.ByteString.Unsafe (unsafeIndex)
|
|
#else
|
|
import Data.ByteString.Base (unsafeIndex)
|
|
#endif
|
|
|
|
import Data.Encoding.GB18030Data
|
|
|
|
data GB18030 = GB18030 deriving (Eq,Show,Typeable)
|
|
|
|
instance Encoding GB18030 where
|
|
decodeChar _ = do
|
|
w1 <- fetchWord8
|
|
case () of
|
|
_
|
|
| w1 <= 0x80 -> return (chr $ fromIntegral w1) -- it's ascii
|
|
| w1 <= 0xFE -> do
|
|
w2 <- fetchWord8
|
|
case () of
|
|
_
|
|
| w2 < 0x30 -> throwException (IllegalCharacter w2)
|
|
| w2 <= 0x39 -> do
|
|
w3 <- fetchWord8
|
|
case () of
|
|
_
|
|
| w3 < 0x81 -> throwException (IllegalCharacter w3)
|
|
| w3 <= 0xFE -> do
|
|
w4 <- fetchWord8
|
|
case () of
|
|
_
|
|
| w4 < 0x30 -> throwException (IllegalCharacter w4)
|
|
| w4 <= 0x39 -> decodeGBFour $ linear w1 w2 w3 w4
|
|
| otherwise -> throwException (IllegalCharacter w4)
|
|
| otherwise -> throwException (IllegalCharacter w3)
|
|
| w2 <= 0x7E -> return $ decodeGBTwo $ linear2 w1 w2
|
|
| w2 == 0x7F -> throwException (IllegalCharacter w2)
|
|
| w2 <= 0xFE -> return $ decodeGBTwo $ linear2 w1 w2
|
|
| otherwise -> throwException (IllegalCharacter w2)
|
|
| otherwise -> throwException (IllegalCharacter w1)
|
|
{- How this works: The nested if-structures form an binary tree over the
|
|
- encoding range.
|
|
-}
|
|
encodeChar _ ch = if ch<='\x4946' -- 1
|
|
then (if ch<='\x4055' -- 2
|
|
then (if ch<='\x2E80' -- 3
|
|
then (if ch<='\x200F' -- 4
|
|
then (if ch<'\x0452'
|
|
then arr 0x0000 arr1
|
|
else range range1)
|
|
else (if ch<'\x2643'
|
|
then arr 0x2010 arr2
|
|
else range range2))
|
|
else (if ch<='\x3917' -- 4
|
|
then (if ch<'\x361B'
|
|
then arr 0x2E81 arr3
|
|
else range range3)
|
|
else (if ch<'\x3CE1'
|
|
then arr 0x3918 arr4
|
|
else range range4)))
|
|
else (if ch<='\x464B' -- 3
|
|
then (if ch<='\x4336' -- 4
|
|
then (if ch<'\x4160'
|
|
then arr 0x4056 arr5
|
|
else range range5)
|
|
else (if ch<'\x44D7'
|
|
then arr 0x4337 arr6
|
|
else range range6))
|
|
else (if ch<'\x478E'
|
|
then arr 0x464C arr7
|
|
else range range7)))
|
|
else (if ch<='\xF92B' -- 2
|
|
then (if ch<='\xD7FF' -- 3
|
|
then (if ch<='\x4C76' -- 4
|
|
then (if ch<'\x49B8'
|
|
then arr 0x4947 arr8
|
|
else range range8)
|
|
else (if ch<'\x9FA6'
|
|
then arr 0x4C77 arr9
|
|
else range range9))
|
|
else (if ch<'\xE865'
|
|
then arr 0xD800 arr10
|
|
else range range10))
|
|
else (if ch<='\xFFFF' -- 3
|
|
then (if ch<='\xFE2F' -- 4
|
|
then (if ch<'\xFA2A'
|
|
then arr 0xF92C arr11
|
|
else range range11)
|
|
else (if ch<'\xFFE6'
|
|
then arr 0xFE30 arr12
|
|
else range range12))
|
|
else (if ch<='\x10FFFF' -- 4
|
|
then range range13
|
|
else throwException (HasNoRepresentation ch))))
|
|
where
|
|
range r = let (w1,w2,w3,w4) = delinear (ord ch + r)
|
|
in pushWord8 w1 >> pushWord8 w2 >> pushWord8 w3 >> pushWord8 w4
|
|
arr off a = let ind = (ord ch - off)*5
|
|
w1 = unsafeIndex a (ind+1)
|
|
w2 = unsafeIndex a (ind+2)
|
|
w3 = unsafeIndex a (ind+3)
|
|
w4 = unsafeIndex a (ind+4)
|
|
in do
|
|
pushWord8 w1
|
|
case unsafeIndex a ind of
|
|
1 -> return ()
|
|
2 -> pushWord8 w2
|
|
3 -> pushWord8 w2 >> pushWord8 w3
|
|
4 -> pushWord8 w2 >> pushWord8 w3 >> pushWord8 w4
|
|
encodeable _ c = c <= '\x10FFFF'
|
|
|
|
linear :: Word8 -> Word8 -> Word8 -> Word8 -> Int
|
|
linear w1 w2 w3 w4
|
|
= (fromIntegral (w4-0x30))
|
|
+ (fromIntegral (w3-0x81))*10
|
|
+ (fromIntegral (w2-0x30))*1260
|
|
+ (fromIntegral (w1-0x81))*12600
|
|
|
|
linear2 :: Word8 -> Word8 -> Int
|
|
linear2 w1 w2 = (fromIntegral (w2 - (if w2<=0x7E
|
|
then 0x40
|
|
else 0x41)))
|
|
+ (fromIntegral (w1-0x81))*190
|
|
|
|
delinear :: Int -> (Word8,Word8,Word8,Word8)
|
|
delinear n = let
|
|
(w1,n1) = n `divMod` 12600
|
|
(w2,n2) = n1 `divMod` 1260
|
|
(w3,n3) = n2 `divMod` 10
|
|
w4 = n3
|
|
in (fromIntegral w1+0x81
|
|
,fromIntegral w2+0x30
|
|
,fromIntegral w3+0x81
|
|
,fromIntegral w4+0x30)
|
|
|
|
decodeGBTwo :: Int -> Char
|
|
decodeGBTwo n = let
|
|
rn = n*2
|
|
w1 = unsafeIndex rrarr rn
|
|
w2 = unsafeIndex rrarr (rn+1)
|
|
in chr $ ((fromIntegral w1) `shiftL` 8) .|. (fromIntegral w2)
|
|
|
|
decodeGBFour :: ByteSource m => Int -> m Char
|
|
decodeGBFour v = if v<=17858 -- 1
|
|
then (if v<=15582 -- 2
|
|
then (if v<=11328 -- 3
|
|
then (if v<=7921 -- 4
|
|
then (if v<820
|
|
then arr 0 rarr1
|
|
else range range1)
|
|
else (if v<9219
|
|
then arr 7922 rarr2
|
|
else range range2))
|
|
else (if v<=13737 -- 4
|
|
then (if v<12973
|
|
then arr 11329 rarr3
|
|
else range range3)
|
|
else (if v<14698
|
|
then arr 13738 rarr4
|
|
else range range4)))
|
|
else (if v<=17101 -- 3
|
|
then (if v<=16317 -- 4
|
|
then (if v<15847
|
|
then arr 15583 rarr5
|
|
else range range5)
|
|
else (if v<16729
|
|
then arr 16318 rarr6
|
|
else range range6))
|
|
else (if v<17418
|
|
then arr 17102 rarr7
|
|
else range range7)))
|
|
else (if v<=37844 -- 2
|
|
then (if v<=33468 -- 3
|
|
then (if v<=18663 -- 4
|
|
then (if v<17961
|
|
then arr 17859 rarr8
|
|
else range range8)
|
|
else (if v<19043
|
|
then arr 18664 rarr9
|
|
else range range9))
|
|
else (if v<33550
|
|
then arr 33469 rarr10
|
|
else range range10))
|
|
else (if v<=39419 -- 3
|
|
then (if v<=39107 -- 4
|
|
then (if v<38078
|
|
then arr 37845 rarr11
|
|
else range range11)
|
|
else (if v<39394
|
|
then arr 39108 rarr12
|
|
else range range12))
|
|
else (if v<=1237575 && v>=189000
|
|
then range range13
|
|
else throwException OutOfRange)))
|
|
where
|
|
arr off a = let
|
|
v' = (v-off)*2
|
|
w1 = unsafeIndex a v'
|
|
w2 = unsafeIndex a (v'+1)
|
|
in return $ chr $ ((fromIntegral w1) `shiftL` 8)
|
|
.|. (fromIntegral w2)
|
|
range r = return $ chr (v-r)
|
|
|
|
range1,range2,range3,range4,range5,range6,range7,range8,range9,range10,range11,range12,range13 :: Int
|
|
range1 = -286
|
|
range2 = -576
|
|
range3 = -878
|
|
range4 = -887
|
|
range5 = -889
|
|
range6 = -894
|
|
range7 = -900
|
|
range8 = -911
|
|
range9 = -21827
|
|
range10 = -25943
|
|
range11 = -25964
|
|
range12 = -26116
|
|
range13 = 123464
|