encoding/Data/Encoding/GB18030.hs
Henning Guenther eeee054f1e Rewrite to support more sources and changing the encoding dynamically
Now it's possible to change the character encoding while de-/encoding.
Also, it's possible to use any data structure as a source or target of the de-/encoding process.

darcs-hash:20090221203100-a4fee-6da31f2e37c30a3f5cd5f10af71984209488bb0b
2009-02-21 12:31:00 -08:00

237 lines
7.2 KiB
Haskell

{-# LANGUAGE CPP,DeriveDataTypeable #-}
{- | GB18030 is a chinese character encoding that is mandatory in china (if you
- don\'t implement it, you\'re not allowed to sell your software there).
-}
module Data.Encoding.GB18030
(GB18030(..))
where
import Control.Throws
import Data.Char (chr,ord)
import Data.Word
import Data.Bits
import Data.Encoding.Base
import Data.Encoding.ByteSource
import Data.Encoding.ByteSink
import Data.Encoding.Exception
import Data.ByteString (ByteString)
import qualified Data.ByteString as BS
import Data.Typeable
#if __GLASGOW_HASKELL__>=608
import Data.ByteString.Unsafe (unsafeIndex)
#else
import Data.ByteString.Base (unsafeIndex)
#endif
import Data.Encoding.GB18030Data
data GB18030 = GB18030 deriving (Eq,Show,Typeable)
instance Encoding GB18030 where
decodeChar _ = do
w1 <- fetchWord8
case () of
_
| w1 <= 0x80 -> return (chr $ fromIntegral w1) -- it's ascii
| w1 <= 0xFE -> do
w2 <- fetchWord8
case () of
_
| w2 < 0x30 -> throwException (IllegalCharacter w2)
| w2 <= 0x39 -> do
w3 <- fetchWord8
case () of
_
| w3 < 0x81 -> throwException (IllegalCharacter w3)
| w3 <= 0xFE -> do
w4 <- fetchWord8
case () of
_
| w4 < 0x30 -> throwException (IllegalCharacter w4)
| w4 <= 0x39 -> decodeGBFour $ linear w1 w2 w3 w4
| otherwise -> throwException (IllegalCharacter w4)
| otherwise -> throwException (IllegalCharacter w3)
| w2 <= 0x7E -> return $ decodeGBTwo $ linear2 w1 w2
| w2 == 0x7F -> throwException (IllegalCharacter w2)
| w2 <= 0xFE -> return $ decodeGBTwo $ linear2 w1 w2
| otherwise -> throwException (IllegalCharacter w2)
| otherwise -> throwException (IllegalCharacter w1)
{- How this works: The nested if-structures form an binary tree over the
- encoding range.
-}
encodeChar _ ch = if ch<='\x4946' -- 1
then (if ch<='\x4055' -- 2
then (if ch<='\x2E80' -- 3
then (if ch<='\x200F' -- 4
then (if ch<'\x0452'
then arr 0x0000 arr1
else range range1)
else (if ch<'\x2643'
then arr 0x2010 arr2
else range range2))
else (if ch<='\x3917' -- 4
then (if ch<'\x361B'
then arr 0x2E81 arr3
else range range3)
else (if ch<'\x3CE1'
then arr 0x3918 arr4
else range range4)))
else (if ch<='\x464B' -- 3
then (if ch<='\x4336' -- 4
then (if ch<'\x4160'
then arr 0x4056 arr5
else range range5)
else (if ch<'\x44D7'
then arr 0x4337 arr6
else range range6))
else (if ch<'\x478E'
then arr 0x464C arr7
else range range7)))
else (if ch<='\xF92B' -- 2
then (if ch<='\xD7FF' -- 3
then (if ch<='\x4C76' -- 4
then (if ch<'\x49B8'
then arr 0x4947 arr8
else range range8)
else (if ch<'\x9FA6'
then arr 0x4C77 arr9
else range range9))
else (if ch<'\xE865'
then arr 0xD800 arr10
else range range10))
else (if ch<='\xFFFF' -- 3
then (if ch<='\xFE2F' -- 4
then (if ch<'\xFA2A'
then arr 0xF92C arr11
else range range11)
else (if ch<'\xFFE6'
then arr 0xFE30 arr12
else range range12))
else (if ch<='\x10FFFF' -- 4
then range range13
else throwException (HasNoRepresentation ch))))
where
range r = let (w1,w2,w3,w4) = delinear (ord ch + r)
in pushWord8 w1 >> pushWord8 w2 >> pushWord8 w3 >> pushWord8 w4
arr off a = let ind = (ord ch - off)*5
w1 = unsafeIndex a (ind+1)
w2 = unsafeIndex a (ind+2)
w3 = unsafeIndex a (ind+3)
w4 = unsafeIndex a (ind+4)
in do
pushWord8 w1
case unsafeIndex a ind of
1 -> return ()
2 -> pushWord8 w2
3 -> pushWord8 w2 >> pushWord8 w3
4 -> pushWord8 w2 >> pushWord8 w3 >> pushWord8 w4
linear :: Word8 -> Word8 -> Word8 -> Word8 -> Int
linear w1 w2 w3 w4
= (fromIntegral (w4-0x30))
+ (fromIntegral (w3-0x81))*10
+ (fromIntegral (w2-0x30))*1260
+ (fromIntegral (w1-0x81))*12600
linear2 :: Word8 -> Word8 -> Int
linear2 w1 w2 = (fromIntegral (w2 - (if w2<=0x7E
then 0x40
else 0x41)))
+ (fromIntegral (w1-0x81))*190
delinear :: Int -> (Word8,Word8,Word8,Word8)
delinear n = let
(w1,n1) = n `divMod` 12600
(w2,n2) = n1 `divMod` 1260
(w3,n3) = n2 `divMod` 10
w4 = n3
in (fromIntegral w1+0x81
,fromIntegral w2+0x30
,fromIntegral w3+0x81
,fromIntegral w4+0x30)
decodeGBTwo :: Int -> Char
decodeGBTwo n = let
rn = n*2
w1 = unsafeIndex rrarr rn
w2 = unsafeIndex rrarr (rn+1)
in chr $ ((fromIntegral w1) `shiftL` 8) .|. (fromIntegral w2)
decodeGBFour :: ByteSource m => Int -> m Char
decodeGBFour v = if v<=17858 -- 1
then (if v<=15582 -- 2
then (if v<=11328 -- 3
then (if v<=7921 -- 4
then (if v<820
then arr 0 rarr1
else range range1)
else (if v<9219
then arr 7922 rarr2
else range range2))
else (if v<=13737 -- 4
then (if v<12973
then arr 11329 rarr3
else range range3)
else (if v<14698
then arr 13738 rarr4
else range range4)))
else (if v<=17101 -- 3
then (if v<=16317 -- 4
then (if v<15847
then arr 15583 rarr5
else range range5)
else (if v<16729
then arr 16318 rarr6
else range range6))
else (if v<17418
then arr 17102 rarr7
else range range7)))
else (if v<=37844 -- 2
then (if v<=33468 -- 3
then (if v<=18663 -- 4
then (if v<17961
then arr 17859 rarr8
else range range8)
else (if v<19043
then arr 18664 rarr9
else range range9))
else (if v<33550
then arr 33469 rarr10
else range range10))
else (if v<=39419 -- 3
then (if v<=39107 -- 4
then (if v<38078
then arr 37845 rarr11
else range range11)
else (if v<39394
then arr 39108 rarr12
else range range12))
else (if v<=1237575 && v>=189000
then range range13
else throwException OutOfRange)))
where
arr off a = let
v' = (v-off)*2
w1 = unsafeIndex a v'
w2 = unsafeIndex a (v'+1)
in return $ chr $ ((fromIntegral w1) `shiftL` 8)
.|. (fromIntegral w2)
range r = return $ chr (v-r)
range1,range2,range3,range4,range5,range6,range7,range8,range9,range10,range11,range12,range13 :: Int
range1 = -286
range2 = -576
range3 = -878
range4 = -887
range5 = -889
range6 = -894
range7 = -900
range8 = -911
range9 = -21827
range10 = -25943
range11 = -25964
range12 = -26116
range13 = 123464