Implemented GB18030 encoding

This is a bit of a hack, because the static lookup data this encoding requires brings template haskell to it's knees. So I've got a program that generates a haskell module file from the XML mapping.

darcs-hash:20070823034759-a4fee-883359c8951d4376fc2d783cd7be352d6c5b2111
This commit is contained in:
Henning Guenther 2007-08-22 20:47:59 -07:00
parent f069504db0
commit dc47984bf6
9 changed files with 31415 additions and 9 deletions

View File

@ -37,6 +37,7 @@ import Data.Encoding.CP1256
import Data.Encoding.CP1257 import Data.Encoding.CP1257
import Data.Encoding.CP1258 import Data.Encoding.CP1258
import Data.Encoding.KOI8R import Data.Encoding.KOI8R
import Data.Encoding.GB18030
-- | An untyped encoding. Used in 'System.IO.Encoding.getSystemEncoding'. -- | An untyped encoding. Used in 'System.IO.Encoding.getSystemEncoding'.
data DynEncoding = forall t. Encoding t => DynEncoding t data DynEncoding = forall t. Encoding t => DynEncoding t
@ -80,4 +81,5 @@ encodingFromString "CP1255" = DynEncoding CP1255
encodingFromString "CP1256" = DynEncoding CP1256 encodingFromString "CP1256" = DynEncoding CP1256
encodingFromString "CP1257" = DynEncoding CP1257 encodingFromString "CP1257" = DynEncoding CP1257
encodingFromString "CP1258" = DynEncoding CP1258 encodingFromString "CP1258" = DynEncoding CP1258
encodingFromString "GB18030" = DynEncoding GB18030
encodingFromString str = error $ "Unknown encoding: "++show str encodingFromString str = error $ "Unknown encoding: "++show str

View File

@ -19,6 +19,7 @@ module Data.Encoding.Base
import Data.Array(array) import Data.Array(array)
import Data.ByteString (ByteString,unfoldrN,unfoldr,length,index,unpack) import Data.ByteString (ByteString,unfoldrN,unfoldr,length,index,unpack)
import qualified Data.ByteString.Lazy as LBS import qualified Data.ByteString.Lazy as LBS
import Data.Encoding.Helper.Template
import Data.ByteString.Base(unsafeIndex) import Data.ByteString.Base(unsafeIndex)
import Data.Map (Map,fromList,lookup) import Data.Map (Map,fromList,lookup)
import Data.Char(chr) import Data.Char(chr)
@ -100,6 +101,7 @@ data EncodeState
| Put1 !Word8 | Put1 !Word8
| Put2 !Word8 !Word8 | Put2 !Word8 !Word8
| Put3 !Word8 !Word8 !Word8 | Put3 !Word8 !Word8 !Word8
deriving Show
-- | This exception type is thrown whenever something went wrong during the -- | This exception type is thrown whenever something went wrong during the
-- encoding-process. -- encoding-process.
@ -115,20 +117,13 @@ data DecodingException
-- byte that couldn't be decoded. -- byte that couldn't be decoded.
| UnexpectedEnd -- ^ more bytes were needed to allow a | UnexpectedEnd -- ^ more bytes were needed to allow a
-- successfull decoding. -- successfull decoding.
| OutOfRange -- ^ the decoded value was out of the unicode range
deriving (Show,Typeable) deriving (Show,Typeable)
decodingArray :: FilePath -> Q Exp decodingArray :: FilePath -> Q Exp
-- Haddock hates template haskell...
#ifndef __HADDOCK__
decodingArray file = do decodingArray file = do
trans <- runIO (readTranslation file) trans <- runIO (readTranslation file)
return $ AppE createCharArray trans 0 255
(AppE
(VarE 'array)
(TupE [LitE $ IntegerL 0,LitE $ IntegerL 255]))
(ListE [ TupE [LitE $ IntegerL from,LitE $ CharL to]
| (from,to) <- trans ])
#endif
encodingMap :: FilePath -> Q Exp encodingMap :: FilePath -> Q Exp
#ifndef __HADDOCK__ #ifndef __HADDOCK__

259
Data/Encoding/GB18030.hs Normal file
View File

@ -0,0 +1,259 @@
{- | GB18030 is a chinese character encoding that is mandatory in china (if you
- don\'t implement it, you\'re not allowed to sell your software there).
-}
module Data.Encoding.GB18030
(GB18030(..))
where
import Control.Exception
import Data.Char
import Data.Word
import Data.Bits
import Data.Encoding.Base
import qualified Data.ByteString as BS
import Data.ByteString.Base (ByteString,c2w,w2c,unsafeIndex)
import Data.Encoding.GB18030Data
data GB18030 = GB18030
instance Encoding GB18030 where
encode _ = encodeMultibyte encodeGB
encodeLazy _ = encodeMultibyteLazy encodeGB
decode _ = decodeMultibyte decodeGB
decodeLazy _ = decodeMultibyteLazy decodeGB
encodable _ ch = ch <= '\x10FFFF'
decodable _ = checkValidity
data DecodingState
= Valid
| Invalid
| Second
| Third
| Fourth
deriving Eq
checkValidity :: ByteString -> Bool
checkValidity bs = BS.foldl' (\st w -> case st of
Invalid -> Invalid
Valid | w<=0x80 -> Valid
| w<=0xFE -> Second
| otherwise -> Invalid
Second | w< 0x30 -> Invalid
| w<=0x39 -> Third
| w<=0x7E -> Valid
| w==0x7F -> Invalid
| w<=0xFE -> Valid
| otherwise -> Invalid
Third | w< 0x81 -> Invalid
| w<=0xFE -> Fourth
| otherwise -> Invalid
Fourth | w< 0x30 -> Invalid
| w<=0x39 -> Valid
| otherwise -> Invalid
) Valid bs == Valid
{- How this works: The nested if-structures form an binary tree over the
- encoding range.
-}
encodeGB :: Char -> (Word8,EncodeState)
encodeGB ch = if ch<='\x4946' -- 1
then (if ch<='\x4055' -- 2
then (if ch<='\x2E80' -- 3
then (if ch<='\x200F' -- 4
then (if ch<'\x0452'
then arr 0x0000 arr1
else range range1)
else (if ch<'\x2643'
then arr 0x2010 arr2
else range range2))
else (if ch<='\x3917' -- 4
then (if ch<'\x361B'
then arr 0x2E81 arr3
else range range3)
else (if ch<'\x3CE1'
then arr 0x3918 arr4
else range range4)))
else (if ch<='\x464B' -- 3
then (if ch<='\x4336' -- 4
then (if ch<'\x4160'
then arr 0x4056 arr5
else range range5)
else (if ch<'\x44D7'
then arr 0x4337 arr6
else range range6))
else (if ch<'\x478E'
then arr 0x464C arr7
else range range7)))
else (if ch<='\xF92B' -- 2
then (if ch<='\xD7FF' -- 3
then (if ch<='\x4C76' -- 4
then (if ch<'\x49B8'
then arr 0x4947 arr8
else range range8)
else (if ch<'\x9FA6'
then arr 0x4C77 arr9
else range range9))
else (if ch<'\xE865'
then arr 0xD800 arr10
else range range10))
else (if ch<='\xFFFF' -- 3
then (if ch<='\xFE2F' -- 4
then (if ch<'\xFA2A'
then arr 0xF92C arr11
else range range11)
else (if ch<'\xFFE6'
then arr 0xFE30 arr12
else range range12))
else (if ch<='\x10FFFF' -- 4
then range range13
else throwDyn (HasNoRepresentation ch))))
where
range r = let
(w1,w2,w3,w4) = delinear (ord ch + r)
in (w1,Put3 w2 w3 w4)
arr off a = let
ind = (ord ch - off)*5
w1 = unsafeIndex a (ind+1)
w2 = unsafeIndex a (ind+2)
w3 = unsafeIndex a (ind+3)
w4 = unsafeIndex a (ind+4)
in (w1,case unsafeIndex a ind of
1 -> Done
2 -> Put1 w2
3 -> Put2 w2 w3
4 -> Put3 w2 w3 w4)
linear :: Word8 -> Word8 -> Word8 -> Word8 -> Int
linear w1 w2 w3 w4
= (fromIntegral (w4-0x30))
+ (fromIntegral (w3-0x81))*10
+ (fromIntegral (w2-0x30))*1260
+ (fromIntegral (w1-0x81))*12600
linear2 :: Word8 -> Word8 -> Int
linear2 w1 w2 = (fromIntegral (w2 - (if w2<=0x7E
then 0x40
else 0x41)))
+ (fromIntegral (w1-0x81))*190
delinear :: Int -> (Word8,Word8,Word8,Word8)
delinear n = let
(w1,n1) = n `divMod` 12600
(w2,n2) = n1 `divMod` 1260
(w3,n3) = n2 `divMod` 10
w4 = n3
in (fromIntegral w1+0x81
,fromIntegral w2+0x30
,fromIntegral w3+0x81
,fromIntegral w4+0x30)
decodeGB :: [Word8] -> (Char,[Word8])
decodeGB (w1:rst)
| w1 <=0x80 = (w2c w1,rst) -- it's ascii
| w1 <=0xFE = case rst of
w2:rst2
| w2 < 0x30 -> throwDyn (IllegalCharacter w2)
| w2 <=0x39 -> case rst2 of
w3:rst3
| w3 < 0x81 -> throwDyn (IllegalCharacter w3)
| w3 <=0xFE -> case rst3 of
w4:rst4
| w4 < 0x30 -> throwDyn (IllegalCharacter w4)
| w4 <=0x39 -> let
v = linear w1 w2 w3 w4
in (decodeGBFour v,rst4)
| otherwise -> throwDyn (IllegalCharacter w4)
[] -> throwDyn UnexpectedEnd
| otherwise -> throwDyn (IllegalCharacter w3)
[] -> throwDyn UnexpectedEnd
| w2 <=0x7E -> (decodeGBTwo (linear2 w1 w2),rst2)
| w2 ==0x7F -> throwDyn (IllegalCharacter w2)
| w2 <=0xFE -> (decodeGBTwo (linear2 w1 w2),rst2)
| otherwise -> throwDyn (IllegalCharacter w2)
[] -> throwDyn UnexpectedEnd
| otherwise = throwDyn (IllegalCharacter w1)
decodeGBTwo :: Int -> Char
decodeGBTwo n = let
rn = n*2
w1 = unsafeIndex rrarr rn
w2 = unsafeIndex rrarr (rn+1)
in chr $ ((fromIntegral w1) `shiftL` 8) .|. (fromIntegral w2)
decodeGBFour :: Int -> Char
decodeGBFour v = if v<=17858 -- 1
then (if v<=15582 -- 2
then (if v<=11328 -- 3
then (if v<=7921 -- 4
then (if v<820
then arr 0 rarr1
else range range1)
else (if v<9219
then arr 7922 rarr2
else range range2))
else (if v<=13737 -- 4
then (if v<12973
then arr 11329 rarr3
else range range3)
else (if v<14698
then arr 13738 rarr4
else range range4)))
else (if v<=17101 -- 3
then (if v<=16317 -- 4
then (if v<15847
then arr 15583 rarr5
else range range5)
else (if v<16729
then arr 16318 rarr6
else range range6))
else (if v<17418
then arr 17102 rarr7
else range range7)))
else (if v<=37844 -- 2
then (if v<=33468 -- 3
then (if v<=18663 -- 4
then (if v<17961
then arr 17859 rarr8
else range range8)
else (if v<19043
then arr 18664 rarr9
else range range9))
else (if v<33550
then arr 33469 rarr10
else range range10))
else (if v<=39419 -- 3
then (if v<=39107 -- 4
then (if v<38078
then arr 37845 rarr11
else range range11)
else (if v<39394
then arr 39108 rarr12
else range range12))
else (if v<=1237575 && v>=189000
then range range13
else throwDyn OutOfRange)))
where
arr off a = let
v' = (v-off)*2
w1 = unsafeIndex a v'
w2 = unsafeIndex a (v'+1)
in chr $ ((fromIntegral w1) `shiftL` 8)
.|. (fromIntegral w2)
range r = chr (v-r)
range1,range2,range3,range4,range5,range6,range7,range8,range9,range10,range11,range12,range13 :: Int
range1 = -286
range2 = -576
range3 = -878
range4 = -887
range5 = -889
range6 = -894
range7 = -900
range8 = -911
range9 = -21827
range10 = -25943
range11 = -25964
range12 = -26116
range13 = 123464

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,107 @@
{- This module is used to create a haskell file with static address-arrays in it -}
module Data.Encoding.Helper.Data where
import Data.Char (ord)
import Data.Word (Word8)
import Data.Ord (comparing)
import Data.Bits (shiftR)
import Data.List (sortBy,genericLength)
import Data.Encoding.Helper.XML
linear :: Word8 -> Word8 -> Word8 -> Word8 -> Int
linear w1 w2 w3 w4
= (fromIntegral (w4-0x30))
+ (fromIntegral (w3-0x81))*10
+ (fromIntegral (w2-0x30))*1260
+ (fromIntegral (w1-0x81))*12600
linear2 :: Word8 -> Word8 -> Int
linear2 w1 w2 = (fromIntegral (w2 - (if w2<=0x7E
then 0x40
else 0x41)))
+ (fromIntegral (w1-0x81))*190
createStandardModule :: IO ()
createStandardModule = do
let fn = "gb-18030-2000.xml"
str <- readFile fn
writeFile "Data/Encoding/GB18030Data.hs" $ createModuleFromFile fn str standardRanges standardRRanges
standardRanges =
[('\x0000','\x0451')
,('\x2010','\x2642')
,('\x2E81','\x361A')
,('\x3918','\x3CE0')
,('\x4056','\x415F')
,('\x4337','\x44D6')
,('\x464C','\x478D')
,('\x4947','\x49B7')
,('\x4C77','\x9FA5')
,('\xD800','\xE864')
,('\xF92C','\xFA29')
,('\xFE30','\xFFE5')
]
standardRRanges =
[( 0, 819)
,( 7922, 9218)
,(11329,12972)
,(13738,14697)
,(15583,15846)
,(16318,16728)
,(17102,17417)
,(17859,17960)
,(18664,19042)
,(33469,33549)
,(37845,38077)
,(39108,39393)
,(39420,188999)]
createModuleFromFile :: String -> String -> [(Char,Char)] -> [(Int,Int)] -> String
createModuleFromFile name str = createModule (readDecodeTable name str)
createModule :: [(Char,[Word8])] -> [(Char,Char)] -> [(Int,Int)] -> String
createModule mp ranges rranges = unlines $
["module Data.Encoding.GB18030Data where","","import Data.ByteString.Base"]
++ (createAddrVars "arr" (map (uncurry $ createAddr mp) ranges))
++ (createAddrVars "rarr" (map (uncurry $ createRAddr4 mp) rranges))
++ (createAddrVar "rrarr" (createRAddr2 mp))
createAddrVars :: String -> [[Word8]] -> [String]
createAddrVars base conts = concatMap (\(n,cont) ->
createAddrVar (base++show n) cont) (zip [1..] conts)
createAddrVar :: String -> [Word8] -> [String]
createAddrVar name cont =
[""
,name++" :: ByteString"
,name++" = unsafePackAddress "++show (length cont)++" \""++addr cont++"\"#"
]
createAddr :: [(Char,[Word8])] -> Char -> Char -> [Word8]
createAddr mp f t = let
lst = sortBy (comparing fst) [el | el@(ch,_) <- mp, ch>=f, ch<=t]
in concatMap (\(ch,seq) -> let
l = length seq
in [fromIntegral l]++seq++(replicate (4-l) 0)) lst
createRAddr2 :: [(Char,[Word8])] -> [Word8]
createRAddr2 mp = let
lst = sortBy (comparing snd)
[ (ch,v) | (ch,[w1,w2]) <- mp,let v = linear2 w1 w2]
in concatMap (\(ch,_) -> let i = ord ch
in [fromIntegral (i `shiftR` 8)
,fromIntegral i]) lst
createRAddr4 :: [(Char,[Word8])] -> Int -> Int -> [Word8]
createRAddr4 mp f t = let
lst = sortBy (comparing snd)
[ (ch,v) | (ch,[w1,w2,w3,w4]) <- mp,
let v = linear w1 w2 w3 w4, v>=f, v<=t ]
in concatMap (\(ch,_) -> let i = ord ch
in [fromIntegral (i `shiftR` 8)
,fromIntegral i]) lst
addr :: [Word8] -> String
addr = concatMap (\w -> "\\"++show w)

View File

@ -0,0 +1,41 @@
module Data.Encoding.Helper.XML where
import Data.Char
import Data.List (find)
import Data.Word
import Data.Maybe (mapMaybe)
import Text.XML.HaXml.Parse
import Text.XML.HaXml.Types
import Text.XML.HaXml.Verbatim
import Numeric
readDecodeTable :: String -> String -> [(Char,[Word8])]
readDecodeTable name str = let
Document _ _ (Elem root_name _ conts) _ = xmlParse name str
in concat $ mapMaybe findAssignments conts
findAssignments :: Content i -> Maybe [(Char,[Word8])]
findAssignments (CElem (Elem "assignments" _ conts) _)
= Just $ mapMaybe findAssignment conts
findAssignments _ = Nothing
findAssignment :: Content i -> Maybe (Char,[Word8])
findAssignment (CElem (Elem "a" attrs _) _) = do
u <- lookup "u" attrs
b <- lookup "b" attrs
return (chr $ readHexInt (showAttValue u),parseBinary b)
findAssignment _ = Nothing
parseBinary :: AttValue -> [Word8]
parseBinary val = map (fromIntegral.readHexInt) (words (showAttValue val))
showAttValue :: AttValue -> String
showAttValue (AttValue lst) = concat $ map (\el -> case el of
Left str -> str
Right ref -> verbatim ref) lst
readHexInt :: String -> Int
readHexInt str = case find (\x -> snd x == "") (readHex str) of
Nothing -> error "Not a hex"
Just (x,_) -> x

3
create_gb18030_data.sh Executable file
View File

@ -0,0 +1,3 @@
#!/bin/bash
ghc Data.Encoding.Helper.Data -e "createStandardModule"

View File

@ -46,9 +46,11 @@ Exposed-Modules:
Data.Encoding.CP1257 Data.Encoding.CP1257
Data.Encoding.CP1258 Data.Encoding.CP1258
Data.Encoding.KOI8R Data.Encoding.KOI8R
Data.Encoding.GB18030
System.IO.Encoding System.IO.Encoding
Other-Modules: Other-Modules:
Data.Encoding.Base Data.Encoding.Base
Data.Encoding.GB18030Data
Extra-Source-Files: Extra-Source-Files:
8859-2.TXT 8859-2.TXT
8859-3.TXT 8859-3.TXT

30916
gb-18030-2000.xml Normal file

File diff suppressed because it is too large Load Diff