Implemented GB18030 encoding
This is a bit of a hack, because the static lookup data this encoding requires brings template haskell to it's knees. So I've got a program that generates a haskell module file from the XML mapping. darcs-hash:20070823034759-a4fee-883359c8951d4376fc2d783cd7be352d6c5b2111
This commit is contained in:
parent
f069504db0
commit
dc47984bf6
@ -37,6 +37,7 @@ import Data.Encoding.CP1256
|
||||
import Data.Encoding.CP1257
|
||||
import Data.Encoding.CP1258
|
||||
import Data.Encoding.KOI8R
|
||||
import Data.Encoding.GB18030
|
||||
|
||||
-- | An untyped encoding. Used in 'System.IO.Encoding.getSystemEncoding'.
|
||||
data DynEncoding = forall t. Encoding t => DynEncoding t
|
||||
@ -80,4 +81,5 @@ encodingFromString "CP1255" = DynEncoding CP1255
|
||||
encodingFromString "CP1256" = DynEncoding CP1256
|
||||
encodingFromString "CP1257" = DynEncoding CP1257
|
||||
encodingFromString "CP1258" = DynEncoding CP1258
|
||||
encodingFromString "GB18030" = DynEncoding GB18030
|
||||
encodingFromString str = error $ "Unknown encoding: "++show str
|
||||
|
||||
@ -19,6 +19,7 @@ module Data.Encoding.Base
|
||||
import Data.Array(array)
|
||||
import Data.ByteString (ByteString,unfoldrN,unfoldr,length,index,unpack)
|
||||
import qualified Data.ByteString.Lazy as LBS
|
||||
import Data.Encoding.Helper.Template
|
||||
import Data.ByteString.Base(unsafeIndex)
|
||||
import Data.Map (Map,fromList,lookup)
|
||||
import Data.Char(chr)
|
||||
@ -100,6 +101,7 @@ data EncodeState
|
||||
| Put1 !Word8
|
||||
| Put2 !Word8 !Word8
|
||||
| Put3 !Word8 !Word8 !Word8
|
||||
deriving Show
|
||||
|
||||
-- | This exception type is thrown whenever something went wrong during the
|
||||
-- encoding-process.
|
||||
@ -115,20 +117,13 @@ data DecodingException
|
||||
-- byte that couldn't be decoded.
|
||||
| UnexpectedEnd -- ^ more bytes were needed to allow a
|
||||
-- successfull decoding.
|
||||
| OutOfRange -- ^ the decoded value was out of the unicode range
|
||||
deriving (Show,Typeable)
|
||||
|
||||
decodingArray :: FilePath -> Q Exp
|
||||
-- Haddock hates template haskell...
|
||||
#ifndef __HADDOCK__
|
||||
decodingArray file = do
|
||||
trans <- runIO (readTranslation file)
|
||||
return $ AppE
|
||||
(AppE
|
||||
(VarE 'array)
|
||||
(TupE [LitE $ IntegerL 0,LitE $ IntegerL 255]))
|
||||
(ListE [ TupE [LitE $ IntegerL from,LitE $ CharL to]
|
||||
| (from,to) <- trans ])
|
||||
#endif
|
||||
createCharArray trans 0 255
|
||||
|
||||
encodingMap :: FilePath -> Q Exp
|
||||
#ifndef __HADDOCK__
|
||||
|
||||
259
Data/Encoding/GB18030.hs
Normal file
259
Data/Encoding/GB18030.hs
Normal file
@ -0,0 +1,259 @@
|
||||
{- | GB18030 is a chinese character encoding that is mandatory in china (if you
|
||||
- don\'t implement it, you\'re not allowed to sell your software there).
|
||||
-}
|
||||
|
||||
module Data.Encoding.GB18030
|
||||
(GB18030(..))
|
||||
where
|
||||
|
||||
import Control.Exception
|
||||
import Data.Char
|
||||
import Data.Word
|
||||
import Data.Bits
|
||||
import Data.Encoding.Base
|
||||
import qualified Data.ByteString as BS
|
||||
import Data.ByteString.Base (ByteString,c2w,w2c,unsafeIndex)
|
||||
|
||||
import Data.Encoding.GB18030Data
|
||||
|
||||
data GB18030 = GB18030
|
||||
|
||||
instance Encoding GB18030 where
|
||||
encode _ = encodeMultibyte encodeGB
|
||||
encodeLazy _ = encodeMultibyteLazy encodeGB
|
||||
decode _ = decodeMultibyte decodeGB
|
||||
decodeLazy _ = decodeMultibyteLazy decodeGB
|
||||
encodable _ ch = ch <= '\x10FFFF'
|
||||
decodable _ = checkValidity
|
||||
|
||||
data DecodingState
|
||||
= Valid
|
||||
| Invalid
|
||||
| Second
|
||||
| Third
|
||||
| Fourth
|
||||
deriving Eq
|
||||
|
||||
checkValidity :: ByteString -> Bool
|
||||
checkValidity bs = BS.foldl' (\st w -> case st of
|
||||
Invalid -> Invalid
|
||||
Valid | w<=0x80 -> Valid
|
||||
| w<=0xFE -> Second
|
||||
| otherwise -> Invalid
|
||||
Second | w< 0x30 -> Invalid
|
||||
| w<=0x39 -> Third
|
||||
| w<=0x7E -> Valid
|
||||
| w==0x7F -> Invalid
|
||||
| w<=0xFE -> Valid
|
||||
| otherwise -> Invalid
|
||||
Third | w< 0x81 -> Invalid
|
||||
| w<=0xFE -> Fourth
|
||||
| otherwise -> Invalid
|
||||
Fourth | w< 0x30 -> Invalid
|
||||
| w<=0x39 -> Valid
|
||||
| otherwise -> Invalid
|
||||
) Valid bs == Valid
|
||||
|
||||
{- How this works: The nested if-structures form an binary tree over the
|
||||
- encoding range.
|
||||
-}
|
||||
encodeGB :: Char -> (Word8,EncodeState)
|
||||
encodeGB ch = if ch<='\x4946' -- 1
|
||||
then (if ch<='\x4055' -- 2
|
||||
then (if ch<='\x2E80' -- 3
|
||||
then (if ch<='\x200F' -- 4
|
||||
then (if ch<'\x0452'
|
||||
then arr 0x0000 arr1
|
||||
else range range1)
|
||||
else (if ch<'\x2643'
|
||||
then arr 0x2010 arr2
|
||||
else range range2))
|
||||
else (if ch<='\x3917' -- 4
|
||||
then (if ch<'\x361B'
|
||||
then arr 0x2E81 arr3
|
||||
else range range3)
|
||||
else (if ch<'\x3CE1'
|
||||
then arr 0x3918 arr4
|
||||
else range range4)))
|
||||
else (if ch<='\x464B' -- 3
|
||||
then (if ch<='\x4336' -- 4
|
||||
then (if ch<'\x4160'
|
||||
then arr 0x4056 arr5
|
||||
else range range5)
|
||||
else (if ch<'\x44D7'
|
||||
then arr 0x4337 arr6
|
||||
else range range6))
|
||||
else (if ch<'\x478E'
|
||||
then arr 0x464C arr7
|
||||
else range range7)))
|
||||
else (if ch<='\xF92B' -- 2
|
||||
then (if ch<='\xD7FF' -- 3
|
||||
then (if ch<='\x4C76' -- 4
|
||||
then (if ch<'\x49B8'
|
||||
then arr 0x4947 arr8
|
||||
else range range8)
|
||||
else (if ch<'\x9FA6'
|
||||
then arr 0x4C77 arr9
|
||||
else range range9))
|
||||
else (if ch<'\xE865'
|
||||
then arr 0xD800 arr10
|
||||
else range range10))
|
||||
else (if ch<='\xFFFF' -- 3
|
||||
then (if ch<='\xFE2F' -- 4
|
||||
then (if ch<'\xFA2A'
|
||||
then arr 0xF92C arr11
|
||||
else range range11)
|
||||
else (if ch<'\xFFE6'
|
||||
then arr 0xFE30 arr12
|
||||
else range range12))
|
||||
else (if ch<='\x10FFFF' -- 4
|
||||
then range range13
|
||||
else throwDyn (HasNoRepresentation ch))))
|
||||
where
|
||||
range r = let
|
||||
(w1,w2,w3,w4) = delinear (ord ch + r)
|
||||
in (w1,Put3 w2 w3 w4)
|
||||
arr off a = let
|
||||
ind = (ord ch - off)*5
|
||||
w1 = unsafeIndex a (ind+1)
|
||||
w2 = unsafeIndex a (ind+2)
|
||||
w3 = unsafeIndex a (ind+3)
|
||||
w4 = unsafeIndex a (ind+4)
|
||||
in (w1,case unsafeIndex a ind of
|
||||
1 -> Done
|
||||
2 -> Put1 w2
|
||||
3 -> Put2 w2 w3
|
||||
4 -> Put3 w2 w3 w4)
|
||||
|
||||
linear :: Word8 -> Word8 -> Word8 -> Word8 -> Int
|
||||
linear w1 w2 w3 w4
|
||||
= (fromIntegral (w4-0x30))
|
||||
+ (fromIntegral (w3-0x81))*10
|
||||
+ (fromIntegral (w2-0x30))*1260
|
||||
+ (fromIntegral (w1-0x81))*12600
|
||||
|
||||
linear2 :: Word8 -> Word8 -> Int
|
||||
linear2 w1 w2 = (fromIntegral (w2 - (if w2<=0x7E
|
||||
then 0x40
|
||||
else 0x41)))
|
||||
+ (fromIntegral (w1-0x81))*190
|
||||
|
||||
delinear :: Int -> (Word8,Word8,Word8,Word8)
|
||||
delinear n = let
|
||||
(w1,n1) = n `divMod` 12600
|
||||
(w2,n2) = n1 `divMod` 1260
|
||||
(w3,n3) = n2 `divMod` 10
|
||||
w4 = n3
|
||||
in (fromIntegral w1+0x81
|
||||
,fromIntegral w2+0x30
|
||||
,fromIntegral w3+0x81
|
||||
,fromIntegral w4+0x30)
|
||||
|
||||
decodeGB :: [Word8] -> (Char,[Word8])
|
||||
decodeGB (w1:rst)
|
||||
| w1 <=0x80 = (w2c w1,rst) -- it's ascii
|
||||
| w1 <=0xFE = case rst of
|
||||
w2:rst2
|
||||
| w2 < 0x30 -> throwDyn (IllegalCharacter w2)
|
||||
| w2 <=0x39 -> case rst2 of
|
||||
w3:rst3
|
||||
| w3 < 0x81 -> throwDyn (IllegalCharacter w3)
|
||||
| w3 <=0xFE -> case rst3 of
|
||||
w4:rst4
|
||||
| w4 < 0x30 -> throwDyn (IllegalCharacter w4)
|
||||
| w4 <=0x39 -> let
|
||||
v = linear w1 w2 w3 w4
|
||||
in (decodeGBFour v,rst4)
|
||||
| otherwise -> throwDyn (IllegalCharacter w4)
|
||||
[] -> throwDyn UnexpectedEnd
|
||||
| otherwise -> throwDyn (IllegalCharacter w3)
|
||||
[] -> throwDyn UnexpectedEnd
|
||||
| w2 <=0x7E -> (decodeGBTwo (linear2 w1 w2),rst2)
|
||||
| w2 ==0x7F -> throwDyn (IllegalCharacter w2)
|
||||
| w2 <=0xFE -> (decodeGBTwo (linear2 w1 w2),rst2)
|
||||
| otherwise -> throwDyn (IllegalCharacter w2)
|
||||
[] -> throwDyn UnexpectedEnd
|
||||
| otherwise = throwDyn (IllegalCharacter w1)
|
||||
|
||||
decodeGBTwo :: Int -> Char
|
||||
decodeGBTwo n = let
|
||||
rn = n*2
|
||||
w1 = unsafeIndex rrarr rn
|
||||
w2 = unsafeIndex rrarr (rn+1)
|
||||
in chr $ ((fromIntegral w1) `shiftL` 8) .|. (fromIntegral w2)
|
||||
|
||||
decodeGBFour :: Int -> Char
|
||||
decodeGBFour v = if v<=17858 -- 1
|
||||
then (if v<=15582 -- 2
|
||||
then (if v<=11328 -- 3
|
||||
then (if v<=7921 -- 4
|
||||
then (if v<820
|
||||
then arr 0 rarr1
|
||||
else range range1)
|
||||
else (if v<9219
|
||||
then arr 7922 rarr2
|
||||
else range range2))
|
||||
else (if v<=13737 -- 4
|
||||
then (if v<12973
|
||||
then arr 11329 rarr3
|
||||
else range range3)
|
||||
else (if v<14698
|
||||
then arr 13738 rarr4
|
||||
else range range4)))
|
||||
else (if v<=17101 -- 3
|
||||
then (if v<=16317 -- 4
|
||||
then (if v<15847
|
||||
then arr 15583 rarr5
|
||||
else range range5)
|
||||
else (if v<16729
|
||||
then arr 16318 rarr6
|
||||
else range range6))
|
||||
else (if v<17418
|
||||
then arr 17102 rarr7
|
||||
else range range7)))
|
||||
else (if v<=37844 -- 2
|
||||
then (if v<=33468 -- 3
|
||||
then (if v<=18663 -- 4
|
||||
then (if v<17961
|
||||
then arr 17859 rarr8
|
||||
else range range8)
|
||||
else (if v<19043
|
||||
then arr 18664 rarr9
|
||||
else range range9))
|
||||
else (if v<33550
|
||||
then arr 33469 rarr10
|
||||
else range range10))
|
||||
else (if v<=39419 -- 3
|
||||
then (if v<=39107 -- 4
|
||||
then (if v<38078
|
||||
then arr 37845 rarr11
|
||||
else range range11)
|
||||
else (if v<39394
|
||||
then arr 39108 rarr12
|
||||
else range range12))
|
||||
else (if v<=1237575 && v>=189000
|
||||
then range range13
|
||||
else throwDyn OutOfRange)))
|
||||
where
|
||||
arr off a = let
|
||||
v' = (v-off)*2
|
||||
w1 = unsafeIndex a v'
|
||||
w2 = unsafeIndex a (v'+1)
|
||||
in chr $ ((fromIntegral w1) `shiftL` 8)
|
||||
.|. (fromIntegral w2)
|
||||
range r = chr (v-r)
|
||||
|
||||
range1,range2,range3,range4,range5,range6,range7,range8,range9,range10,range11,range12,range13 :: Int
|
||||
range1 = -286
|
||||
range2 = -576
|
||||
range3 = -878
|
||||
range4 = -887
|
||||
range5 = -889
|
||||
range6 = -894
|
||||
range7 = -900
|
||||
range8 = -911
|
||||
range9 = -21827
|
||||
range10 = -25943
|
||||
range11 = -25964
|
||||
range12 = -26116
|
||||
range13 = 123464
|
||||
81
Data/Encoding/GB18030Data.hs
Normal file
81
Data/Encoding/GB18030Data.hs
Normal file
File diff suppressed because one or more lines are too long
107
Data/Encoding/Helper/Data.hs
Normal file
107
Data/Encoding/Helper/Data.hs
Normal file
@ -0,0 +1,107 @@
|
||||
{- This module is used to create a haskell file with static address-arrays in it -}
|
||||
|
||||
module Data.Encoding.Helper.Data where
|
||||
|
||||
import Data.Char (ord)
|
||||
import Data.Word (Word8)
|
||||
import Data.Ord (comparing)
|
||||
import Data.Bits (shiftR)
|
||||
import Data.List (sortBy,genericLength)
|
||||
import Data.Encoding.Helper.XML
|
||||
|
||||
linear :: Word8 -> Word8 -> Word8 -> Word8 -> Int
|
||||
linear w1 w2 w3 w4
|
||||
= (fromIntegral (w4-0x30))
|
||||
+ (fromIntegral (w3-0x81))*10
|
||||
+ (fromIntegral (w2-0x30))*1260
|
||||
+ (fromIntegral (w1-0x81))*12600
|
||||
|
||||
linear2 :: Word8 -> Word8 -> Int
|
||||
linear2 w1 w2 = (fromIntegral (w2 - (if w2<=0x7E
|
||||
then 0x40
|
||||
else 0x41)))
|
||||
+ (fromIntegral (w1-0x81))*190
|
||||
|
||||
createStandardModule :: IO ()
|
||||
createStandardModule = do
|
||||
let fn = "gb-18030-2000.xml"
|
||||
str <- readFile fn
|
||||
writeFile "Data/Encoding/GB18030Data.hs" $ createModuleFromFile fn str standardRanges standardRRanges
|
||||
|
||||
standardRanges =
|
||||
[('\x0000','\x0451')
|
||||
,('\x2010','\x2642')
|
||||
,('\x2E81','\x361A')
|
||||
,('\x3918','\x3CE0')
|
||||
,('\x4056','\x415F')
|
||||
,('\x4337','\x44D6')
|
||||
,('\x464C','\x478D')
|
||||
,('\x4947','\x49B7')
|
||||
,('\x4C77','\x9FA5')
|
||||
,('\xD800','\xE864')
|
||||
,('\xF92C','\xFA29')
|
||||
,('\xFE30','\xFFE5')
|
||||
]
|
||||
|
||||
standardRRanges =
|
||||
[( 0, 819)
|
||||
,( 7922, 9218)
|
||||
,(11329,12972)
|
||||
,(13738,14697)
|
||||
,(15583,15846)
|
||||
,(16318,16728)
|
||||
,(17102,17417)
|
||||
,(17859,17960)
|
||||
,(18664,19042)
|
||||
,(33469,33549)
|
||||
,(37845,38077)
|
||||
,(39108,39393)
|
||||
,(39420,188999)]
|
||||
|
||||
createModuleFromFile :: String -> String -> [(Char,Char)] -> [(Int,Int)] -> String
|
||||
createModuleFromFile name str = createModule (readDecodeTable name str)
|
||||
|
||||
createModule :: [(Char,[Word8])] -> [(Char,Char)] -> [(Int,Int)] -> String
|
||||
createModule mp ranges rranges = unlines $
|
||||
["module Data.Encoding.GB18030Data where","","import Data.ByteString.Base"]
|
||||
++ (createAddrVars "arr" (map (uncurry $ createAddr mp) ranges))
|
||||
++ (createAddrVars "rarr" (map (uncurry $ createRAddr4 mp) rranges))
|
||||
++ (createAddrVar "rrarr" (createRAddr2 mp))
|
||||
|
||||
createAddrVars :: String -> [[Word8]] -> [String]
|
||||
createAddrVars base conts = concatMap (\(n,cont) ->
|
||||
createAddrVar (base++show n) cont) (zip [1..] conts)
|
||||
|
||||
createAddrVar :: String -> [Word8] -> [String]
|
||||
createAddrVar name cont =
|
||||
[""
|
||||
,name++" :: ByteString"
|
||||
,name++" = unsafePackAddress "++show (length cont)++" \""++addr cont++"\"#"
|
||||
]
|
||||
|
||||
createAddr :: [(Char,[Word8])] -> Char -> Char -> [Word8]
|
||||
createAddr mp f t = let
|
||||
lst = sortBy (comparing fst) [el | el@(ch,_) <- mp, ch>=f, ch<=t]
|
||||
in concatMap (\(ch,seq) -> let
|
||||
l = length seq
|
||||
in [fromIntegral l]++seq++(replicate (4-l) 0)) lst
|
||||
|
||||
createRAddr2 :: [(Char,[Word8])] -> [Word8]
|
||||
createRAddr2 mp = let
|
||||
lst = sortBy (comparing snd)
|
||||
[ (ch,v) | (ch,[w1,w2]) <- mp,let v = linear2 w1 w2]
|
||||
in concatMap (\(ch,_) -> let i = ord ch
|
||||
in [fromIntegral (i `shiftR` 8)
|
||||
,fromIntegral i]) lst
|
||||
|
||||
createRAddr4 :: [(Char,[Word8])] -> Int -> Int -> [Word8]
|
||||
createRAddr4 mp f t = let
|
||||
lst = sortBy (comparing snd)
|
||||
[ (ch,v) | (ch,[w1,w2,w3,w4]) <- mp,
|
||||
let v = linear w1 w2 w3 w4, v>=f, v<=t ]
|
||||
in concatMap (\(ch,_) -> let i = ord ch
|
||||
in [fromIntegral (i `shiftR` 8)
|
||||
,fromIntegral i]) lst
|
||||
|
||||
addr :: [Word8] -> String
|
||||
addr = concatMap (\w -> "\\"++show w)
|
||||
41
Data/Encoding/Helper/XML.hs
Normal file
41
Data/Encoding/Helper/XML.hs
Normal file
@ -0,0 +1,41 @@
|
||||
module Data.Encoding.Helper.XML where
|
||||
|
||||
import Data.Char
|
||||
import Data.List (find)
|
||||
import Data.Word
|
||||
import Data.Maybe (mapMaybe)
|
||||
import Text.XML.HaXml.Parse
|
||||
import Text.XML.HaXml.Types
|
||||
import Text.XML.HaXml.Verbatim
|
||||
import Numeric
|
||||
|
||||
readDecodeTable :: String -> String -> [(Char,[Word8])]
|
||||
readDecodeTable name str = let
|
||||
Document _ _ (Elem root_name _ conts) _ = xmlParse name str
|
||||
in concat $ mapMaybe findAssignments conts
|
||||
|
||||
findAssignments :: Content i -> Maybe [(Char,[Word8])]
|
||||
findAssignments (CElem (Elem "assignments" _ conts) _)
|
||||
= Just $ mapMaybe findAssignment conts
|
||||
findAssignments _ = Nothing
|
||||
|
||||
findAssignment :: Content i -> Maybe (Char,[Word8])
|
||||
findAssignment (CElem (Elem "a" attrs _) _) = do
|
||||
u <- lookup "u" attrs
|
||||
b <- lookup "b" attrs
|
||||
return (chr $ readHexInt (showAttValue u),parseBinary b)
|
||||
findAssignment _ = Nothing
|
||||
|
||||
parseBinary :: AttValue -> [Word8]
|
||||
parseBinary val = map (fromIntegral.readHexInt) (words (showAttValue val))
|
||||
|
||||
showAttValue :: AttValue -> String
|
||||
showAttValue (AttValue lst) = concat $ map (\el -> case el of
|
||||
Left str -> str
|
||||
Right ref -> verbatim ref) lst
|
||||
|
||||
readHexInt :: String -> Int
|
||||
readHexInt str = case find (\x -> snd x == "") (readHex str) of
|
||||
Nothing -> error "Not a hex"
|
||||
Just (x,_) -> x
|
||||
|
||||
3
create_gb18030_data.sh
Executable file
3
create_gb18030_data.sh
Executable file
@ -0,0 +1,3 @@
|
||||
#!/bin/bash
|
||||
|
||||
ghc Data.Encoding.Helper.Data -e "createStandardModule"
|
||||
@ -46,9 +46,11 @@ Exposed-Modules:
|
||||
Data.Encoding.CP1257
|
||||
Data.Encoding.CP1258
|
||||
Data.Encoding.KOI8R
|
||||
Data.Encoding.GB18030
|
||||
System.IO.Encoding
|
||||
Other-Modules:
|
||||
Data.Encoding.Base
|
||||
Data.Encoding.GB18030Data
|
||||
Extra-Source-Files:
|
||||
8859-2.TXT
|
||||
8859-3.TXT
|
||||
|
||||
30916
gb-18030-2000.xml
Normal file
30916
gb-18030-2000.xml
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user