From 5eed326c086fa4479579d42329152a7f49394ee0 Mon Sep 17 00:00:00 2001 From: "gaetan.lehmann" Date: Wed, 14 May 2008 06:05:14 -0700 Subject: [PATCH] Add the support for a large number of encoding name aliases. The aliases are the ones used in python, and are normalized as in python: they are case and separator insensitive. darcs-hash:20080514130514-2fc9d-1b53b11141878a8651f3bde7e427c877172e6722 --- Data/Encoding.hs | 217 ++++++++++++++++++++++++++++++++++++++++------- encoding.cabal | 4 +- 2 files changed, 187 insertions(+), 34 deletions(-) diff --git a/Data/Encoding.hs b/Data/Encoding.hs index 94e3b04..c4bba4e 100644 --- a/Data/Encoding.hs +++ b/Data/Encoding.hs @@ -50,6 +50,8 @@ import Data.Encoding.CP1258 import Data.Encoding.KOI8R import Data.Encoding.KOI8U import Data.Encoding.GB18030 +import Data.Char +import Text.Regex #endif -- | An untyped encoding. Used in 'System.IO.Encoding.getSystemEncoding'. @@ -82,38 +84,189 @@ recodeLazy enc_f enc_t bs = encodeLazy enc_t (decodeLazy enc_f bs) #ifndef USE_HPC -- | Like 'encodingFromString' but returns 'Nothing' instead of throwing an error encodingFromStringMaybe :: String -> Maybe DynEncoding -encodingFromStringMaybe "ASCII" = Just $ DynEncoding ASCII -encodingFromStringMaybe "UTF-8" = Just $ DynEncoding UTF8 -encodingFromStringMaybe "UTF-16" = Just $ DynEncoding UTF16 -encodingFromStringMaybe "UTF-32" = Just $ DynEncoding UTF32 -encodingFromStringMaybe "KOI8-R" = Just $ DynEncoding KOI8R -encodingFromStringMaybe "KOI8-U" = Just $ DynEncoding KOI8U -encodingFromStringMaybe "ISO-8859-1" = Just $ DynEncoding ISO88591 -encodingFromStringMaybe "ISO-8859-2" = Just $ DynEncoding ISO88592 -encodingFromStringMaybe "ISO-8859-3" = Just $ DynEncoding ISO88593 -encodingFromStringMaybe "ISO-8859-4" = Just $ DynEncoding ISO88594 -encodingFromStringMaybe "ISO-8859-5" = Just $ DynEncoding ISO88595 -encodingFromStringMaybe "ISO-8859-6" = Just $ DynEncoding ISO88596 -encodingFromStringMaybe "ISO-8859-7" = Just $ DynEncoding ISO88597 -encodingFromStringMaybe "ISO-8859-8" = Just $ DynEncoding ISO88598 -encodingFromStringMaybe "ISO-8859-9" = Just $ DynEncoding ISO88599 -encodingFromStringMaybe "ISO-8859-10" = Just $ DynEncoding ISO885910 -encodingFromStringMaybe "ISO-8859-11" = Just $ DynEncoding ISO885911 -encodingFromStringMaybe "ISO-8859-13" = Just $ DynEncoding ISO885913 -encodingFromStringMaybe "ISO-8859-14" = Just $ DynEncoding ISO885914 -encodingFromStringMaybe "ISO-8859-15" = Just $ DynEncoding ISO885915 -encodingFromStringMaybe "ISO-8859-16" = Just $ DynEncoding ISO885916 -encodingFromStringMaybe "CP1250" = Just $ DynEncoding CP1250 -encodingFromStringMaybe "CP1251" = Just $ DynEncoding CP1251 -encodingFromStringMaybe "CP1252" = Just $ DynEncoding CP1252 -encodingFromStringMaybe "CP1253" = Just $ DynEncoding CP1253 -encodingFromStringMaybe "CP1254" = Just $ DynEncoding CP1254 -encodingFromStringMaybe "CP1255" = Just $ DynEncoding CP1255 -encodingFromStringMaybe "CP1256" = Just $ DynEncoding CP1256 -encodingFromStringMaybe "CP1257" = Just $ DynEncoding CP1257 -encodingFromStringMaybe "CP1258" = Just $ DynEncoding CP1258 -encodingFromStringMaybe "GB18030" = Just $ DynEncoding GB18030 -encodingFromStringMaybe _ = Nothing +encodingFromStringMaybe codeName = case (normalizeEncoding codeName) of + -- ASCII + "ascii" -> Just $ DynEncoding ASCII + "646" -> Just $ DynEncoding ASCII + "ansi_x3_4_1968" -> Just $ DynEncoding ASCII + "ansi_x3.4_1986" -> Just $ DynEncoding ASCII + "cp367" -> Just $ DynEncoding ASCII + "csascii" -> Just $ DynEncoding ASCII + "ibm367" -> Just $ DynEncoding ASCII + "iso646_us" -> Just $ DynEncoding ASCII + "iso_646.irv_1991" -> Just $ DynEncoding ASCII + "iso_ir_6" -> Just $ DynEncoding ASCII + "us" -> Just $ DynEncoding ASCII + "us_ascii" -> Just $ DynEncoding ASCII + -- UTF-8 + "utf_8" -> Just $ DynEncoding UTF8 + "u8" -> Just $ DynEncoding UTF8 + "utf" -> Just $ DynEncoding UTF8 + "utf8" -> Just $ DynEncoding UTF8 + "utf8_ucs2" -> Just $ DynEncoding UTF8 + "utf8_ucs4" -> Just $ DynEncoding UTF8 + -- UTF-16 + "utf_16" -> Just $ DynEncoding UTF16 + "u16" -> Just $ DynEncoding UTF16 + "utf16" -> Just $ DynEncoding UTF16 + -- UTF-32 + "utf_32" -> Just $ DynEncoding UTF32 + -- KOI8-R + "koi8_r" -> Just $ DynEncoding KOI8R + "cskoi8r" -> Just $ DynEncoding KOI8R + -- KOI8-I + "koi8_u" -> Just $ DynEncoding KOI8U + -- ISO-8859-1 + "iso_8859_1" -> Just $ DynEncoding ISO88591 + "iso8859_1" -> Just $ DynEncoding ISO88591 + "8859" -> Just $ DynEncoding ISO88591 + "cp819" -> Just $ DynEncoding ISO88591 + "csisolatin1" -> Just $ DynEncoding ISO88591 + "ibm819" -> Just $ DynEncoding ISO88591 + "iso8859" -> Just $ DynEncoding ISO88591 + "iso_8859_1_1987" -> Just $ DynEncoding ISO88591 + "iso_ir_100" -> Just $ DynEncoding ISO88591 + "l1" -> Just $ DynEncoding ISO88591 + "latin" -> Just $ DynEncoding ISO88591 + "latin1" -> Just $ DynEncoding ISO88591 + -- ISO-8859-2 + "iso_8859_2" -> Just $ DynEncoding ISO88592 + "iso8859_2" -> Just $ DynEncoding ISO88592 + "csisolatin2" -> Just $ DynEncoding ISO88592 + "iso_8859_2_1987" -> Just $ DynEncoding ISO88592 + "iso_ir_101" -> Just $ DynEncoding ISO88592 + "l2" -> Just $ DynEncoding ISO88592 + "latin2" -> Just $ DynEncoding ISO88592 + -- ISO-8859-3 + "iso_8859_3" -> Just $ DynEncoding ISO88593 + "iso8859_3" -> Just $ DynEncoding ISO88593 + "csisolatin3" -> Just $ DynEncoding ISO88593 + "iso_8859_3_1988" -> Just $ DynEncoding ISO88593 + "iso_ir_109" -> Just $ DynEncoding ISO88593 + "l3" -> Just $ DynEncoding ISO88593 + "latin3" -> Just $ DynEncoding ISO88593 + --ISO-8859-4 + "iso_8859_4" -> Just $ DynEncoding ISO88594 + "iso8859_4" -> Just $ DynEncoding ISO88594 + "csisolatin4" -> Just $ DynEncoding ISO88594 + "iso_8859_4_1988" -> Just $ DynEncoding ISO88594 + "iso_ir_110" -> Just $ DynEncoding ISO88594 + "l4" -> Just $ DynEncoding ISO88594 + "latin4" -> Just $ DynEncoding ISO88594 + --ISO-8859-5 + "iso_8859_5" -> Just $ DynEncoding ISO88595 + "iso8859_5" -> Just $ DynEncoding ISO88595 + "csisolatincyrillic" -> Just $ DynEncoding ISO88595 + "cyrillic" -> Just $ DynEncoding ISO88595 + "iso_8859_5_1988" -> Just $ DynEncoding ISO88595 + "iso_ir_144" -> Just $ DynEncoding ISO88595 + -- ISO-8859-6 + "iso_8859_6" -> Just $ DynEncoding ISO88596 + "iso8859_6" -> Just $ DynEncoding ISO88596 + "arabic" -> Just $ DynEncoding ISO88596 + "asmo_708" -> Just $ DynEncoding ISO88596 + "csisolatinarabic" -> Just $ DynEncoding ISO88596 + "ecma_114" -> Just $ DynEncoding ISO88596 + "iso_8859_6_1987" -> Just $ DynEncoding ISO88596 + "iso_ir_127" -> Just $ DynEncoding ISO88596 + -- ISO-8859-7 + "iso_8859_7" -> Just $ DynEncoding ISO88597 + "iso8859_7" -> Just $ DynEncoding ISO88597 + "csisolatingreek" -> Just $ DynEncoding ISO88597 + "ecma_118" -> Just $ DynEncoding ISO88597 + "elot_928" -> Just $ DynEncoding ISO88597 + "greek" -> Just $ DynEncoding ISO88597 + "greek8" -> Just $ DynEncoding ISO88597 + "iso_8859_7_1987" -> Just $ DynEncoding ISO88597 + "iso_ir_126" -> Just $ DynEncoding ISO88597 + -- ISO-8859-8 + "iso_8859_8" -> Just $ DynEncoding ISO88598 + "iso8859_8" -> Just $ DynEncoding ISO88598 + "csisolatinhebrew" -> Just $ DynEncoding ISO88598 + "hebrew" -> Just $ DynEncoding ISO88598 + "iso_8859_8_1988" -> Just $ DynEncoding ISO88598 + "iso_ir_138" -> Just $ DynEncoding ISO88598 + -- ISO-8859-9 + "iso_8859_9" -> Just $ DynEncoding ISO88599 + "iso8859_9" -> Just $ DynEncoding ISO88599 + "csisolatin5" -> Just $ DynEncoding ISO88599 + "iso_8859_9_1989" -> Just $ DynEncoding ISO88599 + "iso_ir_148" -> Just $ DynEncoding ISO88599 + "l5" -> Just $ DynEncoding ISO88599 + "latin5" -> Just $ DynEncoding ISO88599 + -- ISO-8859-10 + "iso_8859_10" -> Just $ DynEncoding ISO885910 + "iso8859_10" -> Just $ DynEncoding ISO885910 + "csisolatin6" -> Just $ DynEncoding ISO885910 + "iso_8859_10_1992" -> Just $ DynEncoding ISO885910 + "iso_ir_157" -> Just $ DynEncoding ISO885910 + "l6" -> Just $ DynEncoding ISO885910 + "latin6" -> Just $ DynEncoding ISO885910 + -- ISO-8859-11 + "iso_8859_11" -> Just $ DynEncoding ISO885911 + "iso8859_11" -> Just $ DynEncoding ISO885911 + "thai" -> Just $ DynEncoding ISO885911 + "iso_8859_11_2001" -> Just $ DynEncoding ISO885911 + -- ISO-8859-13 + "iso_8859_13" -> Just $ DynEncoding ISO885913 + "iso8859_13" -> Just $ DynEncoding ISO885913 + -- ISO-8859-14 + "iso_8859_14" -> Just $ DynEncoding ISO885914 + "iso8859_14" -> Just $ DynEncoding ISO885914 + "iso_8859_14_1998" -> Just $ DynEncoding ISO885914 + "iso_celtic" -> Just $ DynEncoding ISO885914 + "iso_ir_199" -> Just $ DynEncoding ISO885914 + "l8" -> Just $ DynEncoding ISO885914 + "latin8" -> Just $ DynEncoding ISO885914 + -- ISO-8859-15 + "iso_8859_15" -> Just $ DynEncoding ISO885915 + "iso8859_15" -> Just $ DynEncoding ISO885915 + "latin9" -> Just $ DynEncoding ISO885915 + "l9" -> Just $ DynEncoding ISO885915 + -- ISO-8859-16 + "iso_8859_16" -> Just $ DynEncoding ISO885916 + "iso8859_16" -> Just $ DynEncoding ISO885916 + "iso_8859_16_2001" -> Just $ DynEncoding ISO885916 + "iso_ir_226" -> Just $ DynEncoding ISO885916 + "l10" -> Just $ DynEncoding ISO885916 + "latin10" -> Just $ DynEncoding ISO885916 + -- CP1250 + "cp1250" -> Just $ DynEncoding CP1250 + "windows_1250" -> Just $ DynEncoding CP1250 + -- CP1251 + "cp1251" -> Just $ DynEncoding CP1251 + "windows_1251" -> Just $ DynEncoding CP1251 + -- CP1252 + "cp1252" -> Just $ DynEncoding CP1252 + "windows_1252" -> Just $ DynEncoding CP1252 + -- CP1253 + "cp1253" -> Just $ DynEncoding CP1253 + "windows_1253" -> Just $ DynEncoding CP1253 + -- CP1254 + "cp1254" -> Just $ DynEncoding CP1254 + "windows_1254" -> Just $ DynEncoding CP1254 + -- CP1255 + "cp1255" -> Just $ DynEncoding CP1255 + "windows_1255" -> Just $ DynEncoding CP1255 + -- CP1256 + "cp1256" -> Just $ DynEncoding CP1256 + "windows_1256" -> Just $ DynEncoding CP1256 + -- CP1257 + "cp1257" -> Just $ DynEncoding CP1257 + "windows_1257" -> Just $ DynEncoding CP1257 + -- CP1258 + "cp1258" -> Just $ DynEncoding CP1258 + "windows_1258" -> Just $ DynEncoding CP1258 + -- GB18030 + "gb18030" -> Just $ DynEncoding GB18030 + "gb18030_2000" -> Just $ DynEncoding GB18030 + -- defaults to nothing + _ -> Nothing + where + normalizeEncoding s = map toLower $ subRegex sep s "_" + sep = mkRegex "[^0-9A-Za-z]+" + + -- | Takes the name of an encoding and creates a dynamic encoding from it. encodingFromString :: String -> DynEncoding diff --git a/encoding.cabal b/encoding.cabal index caff1ae..20a7f8c 100644 --- a/encoding.cabal +++ b/encoding.cabal @@ -45,9 +45,9 @@ Flag splitBase Library if flag(splitBase) - Build-Depends: bytestring, base >= 3, template-haskell, containers, array + Build-Depends: bytestring, base >= 3, template-haskell, containers, array, regex-compat else - Build-Depends: base < 3, template-haskell + Build-Depends: base < 3, template-haskell, regex-compat Extensions: TemplateHaskell,CPP,ExistentialQuantification,ForeignFunctionInterface C-Sources: system_encoding.c Include-Dirs: .