/[suikacvs]/messaging/manakai/lib/Message/Charset/Info.pm
Suika

Diff of /messaging/manakai/lib/Message/Charset/Info.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.8 by wakaba, Sun May 25 08:54:15 2008 UTC revision 1.9 by wakaba, Wed Sep 10 10:28:57 2008 UTC
# Line 2  package Message::Charset::Info; Line 2  package Message::Charset::Info;
2  use strict;  use strict;
3  our $VERSION=do{my @r=(q$Revision$=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};  our $VERSION=do{my @r=(q$Revision$=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4    
5  ## TODO: More drastic charset name aliasing is now part of HTML5...  ## TODO: Certain encodings MUST NOT be implemented [HTML5].
6    
7    ## ISSUE: Should we convert unassigned code point with trivial Unicode
8    ## mapping into U+FFFD?  Or, should we return that Unicode character
9    ## with an error?  (For example, Windows-1252's 0x81 should be converted
10    ## to U+FFFD or U+0081?)
11    
12  sub UNREGISTERED_CHARSET_NAME () { 0b1 }  sub UNREGISTERED_CHARSET_NAME () { 0b1 }
13      ## Names for non-standard encodings/implementations for Perl encodings      ## Names for non-standard encodings/implementations for Perl encodings
# Line 41  sub CHARSET_CATEGORY_EUCJP () { 0b10 } Line 46  sub CHARSET_CATEGORY_EUCJP () { 0b10 }
46  sub CHARSET_CATEGORY_SJIS () { 0b100 }  sub CHARSET_CATEGORY_SJIS () { 0b100 }
47  sub CHARSET_CATEGORY_UTF16 () { 0b1000 }  sub CHARSET_CATEGORY_UTF16 () { 0b1000 }
48      ## NOTE: "A UTF-16 encoding" in HTML5.      ## NOTE: "A UTF-16 encoding" in HTML5.
49    sub CHARSET_CATEGORY_ASCII_COMPAT () { 0b10000 }
50  ## iana_names      ## NOTE: "superset of US-ASCII (specifically, ANSI_X3.4-1968)
51        ## for bytes in the range 0x09-0x0A, 0x0C-0x0D, 0x20-0x22, 0x26, 0x27,
52  ## is_html_ascii_superset: "superset of US-ASCII (specifically, ANSI_X3.4-1968)      ## 0x2C-0x3F, 0x41-0x5A, and 0x61-0x7A" [HTML5]
53  ##     for bytes in the range 0x09 - 0x0D, 0x20, 0x21, 0x22, 0x26, 0x27,  sub CHARSET_CATEGORY_EBCDIC () { 0b100000 }
54  ##     0x2C - 0x3F, 0x41 - 0x5A, and 0x61 - 0x7A" [HTML5]      ## NOTE: "based on EBCDIC" in HTML5.
55  ## is_ebcdic_based  sub CHARSET_CATEGORY_MIME_TEXT () { 0b1000000 }
56    ## TODO: These flags are obsolete - should be replaced by category      ## NOTE: Suitable as MIME text.
57    
58  ## ISSUE: Shift_JIS is a superset of US-ASCII?  ISO-2022-JP is?  ## ISSUE: Shift_JIS is a superset of US-ASCII?  ISO-2022-JP is?
59  ## ISSUE: 0x5F (_) should be added to the range?  ## ISSUE: 0x5F (_) should be added to the range?
60    
61  my $Charset;  my $Charset; ## TODO: this is obsolete.
62    
63  our $IANACharset;  our $IANACharset;
64        ## NOTE: Charset names used where IANA charset names are allowed, either
65        ## registered or not.
66    our $HTMLCharset;
67        ## NOTE: Same as charset names in $IANACharset, except all ASCII
68        ## punctuations are dropped and letters/digits only names are not included.
69    
70  $Charset->{'us-ascii'}  $Charset->{'us-ascii'}
71  = $IANACharset->{'ansi_x3.4-1968'}  = $IANACharset->{'ansi_x3.4-1968'}
# Line 69  $Charset->{'us-ascii'} Line 79  $Charset->{'us-ascii'}
79  = $IANACharset->{'ibm367'}  = $IANACharset->{'ibm367'}
80  = $IANACharset->{'cp367'}  = $IANACharset->{'cp367'}
81  = $IANACharset->{'csascii'}  = $IANACharset->{'csascii'}
82    = $HTMLCharset->{'ansix341968'}
83    = $HTMLCharset->{'isoir6'}
84    = $HTMLCharset->{'ansix341986'}
85    = $HTMLCharset->{'iso646irv1991'}
86    = $HTMLCharset->{'iso646us'}
87    = $HTMLCharset->{'usascii'}
88  = __PACKAGE__->new ({  = __PACKAGE__->new ({
89    category => CHARSET_CATEGORY_BLOCK_SAFE,    category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT,
90    iana_names => {    iana_names => {
91      'ansi_x3.4-1968' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,      'ansi_x3.4-1968' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
92      'iso-ir-6' => REGISTERED_CHARSET_NAME,      'iso-ir-6' => REGISTERED_CHARSET_NAME,
# Line 84  $Charset->{'us-ascii'} Line 100  $Charset->{'us-ascii'}
100      'cp367' => REGISTERED_CHARSET_NAME,      'cp367' => REGISTERED_CHARSET_NAME,
101      'csascii' => REGISTERED_CHARSET_NAME,      'csascii' => REGISTERED_CHARSET_NAME,
102    },    },
103    is_html_ascii_superset => 1,    perl_names => {
104        'web-latin1-us-ascii' => UNREGISTERED_CHARSET_NAME |
105            SEMICONFORMING_ENCODING_IMPL | ERROR_REPORTING_ENCODING_IMPL,
106        'cp1252' => FALLBACK_ENCODING_IMPL, # part of standard Perl distribution
107      },
108      fallback => {
109        "\x80" => "\x{20AC}",
110        "\x81" => undef,
111        "\x82" => "\x{201A}",
112        "\x83" => "\x{0192}",
113        "\x84" => "\x{201E}",
114        "\x85" => "\x{2026}",
115        "\x86" => "\x{2020}",
116        "\x87" => "\x{2021}",
117        "\x88" => "\x{02C6}",
118        "\x89" => "\x{2030}",
119        "\x8A" => "\x{0160}",
120        "\x8B" => "\x{2039}",
121        "\x8C" => "\x{0152}",
122        "\x8D" => undef,
123        "\x8E" => "\x{017D}",
124        "\x8F" => undef,
125        "\x90" => undef,
126        "\x91" => "\x{2018}",
127        "\x92" => "\x{2019}",
128        "\x93" => "\x{201C}",
129        "\x94" => "\x{201D}",
130        "\x95" => "\x{2022}",
131        "\x96" => "\x{2013}",
132        "\x97" => "\x{2014}",
133        "\x98" => "\x{02DC}",
134        "\x99" => "\x{2122}",
135        "\x9A" => "\x{0161}",
136        "\x9B" => "\x{203A}",
137        "\x9C" => "\x{0153}",
138        "\x9D" => undef,
139        "\x9E" => "\x{017E}",
140        "\x9F" => "\x{0178}",
141        "\xA0" => "\xA0", "\xA1" => "\xA1", "\xA2" => "\xA2", "\xA3" => "\xA3",
142        "\xA4" => "\xA4", "\xA5" => "\xA5", "\xA6" => "\xA6", "\xA7" => "\xA7",
143        "\xA8" => "\xA8", "\xA9" => "\xA9", "\xAA" => "\xAA", "\xAB" => "\xAB",
144        "\xAC" => "\xAC", "\xAD" => "\xAD", "\xAE" => "\xAE", "\xAF" => "\xAF",
145        "\xB0" => "\xB0", "\xB1" => "\xB1", "\xB2" => "\xB2", "\xB3" => "\xB3",
146        "\xB4" => "\xB4", "\xB5" => "\xB5", "\xB6" => "\xB6", "\xB7" => "\xB7",
147        "\xB8" => "\xB8", "\xB9" => "\xB9", "\xBA" => "\xBA", "\xBB" => "\xBB",
148        "\xBC" => "\xBC", "\xBD" => "\xBD", "\xBE" => "\xBE", "\xBF" => "\xBF",
149        "\xC0" => "\xC0", "\xC1" => "\xC1", "\xC2" => "\xC2", "\xC3" => "\xC3",
150        "\xC4" => "\xC4", "\xC5" => "\xC5", "\xC6" => "\xC6", "\xC7" => "\xC7",
151        "\xC8" => "\xC8", "\xC9" => "\xC9", "\xCA" => "\xCA", "\xCB" => "\xCB",
152        "\xCC" => "\xCC", "\xCD" => "\xCD", "\xCE" => "\xCE", "\xCF" => "\xCF",
153        "\xD0" => "\xD0", "\xD1" => "\xD1", "\xD2" => "\xD2", "\xD3" => "\xD3",
154        "\xD4" => "\xD4", "\xD5" => "\xD5", "\xD6" => "\xD6", "\xD7" => "\xD7",
155        "\xD8" => "\xD8", "\xD9" => "\xD9", "\xDA" => "\xDA", "\xDB" => "\xDB",
156        "\xDC" => "\xDC", "\xDD" => "\xDD", "\xDE" => "\xDE", "\xDF" => "\xDF",
157        "\xE0" => "\xE0", "\xE1" => "\xE1", "\xE2" => "\xE2", "\xE3" => "\xE3",
158        "\xE4" => "\xE4", "\xE5" => "\xE5", "\xE6" => "\xE6", "\xE7" => "\xE7",
159        "\xE8" => "\xE8", "\xE9" => "\xE9", "\xEA" => "\xEA", "\xEB" => "\xEB",
160        "\xEC" => "\xEC", "\xED" => "\xED", "\xEE" => "\xEE", "\xEF" => "\xEF",
161        "\xF0" => "\xF0", "\xF1" => "\xF1", "\xF2" => "\xF2", "\xF3" => "\xF3",
162        "\xF4" => "\xF4", "\xF5" => "\xF5", "\xF6" => "\xF6", "\xF7" => "\xF7",
163        "\xF8" => "\xF8", "\xF9" => "\xF9", "\xFA" => "\xFA", "\xFB" => "\xFB",
164        "\xFC" => "\xFC", "\xFD" => "\xFD", "\xFE" => "\xFE", "\xFF" => "\xFF",
165      },
166      ## NOTE: Treated as |windows-1252|.  Properties of this charset
167      ## should be consistent with those of that charset.
168  });  });
169    
170  $Charset->{'iso-8859-1'}  $Charset->{'iso-8859-1'}
# Line 97  $Charset->{'iso-8859-1'} Line 177  $Charset->{'iso-8859-1'}
177  = $IANACharset->{'ibm819'}  = $IANACharset->{'ibm819'}
178  = $IANACharset->{'cp819'}  = $IANACharset->{'cp819'}
179  = $IANACharset->{'csisolatin1'}  = $IANACharset->{'csisolatin1'}
180    = $HTMLCharset->{'iso885911987'}
181    = $HTMLCharset->{'isoir100'}
182    = $HTMLCharset->{'iso88591'}
183  = __PACKAGE__->new ({  = __PACKAGE__->new ({
184    category => CHARSET_CATEGORY_BLOCK_SAFE,    category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT,
185    iana_names => {    iana_names => {
186      'iso_8859-1:1987' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,      'iso_8859-1:1987' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
187      'iso-ir-100' => REGISTERED_CHARSET_NAME,      'iso-ir-100' => REGISTERED_CHARSET_NAME,
# Line 113  $Charset->{'iso-8859-1'} Line 196  $Charset->{'iso-8859-1'}
196    perl_names => {    perl_names => {
197      'web-latin1' => UNREGISTERED_CHARSET_NAME | SEMICONFORMING_ENCODING_IMPL |      'web-latin1' => UNREGISTERED_CHARSET_NAME | SEMICONFORMING_ENCODING_IMPL |
198          ERROR_REPORTING_ENCODING_IMPL,          ERROR_REPORTING_ENCODING_IMPL,
199      'iso-8859-1' => FALLBACK_ENCODING_IMPL,      'cp1252' => FALLBACK_ENCODING_IMPL, # part of standard Perl distribution
200    },    },
201    fallback => {    fallback => {
202      "\x80" => "\x{20AC}",      "\x80" => "\x{20AC}",
203        "\x81" => undef,
204      "\x82" => "\x{201A}",      "\x82" => "\x{201A}",
205      "\x83" => "\x{0192}",      "\x83" => "\x{0192}",
206      "\x84" => "\x{201E}",      "\x84" => "\x{201E}",
# Line 128  $Charset->{'iso-8859-1'} Line 212  $Charset->{'iso-8859-1'}
212      "\x8A" => "\x{0160}",      "\x8A" => "\x{0160}",
213      "\x8B" => "\x{2039}",      "\x8B" => "\x{2039}",
214      "\x8C" => "\x{0152}",      "\x8C" => "\x{0152}",
215        "\x8D" => undef,
216      "\x8E" => "\x{017D}",      "\x8E" => "\x{017D}",
217        "\x8F" => undef,
218        "\x90" => undef,
219      "\x91" => "\x{2018}",      "\x91" => "\x{2018}",
220      "\x92" => "\x{2019}",      "\x92" => "\x{2019}",
221      "\x93" => "\x{201C}",      "\x93" => "\x{201C}",
# Line 141  $Charset->{'iso-8859-1'} Line 228  $Charset->{'iso-8859-1'}
228      "\x9A" => "\x{0161}",      "\x9A" => "\x{0161}",
229      "\x9B" => "\x{203A}",      "\x9B" => "\x{203A}",
230      "\x9C" => "\x{0153}",      "\x9C" => "\x{0153}",
231        "\x9D" => undef,
232      "\x9E" => "\x{017E}",      "\x9E" => "\x{017E}",
233      "\x9F" => "\x{0178}",      "\x9F" => "\x{0178}",
234    },    },
235    is_html_ascii_superset => 1,    ## NOTE: Treated as |windows-1252|.  Properties of this charset
236      ## should be consistent with those of that charset.
237  });  });
238    
239  $Charset->{'iso-8859-2'}  $Charset->{'iso-8859-2'}
# Line 155  $Charset->{'iso-8859-2'} Line 244  $Charset->{'iso-8859-2'}
244  = $IANACharset->{'latin2'}  = $IANACharset->{'latin2'}
245  = $IANACharset->{'l2'}  = $IANACharset->{'l2'}
246  = $IANACharset->{'csisolatin2'}  = $IANACharset->{'csisolatin2'}
247    = $HTMLCharset->{'iso885921987'}
248    = $HTMLCharset->{'isoir101'}
249    = $HTMLCharset->{'iso88592'}
250  = __PACKAGE__->new ({  = __PACKAGE__->new ({
251    category => CHARSET_CATEGORY_BLOCK_SAFE,    category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT,
252    iana_names => {    iana_names => {
253      'iso_8859-2:1987' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,      'iso_8859-2:1987' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
254      'iso-ir-101' => REGISTERED_CHARSET_NAME,      'iso-ir-101' => REGISTERED_CHARSET_NAME,
# Line 166  $Charset->{'iso-8859-2'} Line 258  $Charset->{'iso-8859-2'}
258      'l2' => REGISTERED_CHARSET_NAME,      'l2' => REGISTERED_CHARSET_NAME,
259      'csisolatin2' => REGISTERED_CHARSET_NAME,      'csisolatin2' => REGISTERED_CHARSET_NAME,
260    },    },
   is_html_ascii_superset => 1,  
261  });  });
262    
263  $Charset->{'iso-8859-3'}  $Charset->{'iso-8859-3'}
# Line 177  $Charset->{'iso-8859-3'} Line 268  $Charset->{'iso-8859-3'}
268  = $IANACharset->{'latin3'}  = $IANACharset->{'latin3'}
269  = $IANACharset->{'l3'}  = $IANACharset->{'l3'}
270  = $IANACharset->{'csisolatin3'}  = $IANACharset->{'csisolatin3'}
271    = $HTMLCharset->{'iso885931988'}
272    = $HTMLCharset->{'isoir109'}
273    = $HTMLCharset->{'iso88593'}
274  = __PACKAGE__->new ({  = __PACKAGE__->new ({
275    category => CHARSET_CATEGORY_BLOCK_SAFE,    category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT,
276    iana_names => {    iana_names => {
277      'iso_8859-3:1988' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,      'iso_8859-3:1988' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
278      'iso-ir-109' => REGISTERED_CHARSET_NAME,      'iso-ir-109' => REGISTERED_CHARSET_NAME,
# Line 188  $Charset->{'iso-8859-3'} Line 282  $Charset->{'iso-8859-3'}
282      'l3' => REGISTERED_CHARSET_NAME,      'l3' => REGISTERED_CHARSET_NAME,
283      'csisolatin3' => REGISTERED_CHARSET_NAME,      'csisolatin3' => REGISTERED_CHARSET_NAME,
284    },    },
285    is_html_ascii_superset => 1,    error_level => {
286        'unassigned-code-point-error' => 'iso_shall',
287            ## NOTE: I didn't check whether ISO/IEC 8859-3 prohibits the use of
288            ## unassigned code points, but ECMA-94:1986 (whose content considered
289            ## as equivalent to ISO 8859/1-4) disallows the use of them.
290      },
291  });  });
292    
293  $Charset->{'iso-8859-4'}  $Charset->{'iso-8859-4'}
# Line 199  $Charset->{'iso-8859-4'} Line 298  $Charset->{'iso-8859-4'}
298  = $IANACharset->{'latin4'}  = $IANACharset->{'latin4'}
299  = $IANACharset->{'l4'}  = $IANACharset->{'l4'}
300  = $IANACharset->{'csisolatin4'}  = $IANACharset->{'csisolatin4'}
301    = $HTMLCharset->{'iso885941988'}
302    = $HTMLCharset->{'isoir110'}
303    = $HTMLCharset->{'iso88594'}
304  = __PACKAGE__->new ({  = __PACKAGE__->new ({
305    category => CHARSET_CATEGORY_BLOCK_SAFE,    category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT,
306    iana_names => {    iana_names => {
307      'iso_8859-4:1988' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,      'iso_8859-4:1988' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
308      'iso-ir-110' => REGISTERED_CHARSET_NAME,      'iso-ir-110' => REGISTERED_CHARSET_NAME,
# Line 210  $Charset->{'iso-8859-4'} Line 312  $Charset->{'iso-8859-4'}
312      'l4' => REGISTERED_CHARSET_NAME,      'l4' => REGISTERED_CHARSET_NAME,
313      'csisolatin4' => REGISTERED_CHARSET_NAME,      'csisolatin4' => REGISTERED_CHARSET_NAME,
314    },    },
315    is_html_ascii_superset => 1,    error_level => {
316        'unassigned-code-point-error' => 'iso_shall',
317            ## NOTE: I didn't check whether ISO/IEC 8859-3 prohibits the use of
318            ## unassigned code points, but ECMA-94:1986 (whose content considered
319            ## as equivalent to ISO 8859/1-4) disallows the use of them.
320      },
321  });  });
322    
323  $Charset->{'iso-8859-5'}  $Charset->{'iso-8859-5'}
# Line 220  $Charset->{'iso-8859-5'} Line 327  $Charset->{'iso-8859-5'}
327  = $IANACharset->{'iso-8859-5'}  = $IANACharset->{'iso-8859-5'}
328  = $IANACharset->{'cyrillic'}  = $IANACharset->{'cyrillic'}
329  = $IANACharset->{'csisolatincyrillic'}  = $IANACharset->{'csisolatincyrillic'}
330    = $HTMLCharset->{'iso885951988'}
331    = $HTMLCharset->{'isoir144'}
332    = $HTMLCharset->{'iso88595'}
333  = __PACKAGE__->new ({  = __PACKAGE__->new ({
334    category => CHARSET_CATEGORY_BLOCK_SAFE,    category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT,
335    iana_names => {    iana_names => {
336      'iso_8859-5:1988' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,      'iso_8859-5:1988' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
337      'iso-ir-144' => REGISTERED_CHARSET_NAME,      'iso-ir-144' => REGISTERED_CHARSET_NAME,
# Line 230  $Charset->{'iso-8859-5'} Line 340  $Charset->{'iso-8859-5'}
340      'cyrillic' => REGISTERED_CHARSET_NAME,      'cyrillic' => REGISTERED_CHARSET_NAME,
341      'csisolatincyrillic' => REGISTERED_CHARSET_NAME,      'csisolatincyrillic' => REGISTERED_CHARSET_NAME,
342    },    },
   is_html_ascii_superset => 1,  
343  });  });
344    
345  $Charset->{'iso-8859-6'}  $Charset->{'iso-8859-6'}
# Line 242  $Charset->{'iso-8859-6'} Line 351  $Charset->{'iso-8859-6'}
351  = $IANACharset->{'asmo-708'}  = $IANACharset->{'asmo-708'}
352  = $IANACharset->{'arabic'}  = $IANACharset->{'arabic'}
353  = $IANACharset->{'csisolatinarabic'}  = $IANACharset->{'csisolatinarabic'}
354    = $HTMLCharset->{'iso885961987'}
355    = $HTMLCharset->{'isoir127'}
356    = $HTMLCharset->{'iso88596'}
357    = $HTMLCharset->{'ecma114'}
358    = $HTMLCharset->{'asmo708'}
359  = __PACKAGE__->new ({  = __PACKAGE__->new ({
360    category => CHARSET_CATEGORY_BLOCK_SAFE,    category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT,
361          ## NOTE: 3/0..3/9 have different semantics from U+0030..0039,
362          ## but have same character names (maybe).
363          ## NOTE: According to RFC 2046, charset left-hand half of "iso-8859-6"
364          ## is same as "us-ascii".
365    ## TODO: RFC 1345 def?
366    iana_names => {    iana_names => {
367      'iso_8859-6:1987' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,      'iso_8859-6:1987' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
368      'iso-ir-127' => REGISTERED_CHARSET_NAME,      'iso-ir-127' => REGISTERED_CHARSET_NAME,
# Line 254  $Charset->{'iso-8859-6'} Line 373  $Charset->{'iso-8859-6'}
373      'arabic' => REGISTERED_CHARSET_NAME,      'arabic' => REGISTERED_CHARSET_NAME,
374      'csisolatinarabic' => REGISTERED_CHARSET_NAME,      'csisolatinarabic' => REGISTERED_CHARSET_NAME,
375    },    },
376    is_html_ascii_superset => 1,    ## TODO: |error_level|
       ## NOTE: 3/0..3/9 have different semantics from U+0030..0039,  
       ## but have same character names (maybe).  
       ## NOTE: According to RFC 2046, charset left-hand half of "iso-8859-6"  
       ## is same as "us-ascii".  
377  });  });
378    
379  $Charset->{'iso-8859-7'}  $Charset->{'iso-8859-7'}
# Line 271  $Charset->{'iso-8859-7'} Line 386  $Charset->{'iso-8859-7'}
386  = $IANACharset->{'greek'}  = $IANACharset->{'greek'}
387  = $IANACharset->{'greek8'}  = $IANACharset->{'greek8'}
388  = $IANACharset->{'csisolatingreek'}  = $IANACharset->{'csisolatingreek'}
389    = $HTMLCharset->{'iso885971987'}
390    = $HTMLCharset->{'isoir126'}
391    = $HTMLCharset->{'iso88597'}
392    = $HTMLCharset->{'elot928'}
393    = $HTMLCharset->{'ecma118'}
394  = __PACKAGE__->new ({  = __PACKAGE__->new ({
395    category => CHARSET_CATEGORY_BLOCK_SAFE,    category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT,
396    iana_names => {    iana_names => {
397      'iso_8859-7:1987' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,      'iso_8859-7:1987' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
398      'iso-ir-126' => REGISTERED_CHARSET_NAME,      'iso-ir-126' => REGISTERED_CHARSET_NAME,
# Line 284  $Charset->{'iso-8859-7'} Line 404  $Charset->{'iso-8859-7'}
404      'greek8' => REGISTERED_CHARSET_NAME,      'greek8' => REGISTERED_CHARSET_NAME,
405      'csisolatingreek' => REGISTERED_CHARSET_NAME,      'csisolatingreek' => REGISTERED_CHARSET_NAME,
406    },    },
407    is_html_ascii_superset => 1,    ## TODO: |error_level|
408  });  });
409    
410  $Charset->{'iso-8859-8'}  $Charset->{'iso-8859-8'}
# Line 294  $Charset->{'iso-8859-8'} Line 414  $Charset->{'iso-8859-8'}
414  = $IANACharset->{'iso-8859-8'}  = $IANACharset->{'iso-8859-8'}
415  = $IANACharset->{'hebrew'}  = $IANACharset->{'hebrew'}
416  = $IANACharset->{'csisolatinhebrew'}  = $IANACharset->{'csisolatinhebrew'}
417    = $HTMLCharset->{'iso885981988'}
418    = $HTMLCharset->{'isoir138'}
419    = $HTMLCharset->{'iso88598'}
420  = __PACKAGE__->new ({  = __PACKAGE__->new ({
421    category => CHARSET_CATEGORY_BLOCK_SAFE,    category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT,
422    iana_names => {    iana_names => {
423      'iso_8859-8:1988' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,      'iso_8859-8:1988' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
424      'iso-ir-138' => REGISTERED_CHARSET_NAME,      'iso-ir-138' => REGISTERED_CHARSET_NAME,
# Line 304  $Charset->{'iso-8859-8'} Line 427  $Charset->{'iso-8859-8'}
427      'hebrew' => REGISTERED_CHARSET_NAME,      'hebrew' => REGISTERED_CHARSET_NAME,
428      'csisolatinhebrew' => REGISTERED_CHARSET_NAME,      'csisolatinhebrew' => REGISTERED_CHARSET_NAME,
429    },    },
430    is_html_ascii_superset => 1,    ## TODO: |error_level|
431  });  });
432    
433  $Charset->{'iso-8859-9'}  $Charset->{'iso-8859-9'}
# Line 315  $Charset->{'iso-8859-9'} Line 438  $Charset->{'iso-8859-9'}
438  = $IANACharset->{'latin5'}  = $IANACharset->{'latin5'}
439  = $IANACharset->{'l5'}  = $IANACharset->{'l5'}
440  = $IANACharset->{'csisolatin5'}  = $IANACharset->{'csisolatin5'}
441    = $HTMLCharset->{'iso885991989'}
442    = $HTMLCharset->{'isoir148'}
443    = $HTMLCharset->{'iso88599'}
444  = __PACKAGE__->new ({  = __PACKAGE__->new ({
445    category => CHARSET_CATEGORY_BLOCK_SAFE,    category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT,
446    iana_names => {    iana_names => {
447      'iso_8859-9:1989' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,      'iso_8859-9:1989' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
448      'iso-ir-148' => REGISTERED_CHARSET_NAME,      'iso-ir-148' => REGISTERED_CHARSET_NAME,
# Line 326  $Charset->{'iso-8859-9'} Line 452  $Charset->{'iso-8859-9'}
452      'l5' => REGISTERED_CHARSET_NAME,      'l5' => REGISTERED_CHARSET_NAME,
453      'csisolatin5' => REGISTERED_CHARSET_NAME,      'csisolatin5' => REGISTERED_CHARSET_NAME,
454    },    },
455    is_html_ascii_superset => 1,    perl_names => {
456        'web-latin5' => UNREGISTERED_CHARSET_NAME | SEMICONFORMING_ENCODING_IMPL |
457            ERROR_REPORTING_ENCODING_IMPL,
458        'cp1254' => FALLBACK_ENCODING_IMPL, # part of standard Perl distribution
459      },
460      fallback => {
461        "\x80" => "\x{20AC}",
462        "\x81" => undef,
463        "\x82" => "\x{201A}",
464        "\x83" => "\x{0192}",
465        "\x84" => "\x{201E}",
466        "\x85" => "\x{2026}",
467        "\x86" => "\x{2020}",
468        "\x87" => "\x{2021}",
469        "\x88" => "\x{02C6}",
470        "\x89" => "\x{2030}",
471        "\x8A" => "\x{0160}",
472        "\x8B" => "\x{2039}",
473        "\x8C" => "\x{0152}",
474        "\x8D" => undef,
475        "\x8E" => undef,
476        "\x8F" => undef,
477        "\x90" => undef,
478        "\x91" => "\x{2018}",
479        "\x92" => "\x{2019}",
480        "\x93" => "\x{201C}",
481        "\x94" => "\x{201D}",
482        "\x95" => "\x{2022}",
483        "\x96" => "\x{2013}",
484        "\x97" => "\x{2014}",
485        "\x98" => "\x{02DC}",
486        "\x99" => "\x{2122}",
487        "\x9A" => "\x{0161}",
488        "\x9B" => "\x{203A}",
489        "\x9C" => "\x{0153}",
490        "\x9D" => undef,
491        "\x9E" => undef,
492        "\x9F" => "\x{0178}",
493      },
494      ## NOTE: Treated as |windows-1254|.  Properties of this charset
495      ## should be consistent with those of that charset.
496  });  });
497    
498  $Charset->{'iso-8859-10'}  $Charset->{'iso-8859-10'}
# Line 336  $Charset->{'iso-8859-10'} Line 502  $Charset->{'iso-8859-10'}
502  = $IANACharset->{'iso_8859-10:1992'}  = $IANACharset->{'iso_8859-10:1992'}
503  = $IANACharset->{'csisolatin6'}  = $IANACharset->{'csisolatin6'}
504  = $IANACharset->{'latin6'}  = $IANACharset->{'latin6'}
505    = $HTMLCharset->{'iso885910'}
506    = $HTMLCharset->{'isoir157'}
507    = $HTMLCharset->{'iso8859101992'}
508  = __PACKAGE__->new ({  = __PACKAGE__->new ({
509    category => CHARSET_CATEGORY_BLOCK_SAFE,    category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT,
510    iana_names => {    iana_names => {
511      'iso-8859-10' => PRIMARY_CHARSET_NAME | PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,      'iso-8859-10' => PRIMARY_CHARSET_NAME | PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
512      'iso-ir-157' => REGISTERED_CHARSET_NAME,      'iso-ir-157' => REGISTERED_CHARSET_NAME,
# Line 346  $Charset->{'iso-8859-10'} Line 515  $Charset->{'iso-8859-10'}
515      'csisolatin6' => REGISTERED_CHARSET_NAME,      'csisolatin6' => REGISTERED_CHARSET_NAME,
516      'latin6' => REGISTERED_CHARSET_NAME,      'latin6' => REGISTERED_CHARSET_NAME,
517    },    },
518    is_html_ascii_superset => 1,    ## TODO: |error_level|
519  });  });
520    
521  $Charset->{'iso_6937-2-add'}  $Charset->{'iso_6937-2-add'}
522  = $IANACharset->{'iso_6937-2-add'}  = $IANACharset->{'iso_6937-2-add'}
523  = $IANACharset->{'iso-ir-142'}  = $IANACharset->{'iso-ir-142'}
524  = $IANACharset->{'csisotextcomm'}  = $IANACharset->{'csisotextcomm'}
525    = $HTMLCharset->{'iso69372add'}
526    = $HTMLCharset->{'isoir142'}
527  = __PACKAGE__->new ({  = __PACKAGE__->new ({
528    category => CHARSET_CATEGORY_BLOCK_SAFE,    category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT,
529    iana_names => {    iana_names => {
530      'iso_6937-2-add' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,      'iso_6937-2-add' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
531      'iso-ir-142' => REGISTERED_CHARSET_NAME,      'iso-ir-142' => REGISTERED_CHARSET_NAME,
532      'csisotextcomm' => REGISTERED_CHARSET_NAME,      'csisotextcomm' => REGISTERED_CHARSET_NAME,
533    },    },
534    is_html_ascii_superset => 1,    ## TODO: |error_level|
535  });  });
536    
537  $Charset->{'jis_x0201'}  $Charset->{'jis_x0201'}
538  = $IANACharset->{'jis_x0201'}  = $IANACharset->{'jis_x0201'}
539  = $IANACharset->{'x0201'}  = $IANACharset->{'x0201'}
540  = $IANACharset->{'cshalfwidthkatakana'}  = $IANACharset->{'cshalfwidthkatakana'}
541    = $HTMLCharset->{'jisx0201'}
542  = __PACKAGE__->new ({  = __PACKAGE__->new ({
543    category => CHARSET_CATEGORY_BLOCK_SAFE,    category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT,
544    iana_names => {    iana_names => {
545      'jis_x0201' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,      'jis_x0201' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
546      'x0201' => REGISTERED_CHARSET_NAME,      'x0201' => REGISTERED_CHARSET_NAME,
547      'cshalfwidthkatakana' => REGISTERED_CHARSET_NAME,      'cshalfwidthkatakana' => REGISTERED_CHARSET_NAME,
548    },    },
549    is_html_ascii_superset => 1,    ## TODO: |error_level|
550  });  });
551    
552  $Charset->{'jis_encoding'}  $Charset->{'jis_encoding'}
553  = $IANACharset->{'jis_encoding'}  = $IANACharset->{'jis_encoding'}
554  = $IANACharset->{'csjisencoding'}  = $IANACharset->{'csjisencoding'}
555    = $HTMLCharset->{'jisencoding'}
556  = __PACKAGE__->new ({  = __PACKAGE__->new ({
557    category => 0,    category => 0,
558    iana_names => {    iana_names => {
# Line 393  $Charset->{'shift_jis'} Line 566  $Charset->{'shift_jis'}
566  = $IANACharset->{'shift_jis'}  = $IANACharset->{'shift_jis'}
567  = $IANACharset->{'ms_kanji'}  = $IANACharset->{'ms_kanji'}
568  = $IANACharset->{'csshiftjis'}  = $IANACharset->{'csshiftjis'}
569    = $HTMLCharset->{'shiftjis'}
570    = $HTMLCharset->{'mskanji'}
571  = __PACKAGE__->new ({  = __PACKAGE__->new ({
572    category => CHARSET_CATEGORY_SJIS | CHARSET_CATEGORY_BLOCK_SAFE,    category => CHARSET_CATEGORY_SJIS | CHARSET_CATEGORY_BLOCK_SAFE |
573          CHARSET_CATEGORY_MIME_TEXT,
574    iana_names => {    iana_names => {
575      'shift_jis' => PREFERRED_CHARSET_NAME | PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,      'shift_jis' => PREFERRED_CHARSET_NAME | PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
576      'ms_kanji' => REGISTERED_CHARSET_NAME,      'ms_kanji' => REGISTERED_CHARSET_NAME,
# Line 407  $Charset->{'shift_jis'} Line 583  $Charset->{'shift_jis'}
583          ERROR_REPORTING_ENCODING_IMPL,          ERROR_REPORTING_ENCODING_IMPL,
584          ## NOTE: Unicode mapping is wrong.          ## NOTE: Unicode mapping is wrong.
585    },    },
586    mime_text_suitable => 1,    ## TODO: |error_level|
587  });  });
588    
589  $Charset->{'x-sjis'}  $Charset->{'x-sjis'}
590  = $IANACharset->{'x-sjis'}  = $IANACharset->{'x-sjis'}
591    = $HTMLCharset->{'xsjis'}
592  = __PACKAGE__->new ({  = __PACKAGE__->new ({
593    category => CHARSET_CATEGORY_SJIS | CHARSET_CATEGORY_BLOCK_SAFE,    category => CHARSET_CATEGORY_SJIS | CHARSET_CATEGORY_BLOCK_SAFE |
594          CHARSET_CATEGORY_ASCII_COMPAT,
595    iana_names => {    iana_names => {
596      'x-sjis' => UNREGISTERED_CHARSET_NAME,      'x-sjis' => UNREGISTERED_CHARSET_NAME,
597    },    },
598    perl_names => {    perl_names => {
599      'shift-jis-1997' => FALLBACK_ENCODING_IMPL | ERROR_REPORTING_ENCODING_IMPL,      'shift-jis-1997' => FALLBACK_ENCODING_IMPL | ERROR_REPORTING_ENCODING_IMPL,
600    },    },
601    mime_text_suitable => 1,    ## TODO: |error_level|
602  });  });
603    
604  $Charset->{shift_jisx0213}  $Charset->{shift_jisx0213}
605  = $IANACharset->{shift_jisx0213}  = $IANACharset->{shift_jisx0213}
606    = $HTMLCharset->{shiftjisx0213}
607  = __PACKAGE__->new ({  = __PACKAGE__->new ({
608    category => CHARSET_CATEGORY_SJIS | CHARSET_CATEGORY_BLOCK_SAFE,    category => CHARSET_CATEGORY_SJIS | CHARSET_CATEGORY_BLOCK_SAFE |
609          CHARSET_CATEGORY_MIME_TEXT,
610    iana_names => {    iana_names => {
611      shift_jisx0213 => UNREGISTERED_CHARSET_NAME,      shift_jisx0213 => UNREGISTERED_CHARSET_NAME,
612    },    },
# Line 435  $Charset->{shift_jisx0213} Line 615  $Charset->{shift_jisx0213}
615      'shift-jis-1997' => FALLBACK_ENCODING_IMPL | ERROR_REPORTING_ENCODING_IMPL,      'shift-jis-1997' => FALLBACK_ENCODING_IMPL | ERROR_REPORTING_ENCODING_IMPL,
616      'shiftjis' => FALLBACK_ENCODING_IMPL | ERROR_REPORTING_ENCODING_IMPL,      'shiftjis' => FALLBACK_ENCODING_IMPL | ERROR_REPORTING_ENCODING_IMPL,
617    },    },
618    mime_text_suitable => 1,    ## TODO: |error_level|
619  });  });
620    
621  $Charset->{'euc-jp'}  $Charset->{'euc-jp'}
622  = $IANACharset->{'extended_unix_code_packed_format_for_japanese'}  = $IANACharset->{'extended_unix_code_packed_format_for_japanese'}
623  = $IANACharset->{'cseucpkdfmtjapanese'}  = $IANACharset->{'cseucpkdfmtjapanese'}
624  = $IANACharset->{'euc-jp'}  = $IANACharset->{'euc-jp'}
625  = $IANACharset->{'x-euc-jp'}  = $HTMLCharset->{'extendedunixcodepackedformatforjapanese'}
626    = $HTMLCharset->{'cseucpkdfmtjapanese'}
627    = $HTMLCharset->{'eucjp'}
628  = __PACKAGE__->new ({  = __PACKAGE__->new ({
629    category => CHARSET_CATEGORY_EUCJP | CHARSET_CATEGORY_BLOCK_SAFE,    category => CHARSET_CATEGORY_EUCJP | CHARSET_CATEGORY_BLOCK_SAFE |
630          CHARSET_CATEGORY_MIME_TEXT | CHARSET_CATEGORY_ASCII_COMPAT,
631    iana_names => {    iana_names => {
632      'extended_unix_code_packed_format_for_japanese' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,      'extended_unix_code_packed_format_for_japanese' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
633      'cseucpkdfmtjapanese' => REGISTERED_CHARSET_NAME,      'cseucpkdfmtjapanese' => REGISTERED_CHARSET_NAME,
# Line 461  $Charset->{'euc-jp'} Line 644  $Charset->{'euc-jp'}
644          ERROR_REPORTING_ENCODING_IMPL,          ERROR_REPORTING_ENCODING_IMPL,
645          ## NOTE: Unicode mapping is wrong.          ## NOTE: Unicode mapping is wrong.
646    },    },
647    is_html_ascii_superset => 1,    ## TODO: |error_level|
   mime_text_suitable => 1,  
648  });  });
649    
650  $Charset->{'x-euc-jp'}  $Charset->{'x-euc-jp'}
651  = $IANACharset->{'x-euc-jp'}  = $IANACharset->{'x-euc-jp'}
652    = $HTMLCharset->{'xeucjp'}
653  = __PACKAGE__->new ({  = __PACKAGE__->new ({
654    category => CHARSET_CATEGORY_EUCJP | CHARSET_CATEGORY_BLOCK_SAFE,    category => CHARSET_CATEGORY_EUCJP | CHARSET_CATEGORY_BLOCK_SAFE |
655          CHARSET_CATEGORY_MIME_TEXT | CHARSET_CATEGORY_ASCII_COMPAT,
656    iana_names => {    iana_names => {
657      'x-euc-jp' => UNREGISTERED_CHARSET_NAME,      'x-euc-jp' => UNREGISTERED_CHARSET_NAME,
658    },    },
# Line 476  $Charset->{'x-euc-jp'} Line 660  $Charset->{'x-euc-jp'}
660      'euc-jp-1997' => FALLBACK_ENCODING_IMPL | ERROR_REPORTING_ENCODING_IMPL,      'euc-jp-1997' => FALLBACK_ENCODING_IMPL | ERROR_REPORTING_ENCODING_IMPL,
661      'euc-jp' => FALLBACK_ENCODING_IMPL | ERROR_REPORTING_ENCODING_IMPL,      'euc-jp' => FALLBACK_ENCODING_IMPL | ERROR_REPORTING_ENCODING_IMPL,
662    },    },
   is_html_ascii_superset => 1,  is_html_ascii_superset => 1,  
   mime_text_suitable => 1,  
663  });  });
664    
665  $Charset->{'extended_unix_code_fixed_width_for_japanese'}  $Charset->{'extended_unix_code_fixed_width_for_japanese'}
666  = $IANACharset->{'extended_unix_code_fixed_width_for_japanese'}  = $IANACharset->{'extended_unix_code_fixed_width_for_japanese'}
667  = $IANACharset->{'cseucfixwidjapanese'}  = $IANACharset->{'cseucfixwidjapanese'}
668    = $HTMLCharset->{'extendedunixcodefixedwidthforjapanese'}
669  = __PACKAGE__->new ({  = __PACKAGE__->new ({
670    category => CHARSET_CATEGORY_BLOCK_SAFE,    category => CHARSET_CATEGORY_BLOCK_SAFE,
671    iana_names => {    iana_names => {
672      'extended_unix_code_fixed_width_for_japanese' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,      'extended_unix_code_fixed_width_for_japanese' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
673      'cseucfixwidjapanese' => REGISTERED_CHARSET_NAME,      'cseucfixwidjapanese' => REGISTERED_CHARSET_NAME,
674    },    },
675      ## TODO: |error_level|
676  });  });
677    
678  ## TODO: ...  ## TODO: ...
# Line 496  $Charset->{'extended_unix_code_fixed_wid Line 680  $Charset->{'extended_unix_code_fixed_wid
680  $Charset->{'euc-kr'}  $Charset->{'euc-kr'}
681  = $IANACharset->{'euc-kr'}  = $IANACharset->{'euc-kr'}
682  = $IANACharset->{'cseuckr'}  = $IANACharset->{'cseuckr'}
683    = $HTMLCharset->{'euckr'}
684  = __PACKAGE__->new ({  = __PACKAGE__->new ({
685    category => CHARSET_CATEGORY_BLOCK_SAFE,    category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT,
686    iana_names => {    iana_names => {
687      'euc-kr' => PRIMARY_CHARSET_NAME | PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,      'euc-kr' => PRIMARY_CHARSET_NAME | PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
688      'cseuckr' => REGISTERED_CHARSET_NAME,      'cseuckr' => REGISTERED_CHARSET_NAME,
689    },    },
690    is_html_ascii_superset => 1,    perl_names => {
691        ## TODO: We need a parse error generating wrapper for the decoder.
692        'cp949' => FALLBACK_ENCODING_IMPL, # part of standard Perl distribution
693      },
694      ## NOTE: |euc-kr| is handled as |windows-949|, such that properties
695      ## should be consistent with that encoding's properties.
696  });  });
697    
698  $Charset->{'iso-2022-jp'}  $Charset->{'iso-2022-jp'}
# Line 510  $Charset->{'iso-2022-jp'} Line 700  $Charset->{'iso-2022-jp'}
700  = $IANACharset->{'csiso2022jp'}  = $IANACharset->{'csiso2022jp'}
701  = $IANACharset->{'iso2022jp'}  = $IANACharset->{'iso2022jp'}
702  = $IANACharset->{'junet-code'}  = $IANACharset->{'junet-code'}
703    = $HTMLCharset->{'iso2022jp'}
704    = $HTMLCharset->{'junetcode'}
705  = __PACKAGE__->new ({  = __PACKAGE__->new ({
706    category => 0,    category => CHARSET_CATEGORY_MIME_TEXT,
707    iana_names => {    iana_names => {
708      'iso-2022-jp' => PREFERRED_CHARSET_NAME | PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,      'iso-2022-jp' => PREFERRED_CHARSET_NAME | PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
709      'csiso2022jp' => REGISTERED_CHARSET_NAME,      'csiso2022jp' => REGISTERED_CHARSET_NAME,
710      'iso2022jp' => UNREGISTERED_CHARSET_NAME,      'iso2022jp' => UNREGISTERED_CHARSET_NAME,
711      'junet-code' => UNREGISTERED_CHARSET_NAME,      'junet-code' => UNREGISTERED_CHARSET_NAME,
712    },    },
713    mime_text_suitable => 1,    ## TODO: |error_level|
714  });  });
715    
716  $Charset->{'iso-2022-jp-2'}  $Charset->{'iso-2022-jp-2'}
717  = $IANACharset->{'iso-2022-jp-2'}  = $IANACharset->{'iso-2022-jp-2'}
718  = $IANACharset->{'csiso2022jp2'}  = $IANACharset->{'csiso2022jp2'}
719    = $HTMLCharset->{'iso2022jp2'}
720  = __PACKAGE__->new ({  = __PACKAGE__->new ({
721    category => 0,    category => CHARSET_CATEGORY_MIME_TEXT,
722    iana_names => {    iana_names => {
723      'iso-2022-jp-2' => PREFERRED_CHARSET_NAME | PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,      'iso-2022-jp-2' => PREFERRED_CHARSET_NAME | PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
724      'csiso2022jp2' => REGISTERED_CHARSET_NAME,      'csiso2022jp2' => REGISTERED_CHARSET_NAME,
725    },    },
726    mime_text_suitable => 1,    ## TODO: |error_level|
727    });
728    
729    ## TODO: ...
730    
731    $IANACharset->{'gb_2312-80'}
732    = $IANACharset->{'iso-ir-58'}
733    = $IANACharset->{chinese}
734    = $HTMLCharset->{gb231280}
735    = $HTMLCharset->{isoir58}
736    = __PACKAGE__->new ({
737      ## NOTE: What is represented by this charset is unclear...  I don't
738      ## understand what RFC 1945 describes...
739      category => 0,
740      iana_names => {
741        'gb_2312-80' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
742        'iso-ir-58' => REGISTERED_CHARSET_NAME,
743        'chinese' => REGISTERED_CHARSET_NAME,
744        'csiso58gb231280' => REGISTERED_CHARSET_NAME,
745      },
746      perl_names => {
747        ## TODO: GB2312->GBK Parse Error wrapper
748        'cp936' => FALLBACK_ENCODING_IMPL,
749      },
750      ## NOTE: |gb2312| is handled as |gbk|, such that properties should be
751      ## consistent.
752  });  });
753    
754  ## TODO: ...  ## TODO: ...
# Line 538  $Charset->{'iso-2022-jp-2'} Line 756  $Charset->{'iso-2022-jp-2'}
756  $Charset->{'utf-8'}  $Charset->{'utf-8'}
757  = $IANACharset->{'utf-8'}  = $IANACharset->{'utf-8'}
758  = $IANACharset->{'x-utf-8'}  = $IANACharset->{'x-utf-8'}
759    = $HTMLCharset->{'utf8'}
760    = $HTMLCharset->{'xutf8'}
761  = __PACKAGE__->new ({  = __PACKAGE__->new ({
762    category => CHARSET_CATEGORY_BLOCK_SAFE,    category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT |
763          CHARSET_CATEGORY_MIME_TEXT,
764    iana_names => {    iana_names => {
765      'utf-8' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,      'utf-8' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
766          ## NOTE: IANA name "utf-8" references RFC 3629.  According to the RFC,          ## NOTE: IANA name "utf-8" references RFC 3629.  According to the RFC,
767          ## the definitive definition is one specified in the Unicode Standard.          ## the definitive definition is one specified in the Unicode Standard.
768      'x-utf-8' => UNREGISTERED_CHARSET_NAME,      'x-utf-8' => UNREGISTERED_CHARSET_NAME,
769            ## NOTE: We treat |x-utf-8| as an alias of |utf-8|, since unlike
770            ## other charset like |x-sjis| or |x-euc-jp|, there is no major
771            ## variant for the UTF-8 encoding.
772                     ## TODO: We might ought to reconsider this policy, since
773                     ## there are UTF-8 variant in fact, such as
774                     ## Unicode's UTF-8, ISO/IEC 10646's UTF-8, UTF-8n, and as
775                     ## such.
776    },    },
777    perl_names => {    perl_names => {
778      'utf-8-strict' => PRIMARY_CHARSET_NAME | SEMICONFORMING_ENCODING_IMPL |      'utf-8-strict' => PRIMARY_CHARSET_NAME | SEMICONFORMING_ENCODING_IMPL |
# Line 554  $Charset->{'utf-8'} Line 782  $Charset->{'utf-8'}
782          ## It does not support surrpgate pairs (conforming).          ## It does not support surrpgate pairs (conforming).
783          ## It does not support BOMs (non-conforming).          ## It does not support BOMs (non-conforming).
784    },    },
785      ## TODO: |error_level|
786    bom_pattern => qr/\xEF\xBB\xBF/,    bom_pattern => qr/\xEF\xBB\xBF/,
   is_html_ascii_superset => 1,  
   mime_text_suitable => 1,  
787  });  });
788    
789  $Charset->{'utf-8n'}  $Charset->{'utf-8n'}
790  = $IANACharset->{'utf-8n'}  = $IANACharset->{'utf-8n'}
791    = $HTMLCharset->{'utf-8'}
792  = __PACKAGE__->new ({  = __PACKAGE__->new ({
793    category => CHARSET_CATEGORY_BLOCK_SAFE,    category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_MIME_TEXT |
794          CHARSET_CATEGORY_ASCII_COMPAT,
795    iana_names => {    iana_names => {
796      'utf-8n' => UNREGISTERED_CHARSET_NAME,      'utf-8n' => UNREGISTERED_CHARSET_NAME,
797          ## NOTE: Is there any normative definition for the charset?          ## NOTE: Is there any normative definition for the charset?
# Line 571  $Charset->{'utf-8n'} Line 800  $Charset->{'utf-8n'}
800    perl_names => {    perl_names => {
801      'utf-8-strict' => PRIMARY_CHARSET_NAME | ERROR_REPORTING_ENCODING_IMPL,      'utf-8-strict' => PRIMARY_CHARSET_NAME | ERROR_REPORTING_ENCODING_IMPL,
802    },    },
803    is_html_ascii_superset => 1,    ## TODO: |error_level|
   mime_text_suitable => 1,  
804  });  });
805    
806  ## TODO: ...  ## TODO: ...
# Line 582  $Charset->{'gbk'} Line 810  $Charset->{'gbk'}
810  = $IANACharset->{'cp936'}  = $IANACharset->{'cp936'}
811  = $IANACharset->{'ms936'}  = $IANACharset->{'ms936'}
812  = $IANACharset->{'windows-936'}  = $IANACharset->{'windows-936'}
813    = $HTMLCharset->{'windows936'}
814  = __PACKAGE__->new ({  = __PACKAGE__->new ({
815    category => CHARSET_CATEGORY_BLOCK_SAFE,    category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_MIME_TEXT,
816    iana_names => {    iana_names => {
817      'gbk' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,      'gbk' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
818      'cp936' => REGISTERED_CHARSET_NAME,      'cp936' => REGISTERED_CHARSET_NAME,
819      'ms936' => REGISTERED_CHARSET_NAME,      'ms936' => REGISTERED_CHARSET_NAME,
820      'windows-936' => REGISTERED_CHARSET_NAME,      'windows-936' => REGISTERED_CHARSET_NAME,
821    },    },
822      ## TODO: |error_level|
823    iana_status => STATUS_COMMON | STATUS_OBSOLETE,    iana_status => STATUS_COMMON | STATUS_OBSOLETE,
   mime_text_suitable => 1,  
824  });  });
825    
826  $Charset->{'gb18030'}  $Charset->{'gb18030'}
827  = $IANACharset->{'gb18030'}  = $IANACharset->{'gb18030'}
828  = __PACKAGE__->new ({  = __PACKAGE__->new ({
829    category => CHARSET_CATEGORY_BLOCK_SAFE,    category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_MIME_TEXT,
830    iana_names => {    iana_names => {
831      'gb18030' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,      'gb18030' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
832    },    },
# Line 609  $Charset->{'gb18030'} Line 838  $Charset->{'gb18030'}
838    
839  $Charset->{'utf-16be'}  $Charset->{'utf-16be'}
840  = $IANACharset->{'utf-16be'}  = $IANACharset->{'utf-16be'}
841    = $HTMLCharset->{'utf16be'}
842  = __PACKAGE__->new ({  = __PACKAGE__->new ({
843    category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_UTF16,    category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_UTF16,
844    iana_names => {    iana_names => {
845      'utf-16be' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,      'utf-16be' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
846    },    },
847      ## TODO: |error_level|
848  });  });
849    
850  $Charset->{'utf-16le'}  $Charset->{'utf-16le'}
851  = $IANACharset->{'utf-16le'}  = $IANACharset->{'utf-16le'}
852    = $HTMLCharset->{'utf16le'}
853  = __PACKAGE__->new ({  = __PACKAGE__->new ({
854    category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_UTF16,    category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_UTF16,
855    iana_names => {    iana_names => {
856      'utf-16le' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,      'utf-16le' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
857    },    },
858      ## TODO: |error_level|
859  });  });
860    
861  $Charset->{'utf-16'}  $Charset->{'utf-16'}
862  = $IANACharset->{'utf-16'}  = $IANACharset->{'utf-16'}
863    = $HTMLCharset->{'utf16'}
864  = __PACKAGE__->new ({  = __PACKAGE__->new ({
865    category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_UTF16,    category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_UTF16,
866    iana_names => {    iana_names => {
867      'utf-16' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,      'utf-16' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
868    },    },
869      ## TODO: |error_level|
870  });  });
871    
872  ## TODO: ...  ## TODO: ...
# Line 639  $Charset->{'utf-16'} Line 874  $Charset->{'utf-16'}
874  $Charset->{'windows-31j'}  $Charset->{'windows-31j'}
875  = $IANACharset->{'windows-31j'}  = $IANACharset->{'windows-31j'}
876  = $IANACharset->{'cswindows31j'}  = $IANACharset->{'cswindows31j'}
877    = $HTMLCharset->{'windows31j'}
878  = __PACKAGE__->new ({  = __PACKAGE__->new ({
879    category => CHARSET_CATEGORY_SJIS | CHARSET_CATEGORY_BLOCK_SAFE,    category => CHARSET_CATEGORY_SJIS | CHARSET_CATEGORY_BLOCK_SAFE |
880          CHARSET_CATEGORY_MIME_TEXT,
881    iana_names => {    iana_names => {
882      'windows-31j' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,      'windows-31j' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
883      'cswindows31j' => REGISTERED_CHARSET_NAME,      'cswindows31j' => REGISTERED_CHARSET_NAME,
884    },    },
885    iana_status => STATUS_LIMITED_USE, # maybe    iana_status => STATUS_LIMITED_USE, # maybe
886    mime_text_suitable => 1,    ## TODO: |error_level|
887  });  });
888    
889  $Charset->{'gb2312'}  $Charset->{'gb2312'}
890  = $IANACharset->{'gb2312'}  = $IANACharset->{'gb2312'}
891  = $IANACharset->{'csgb2312'}  = $IANACharset->{'csgb2312'}
892  = __PACKAGE__->new ({  = __PACKAGE__->new ({
893    category => CHARSET_CATEGORY_BLOCK_SAFE,    category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_MIME_TEXT |
894          CHARSET_CATEGORY_ASCII_COMPAT,
895    iana_names => {    iana_names => {
896      'gb2312' => PRIMARY_CHARSET_NAME | PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,      'gb2312' => PRIMARY_CHARSET_NAME | PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
897      'csgb2312' => REGISTERED_CHARSET_NAME,      'csgb2312' => REGISTERED_CHARSET_NAME,
898    },    },
899    is_html_ascii_superset => 1,    perl_names => {
900    mime_text_suitable => 1,      ## TODO: GB2312->GBK Parse Error wrapper
901        'cp936' => FALLBACK_ENCODING_IMPL,
902      },
903      ## NOTE: |gb2312| is handled as |gbk|, such that properties should be
904      ## consistent.
905  });  });
906    
907  $Charset->{'big5'}  $Charset->{'big5'}
908  = $IANACharset->{'big5'}  = $IANACharset->{'big5'}
909  = $IANACharset->{'csbig5'}  = $IANACharset->{'csbig5'}
910    = $IANACharset->{'x-x-big5'}
911    = $HTMLCharset->{xxbig5}
912  = __PACKAGE__->new ({  = __PACKAGE__->new ({
913    category => CHARSET_CATEGORY_BLOCK_SAFE,    category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_MIME_TEXT,
914    iana_names => {    iana_names => {
915      'big5' => PRIMARY_CHARSET_NAME | PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,      'big5' => PRIMARY_CHARSET_NAME | PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
916      'csbig5' => REGISTERED_CHARSET_NAME,      'csbig5' => REGISTERED_CHARSET_NAME,
917        'x-x-big5' => UNREGISTERED_CHARSET_NAME,
918            ## NOTE: In HTML5, |x-x-big5| is defined as an alias of |big5|.
919            ## According to that spec, if there is any difference between
920            ## input and replacement encodings, the result is parse error.
921            ## However, since there is no formal definition for |x-x-big5|
922            ## charset, we cannot raise such errors.
923    },    },
924    mime_text_suitable => 1,    ## TODO: |error_level|
925  });  });
926    
927  ## TODO: ...  ## TODO: ...
928    
929  $Charset->{'big5-hkscs'}  $Charset->{'big5-hkscs'}
930  = $IANACharset->{'big5-hkscs'}  = $IANACharset->{'big5-hkscs'}
931    = $HTMLCharset->{'big5hkscs'}
932  = __PACKAGE__->new ({  = __PACKAGE__->new ({
933    category => CHARSET_CATEGORY_BLOCK_SAFE,    category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_MIME_TEXT,
934    iana_names => {    iana_names => {
935      'big5-hkscs' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,      'big5-hkscs' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
936    },    },
937    mime_text_suitable => 1,    ## TODO: |error_level|
938  });  });
939    
940  ## TODO: ...  ## TODO: ...
941    
942  $Charset->{'windows-1252'}  $Charset->{'windows-1252'}
943  = $IANACharset->{'windows-1252'}  = $IANACharset->{'windows-1252'}
944    = $HTMLCharset->{'windows1252'}
945  = __PACKAGE__->new ({  = __PACKAGE__->new ({
946    category => CHARSET_CATEGORY_BLOCK_SAFE,    category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT |
947          CHARSET_CATEGORY_MIME_TEXT,
948    iana_names => {    iana_names => {
949      'windows-1252' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,      'windows-1252' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
950    },    },
951    is_html_ascii_superset => 1,    ## TODO: Check whether use of 0x81 is conforming or not...
952    });
953    
954    $Charset->{'windows-1253'}
955    = $IANACharset->{'windows-1253'}
956    = $HTMLCharset->{'windows1253'}
957    = __PACKAGE__->new ({
958      category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT |
959          CHARSET_CATEGORY_MIME_TEXT,
960      iana_names => {
961        'windows-1253' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
962      },
963      ## TODO: Check whether use of 0x81 is conforming or not...
964    });
965    
966    $Charset->{'windows-1254'}
967    = $IANACharset->{'windows-1254'}
968    = $HTMLCharset->{'windows1254'}
969    = __PACKAGE__->new ({
970      category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT |
971          CHARSET_CATEGORY_MIME_TEXT,
972      iana_names => {
973        'windows-1254' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
974      },
975      ## TODO: Check whether use of 0x81 is conforming or not...
976  });  });
977    
978  ## TODO: ...  ## TODO: ...
979    
980  $Charset->{'tis-620'}  $Charset->{'tis-620'}
981  = $IANACharset->{'tis-620'}  = $IANACharset->{'tis-620'}
982    = $HTMLCharset->{'tis620'}
983  = __PACKAGE__->new ({  = __PACKAGE__->new ({
984    category => CHARSET_CATEGORY_BLOCK_SAFE,    category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT,
985    iana_names => {    iana_names => {
986      'tis-620' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,      'tis-620' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
987    },    },
988    perl_names => {    perl_names => {
989      'tis-620' => FALLBACK_ENCODING_IMPL | ERROR_REPORTING_ENCODING_IMPL,      'web-tis-620' => UNREGISTERED_CHARSET_NAME | ERROR_REPORTING_ENCODING_IMPL,
990          ## NOTE: An alias of |iso-8859-11|.      'windows-874' => FALLBACK_ENCODING_IMPL | ERROR_REPORTING_ENCODING_IMPL,
991    },    },
992    is_html_ascii_superset => 1,    fallback => {
993        "\x80" => "\x{20AC}",
994        "\x81" => undef, "\x82" => undef, "\x83" => undef, "\x84" => undef,
995        "\x85" => "\x{2026}",
996        "\x86" => undef, "\x87" => undef, "\x88" => undef, "\x89" => undef,
997        "\x8A" => undef, "\x8B" => undef, "\x8C" => undef, "\x8D" => undef,
998        "\x8E" => undef, "\x8F" => undef, "\x90" => undef,
999        "\x91" => "\x{2018}",
1000        "\x92" => "\x{2019}",
1001        "\x93" => "\x{201C}",
1002        "\x94" => "\x{201D}",
1003        "\x95" => "\x{2022}",
1004        "\x96" => "\x{2013}",
1005        "\x97" => "\x{2014}",
1006        "\x98" => undef, "\x99" => undef, "\x9A" => undef, "\x9B" => undef,
1007        "\x9C" => undef, "\x9D" => undef, "\x9E" => undef, "\x9F" => undef,
1008        "\xA0" => "\xA0",
1009      },
1010      ## NOTE: |tis-620| is treated as |windows-874|, so ensure that
1011      ## they are consistent.
1012  });  });
1013    
1014  $Charset->{'iso-8859-11'}  $Charset->{'iso-8859-11'}
1015  = $IANACharset->{'iso-8859-11'}  = $IANACharset->{'iso-8859-11'}
1016    = $HTMLCharset->{'iso885911'}
1017  = __PACKAGE__->new ({  = __PACKAGE__->new ({
1018    category => CHARSET_CATEGORY_BLOCK_SAFE,    category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT,
1019    iana_names => {    iana_names => {
1020      'iso-8859-11' => UNREGISTERED_CHARSET_NAME,      'iso-8859-11' => UNREGISTERED_CHARSET_NAME,
1021          ## NOTE: The Web Thai encoding, i.e. windows-874.          ## NOTE: The Web Thai encoding, i.e. windows-874.
1022    },    },
1023    perl_names => {    perl_names => {
     'windows-874' => FALLBACK_ENCODING_IMPL | ERROR_REPORTING_ENCODING_IMPL,  
1024      'web-thai' => UNREGISTERED_CHARSET_NAME | ERROR_REPORTING_ENCODING_IMPL,      'web-thai' => UNREGISTERED_CHARSET_NAME | ERROR_REPORTING_ENCODING_IMPL,
1025        'windows-874' => FALLBACK_ENCODING_IMPL | ERROR_REPORTING_ENCODING_IMPL,
1026    },    },
1027    fallback => {    fallback => {
1028      "\x80" => "\x{20AC}",      "\x80" => "\x{20AC}",
1029        "\x81" => undef, "\x82" => undef, "\x83" => undef, "\x84" => undef,
1030      "\x85" => "\x{2026}",      "\x85" => "\x{2026}",
1031        "\x86" => undef, "\x87" => undef, "\x88" => undef, "\x89" => undef,
1032        "\x8A" => undef, "\x8B" => undef, "\x8C" => undef, "\x8D" => undef,
1033        "\x8E" => undef, "\x8F" => undef, "\x90" => undef,
1034      "\x91" => "\x{2018}",      "\x91" => "\x{2018}",
1035      "\x92" => "\x{2019}",      "\x92" => "\x{2019}",
1036      "\x93" => "\x{201C}",      "\x93" => "\x{201C}",
# Line 736  $Charset->{'iso-8859-11'} Line 1038  $Charset->{'iso-8859-11'}
1038      "\x95" => "\x{2022}",      "\x95" => "\x{2022}",
1039      "\x96" => "\x{2013}",      "\x96" => "\x{2013}",
1040      "\x97" => "\x{2014}",      "\x97" => "\x{2014}",
1041        "\x98" => undef, "\x99" => undef, "\x9A" => undef, "\x9B" => undef,
1042        "\x9C" => undef, "\x9D" => undef, "\x9E" => undef, "\x9F" => undef,
1043    },    },
1044    is_html_ascii_superset => 1,    ## NOTE: |iso-8859-11| is treated as |windows-874|, so ensure that
1045      ## they are consistent.
1046  });  });
1047    
1048  $Charset->{'windows-874'}  $Charset->{'windows-874'}
1049  = $IANACharset->{'windows-874'}  = $IANACharset->{'windows-874'}
1050    = $HTMLCharset->{'windows874'}
1051  = __PACKAGE__->new ({  = __PACKAGE__->new ({
1052    category => CHARSET_CATEGORY_BLOCK_SAFE,    category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT,
1053    iana_names => {    iana_names => {
1054      'windows-874' => UNREGISTERED_CHARSET_NAME,      'windows-874' => UNREGISTERED_CHARSET_NAME,
1055    },    },
1056    perl_names => {    perl_names => {
1057      'windows-874' => REGISTERED_CHARSET_NAME | ERROR_REPORTING_ENCODING_IMPL,      'windows-874' => REGISTERED_CHARSET_NAME | ERROR_REPORTING_ENCODING_IMPL,
1058    },    },
1059    is_html_ascii_superset => 1,    ## TODO: |error_level|
1060    });
1061    
1062    $IANACharset->{'windows-949'}
1063    = $HTMLCharset->{windows949}
1064    = __PACKAGE__->new ({
1065      category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_MIME_TEXT,
1066      iana_names => {
1067        'windows-949' => UNREGISTERED_CHARSET_NAME,
1068      },
1069      perl_names => {
1070        'cp949' => PREFERRED_CHARSET_NAME | NONCONFORMING_ENCODING_IMPL |
1071            ERROR_REPORTING_ENCODING_IMPL,
1072            ## TODO: Is this implementation conforming?
1073      },
1074      ## NOTE: |error_level| is same as default, since we can't find any formal
1075      ## definition for this charset.
1076  });  });
1077    
1078  sub new ($$) {  sub new ($$) {
# Line 782  sub get_decode_handle ($$;%) { Line 1104  sub get_decode_handle ($$;%) {
1104      charset => '', ## TODO: We set a charset name for input_encoding (when we get identify-by-URI nonsense away)      charset => '', ## TODO: We set a charset name for input_encoding (when we get identify-by-URI nonsense away)
1105      byte_buffer => $opt{byte_buffer} ? ${$opt{byte_buffer}} : '', ## TODO: ref, instead of value, should be used      byte_buffer => $opt{byte_buffer} ? ${$opt{byte_buffer}} : '', ## TODO: ref, instead of value, should be used
1106      onerror => $opt{onerror} || sub {},      onerror => $opt{onerror} || sub {},
1107      must_level => 'm',      level => $opt{level} || {
1108      fact_level => 'm',        must => 'm',
1109          charset_variant => 'm',
1110          charset_fact => 'm',
1111          iso_shall => 'm',
1112        },
1113        error_level => $self->{error_level} || {
1114          ## HTML5 charset name aliases
1115              ## NOTE: Use of code points in the variant whose definition differs
1116              ## from the original charset is a parse error in HTML5.  However,
1117              ## it does not affect the document conformance; the HTML5 spec
1118              ## does not define the conformance of the input stream against the
1119              ## charset in use.
1120          'fallback-char-error' => 'charset_variant',
1121          #'fallback-illegal-error' => 'charset_variant',
1122          'fallback-unassigned-error' => 'charset_variant',
1123              ## NOTE: An appropriate error level should be set for each charset
1124              ## (many charset prohibits use of unassigned code points).
1125    
1126          'illegal-octets-error' => 'charset_fact',
1127          'unassigned-code-point-error' => 'charset_fact',
1128          'invalid-state-error' => 'charset_fact',
1129        },
1130    };    };
1131    
1132    require Whatpm::Charset::DecodeHandle;    require Whatpm::Charset::DecodeHandle;
# Line 853  sub get_perl_encoding ($;%) { Line 1196  sub get_perl_encoding ($;%) {
1196        require Encode::EUCJP1997;        require Encode::EUCJP1997;
1197      } elsif ($name eq 'shift-jis-1997') {      } elsif ($name eq 'shift-jis-1997') {
1198        require Encode::ShiftJIS1997;        require Encode::ShiftJIS1997;
1199      } elsif ($name eq 'web-latin1') {      } elsif ({'web-latin1' => 1,
1200                  'web-latin1-us-ascii' => 1,
1201                  'web-latin5' => 1}->{$name}) {
1202        require Whatpm::Charset::WebLatin1;        require Whatpm::Charset::WebLatin1;
1203      } elsif ($name eq 'web-thai') {      } elsif ($name eq 'web-thai' or $name eq 'web-tis-620') {
1204        require Whatpm::Charset::WebThai;        require Whatpm::Charset::WebThai;
1205      }      }
1206    }; # $load_encode    }; # $load_encode
# Line 870  sub get_perl_encoding ($;%) { Line 1215  sub get_perl_encoding ($;%) {
1215                
1216        $load_encode->($perl_name);        $load_encode->($perl_name);
1217        my $e = Encode::find_encoding ($perl_name);        my $e = Encode::find_encoding ($perl_name);
1218        if ($e) {        if ($e and $e->name eq $perl_name) {
1219            ## NOTE: Don't return $e unless $e eq $perl_name, since
1220            ## |find_encoding| resolves e.g. |foobarlatin-1| to |iso-8859-1|,
1221            ## which might return wrong encoding object when a dedicated
1222            ## implementation not part of the standard Perl distribution is
1223            ## desired.
1224          return ($e, $perl_status);          return ($e, $perl_status);
1225        }        }
1226      }      }
# Line 945  sub get_iana_name ($) { Line 1295  sub get_iana_name ($) {
1295  sub is_syntactically_valid_iana_charset_name ($) {  sub is_syntactically_valid_iana_charset_name ($) {
1296    my $name = shift;    my $name = shift;
1297    return $name =~ /\A[\x20-\x7E]{1,40}\z/;    return $name =~ /\A[\x20-\x7E]{1,40}\z/;
1298    
1299      ## NOTE: According to IANAREG, "The character set names may be up to 40
1300      ## characters taken from the printable characters of US-ASCII.  However,
1301      ## no distinction is made between use of upper and lower case letters.".
1302  } # is_suntactically_valid_iana_charset_name  } # is_suntactically_valid_iana_charset_name
1303    
1304  1;  1;

Legend:
Removed from v.1.8  
changed lines
  Added in v.1.9

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24