/[suikacvs]/messaging/manakai/lib/Message/Charset/Info.pm
Suika

Contents of /messaging/manakai/lib/Message/Charset/Info.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.15 - (show annotations) (download)
Mon Sep 15 07:19:33 2008 UTC (16 years, 2 months ago) by wakaba
Branch: MAIN
CVS Tags: HEAD
Changes since 1.14: +3 -2 lines
++ manakai/lib/Message/Charset/ChangeLog	15 Sep 2008 07:19:29 -0000
2008-09-15  Wakaba  <wakaba@suika.fam.cx>

	* Info.pm: Noted that new internal variable |{onerror_set}|
	is introduced.

1 package Message::Charset::Info;
2 use strict;
3 our $VERSION=do{my @r=(q$Revision: 1.14 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4
5 ## TODO: Certain encodings MUST NOT be implemented [HTML5].
6
7 ## ISSUE: Should we convert unassigned code point with trivial Unicode
8 ## mapping into U+FFFD? Or, should we return that Unicode character
9 ## with an error? (For example, Windows-1252's 0x81 should be converted
10 ## to U+FFFD or U+0081?)
11
12 sub UNREGISTERED_CHARSET_NAME () { 0b1 }
13 ## Names for non-standard encodings/implementations for Perl encodings
14 sub REGISTERED_CHARSET_NAME () { 0b10 }
15 ## Names for standard encodings for Perl encodings
16 sub PRIMARY_CHARSET_NAME () { 0b100 }
17 ## "Name:" field for IANA names
18 ## Canonical name for Perl encodings
19 sub PREFERRED_CHARSET_NAME () { 0b1000 }
20 ## "preferred MIME name" for IANA names
21
22 sub FALLBACK_ENCODING_IMPL () { 0b10000 }
23 ## For Perl encodings: Not a name of the encoding, the encoding
24 ## for the name might be useful as a fallback when the correct
25 ## encoding is not supported.
26 sub NONCONFORMING_ENCODING_IMPL () { FALLBACK_ENCODING_IMPL }
27 ## For Perl encodings: Not a conforming implementation of the encoding,
28 ## though it seems that the intention was to implement that encoding.
29 sub SEMICONFORMING_ENCODING_IMPL () { 0b1000000 }
30 ## For Perl encodings: The implementation itself (returned by
31 ## |get_perl_encoding|) is non-conforming. The decode handle
32 ## implementation (returned by |get_decode_handle|) is conforming.
33 sub ERROR_REPORTING_ENCODING_IMPL () { 0b100000 }
34 ## For Perl encodings: Support error reporting via |manakai_onerror|
35 ## handler when the encoding is handled with decode handle.
36
37 ## iana_status
38 sub STATUS_COMMON () { 0b1 }
39 sub STATUS_LIMITED_USE () { 0b10 }
40 sub STATUS_OBSOLETE () { 0b100 }
41
42 ## category
43 sub CHARSET_CATEGORY_BLOCK_SAFE () { 0b1 }
44 ## NOTE: Stateless
45 sub CHARSET_CATEGORY_EUCJP () { 0b10 }
46 sub CHARSET_CATEGORY_SJIS () { 0b100 }
47 sub CHARSET_CATEGORY_UTF16 () { 0b1000 }
48 ## NOTE: "A UTF-16 encoding" in HTML5.
49 sub CHARSET_CATEGORY_ASCII_COMPAT () { 0b10000 }
50 ## NOTE: "superset of US-ASCII (specifically, ANSI_X3.4-1968)
51 ## for bytes in the range 0x09-0x0A, 0x0C-0x0D, 0x20-0x22, 0x26, 0x27,
52 ## 0x2C-0x3F, 0x41-0x5A, and 0x61-0x7A" [HTML5]
53 sub CHARSET_CATEGORY_EBCDIC () { 0b100000 }
54 ## NOTE: "based on EBCDIC" in HTML5.
55 sub CHARSET_CATEGORY_MIME_TEXT () { 0b1000000 }
56 ## NOTE: Suitable as MIME text.
57
58 ## ISSUE: Shift_JIS is a superset of US-ASCII? ISO-2022-JP is?
59 ## ISSUE: 0x5F (_) should be added to the range?
60
61 my $Charset; ## TODO: this is obsolete.
62
63 our $IANACharset;
64 ## NOTE: Charset names used where IANA charset names are allowed, either
65 ## registered or not.
66 our $HTMLCharset;
67 ## NOTE: Same as charset names in $IANACharset, except all ASCII
68 ## punctuations are dropped and letters/digits only names are not included.
69
70 $Charset->{'us-ascii'}
71 = $IANACharset->{'ansi_x3.4-1968'}
72 = $IANACharset->{'iso-ir-6'}
73 = $IANACharset->{'ansi_x3.4-1986'}
74 = $IANACharset->{'iso_646.irv:1991'}
75 = $IANACharset->{'ascii'}
76 = $IANACharset->{'iso646-us'}
77 = $IANACharset->{'us-ascii'}
78 = $IANACharset->{'us'}
79 = $IANACharset->{'ibm367'}
80 = $IANACharset->{'cp367'}
81 = $IANACharset->{'csascii'}
82 = $HTMLCharset->{'ansix341968'}
83 = $HTMLCharset->{'isoir6'}
84 = $HTMLCharset->{'ansix341986'}
85 = $HTMLCharset->{'iso646irv1991'}
86 = $HTMLCharset->{'iso646us'}
87 = $HTMLCharset->{'usascii'}
88 = __PACKAGE__->new ({
89 category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT,
90 iana_names => {
91 'ansi_x3.4-1968' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
92 'iso-ir-6' => REGISTERED_CHARSET_NAME,
93 'ansi_x3.4-1986' => REGISTERED_CHARSET_NAME,
94 'iso_646.irv:1991' => REGISTERED_CHARSET_NAME,
95 'ascii' => REGISTERED_CHARSET_NAME,
96 'iso646-us' => REGISTERED_CHARSET_NAME,
97 'us-ascii' => PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
98 'us' => REGISTERED_CHARSET_NAME,
99 'ibm367' => REGISTERED_CHARSET_NAME,
100 'cp367' => REGISTERED_CHARSET_NAME,
101 'csascii' => REGISTERED_CHARSET_NAME,
102 },
103 perl_names => {
104 'web-latin1-us-ascii' => UNREGISTERED_CHARSET_NAME |
105 SEMICONFORMING_ENCODING_IMPL | ERROR_REPORTING_ENCODING_IMPL,
106 'cp1252' => FALLBACK_ENCODING_IMPL, # part of standard Perl distribution
107 },
108 fallback => {
109 "\x80" => "\x{20AC}",
110 "\x81" => undef,
111 "\x82" => "\x{201A}",
112 "\x83" => "\x{0192}",
113 "\x84" => "\x{201E}",
114 "\x85" => "\x{2026}",
115 "\x86" => "\x{2020}",
116 "\x87" => "\x{2021}",
117 "\x88" => "\x{02C6}",
118 "\x89" => "\x{2030}",
119 "\x8A" => "\x{0160}",
120 "\x8B" => "\x{2039}",
121 "\x8C" => "\x{0152}",
122 "\x8D" => undef,
123 "\x8E" => "\x{017D}",
124 "\x8F" => undef,
125 "\x90" => undef,
126 "\x91" => "\x{2018}",
127 "\x92" => "\x{2019}",
128 "\x93" => "\x{201C}",
129 "\x94" => "\x{201D}",
130 "\x95" => "\x{2022}",
131 "\x96" => "\x{2013}",
132 "\x97" => "\x{2014}",
133 "\x98" => "\x{02DC}",
134 "\x99" => "\x{2122}",
135 "\x9A" => "\x{0161}",
136 "\x9B" => "\x{203A}",
137 "\x9C" => "\x{0153}",
138 "\x9D" => undef,
139 "\x9E" => "\x{017E}",
140 "\x9F" => "\x{0178}",
141 "\xA0" => "\xA0", "\xA1" => "\xA1", "\xA2" => "\xA2", "\xA3" => "\xA3",
142 "\xA4" => "\xA4", "\xA5" => "\xA5", "\xA6" => "\xA6", "\xA7" => "\xA7",
143 "\xA8" => "\xA8", "\xA9" => "\xA9", "\xAA" => "\xAA", "\xAB" => "\xAB",
144 "\xAC" => "\xAC", "\xAD" => "\xAD", "\xAE" => "\xAE", "\xAF" => "\xAF",
145 "\xB0" => "\xB0", "\xB1" => "\xB1", "\xB2" => "\xB2", "\xB3" => "\xB3",
146 "\xB4" => "\xB4", "\xB5" => "\xB5", "\xB6" => "\xB6", "\xB7" => "\xB7",
147 "\xB8" => "\xB8", "\xB9" => "\xB9", "\xBA" => "\xBA", "\xBB" => "\xBB",
148 "\xBC" => "\xBC", "\xBD" => "\xBD", "\xBE" => "\xBE", "\xBF" => "\xBF",
149 "\xC0" => "\xC0", "\xC1" => "\xC1", "\xC2" => "\xC2", "\xC3" => "\xC3",
150 "\xC4" => "\xC4", "\xC5" => "\xC5", "\xC6" => "\xC6", "\xC7" => "\xC7",
151 "\xC8" => "\xC8", "\xC9" => "\xC9", "\xCA" => "\xCA", "\xCB" => "\xCB",
152 "\xCC" => "\xCC", "\xCD" => "\xCD", "\xCE" => "\xCE", "\xCF" => "\xCF",
153 "\xD0" => "\xD0", "\xD1" => "\xD1", "\xD2" => "\xD2", "\xD3" => "\xD3",
154 "\xD4" => "\xD4", "\xD5" => "\xD5", "\xD6" => "\xD6", "\xD7" => "\xD7",
155 "\xD8" => "\xD8", "\xD9" => "\xD9", "\xDA" => "\xDA", "\xDB" => "\xDB",
156 "\xDC" => "\xDC", "\xDD" => "\xDD", "\xDE" => "\xDE", "\xDF" => "\xDF",
157 "\xE0" => "\xE0", "\xE1" => "\xE1", "\xE2" => "\xE2", "\xE3" => "\xE3",
158 "\xE4" => "\xE4", "\xE5" => "\xE5", "\xE6" => "\xE6", "\xE7" => "\xE7",
159 "\xE8" => "\xE8", "\xE9" => "\xE9", "\xEA" => "\xEA", "\xEB" => "\xEB",
160 "\xEC" => "\xEC", "\xED" => "\xED", "\xEE" => "\xEE", "\xEF" => "\xEF",
161 "\xF0" => "\xF0", "\xF1" => "\xF1", "\xF2" => "\xF2", "\xF3" => "\xF3",
162 "\xF4" => "\xF4", "\xF5" => "\xF5", "\xF6" => "\xF6", "\xF7" => "\xF7",
163 "\xF8" => "\xF8", "\xF9" => "\xF9", "\xFA" => "\xFA", "\xFB" => "\xFB",
164 "\xFC" => "\xFC", "\xFD" => "\xFD", "\xFE" => "\xFE", "\xFF" => "\xFF",
165 },
166 ## NOTE: Treated as |windows-1252|. Properties of this charset
167 ## should be consistent with those of that charset.
168 });
169
170 $Charset->{'iso-8859-1'}
171 = $IANACharset->{'iso_8859-1:1987'}
172 = $IANACharset->{'iso-ir-100'}
173 = $IANACharset->{'iso_8859-1'}
174 = $IANACharset->{'iso-8859-1'}
175 = $IANACharset->{'latin1'}
176 = $IANACharset->{'l1'}
177 = $IANACharset->{'ibm819'}
178 = $IANACharset->{'cp819'}
179 = $IANACharset->{'csisolatin1'}
180 = $HTMLCharset->{'iso885911987'}
181 = $HTMLCharset->{'isoir100'}
182 = $HTMLCharset->{'iso88591'}
183 = __PACKAGE__->new ({
184 category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT,
185 iana_names => {
186 'iso_8859-1:1987' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
187 'iso-ir-100' => REGISTERED_CHARSET_NAME,
188 'iso_8859-1' => REGISTERED_CHARSET_NAME,
189 'iso-8859-1' => PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
190 'latin1' => REGISTERED_CHARSET_NAME,
191 'l1' => REGISTERED_CHARSET_NAME,
192 'ibm819' => REGISTERED_CHARSET_NAME,
193 'cp819' => REGISTERED_CHARSET_NAME,
194 'csisolatin1' => REGISTERED_CHARSET_NAME,
195 },
196 perl_names => {
197 'web-latin1' => UNREGISTERED_CHARSET_NAME | SEMICONFORMING_ENCODING_IMPL |
198 ERROR_REPORTING_ENCODING_IMPL,
199 'cp1252' => FALLBACK_ENCODING_IMPL, # part of standard Perl distribution
200 },
201 fallback => {
202 "\x80" => "\x{20AC}",
203 "\x81" => undef,
204 "\x82" => "\x{201A}",
205 "\x83" => "\x{0192}",
206 "\x84" => "\x{201E}",
207 "\x85" => "\x{2026}",
208 "\x86" => "\x{2020}",
209 "\x87" => "\x{2021}",
210 "\x88" => "\x{02C6}",
211 "\x89" => "\x{2030}",
212 "\x8A" => "\x{0160}",
213 "\x8B" => "\x{2039}",
214 "\x8C" => "\x{0152}",
215 "\x8D" => undef,
216 "\x8E" => "\x{017D}",
217 "\x8F" => undef,
218 "\x90" => undef,
219 "\x91" => "\x{2018}",
220 "\x92" => "\x{2019}",
221 "\x93" => "\x{201C}",
222 "\x94" => "\x{201D}",
223 "\x95" => "\x{2022}",
224 "\x96" => "\x{2013}",
225 "\x97" => "\x{2014}",
226 "\x98" => "\x{02DC}",
227 "\x99" => "\x{2122}",
228 "\x9A" => "\x{0161}",
229 "\x9B" => "\x{203A}",
230 "\x9C" => "\x{0153}",
231 "\x9D" => undef,
232 "\x9E" => "\x{017E}",
233 "\x9F" => "\x{0178}",
234 },
235 ## NOTE: Treated as |windows-1252|. Properties of this charset
236 ## should be consistent with those of that charset.
237 });
238
239 $Charset->{'iso-8859-2'}
240 = $IANACharset->{'iso_8859-2:1987'}
241 = $IANACharset->{'iso-ir-101'}
242 = $IANACharset->{'iso_8859-2'}
243 = $IANACharset->{'iso-8859-2'}
244 = $IANACharset->{'latin2'}
245 = $IANACharset->{'l2'}
246 = $IANACharset->{'csisolatin2'}
247 = $HTMLCharset->{'iso885921987'}
248 = $HTMLCharset->{'isoir101'}
249 = $HTMLCharset->{'iso88592'}
250 = __PACKAGE__->new ({
251 category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT,
252 iana_names => {
253 'iso_8859-2:1987' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
254 'iso-ir-101' => REGISTERED_CHARSET_NAME,
255 'iso_8859-2' => REGISTERED_CHARSET_NAME,
256 'iso-8859-2' => PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
257 'latin2' => REGISTERED_CHARSET_NAME,
258 'l2' => REGISTERED_CHARSET_NAME,
259 'csisolatin2' => REGISTERED_CHARSET_NAME,
260 },
261 });
262
263 $Charset->{'iso-8859-3'}
264 = $IANACharset->{'iso_8859-3:1988'}
265 = $IANACharset->{'iso-ir-109'}
266 = $IANACharset->{'iso_8859-3'}
267 = $IANACharset->{'iso-8859-3'}
268 = $IANACharset->{'latin3'}
269 = $IANACharset->{'l3'}
270 = $IANACharset->{'csisolatin3'}
271 = $HTMLCharset->{'iso885931988'}
272 = $HTMLCharset->{'isoir109'}
273 = $HTMLCharset->{'iso88593'}
274 = __PACKAGE__->new ({
275 category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT,
276 iana_names => {
277 'iso_8859-3:1988' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
278 'iso-ir-109' => REGISTERED_CHARSET_NAME,
279 'iso_8859-3' => REGISTERED_CHARSET_NAME,
280 'iso-8859-3' => PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
281 'latin3' => REGISTERED_CHARSET_NAME,
282 'l3' => REGISTERED_CHARSET_NAME,
283 'csisolatin3' => REGISTERED_CHARSET_NAME,
284 },
285 error_level => {
286 'unassigned-code-point-error' => 'iso_shall',
287 ## NOTE: I didn't check whether ISO/IEC 8859-3 prohibits the use of
288 ## unassigned code points, but ECMA-94:1986 (whose content considered
289 ## as equivalent to ISO 8859/1-4) disallows the use of them.
290 },
291 });
292
293 $Charset->{'iso-8859-4'}
294 = $IANACharset->{'iso_8859-4:1988'}
295 = $IANACharset->{'iso-ir-110'}
296 = $IANACharset->{'iso_8859-4'}
297 = $IANACharset->{'iso-8859-4'}
298 = $IANACharset->{'latin4'}
299 = $IANACharset->{'l4'}
300 = $IANACharset->{'csisolatin4'}
301 = $HTMLCharset->{'iso885941988'}
302 = $HTMLCharset->{'isoir110'}
303 = $HTMLCharset->{'iso88594'}
304 = __PACKAGE__->new ({
305 category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT,
306 iana_names => {
307 'iso_8859-4:1988' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
308 'iso-ir-110' => REGISTERED_CHARSET_NAME,
309 'iso_8859-4' => REGISTERED_CHARSET_NAME,
310 'iso-8859-4' => PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
311 'latin4' => REGISTERED_CHARSET_NAME,
312 'l4' => REGISTERED_CHARSET_NAME,
313 'csisolatin4' => REGISTERED_CHARSET_NAME,
314 },
315 error_level => {
316 'unassigned-code-point-error' => 'iso_shall',
317 ## NOTE: I didn't check whether ISO/IEC 8859-3 prohibits the use of
318 ## unassigned code points, but ECMA-94:1986 (whose content considered
319 ## as equivalent to ISO 8859/1-4) disallows the use of them.
320 },
321 });
322
323 $Charset->{'iso-8859-5'}
324 = $IANACharset->{'iso_8859-5:1988'}
325 = $IANACharset->{'iso-ir-144'}
326 = $IANACharset->{'iso_8859-5'}
327 = $IANACharset->{'iso-8859-5'}
328 = $IANACharset->{'cyrillic'}
329 = $IANACharset->{'csisolatincyrillic'}
330 = $HTMLCharset->{'iso885951988'}
331 = $HTMLCharset->{'isoir144'}
332 = $HTMLCharset->{'iso88595'}
333 = __PACKAGE__->new ({
334 category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT,
335 iana_names => {
336 'iso_8859-5:1988' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
337 'iso-ir-144' => REGISTERED_CHARSET_NAME,
338 'iso_8859-5' => REGISTERED_CHARSET_NAME,
339 'iso-8859-5' => PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
340 'cyrillic' => REGISTERED_CHARSET_NAME,
341 'csisolatincyrillic' => REGISTERED_CHARSET_NAME,
342 },
343 });
344
345 $Charset->{'iso-8859-6'}
346 = $IANACharset->{'iso_8859-6:1987'}
347 = $IANACharset->{'iso-ir-127'}
348 = $IANACharset->{'iso_8859-6'}
349 = $IANACharset->{'iso-8859-6'}
350 = $IANACharset->{'ecma-114'}
351 = $IANACharset->{'asmo-708'}
352 = $IANACharset->{'arabic'}
353 = $IANACharset->{'csisolatinarabic'}
354 = $HTMLCharset->{'iso885961987'}
355 = $HTMLCharset->{'isoir127'}
356 = $HTMLCharset->{'iso88596'}
357 = $HTMLCharset->{'ecma114'}
358 = $HTMLCharset->{'asmo708'}
359 = __PACKAGE__->new ({
360 category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT,
361 ## NOTE: 3/0..3/9 have different semantics from U+0030..0039,
362 ## but have same character names (maybe).
363 ## NOTE: According to RFC 2046, charset left-hand half of "iso-8859-6"
364 ## is same as "us-ascii".
365 ## TODO: RFC 1345 def?
366 iana_names => {
367 'iso_8859-6:1987' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
368 'iso-ir-127' => REGISTERED_CHARSET_NAME,
369 'iso_8859-6' => REGISTERED_CHARSET_NAME,
370 'iso-8859-6' => PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
371 'ecma-114' => REGISTERED_CHARSET_NAME,
372 'asmo-708' => REGISTERED_CHARSET_NAME,
373 'arabic' => REGISTERED_CHARSET_NAME,
374 'csisolatinarabic' => REGISTERED_CHARSET_NAME,
375 },
376 ## TODO: |error_level|
377 });
378
379 $Charset->{'iso-8859-7'}
380 = $IANACharset->{'iso_8859-7:1987'}
381 = $IANACharset->{'iso-ir-126'}
382 = $IANACharset->{'iso_8859-7'}
383 = $IANACharset->{'iso-8859-7'}
384 = $IANACharset->{'elot_928'}
385 = $IANACharset->{'ecma-118'}
386 = $IANACharset->{'greek'}
387 = $IANACharset->{'greek8'}
388 = $IANACharset->{'csisolatingreek'}
389 = $HTMLCharset->{'iso885971987'}
390 = $HTMLCharset->{'isoir126'}
391 = $HTMLCharset->{'iso88597'}
392 = $HTMLCharset->{'elot928'}
393 = $HTMLCharset->{'ecma118'}
394 = __PACKAGE__->new ({
395 category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT,
396 iana_names => {
397 'iso_8859-7:1987' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
398 'iso-ir-126' => REGISTERED_CHARSET_NAME,
399 'iso_8859-7' => REGISTERED_CHARSET_NAME,
400 'iso-8859-7' => PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
401 'elot_928' => REGISTERED_CHARSET_NAME,
402 'ecma-118' => REGISTERED_CHARSET_NAME,
403 'greek' => REGISTERED_CHARSET_NAME,
404 'greek8' => REGISTERED_CHARSET_NAME,
405 'csisolatingreek' => REGISTERED_CHARSET_NAME,
406 },
407 ## TODO: |error_level|
408 });
409
410 $Charset->{'iso-8859-8'}
411 = $IANACharset->{'iso_8859-8:1988'}
412 = $IANACharset->{'iso-ir-138'}
413 = $IANACharset->{'iso_8859-8'}
414 = $IANACharset->{'iso-8859-8'}
415 = $IANACharset->{'hebrew'}
416 = $IANACharset->{'csisolatinhebrew'}
417 = $HTMLCharset->{'iso885981988'}
418 = $HTMLCharset->{'isoir138'}
419 = $HTMLCharset->{'iso88598'}
420 = __PACKAGE__->new ({
421 category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT,
422 iana_names => {
423 'iso_8859-8:1988' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
424 'iso-ir-138' => REGISTERED_CHARSET_NAME,
425 'iso_8859-8' => REGISTERED_CHARSET_NAME,
426 'iso-8859-8' => PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
427 'hebrew' => REGISTERED_CHARSET_NAME,
428 'csisolatinhebrew' => REGISTERED_CHARSET_NAME,
429 },
430 ## TODO: |error_level|
431 });
432
433 $Charset->{'iso-8859-9'}
434 = $IANACharset->{'iso_8859-9:1989'}
435 = $IANACharset->{'iso-ir-148'}
436 = $IANACharset->{'iso_8859-9'}
437 = $IANACharset->{'iso-8859-9'}
438 = $IANACharset->{'latin5'}
439 = $IANACharset->{'l5'}
440 = $IANACharset->{'csisolatin5'}
441 = $HTMLCharset->{'iso885991989'}
442 = $HTMLCharset->{'isoir148'}
443 = $HTMLCharset->{'iso88599'}
444 = __PACKAGE__->new ({
445 category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT,
446 iana_names => {
447 'iso_8859-9:1989' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
448 'iso-ir-148' => REGISTERED_CHARSET_NAME,
449 'iso_8859-9' => REGISTERED_CHARSET_NAME,
450 'iso-8859-9' => PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
451 'latin5' => REGISTERED_CHARSET_NAME,
452 'l5' => REGISTERED_CHARSET_NAME,
453 'csisolatin5' => REGISTERED_CHARSET_NAME,
454 },
455 perl_names => {
456 'web-latin5' => UNREGISTERED_CHARSET_NAME | SEMICONFORMING_ENCODING_IMPL |
457 ERROR_REPORTING_ENCODING_IMPL,
458 'cp1254' => FALLBACK_ENCODING_IMPL, # part of standard Perl distribution
459 },
460 fallback => {
461 "\x80" => "\x{20AC}",
462 "\x81" => undef,
463 "\x82" => "\x{201A}",
464 "\x83" => "\x{0192}",
465 "\x84" => "\x{201E}",
466 "\x85" => "\x{2026}",
467 "\x86" => "\x{2020}",
468 "\x87" => "\x{2021}",
469 "\x88" => "\x{02C6}",
470 "\x89" => "\x{2030}",
471 "\x8A" => "\x{0160}",
472 "\x8B" => "\x{2039}",
473 "\x8C" => "\x{0152}",
474 "\x8D" => undef,
475 "\x8E" => undef,
476 "\x8F" => undef,
477 "\x90" => undef,
478 "\x91" => "\x{2018}",
479 "\x92" => "\x{2019}",
480 "\x93" => "\x{201C}",
481 "\x94" => "\x{201D}",
482 "\x95" => "\x{2022}",
483 "\x96" => "\x{2013}",
484 "\x97" => "\x{2014}",
485 "\x98" => "\x{02DC}",
486 "\x99" => "\x{2122}",
487 "\x9A" => "\x{0161}",
488 "\x9B" => "\x{203A}",
489 "\x9C" => "\x{0153}",
490 "\x9D" => undef,
491 "\x9E" => undef,
492 "\x9F" => "\x{0178}",
493 },
494 ## NOTE: Treated as |windows-1254|. Properties of this charset
495 ## should be consistent with those of that charset.
496 });
497
498 $Charset->{'iso-8859-10'}
499 = $IANACharset->{'iso-8859-10'}
500 = $IANACharset->{'iso-ir-157'}
501 = $IANACharset->{'l6'}
502 = $IANACharset->{'iso_8859-10:1992'}
503 = $IANACharset->{'csisolatin6'}
504 = $IANACharset->{'latin6'}
505 = $HTMLCharset->{'iso885910'}
506 = $HTMLCharset->{'isoir157'}
507 = $HTMLCharset->{'iso8859101992'}
508 = __PACKAGE__->new ({
509 category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT,
510 iana_names => {
511 'iso-8859-10' => PRIMARY_CHARSET_NAME | PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
512 'iso-ir-157' => REGISTERED_CHARSET_NAME,
513 'l6' => REGISTERED_CHARSET_NAME,
514 'iso_8859-10:1992' => REGISTERED_CHARSET_NAME,
515 'csisolatin6' => REGISTERED_CHARSET_NAME,
516 'latin6' => REGISTERED_CHARSET_NAME,
517 },
518 ## TODO: |error_level|
519 });
520
521 $Charset->{'iso_6937-2-add'}
522 = $IANACharset->{'iso_6937-2-add'}
523 = $IANACharset->{'iso-ir-142'}
524 = $IANACharset->{'csisotextcomm'}
525 = $HTMLCharset->{'iso69372add'}
526 = $HTMLCharset->{'isoir142'}
527 = __PACKAGE__->new ({
528 category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT,
529 iana_names => {
530 'iso_6937-2-add' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
531 'iso-ir-142' => REGISTERED_CHARSET_NAME,
532 'csisotextcomm' => REGISTERED_CHARSET_NAME,
533 },
534 ## TODO: |error_level|
535 });
536
537 $Charset->{'jis_x0201'}
538 = $IANACharset->{'jis_x0201'}
539 = $IANACharset->{'x0201'}
540 = $IANACharset->{'cshalfwidthkatakana'}
541 = $HTMLCharset->{'jisx0201'}
542 = __PACKAGE__->new ({
543 category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT,
544 iana_names => {
545 'jis_x0201' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
546 'x0201' => REGISTERED_CHARSET_NAME,
547 'cshalfwidthkatakana' => REGISTERED_CHARSET_NAME,
548 },
549 ## TODO: |error_level|
550 });
551
552 $Charset->{'jis_encoding'}
553 = $IANACharset->{'jis_encoding'}
554 = $IANACharset->{'csjisencoding'}
555 = $HTMLCharset->{'jisencoding'}
556 = __PACKAGE__->new ({
557 category => 0,
558 iana_names => {
559 'jis_encoding' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
560 'csjisencoding' => REGISTERED_CHARSET_NAME,
561 },
562 ## NOTE: What is this?
563 });
564
565 $Charset->{'shift_jis'}
566 = $IANACharset->{'shift_jis'}
567 = $IANACharset->{'ms_kanji'}
568 = $IANACharset->{'csshiftjis'}
569 = $HTMLCharset->{'shiftjis'}
570 = $HTMLCharset->{'mskanji'}
571 = __PACKAGE__->new ({
572 category => CHARSET_CATEGORY_SJIS | CHARSET_CATEGORY_BLOCK_SAFE |
573 CHARSET_CATEGORY_MIME_TEXT,
574 iana_names => {
575 'shift_jis' => PREFERRED_CHARSET_NAME | PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
576 'ms_kanji' => REGISTERED_CHARSET_NAME,
577 'csshiftjis' => REGISTERED_CHARSET_NAME,
578 },
579 perl_names => {
580 'shift-jis-1997' => UNREGISTERED_CHARSET_NAME |
581 SEMICONFORMING_ENCODING_IMPL | ERROR_REPORTING_ENCODING_IMPL,
582 shiftjis => PRIMARY_CHARSET_NAME | NONCONFORMING_ENCODING_IMPL |
583 ERROR_REPORTING_ENCODING_IMPL,
584 ## NOTE: Unicode mapping is wrong.
585 },
586 ## TODO: |error_level|
587 });
588
589 $Charset->{'x-sjis'}
590 = $IANACharset->{'x-sjis'}
591 = $HTMLCharset->{'xsjis'}
592 = __PACKAGE__->new ({
593 category => CHARSET_CATEGORY_SJIS | CHARSET_CATEGORY_BLOCK_SAFE |
594 CHARSET_CATEGORY_ASCII_COMPAT,
595 iana_names => {
596 'x-sjis' => UNREGISTERED_CHARSET_NAME,
597 },
598 perl_names => {
599 'shift-jis-1997' => FALLBACK_ENCODING_IMPL | ERROR_REPORTING_ENCODING_IMPL,
600 },
601 ## TODO: |error_level|
602 });
603
604 $Charset->{shift_jisx0213}
605 = $IANACharset->{shift_jisx0213}
606 = $HTMLCharset->{shiftjisx0213}
607 = __PACKAGE__->new ({
608 category => CHARSET_CATEGORY_SJIS | CHARSET_CATEGORY_BLOCK_SAFE |
609 CHARSET_CATEGORY_MIME_TEXT,
610 iana_names => {
611 shift_jisx0213 => UNREGISTERED_CHARSET_NAME,
612 },
613 perl_names => {
614 #shift_jisx0213 (non-standard - i don't know its conformance)
615 'shift-jis-1997' => FALLBACK_ENCODING_IMPL | ERROR_REPORTING_ENCODING_IMPL,
616 'shiftjis' => FALLBACK_ENCODING_IMPL | ERROR_REPORTING_ENCODING_IMPL,
617 },
618 ## TODO: |error_level|
619 });
620
621 $Charset->{'euc-jp'}
622 = $IANACharset->{'extended_unix_code_packed_format_for_japanese'}
623 = $IANACharset->{'cseucpkdfmtjapanese'}
624 = $IANACharset->{'euc-jp'}
625 = $HTMLCharset->{'extendedunixcodepackedformatforjapanese'}
626 = $HTMLCharset->{'cseucpkdfmtjapanese'}
627 = $HTMLCharset->{'eucjp'}
628 = __PACKAGE__->new ({
629 category => CHARSET_CATEGORY_EUCJP | CHARSET_CATEGORY_BLOCK_SAFE |
630 CHARSET_CATEGORY_MIME_TEXT | CHARSET_CATEGORY_ASCII_COMPAT,
631 iana_names => {
632 'extended_unix_code_packed_format_for_japanese' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
633 'cseucpkdfmtjapanese' => REGISTERED_CHARSET_NAME,
634 'euc-jp' => PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
635 },
636 perl_names => {
637 'euc-jp-1997' => UNREGISTERED_CHARSET_NAME |
638 SEMICONFORMING_ENCODING_IMPL | ERROR_REPORTING_ENCODING_IMPL,
639 ## NOTE: Though the IANA definition references the 1990 version
640 ## of EUC-JP, the 1997 version of JIS standard claims that the version
641 ## is same coded character set as the 1990 version, such that we
642 ## consider the EUC-JP 1990 version is same as the 1997 version.
643 'euc-jp' => PREFERRED_CHARSET_NAME | NONCONFORMING_ENCODING_IMPL |
644 ERROR_REPORTING_ENCODING_IMPL,
645 ## NOTE: Unicode mapping is wrong.
646 },
647 ## TODO: |error_level|
648 });
649
650 $Charset->{'x-euc-jp'}
651 = $IANACharset->{'x-euc-jp'}
652 = $HTMLCharset->{'xeucjp'}
653 = __PACKAGE__->new ({
654 category => CHARSET_CATEGORY_EUCJP | CHARSET_CATEGORY_BLOCK_SAFE |
655 CHARSET_CATEGORY_MIME_TEXT | CHARSET_CATEGORY_ASCII_COMPAT,
656 iana_names => {
657 'x-euc-jp' => UNREGISTERED_CHARSET_NAME,
658 },
659 perl_names => {
660 'euc-jp-1997' => FALLBACK_ENCODING_IMPL | ERROR_REPORTING_ENCODING_IMPL,
661 'euc-jp' => FALLBACK_ENCODING_IMPL | ERROR_REPORTING_ENCODING_IMPL,
662 },
663 });
664
665 $Charset->{'extended_unix_code_fixed_width_for_japanese'}
666 = $IANACharset->{'extended_unix_code_fixed_width_for_japanese'}
667 = $IANACharset->{'cseucfixwidjapanese'}
668 = $HTMLCharset->{'extendedunixcodefixedwidthforjapanese'}
669 = __PACKAGE__->new ({
670 category => CHARSET_CATEGORY_BLOCK_SAFE,
671 iana_names => {
672 'extended_unix_code_fixed_width_for_japanese' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
673 'cseucfixwidjapanese' => REGISTERED_CHARSET_NAME,
674 },
675 ## TODO: |error_level|
676 });
677
678 ## TODO: ...
679
680 $Charset->{'euc-kr'}
681 = $IANACharset->{'euc-kr'}
682 = $IANACharset->{'cseuckr'}
683 = $HTMLCharset->{'euckr'}
684 = __PACKAGE__->new ({
685 category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT,
686 iana_names => {
687 'euc-kr' => PRIMARY_CHARSET_NAME | PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
688 'cseuckr' => REGISTERED_CHARSET_NAME,
689 },
690 perl_names => {
691 ## TODO: We need a parse error generating wrapper for the decoder.
692 'cp949' => FALLBACK_ENCODING_IMPL, # part of standard Perl distribution
693 },
694 ## NOTE: |euc-kr| is handled as |windows-949|, such that properties
695 ## should be consistent with that encoding's properties.
696 });
697
698 $Charset->{'iso-2022-jp'}
699 = $IANACharset->{'iso-2022-jp'}
700 = $IANACharset->{'csiso2022jp'}
701 = $IANACharset->{'iso2022jp'}
702 = $IANACharset->{'junet-code'}
703 = $HTMLCharset->{'iso2022jp'}
704 = $HTMLCharset->{'junetcode'}
705 = __PACKAGE__->new ({
706 category => CHARSET_CATEGORY_MIME_TEXT,
707 iana_names => {
708 'iso-2022-jp' => PREFERRED_CHARSET_NAME | PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
709 'csiso2022jp' => REGISTERED_CHARSET_NAME,
710 'iso2022jp' => UNREGISTERED_CHARSET_NAME,
711 'junet-code' => UNREGISTERED_CHARSET_NAME,
712 },
713 ## TODO: |error_level|
714 });
715
716 $Charset->{'iso-2022-jp-2'}
717 = $IANACharset->{'iso-2022-jp-2'}
718 = $IANACharset->{'csiso2022jp2'}
719 = $HTMLCharset->{'iso2022jp2'}
720 = __PACKAGE__->new ({
721 category => CHARSET_CATEGORY_MIME_TEXT,
722 iana_names => {
723 'iso-2022-jp-2' => PREFERRED_CHARSET_NAME | PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
724 'csiso2022jp2' => REGISTERED_CHARSET_NAME,
725 },
726 ## TODO: |error_level|
727 });
728
729 ## TODO: ...
730
731 $IANACharset->{'gb_2312-80'}
732 = $IANACharset->{'iso-ir-58'}
733 = $IANACharset->{chinese}
734 = $HTMLCharset->{gb231280}
735 = $HTMLCharset->{isoir58}
736 = __PACKAGE__->new ({
737 ## NOTE: What is represented by this charset is unclear... I don't
738 ## understand what RFC 1945 describes...
739 category => 0,
740 iana_names => {
741 'gb_2312-80' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
742 'iso-ir-58' => REGISTERED_CHARSET_NAME,
743 'chinese' => REGISTERED_CHARSET_NAME,
744 'csiso58gb231280' => REGISTERED_CHARSET_NAME,
745 },
746 perl_names => {
747 ## TODO: GB2312->GBK Parse Error wrapper
748 'cp936' => FALLBACK_ENCODING_IMPL,
749 },
750 ## NOTE: |gb2312| is handled as |gbk|, such that properties should be
751 ## consistent.
752 });
753
754 ## TODO: ...
755
756 $Charset->{'utf-8'}
757 = $IANACharset->{'utf-8'}
758 = $IANACharset->{'x-utf-8'}
759 = $HTMLCharset->{'utf8'}
760 = $HTMLCharset->{'xutf8'}
761 = __PACKAGE__->new ({
762 category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT |
763 CHARSET_CATEGORY_MIME_TEXT,
764 iana_names => {
765 'utf-8' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
766 ## NOTE: IANA name "utf-8" references RFC 3629. According to the RFC,
767 ## the definitive definition is one specified in the Unicode Standard.
768 'x-utf-8' => UNREGISTERED_CHARSET_NAME,
769 ## NOTE: We treat |x-utf-8| as an alias of |utf-8|, since unlike
770 ## other charset like |x-sjis| or |x-euc-jp|, there is no major
771 ## variant for the UTF-8 encoding.
772 ## TODO: We might ought to reconsider this policy, since
773 ## there are UTF-8 variant in fact, such as
774 ## Unicode's UTF-8, ISO/IEC 10646's UTF-8, UTF-8n, and as
775 ## such.
776 },
777 perl_names => {
778 'utf-8-strict' => PRIMARY_CHARSET_NAME | SEMICONFORMING_ENCODING_IMPL |
779 ERROR_REPORTING_ENCODING_IMPL,
780 ## NOTE: It does not support non-Unicode UCS characters (conforming).
781 ## It does detect illegal sequences (conforming).
782 ## It does not support surrpgate pairs (conforming).
783 ## It does not support BOMs (non-conforming).
784 },
785 ## TODO: |error_level|
786 bom_pattern => qr/\xEF\xBB\xBF/,
787 });
788
789 $Charset->{'utf-8n'}
790 = $IANACharset->{'utf-8n'}
791 = $HTMLCharset->{'utf-8'}
792 = __PACKAGE__->new ({
793 category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_MIME_TEXT |
794 CHARSET_CATEGORY_ASCII_COMPAT,
795 iana_names => {
796 'utf-8n' => UNREGISTERED_CHARSET_NAME,
797 ## NOTE: Is there any normative definition for the charset?
798 ## What variant of UTF-8 should we use for the charset?
799 },
800 perl_names => {
801 'utf-8-strict' => PRIMARY_CHARSET_NAME | ERROR_REPORTING_ENCODING_IMPL,
802 },
803 ## TODO: |error_level|
804 });
805
806 ## TODO: ...
807
808 $Charset->{'gbk'}
809 = $IANACharset->{'gbk'}
810 = $IANACharset->{'cp936'}
811 = $IANACharset->{'ms936'}
812 = $IANACharset->{'windows-936'}
813 = $HTMLCharset->{'windows936'}
814 = __PACKAGE__->new ({
815 category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_MIME_TEXT,
816 iana_names => {
817 'gbk' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
818 'cp936' => REGISTERED_CHARSET_NAME,
819 'ms936' => REGISTERED_CHARSET_NAME,
820 'windows-936' => REGISTERED_CHARSET_NAME,
821 },
822 ## TODO: |error_level|
823 iana_status => STATUS_COMMON | STATUS_OBSOLETE,
824 });
825
826 $Charset->{'gb18030'}
827 = $IANACharset->{'gb18030'}
828 = __PACKAGE__->new ({
829 category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_MIME_TEXT,
830 iana_names => {
831 'gb18030' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
832 },
833 iana_status => STATUS_COMMON,
834 mime_text_suitable => 1,
835 });
836
837 ## TODO: ...
838
839 $Charset->{'utf-16be'}
840 = $IANACharset->{'utf-16be'}
841 = $HTMLCharset->{'utf16be'}
842 = __PACKAGE__->new ({
843 category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_UTF16,
844 iana_names => {
845 'utf-16be' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
846 },
847 ## TODO: |error_level|
848 });
849
850 $Charset->{'utf-16le'}
851 = $IANACharset->{'utf-16le'}
852 = $HTMLCharset->{'utf16le'}
853 = __PACKAGE__->new ({
854 category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_UTF16,
855 iana_names => {
856 'utf-16le' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
857 },
858 ## TODO: |error_level|
859 });
860
861 $Charset->{'utf-16'}
862 = $IANACharset->{'utf-16'}
863 = $HTMLCharset->{'utf16'}
864 = __PACKAGE__->new ({
865 category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_UTF16,
866 iana_names => {
867 'utf-16' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
868 },
869 ## TODO: |error_level|
870 });
871
872 ## TODO: ...
873
874 $Charset->{'windows-31j'}
875 = $IANACharset->{'windows-31j'}
876 = $IANACharset->{'cswindows31j'}
877 = $HTMLCharset->{'windows31j'}
878 = __PACKAGE__->new ({
879 category => CHARSET_CATEGORY_SJIS | CHARSET_CATEGORY_BLOCK_SAFE |
880 CHARSET_CATEGORY_MIME_TEXT,
881 iana_names => {
882 'windows-31j' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
883 'cswindows31j' => REGISTERED_CHARSET_NAME,
884 },
885 iana_status => STATUS_LIMITED_USE, # maybe
886 ## TODO: |error_level|
887 });
888
889 $Charset->{'gb2312'}
890 = $IANACharset->{'gb2312'}
891 = $IANACharset->{'csgb2312'}
892 = __PACKAGE__->new ({
893 category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_MIME_TEXT |
894 CHARSET_CATEGORY_ASCII_COMPAT,
895 iana_names => {
896 'gb2312' => PRIMARY_CHARSET_NAME | PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
897 'csgb2312' => REGISTERED_CHARSET_NAME,
898 },
899 perl_names => {
900 ## TODO: GB2312->GBK Parse Error wrapper
901 'cp936' => FALLBACK_ENCODING_IMPL,
902 },
903 ## NOTE: |gb2312| is handled as |gbk|, such that properties should be
904 ## consistent.
905 });
906
907 $Charset->{'big5'}
908 = $IANACharset->{'big5'}
909 = $IANACharset->{'csbig5'}
910 = $IANACharset->{'x-x-big5'}
911 = $HTMLCharset->{xxbig5}
912 = __PACKAGE__->new ({
913 category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_MIME_TEXT,
914 iana_names => {
915 'big5' => PRIMARY_CHARSET_NAME | PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
916 'csbig5' => REGISTERED_CHARSET_NAME,
917 'x-x-big5' => UNREGISTERED_CHARSET_NAME,
918 ## NOTE: In HTML5, |x-x-big5| is defined as an alias of |big5|.
919 ## According to that spec, if there is any difference between
920 ## input and replacement encodings, the result is parse error.
921 ## However, since there is no formal definition for |x-x-big5|
922 ## charset, we cannot raise such errors.
923 },
924 ## TODO: |error_level|
925 });
926
927 ## TODO: ...
928
929 $Charset->{'big5-hkscs'}
930 = $IANACharset->{'big5-hkscs'}
931 = $HTMLCharset->{'big5hkscs'}
932 = __PACKAGE__->new ({
933 category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_MIME_TEXT,
934 iana_names => {
935 'big5-hkscs' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
936 },
937 ## TODO: |error_level|
938 });
939
940 ## TODO: ...
941
942 $Charset->{'windows-1252'}
943 = $IANACharset->{'windows-1252'}
944 = $HTMLCharset->{'windows1252'}
945 = __PACKAGE__->new ({
946 category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT |
947 CHARSET_CATEGORY_MIME_TEXT,
948 iana_names => {
949 'windows-1252' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
950 },
951 ## TODO: Check whether use of 0x81 is conforming or not...
952 });
953
954 $Charset->{'windows-1253'}
955 = $IANACharset->{'windows-1253'}
956 = $HTMLCharset->{'windows1253'}
957 = __PACKAGE__->new ({
958 category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT |
959 CHARSET_CATEGORY_MIME_TEXT,
960 iana_names => {
961 'windows-1253' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
962 },
963 ## TODO: Check whether use of 0x81 is conforming or not...
964 });
965
966 $Charset->{'windows-1254'}
967 = $IANACharset->{'windows-1254'}
968 = $HTMLCharset->{'windows1254'}
969 = __PACKAGE__->new ({
970 category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT |
971 CHARSET_CATEGORY_MIME_TEXT,
972 iana_names => {
973 'windows-1254' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
974 },
975 ## TODO: Check whether use of 0x81 is conforming or not...
976 });
977
978 ## TODO: ...
979
980 $Charset->{'tis-620'}
981 = $IANACharset->{'tis-620'}
982 = $HTMLCharset->{'tis620'}
983 = __PACKAGE__->new ({
984 category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT,
985 iana_names => {
986 'tis-620' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
987 },
988 perl_names => {
989 'web-tis-620' => UNREGISTERED_CHARSET_NAME | ERROR_REPORTING_ENCODING_IMPL,
990 'windows-874' => FALLBACK_ENCODING_IMPL | ERROR_REPORTING_ENCODING_IMPL,
991 },
992 fallback => {
993 "\x80" => "\x{20AC}",
994 "\x81" => undef, "\x82" => undef, "\x83" => undef, "\x84" => undef,
995 "\x85" => "\x{2026}",
996 "\x86" => undef, "\x87" => undef, "\x88" => undef, "\x89" => undef,
997 "\x8A" => undef, "\x8B" => undef, "\x8C" => undef, "\x8D" => undef,
998 "\x8E" => undef, "\x8F" => undef, "\x90" => undef,
999 "\x91" => "\x{2018}",
1000 "\x92" => "\x{2019}",
1001 "\x93" => "\x{201C}",
1002 "\x94" => "\x{201D}",
1003 "\x95" => "\x{2022}",
1004 "\x96" => "\x{2013}",
1005 "\x97" => "\x{2014}",
1006 "\x98" => undef, "\x99" => undef, "\x9A" => undef, "\x9B" => undef,
1007 "\x9C" => undef, "\x9D" => undef, "\x9E" => undef, "\x9F" => undef,
1008 "\xA0" => "\xA0",
1009 },
1010 ## NOTE: |tis-620| is treated as |windows-874|, so ensure that
1011 ## they are consistent.
1012 });
1013
1014 $Charset->{'iso-8859-11'}
1015 = $IANACharset->{'iso-8859-11'}
1016 = $HTMLCharset->{'iso885911'}
1017 = __PACKAGE__->new ({
1018 category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT,
1019 iana_names => {
1020 'iso-8859-11' => UNREGISTERED_CHARSET_NAME,
1021 ## NOTE: The Web Thai encoding, i.e. windows-874.
1022 },
1023 perl_names => {
1024 'web-thai' => UNREGISTERED_CHARSET_NAME | ERROR_REPORTING_ENCODING_IMPL,
1025 'windows-874' => FALLBACK_ENCODING_IMPL | ERROR_REPORTING_ENCODING_IMPL,
1026 },
1027 fallback => {
1028 "\x80" => "\x{20AC}",
1029 "\x81" => undef, "\x82" => undef, "\x83" => undef, "\x84" => undef,
1030 "\x85" => "\x{2026}",
1031 "\x86" => undef, "\x87" => undef, "\x88" => undef, "\x89" => undef,
1032 "\x8A" => undef, "\x8B" => undef, "\x8C" => undef, "\x8D" => undef,
1033 "\x8E" => undef, "\x8F" => undef, "\x90" => undef,
1034 "\x91" => "\x{2018}",
1035 "\x92" => "\x{2019}",
1036 "\x93" => "\x{201C}",
1037 "\x94" => "\x{201D}",
1038 "\x95" => "\x{2022}",
1039 "\x96" => "\x{2013}",
1040 "\x97" => "\x{2014}",
1041 "\x98" => undef, "\x99" => undef, "\x9A" => undef, "\x9B" => undef,
1042 "\x9C" => undef, "\x9D" => undef, "\x9E" => undef, "\x9F" => undef,
1043 },
1044 ## NOTE: |iso-8859-11| is treated as |windows-874|, so ensure that
1045 ## they are consistent.
1046 });
1047
1048 $Charset->{'windows-874'}
1049 = $IANACharset->{'windows-874'}
1050 = $HTMLCharset->{'windows874'}
1051 = __PACKAGE__->new ({
1052 category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT,
1053 iana_names => {
1054 'windows-874' => UNREGISTERED_CHARSET_NAME,
1055 },
1056 perl_names => {
1057 'windows-874' => REGISTERED_CHARSET_NAME | ERROR_REPORTING_ENCODING_IMPL,
1058 },
1059 ## TODO: |error_level|
1060 });
1061
1062 $IANACharset->{'windows-949'}
1063 = $HTMLCharset->{windows949}
1064 = __PACKAGE__->new ({
1065 category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_MIME_TEXT,
1066 iana_names => {
1067 'windows-949' => UNREGISTERED_CHARSET_NAME,
1068 },
1069 perl_names => {
1070 'cp949' => PREFERRED_CHARSET_NAME | NONCONFORMING_ENCODING_IMPL |
1071 ERROR_REPORTING_ENCODING_IMPL,
1072 ## TODO: Is this implementation conforming?
1073 },
1074 ## NOTE: |error_level| is same as default, since we can't find any formal
1075 ## definition for this charset.
1076 });
1077
1078 sub new ($$) {
1079 return bless $_[1], $_[0];
1080 } # new
1081
1082 ## NOTE: A class method
1083 sub get_by_html_name ($$) {
1084 my $name = $_[1];
1085 $name =~ tr/A-Z/a-z/; ## ASCII case-insensitive
1086 my $iana_name = $name;
1087 $name =~ s/[\x09-\x0D\x20-\x2F\x3A-\x40\x5B-\x60\x7B-\x7E]//g;
1088 ## NOTE: U+000B is included.
1089 unless ($HTMLCharset->{$name} || $IANACharset->{$name}) {
1090 $IANACharset->{$iana_name} =
1091 $HTMLCharset->{$name} = __PACKAGE__->new ({
1092 iana_names => {
1093 $iana_name => UNREGISTERED_CHARSET_NAME,
1094 },
1095 });
1096 }
1097 return $HTMLCharset->{$name} || $IANACharset->{$name};
1098 } # get_by_html_name
1099
1100 ## NOTE: A class method
1101 sub get_by_iana_name ($$) {
1102 my $name = $_[1];
1103 $name =~ tr/A-Z/a-z/; ## ASCII case-insensitive
1104 unless ($IANACharset->{$name}) {
1105 $IANACharset->{$name} = __PACKAGE__->new ({
1106 iana_names => {
1107 $name => UNREGISTERED_CHARSET_NAME,
1108 },
1109 });
1110 }
1111 return $IANACharset->{$name};
1112 } # get_by_iana_name
1113
1114 sub get_decode_handle ($$;%) {
1115 my $self = shift;
1116 my $byte_stream = shift;
1117 my %opt = @_;
1118
1119 my $obj = {
1120 category => $self->{category},
1121 char_buffer => \(my $s = ''),
1122 char_buffer_pos => 0,
1123 character_queue => [],
1124 filehandle => $byte_stream,
1125 charset => '', ## TODO: We set a charset name for input_encoding (when we get identify-by-URI nonsense away)
1126 byte_buffer => $opt{byte_buffer} ? ${$opt{byte_buffer}} : '', ## TODO: ref, instead of value, should be used
1127 onerror => $opt{onerror} || sub {},
1128 #onerror_set
1129 level => $opt{level} || {
1130 must => 'm',
1131 charset_variant => 'm',
1132 charset_fact => 'm',
1133 iso_shall => 'm',
1134 },
1135 error_level => $self->{error_level} || {
1136 ## HTML5 charset name aliases
1137 ## NOTE: Use of code points in the variant whose definition differs
1138 ## from the original charset is a parse error in HTML5. However,
1139 ## it does not affect the document conformance; the HTML5 spec
1140 ## does not define the conformance of the input stream against the
1141 ## charset in use.
1142 'fallback-char-error' => 'charset_variant',
1143 #'fallback-illegal-error' => 'charset_variant',
1144 'fallback-unassigned-error' => 'charset_variant',
1145 ## NOTE: An appropriate error level should be set for each charset
1146 ## (many charset prohibits use of unassigned code points).
1147
1148 'illegal-octets-error' => 'charset_fact',
1149 'unassigned-code-point-error' => 'charset_fact',
1150 'invalid-state-error' => 'charset_fact',
1151 },
1152 };
1153
1154 require Whatpm::Charset::DecodeHandle;
1155 if ($self->{iana_names}->{'iso-2022-jp'}) {
1156 $obj->{state_2440} = 'gl-jis-1978';
1157 $obj->{state_2442} = 'gl-jis-1983';
1158 $obj->{state} = 'state_2842';
1159 eval {
1160 require Encode::GLJIS1978;
1161 require Encode::GLJIS1983;
1162 };
1163 if (Encode::find_encoding ($obj->{state_2440}) and
1164 Encode::find_encoding ($obj->{state_2442})) {
1165 return ((bless $obj, 'Whatpm::Charset::DecodeHandle::ISO2022JP'),
1166 PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME);
1167 }
1168 } elsif ($self->{xml_names}->{'iso-2022-jp'}) {
1169 $obj->{state_2440} = 'gl-jis-1997-swapped';
1170 $obj->{state_2442} = 'gl-jis-1997';
1171 $obj->{state} = 'state_2842';
1172 eval {
1173 require Encode::GLJIS1997Swapped;
1174 require Encode::GLJIS1997;
1175 };
1176 if (Encode::find_encoding ($obj->{state_2440}) and
1177 Encode::find_encoding ($obj->{state_2442})) {
1178 return ((bless $obj, 'Whatpm::Charset::DecodeHandle::ISO2022JP'),
1179 PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME);
1180 }
1181 }
1182
1183 my ($e, $e_status) = $self->get_perl_encoding
1184 (%opt, allow_semiconforming => 1);
1185 if ($e) {
1186 $obj->{perl_encoding_name} = $e->name;
1187 unless ($self->{category} & CHARSET_CATEGORY_BLOCK_SAFE) {
1188 $e_status |= FALLBACK_ENCODING_IMPL;
1189 }
1190 $obj->{bom_pattern} = $self->{bom_pattern};
1191 $obj->{fallback} = $self->{fallback};
1192 return ((bless $obj, 'Whatpm::Charset::DecodeHandle::Encode'), $e_status);
1193 } else {
1194 return (undef, 0);
1195 }
1196 } # get_decode_handle
1197
1198 sub get_perl_encoding ($;%) {
1199 my ($self, %opt) = @_;
1200
1201 require Encode;
1202 my $load_encode = sub {
1203 my $name = shift;
1204 if ($name eq 'euc-jp-1997') {
1205 require Encode::EUCJP1997;
1206 } elsif ($name eq 'shift-jis-1997') {
1207 require Encode::ShiftJIS1997;
1208 } elsif ({'web-latin1' => 1,
1209 'web-latin1-us-ascii' => 1,
1210 'web-latin5' => 1}->{$name}) {
1211 require Whatpm::Charset::WebLatin1;
1212 } elsif ($name eq 'web-thai' or $name eq 'web-tis-620') {
1213 require Whatpm::Charset::WebThai;
1214 }
1215 }; # $load_encode
1216
1217 if ($opt{allow_error_reporting}) {
1218 for my $perl_name (keys %{$self->{perl_names} or {}}) {
1219 my $perl_status = $self->{perl_names}->{$perl_name};
1220 next unless $perl_status & ERROR_REPORTING_ENCODING_IMPL;
1221 next if $perl_status & FALLBACK_ENCODING_IMPL;
1222 next if $perl_status & SEMICONFORMING_ENCODING_IMPL and
1223 not $opt{allow_semiconforming};
1224
1225 $load_encode->($perl_name);
1226 my $e = Encode::find_encoding ($perl_name);
1227 if ($e and $e->name eq $perl_name) {
1228 ## NOTE: Don't return $e unless $e eq $perl_name, since
1229 ## |find_encoding| resolves e.g. |foobarlatin-1| to |iso-8859-1|,
1230 ## which might return wrong encoding object when a dedicated
1231 ## implementation not part of the standard Perl distribution is
1232 ## desired.
1233 return ($e, $perl_status);
1234 }
1235 }
1236 }
1237
1238 for my $perl_name (keys %{$self->{perl_names} or {}}) {
1239 my $perl_status = $self->{perl_names}->{$perl_name};
1240 next if $perl_status & ERROR_REPORTING_ENCODING_IMPL;
1241 next if $perl_status & FALLBACK_ENCODING_IMPL;
1242 next if $perl_status & SEMICONFORMING_ENCODING_IMPL and
1243 not $opt{allow_semiconforming};
1244
1245 $load_encode->($perl_name);
1246 my $e = Encode::find_encoding ($perl_name);
1247 if ($e) {
1248 return ($e, $perl_status);
1249 }
1250 }
1251
1252 if ($opt{allow_fallback}) {
1253 for my $perl_name (keys %{$self->{perl_names} or {}}) {
1254 my $perl_status = $self->{perl_names}->{$perl_name};
1255 next unless $perl_status & FALLBACK_ENCODING_IMPL or
1256 $perl_status & SEMICONFORMING_ENCODING_IMPL;
1257 ## NOTE: We don't prefer semi-conforming implementations to
1258 ## non-conforming implementations, since semi-conforming implementations
1259 ## will never be conforming without assist of the callee, and in such
1260 ## cases the callee should set the |allow_semiconforming| option upon
1261 ## the invocation of the method anyway.
1262
1263 $load_encode->($perl_name);
1264 my $e = Encode::find_encoding ($perl_name);
1265 if ($e) {
1266 return ($e, $perl_status);
1267 }
1268 }
1269
1270 for my $iana_name (keys %{$self->{iana_names} or {}}) {
1271 $load_encode->($iana_name);
1272 my $e = Encode::find_encoding ($iana_name);
1273 if ($e) {
1274 return ($e, FALLBACK_ENCODING_IMPL);
1275 }
1276 }
1277 }
1278
1279 return (undef, 0);
1280 } # get_perl_encoding
1281
1282 sub get_iana_name ($) {
1283 my $self = shift;
1284
1285 my $primary;
1286 my $other;
1287 for my $iana_name (keys %{$self->{iana_names} or {}}) {
1288 my $name_status = $self->{iana_names}->{$iana_name};
1289 if ($name_status & PREFERRED_CHARSET_NAME) {
1290 return $iana_name;
1291 } elsif ($name_status & PRIMARY_CHARSET_NAME) {
1292 $primary = $iana_name;
1293 } elsif ($name_status & REGISTERED_CHARSET_NAME) {
1294 $other = $iana_name;
1295 } else {
1296 $other ||= $iana_name;
1297 }
1298 }
1299
1300 return $primary || $other;
1301 } # get_iana_name
1302
1303 ## NOTE: A non-method function
1304 sub is_syntactically_valid_iana_charset_name ($) {
1305 my $name = shift;
1306 return $name =~ /\A[\x20-\x7E]{1,40}\z/;
1307
1308 ## NOTE: According to IANAREG, "The character set names may be up to 40
1309 ## characters taken from the printable characters of US-ASCII. However,
1310 ## no distinction is made between use of upper and lower case letters.".
1311 } # is_suntactically_valid_iana_charset_name
1312
1313 1;
1314 ## $Date: 2008/09/14 07:20:17 $
1315

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24