/[suikacvs]/messaging/manakai/lib/Message/Charset/Info.pm
Suika

Contents of /messaging/manakai/lib/Message/Charset/Info.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.14 - (hide annotations) (download)
Sun Sep 14 07:20:17 2008 UTC (16 years, 2 months ago) by wakaba
Branch: MAIN
Changes since 1.13: +7 -17 lines
++ manakai/lib/Message/Charset/ChangeLog	14 Sep 2008 07:20:15 -0000
	* Info.pm: Remove the entry for Shift_JIS, since the ShiftJIS
	decodehandle class is merged with the Encode class.

2008-09-14  Wakaba  <wakaba@suika.fam.cx>

1 wakaba 1.1 package Message::Charset::Info;
2     use strict;
3 wakaba 1.14 our $VERSION=do{my @r=(q$Revision: 1.13 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 wakaba 1.8
5 wakaba 1.9 ## TODO: Certain encodings MUST NOT be implemented [HTML5].
6    
7     ## ISSUE: Should we convert unassigned code point with trivial Unicode
8     ## mapping into U+FFFD? Or, should we return that Unicode character
9     ## with an error? (For example, Windows-1252's 0x81 should be converted
10     ## to U+FFFD or U+0081?)
11 wakaba 1.1
12     sub UNREGISTERED_CHARSET_NAME () { 0b1 }
13 wakaba 1.4 ## Names for non-standard encodings/implementations for Perl encodings
14 wakaba 1.1 sub REGISTERED_CHARSET_NAME () { 0b10 }
15 wakaba 1.4 ## Names for standard encodings for Perl encodings
16     sub PRIMARY_CHARSET_NAME () { 0b100 }
17 wakaba 1.1 ## "Name:" field for IANA names
18 wakaba 1.4 ## Canonical name for Perl encodings
19     sub PREFERRED_CHARSET_NAME () { 0b1000 }
20 wakaba 1.1 ## "preferred MIME name" for IANA names
21    
22 wakaba 1.4 sub FALLBACK_ENCODING_IMPL () { 0b10000 }
23     ## For Perl encodings: Not a name of the encoding, the encoding
24     ## for the name might be useful as a fallback when the correct
25     ## encoding is not supported.
26     sub NONCONFORMING_ENCODING_IMPL () { FALLBACK_ENCODING_IMPL }
27     ## For Perl encodings: Not a conforming implementation of the encoding,
28     ## though it seems that the intention was to implement that encoding.
29 wakaba 1.6 sub SEMICONFORMING_ENCODING_IMPL () { 0b1000000 }
30     ## For Perl encodings: The implementation itself (returned by
31     ## |get_perl_encoding|) is non-conforming. The decode handle
32     ## implementation (returned by |get_decode_handle|) is conforming.
33 wakaba 1.4 sub ERROR_REPORTING_ENCODING_IMPL () { 0b100000 }
34     ## For Perl encodings: Support error reporting via |manakai_onerror|
35 wakaba 1.6 ## handler when the encoding is handled with decode handle.
36 wakaba 1.4
37 wakaba 1.2 ## iana_status
38     sub STATUS_COMMON () { 0b1 }
39     sub STATUS_LIMITED_USE () { 0b10 }
40     sub STATUS_OBSOLETE () { 0b100 }
41    
42 wakaba 1.5 ## category
43     sub CHARSET_CATEGORY_BLOCK_SAFE () { 0b1 }
44     ## NOTE: Stateless
45     sub CHARSET_CATEGORY_EUCJP () { 0b10 }
46     sub CHARSET_CATEGORY_SJIS () { 0b100 }
47 wakaba 1.8 sub CHARSET_CATEGORY_UTF16 () { 0b1000 }
48     ## NOTE: "A UTF-16 encoding" in HTML5.
49 wakaba 1.9 sub CHARSET_CATEGORY_ASCII_COMPAT () { 0b10000 }
50     ## NOTE: "superset of US-ASCII (specifically, ANSI_X3.4-1968)
51     ## for bytes in the range 0x09-0x0A, 0x0C-0x0D, 0x20-0x22, 0x26, 0x27,
52     ## 0x2C-0x3F, 0x41-0x5A, and 0x61-0x7A" [HTML5]
53     sub CHARSET_CATEGORY_EBCDIC () { 0b100000 }
54     ## NOTE: "based on EBCDIC" in HTML5.
55     sub CHARSET_CATEGORY_MIME_TEXT () { 0b1000000 }
56     ## NOTE: Suitable as MIME text.
57 wakaba 1.1
58     ## ISSUE: Shift_JIS is a superset of US-ASCII? ISO-2022-JP is?
59     ## ISSUE: 0x5F (_) should be added to the range?
60    
61 wakaba 1.9 my $Charset; ## TODO: this is obsolete.
62 wakaba 1.1
63     our $IANACharset;
64 wakaba 1.9 ## NOTE: Charset names used where IANA charset names are allowed, either
65     ## registered or not.
66     our $HTMLCharset;
67     ## NOTE: Same as charset names in $IANACharset, except all ASCII
68     ## punctuations are dropped and letters/digits only names are not included.
69 wakaba 1.1
70     $Charset->{'us-ascii'}
71     = $IANACharset->{'ansi_x3.4-1968'}
72     = $IANACharset->{'iso-ir-6'}
73     = $IANACharset->{'ansi_x3.4-1986'}
74     = $IANACharset->{'iso_646.irv:1991'}
75     = $IANACharset->{'ascii'}
76     = $IANACharset->{'iso646-us'}
77     = $IANACharset->{'us-ascii'}
78     = $IANACharset->{'us'}
79     = $IANACharset->{'ibm367'}
80     = $IANACharset->{'cp367'}
81     = $IANACharset->{'csascii'}
82 wakaba 1.9 = $HTMLCharset->{'ansix341968'}
83     = $HTMLCharset->{'isoir6'}
84     = $HTMLCharset->{'ansix341986'}
85     = $HTMLCharset->{'iso646irv1991'}
86     = $HTMLCharset->{'iso646us'}
87     = $HTMLCharset->{'usascii'}
88 wakaba 1.4 = __PACKAGE__->new ({
89 wakaba 1.9 category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT,
90 wakaba 1.1 iana_names => {
91 wakaba 1.4 'ansi_x3.4-1968' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
92 wakaba 1.1 'iso-ir-6' => REGISTERED_CHARSET_NAME,
93     'ansi_x3.4-1986' => REGISTERED_CHARSET_NAME,
94     'iso_646.irv:1991' => REGISTERED_CHARSET_NAME,
95     'ascii' => REGISTERED_CHARSET_NAME,
96     'iso646-us' => REGISTERED_CHARSET_NAME,
97 wakaba 1.4 'us-ascii' => PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
98 wakaba 1.1 'us' => REGISTERED_CHARSET_NAME,
99     'ibm367' => REGISTERED_CHARSET_NAME,
100     'cp367' => REGISTERED_CHARSET_NAME,
101     'csascii' => REGISTERED_CHARSET_NAME,
102     },
103 wakaba 1.9 perl_names => {
104     'web-latin1-us-ascii' => UNREGISTERED_CHARSET_NAME |
105     SEMICONFORMING_ENCODING_IMPL | ERROR_REPORTING_ENCODING_IMPL,
106     'cp1252' => FALLBACK_ENCODING_IMPL, # part of standard Perl distribution
107     },
108     fallback => {
109     "\x80" => "\x{20AC}",
110     "\x81" => undef,
111     "\x82" => "\x{201A}",
112     "\x83" => "\x{0192}",
113     "\x84" => "\x{201E}",
114     "\x85" => "\x{2026}",
115     "\x86" => "\x{2020}",
116     "\x87" => "\x{2021}",
117     "\x88" => "\x{02C6}",
118     "\x89" => "\x{2030}",
119     "\x8A" => "\x{0160}",
120     "\x8B" => "\x{2039}",
121     "\x8C" => "\x{0152}",
122     "\x8D" => undef,
123     "\x8E" => "\x{017D}",
124     "\x8F" => undef,
125     "\x90" => undef,
126     "\x91" => "\x{2018}",
127     "\x92" => "\x{2019}",
128     "\x93" => "\x{201C}",
129     "\x94" => "\x{201D}",
130     "\x95" => "\x{2022}",
131     "\x96" => "\x{2013}",
132     "\x97" => "\x{2014}",
133     "\x98" => "\x{02DC}",
134     "\x99" => "\x{2122}",
135     "\x9A" => "\x{0161}",
136     "\x9B" => "\x{203A}",
137     "\x9C" => "\x{0153}",
138     "\x9D" => undef,
139     "\x9E" => "\x{017E}",
140     "\x9F" => "\x{0178}",
141     "\xA0" => "\xA0", "\xA1" => "\xA1", "\xA2" => "\xA2", "\xA3" => "\xA3",
142     "\xA4" => "\xA4", "\xA5" => "\xA5", "\xA6" => "\xA6", "\xA7" => "\xA7",
143     "\xA8" => "\xA8", "\xA9" => "\xA9", "\xAA" => "\xAA", "\xAB" => "\xAB",
144     "\xAC" => "\xAC", "\xAD" => "\xAD", "\xAE" => "\xAE", "\xAF" => "\xAF",
145     "\xB0" => "\xB0", "\xB1" => "\xB1", "\xB2" => "\xB2", "\xB3" => "\xB3",
146     "\xB4" => "\xB4", "\xB5" => "\xB5", "\xB6" => "\xB6", "\xB7" => "\xB7",
147     "\xB8" => "\xB8", "\xB9" => "\xB9", "\xBA" => "\xBA", "\xBB" => "\xBB",
148     "\xBC" => "\xBC", "\xBD" => "\xBD", "\xBE" => "\xBE", "\xBF" => "\xBF",
149     "\xC0" => "\xC0", "\xC1" => "\xC1", "\xC2" => "\xC2", "\xC3" => "\xC3",
150     "\xC4" => "\xC4", "\xC5" => "\xC5", "\xC6" => "\xC6", "\xC7" => "\xC7",
151     "\xC8" => "\xC8", "\xC9" => "\xC9", "\xCA" => "\xCA", "\xCB" => "\xCB",
152     "\xCC" => "\xCC", "\xCD" => "\xCD", "\xCE" => "\xCE", "\xCF" => "\xCF",
153     "\xD0" => "\xD0", "\xD1" => "\xD1", "\xD2" => "\xD2", "\xD3" => "\xD3",
154     "\xD4" => "\xD4", "\xD5" => "\xD5", "\xD6" => "\xD6", "\xD7" => "\xD7",
155     "\xD8" => "\xD8", "\xD9" => "\xD9", "\xDA" => "\xDA", "\xDB" => "\xDB",
156     "\xDC" => "\xDC", "\xDD" => "\xDD", "\xDE" => "\xDE", "\xDF" => "\xDF",
157     "\xE0" => "\xE0", "\xE1" => "\xE1", "\xE2" => "\xE2", "\xE3" => "\xE3",
158     "\xE4" => "\xE4", "\xE5" => "\xE5", "\xE6" => "\xE6", "\xE7" => "\xE7",
159     "\xE8" => "\xE8", "\xE9" => "\xE9", "\xEA" => "\xEA", "\xEB" => "\xEB",
160     "\xEC" => "\xEC", "\xED" => "\xED", "\xEE" => "\xEE", "\xEF" => "\xEF",
161     "\xF0" => "\xF0", "\xF1" => "\xF1", "\xF2" => "\xF2", "\xF3" => "\xF3",
162     "\xF4" => "\xF4", "\xF5" => "\xF5", "\xF6" => "\xF6", "\xF7" => "\xF7",
163     "\xF8" => "\xF8", "\xF9" => "\xF9", "\xFA" => "\xFA", "\xFB" => "\xFB",
164     "\xFC" => "\xFC", "\xFD" => "\xFD", "\xFE" => "\xFE", "\xFF" => "\xFF",
165     },
166     ## NOTE: Treated as |windows-1252|. Properties of this charset
167     ## should be consistent with those of that charset.
168 wakaba 1.4 });
169 wakaba 1.1
170     $Charset->{'iso-8859-1'}
171     = $IANACharset->{'iso_8859-1:1987'}
172     = $IANACharset->{'iso-ir-100'}
173     = $IANACharset->{'iso_8859-1'}
174     = $IANACharset->{'iso-8859-1'}
175     = $IANACharset->{'latin1'}
176     = $IANACharset->{'l1'}
177     = $IANACharset->{'ibm819'}
178     = $IANACharset->{'cp819'}
179     = $IANACharset->{'csisolatin1'}
180 wakaba 1.9 = $HTMLCharset->{'iso885911987'}
181     = $HTMLCharset->{'isoir100'}
182     = $HTMLCharset->{'iso88591'}
183 wakaba 1.4 = __PACKAGE__->new ({
184 wakaba 1.9 category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT,
185 wakaba 1.1 iana_names => {
186 wakaba 1.4 'iso_8859-1:1987' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
187 wakaba 1.1 'iso-ir-100' => REGISTERED_CHARSET_NAME,
188     'iso_8859-1' => REGISTERED_CHARSET_NAME,
189 wakaba 1.4 'iso-8859-1' => PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
190 wakaba 1.1 'latin1' => REGISTERED_CHARSET_NAME,
191     'l1' => REGISTERED_CHARSET_NAME,
192     'ibm819' => REGISTERED_CHARSET_NAME,
193     'cp819' => REGISTERED_CHARSET_NAME,
194     'csisolatin1' => REGISTERED_CHARSET_NAME,
195     },
196 wakaba 1.7 perl_names => {
197     'web-latin1' => UNREGISTERED_CHARSET_NAME | SEMICONFORMING_ENCODING_IMPL |
198     ERROR_REPORTING_ENCODING_IMPL,
199 wakaba 1.9 'cp1252' => FALLBACK_ENCODING_IMPL, # part of standard Perl distribution
200 wakaba 1.7 },
201     fallback => {
202     "\x80" => "\x{20AC}",
203 wakaba 1.9 "\x81" => undef,
204 wakaba 1.7 "\x82" => "\x{201A}",
205     "\x83" => "\x{0192}",
206     "\x84" => "\x{201E}",
207     "\x85" => "\x{2026}",
208     "\x86" => "\x{2020}",
209     "\x87" => "\x{2021}",
210     "\x88" => "\x{02C6}",
211     "\x89" => "\x{2030}",
212     "\x8A" => "\x{0160}",
213     "\x8B" => "\x{2039}",
214     "\x8C" => "\x{0152}",
215 wakaba 1.9 "\x8D" => undef,
216 wakaba 1.7 "\x8E" => "\x{017D}",
217 wakaba 1.9 "\x8F" => undef,
218     "\x90" => undef,
219 wakaba 1.7 "\x91" => "\x{2018}",
220     "\x92" => "\x{2019}",
221     "\x93" => "\x{201C}",
222     "\x94" => "\x{201D}",
223     "\x95" => "\x{2022}",
224     "\x96" => "\x{2013}",
225     "\x97" => "\x{2014}",
226     "\x98" => "\x{02DC}",
227     "\x99" => "\x{2122}",
228     "\x9A" => "\x{0161}",
229     "\x9B" => "\x{203A}",
230     "\x9C" => "\x{0153}",
231 wakaba 1.9 "\x9D" => undef,
232 wakaba 1.7 "\x9E" => "\x{017E}",
233     "\x9F" => "\x{0178}",
234     },
235 wakaba 1.9 ## NOTE: Treated as |windows-1252|. Properties of this charset
236     ## should be consistent with those of that charset.
237 wakaba 1.4 });
238 wakaba 1.1
239 wakaba 1.2 $Charset->{'iso-8859-2'}
240     = $IANACharset->{'iso_8859-2:1987'}
241     = $IANACharset->{'iso-ir-101'}
242     = $IANACharset->{'iso_8859-2'}
243     = $IANACharset->{'iso-8859-2'}
244     = $IANACharset->{'latin2'}
245     = $IANACharset->{'l2'}
246     = $IANACharset->{'csisolatin2'}
247 wakaba 1.9 = $HTMLCharset->{'iso885921987'}
248     = $HTMLCharset->{'isoir101'}
249     = $HTMLCharset->{'iso88592'}
250 wakaba 1.4 = __PACKAGE__->new ({
251 wakaba 1.9 category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT,
252 wakaba 1.2 iana_names => {
253 wakaba 1.4 'iso_8859-2:1987' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
254 wakaba 1.2 'iso-ir-101' => REGISTERED_CHARSET_NAME,
255     'iso_8859-2' => REGISTERED_CHARSET_NAME,
256 wakaba 1.4 'iso-8859-2' => PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
257 wakaba 1.2 'latin2' => REGISTERED_CHARSET_NAME,
258     'l2' => REGISTERED_CHARSET_NAME,
259     'csisolatin2' => REGISTERED_CHARSET_NAME,
260     },
261 wakaba 1.4 });
262 wakaba 1.2
263     $Charset->{'iso-8859-3'}
264     = $IANACharset->{'iso_8859-3:1988'}
265     = $IANACharset->{'iso-ir-109'}
266     = $IANACharset->{'iso_8859-3'}
267     = $IANACharset->{'iso-8859-3'}
268     = $IANACharset->{'latin3'}
269     = $IANACharset->{'l3'}
270     = $IANACharset->{'csisolatin3'}
271 wakaba 1.9 = $HTMLCharset->{'iso885931988'}
272     = $HTMLCharset->{'isoir109'}
273     = $HTMLCharset->{'iso88593'}
274 wakaba 1.4 = __PACKAGE__->new ({
275 wakaba 1.9 category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT,
276 wakaba 1.2 iana_names => {
277 wakaba 1.4 'iso_8859-3:1988' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
278 wakaba 1.2 'iso-ir-109' => REGISTERED_CHARSET_NAME,
279     'iso_8859-3' => REGISTERED_CHARSET_NAME,
280 wakaba 1.4 'iso-8859-3' => PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
281 wakaba 1.2 'latin3' => REGISTERED_CHARSET_NAME,
282     'l3' => REGISTERED_CHARSET_NAME,
283     'csisolatin3' => REGISTERED_CHARSET_NAME,
284     },
285 wakaba 1.9 error_level => {
286     'unassigned-code-point-error' => 'iso_shall',
287     ## NOTE: I didn't check whether ISO/IEC 8859-3 prohibits the use of
288     ## unassigned code points, but ECMA-94:1986 (whose content considered
289     ## as equivalent to ISO 8859/1-4) disallows the use of them.
290     },
291 wakaba 1.4 });
292 wakaba 1.2
293     $Charset->{'iso-8859-4'}
294     = $IANACharset->{'iso_8859-4:1988'}
295     = $IANACharset->{'iso-ir-110'}
296     = $IANACharset->{'iso_8859-4'}
297     = $IANACharset->{'iso-8859-4'}
298     = $IANACharset->{'latin4'}
299     = $IANACharset->{'l4'}
300     = $IANACharset->{'csisolatin4'}
301 wakaba 1.9 = $HTMLCharset->{'iso885941988'}
302     = $HTMLCharset->{'isoir110'}
303     = $HTMLCharset->{'iso88594'}
304 wakaba 1.4 = __PACKAGE__->new ({
305 wakaba 1.9 category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT,
306 wakaba 1.2 iana_names => {
307 wakaba 1.4 'iso_8859-4:1988' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
308 wakaba 1.2 'iso-ir-110' => REGISTERED_CHARSET_NAME,
309     'iso_8859-4' => REGISTERED_CHARSET_NAME,
310 wakaba 1.4 'iso-8859-4' => PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
311 wakaba 1.2 'latin4' => REGISTERED_CHARSET_NAME,
312     'l4' => REGISTERED_CHARSET_NAME,
313     'csisolatin4' => REGISTERED_CHARSET_NAME,
314     },
315 wakaba 1.9 error_level => {
316     'unassigned-code-point-error' => 'iso_shall',
317     ## NOTE: I didn't check whether ISO/IEC 8859-3 prohibits the use of
318     ## unassigned code points, but ECMA-94:1986 (whose content considered
319     ## as equivalent to ISO 8859/1-4) disallows the use of them.
320     },
321 wakaba 1.4 });
322 wakaba 1.2
323     $Charset->{'iso-8859-5'}
324     = $IANACharset->{'iso_8859-5:1988'}
325     = $IANACharset->{'iso-ir-144'}
326     = $IANACharset->{'iso_8859-5'}
327     = $IANACharset->{'iso-8859-5'}
328     = $IANACharset->{'cyrillic'}
329     = $IANACharset->{'csisolatincyrillic'}
330 wakaba 1.9 = $HTMLCharset->{'iso885951988'}
331     = $HTMLCharset->{'isoir144'}
332     = $HTMLCharset->{'iso88595'}
333 wakaba 1.4 = __PACKAGE__->new ({
334 wakaba 1.9 category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT,
335 wakaba 1.2 iana_names => {
336 wakaba 1.4 'iso_8859-5:1988' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
337 wakaba 1.2 'iso-ir-144' => REGISTERED_CHARSET_NAME,
338     'iso_8859-5' => REGISTERED_CHARSET_NAME,
339 wakaba 1.4 'iso-8859-5' => PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
340 wakaba 1.2 'cyrillic' => REGISTERED_CHARSET_NAME,
341     'csisolatincyrillic' => REGISTERED_CHARSET_NAME,
342     },
343 wakaba 1.4 });
344 wakaba 1.2
345     $Charset->{'iso-8859-6'}
346     = $IANACharset->{'iso_8859-6:1987'}
347     = $IANACharset->{'iso-ir-127'}
348     = $IANACharset->{'iso_8859-6'}
349     = $IANACharset->{'iso-8859-6'}
350     = $IANACharset->{'ecma-114'}
351     = $IANACharset->{'asmo-708'}
352     = $IANACharset->{'arabic'}
353     = $IANACharset->{'csisolatinarabic'}
354 wakaba 1.9 = $HTMLCharset->{'iso885961987'}
355     = $HTMLCharset->{'isoir127'}
356     = $HTMLCharset->{'iso88596'}
357     = $HTMLCharset->{'ecma114'}
358     = $HTMLCharset->{'asmo708'}
359 wakaba 1.4 = __PACKAGE__->new ({
360 wakaba 1.9 category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT,
361     ## NOTE: 3/0..3/9 have different semantics from U+0030..0039,
362     ## but have same character names (maybe).
363     ## NOTE: According to RFC 2046, charset left-hand half of "iso-8859-6"
364     ## is same as "us-ascii".
365     ## TODO: RFC 1345 def?
366 wakaba 1.2 iana_names => {
367 wakaba 1.4 'iso_8859-6:1987' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
368 wakaba 1.2 'iso-ir-127' => REGISTERED_CHARSET_NAME,
369     'iso_8859-6' => REGISTERED_CHARSET_NAME,
370 wakaba 1.4 'iso-8859-6' => PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
371 wakaba 1.2 'ecma-114' => REGISTERED_CHARSET_NAME,
372     'asmo-708' => REGISTERED_CHARSET_NAME,
373     'arabic' => REGISTERED_CHARSET_NAME,
374     'csisolatinarabic' => REGISTERED_CHARSET_NAME,
375     },
376 wakaba 1.9 ## TODO: |error_level|
377 wakaba 1.4 });
378 wakaba 1.2
379     $Charset->{'iso-8859-7'}
380     = $IANACharset->{'iso_8859-7:1987'}
381     = $IANACharset->{'iso-ir-126'}
382     = $IANACharset->{'iso_8859-7'}
383     = $IANACharset->{'iso-8859-7'}
384     = $IANACharset->{'elot_928'}
385     = $IANACharset->{'ecma-118'}
386     = $IANACharset->{'greek'}
387     = $IANACharset->{'greek8'}
388     = $IANACharset->{'csisolatingreek'}
389 wakaba 1.9 = $HTMLCharset->{'iso885971987'}
390     = $HTMLCharset->{'isoir126'}
391     = $HTMLCharset->{'iso88597'}
392     = $HTMLCharset->{'elot928'}
393     = $HTMLCharset->{'ecma118'}
394 wakaba 1.4 = __PACKAGE__->new ({
395 wakaba 1.9 category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT,
396 wakaba 1.2 iana_names => {
397 wakaba 1.4 'iso_8859-7:1987' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
398 wakaba 1.2 'iso-ir-126' => REGISTERED_CHARSET_NAME,
399     'iso_8859-7' => REGISTERED_CHARSET_NAME,
400 wakaba 1.4 'iso-8859-7' => PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
401 wakaba 1.2 'elot_928' => REGISTERED_CHARSET_NAME,
402     'ecma-118' => REGISTERED_CHARSET_NAME,
403     'greek' => REGISTERED_CHARSET_NAME,
404     'greek8' => REGISTERED_CHARSET_NAME,
405     'csisolatingreek' => REGISTERED_CHARSET_NAME,
406     },
407 wakaba 1.9 ## TODO: |error_level|
408 wakaba 1.4 });
409 wakaba 1.2
410     $Charset->{'iso-8859-8'}
411     = $IANACharset->{'iso_8859-8:1988'}
412     = $IANACharset->{'iso-ir-138'}
413     = $IANACharset->{'iso_8859-8'}
414     = $IANACharset->{'iso-8859-8'}
415     = $IANACharset->{'hebrew'}
416     = $IANACharset->{'csisolatinhebrew'}
417 wakaba 1.9 = $HTMLCharset->{'iso885981988'}
418     = $HTMLCharset->{'isoir138'}
419     = $HTMLCharset->{'iso88598'}
420 wakaba 1.4 = __PACKAGE__->new ({
421 wakaba 1.9 category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT,
422 wakaba 1.2 iana_names => {
423 wakaba 1.4 'iso_8859-8:1988' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
424 wakaba 1.2 'iso-ir-138' => REGISTERED_CHARSET_NAME,
425     'iso_8859-8' => REGISTERED_CHARSET_NAME,
426 wakaba 1.4 'iso-8859-8' => PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
427 wakaba 1.2 'hebrew' => REGISTERED_CHARSET_NAME,
428     'csisolatinhebrew' => REGISTERED_CHARSET_NAME,
429     },
430 wakaba 1.9 ## TODO: |error_level|
431 wakaba 1.4 });
432 wakaba 1.2
433     $Charset->{'iso-8859-9'}
434     = $IANACharset->{'iso_8859-9:1989'}
435     = $IANACharset->{'iso-ir-148'}
436     = $IANACharset->{'iso_8859-9'}
437     = $IANACharset->{'iso-8859-9'}
438     = $IANACharset->{'latin5'}
439     = $IANACharset->{'l5'}
440     = $IANACharset->{'csisolatin5'}
441 wakaba 1.9 = $HTMLCharset->{'iso885991989'}
442     = $HTMLCharset->{'isoir148'}
443     = $HTMLCharset->{'iso88599'}
444 wakaba 1.4 = __PACKAGE__->new ({
445 wakaba 1.9 category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT,
446 wakaba 1.2 iana_names => {
447 wakaba 1.4 'iso_8859-9:1989' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
448 wakaba 1.2 'iso-ir-148' => REGISTERED_CHARSET_NAME,
449     'iso_8859-9' => REGISTERED_CHARSET_NAME,
450 wakaba 1.4 'iso-8859-9' => PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
451 wakaba 1.2 'latin5' => REGISTERED_CHARSET_NAME,
452     'l5' => REGISTERED_CHARSET_NAME,
453     'csisolatin5' => REGISTERED_CHARSET_NAME,
454     },
455 wakaba 1.9 perl_names => {
456     'web-latin5' => UNREGISTERED_CHARSET_NAME | SEMICONFORMING_ENCODING_IMPL |
457     ERROR_REPORTING_ENCODING_IMPL,
458     'cp1254' => FALLBACK_ENCODING_IMPL, # part of standard Perl distribution
459     },
460     fallback => {
461     "\x80" => "\x{20AC}",
462     "\x81" => undef,
463     "\x82" => "\x{201A}",
464     "\x83" => "\x{0192}",
465     "\x84" => "\x{201E}",
466     "\x85" => "\x{2026}",
467     "\x86" => "\x{2020}",
468     "\x87" => "\x{2021}",
469     "\x88" => "\x{02C6}",
470     "\x89" => "\x{2030}",
471     "\x8A" => "\x{0160}",
472     "\x8B" => "\x{2039}",
473     "\x8C" => "\x{0152}",
474     "\x8D" => undef,
475     "\x8E" => undef,
476     "\x8F" => undef,
477     "\x90" => undef,
478     "\x91" => "\x{2018}",
479     "\x92" => "\x{2019}",
480     "\x93" => "\x{201C}",
481     "\x94" => "\x{201D}",
482     "\x95" => "\x{2022}",
483     "\x96" => "\x{2013}",
484     "\x97" => "\x{2014}",
485     "\x98" => "\x{02DC}",
486     "\x99" => "\x{2122}",
487     "\x9A" => "\x{0161}",
488     "\x9B" => "\x{203A}",
489     "\x9C" => "\x{0153}",
490     "\x9D" => undef,
491     "\x9E" => undef,
492     "\x9F" => "\x{0178}",
493     },
494     ## NOTE: Treated as |windows-1254|. Properties of this charset
495     ## should be consistent with those of that charset.
496 wakaba 1.4 });
497 wakaba 1.2
498     $Charset->{'iso-8859-10'}
499     = $IANACharset->{'iso-8859-10'}
500     = $IANACharset->{'iso-ir-157'}
501     = $IANACharset->{'l6'}
502     = $IANACharset->{'iso_8859-10:1992'}
503     = $IANACharset->{'csisolatin6'}
504     = $IANACharset->{'latin6'}
505 wakaba 1.9 = $HTMLCharset->{'iso885910'}
506     = $HTMLCharset->{'isoir157'}
507     = $HTMLCharset->{'iso8859101992'}
508 wakaba 1.4 = __PACKAGE__->new ({
509 wakaba 1.9 category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT,
510 wakaba 1.2 iana_names => {
511 wakaba 1.4 'iso-8859-10' => PRIMARY_CHARSET_NAME | PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
512 wakaba 1.2 'iso-ir-157' => REGISTERED_CHARSET_NAME,
513     'l6' => REGISTERED_CHARSET_NAME,
514     'iso_8859-10:1992' => REGISTERED_CHARSET_NAME,
515     'csisolatin6' => REGISTERED_CHARSET_NAME,
516     'latin6' => REGISTERED_CHARSET_NAME,
517     },
518 wakaba 1.9 ## TODO: |error_level|
519 wakaba 1.4 });
520 wakaba 1.2
521     $Charset->{'iso_6937-2-add'}
522     = $IANACharset->{'iso_6937-2-add'}
523     = $IANACharset->{'iso-ir-142'}
524     = $IANACharset->{'csisotextcomm'}
525 wakaba 1.9 = $HTMLCharset->{'iso69372add'}
526     = $HTMLCharset->{'isoir142'}
527 wakaba 1.4 = __PACKAGE__->new ({
528 wakaba 1.9 category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT,
529 wakaba 1.2 iana_names => {
530 wakaba 1.4 'iso_6937-2-add' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
531 wakaba 1.2 'iso-ir-142' => REGISTERED_CHARSET_NAME,
532     'csisotextcomm' => REGISTERED_CHARSET_NAME,
533     },
534 wakaba 1.9 ## TODO: |error_level|
535 wakaba 1.4 });
536 wakaba 1.2
537     $Charset->{'jis_x0201'}
538     = $IANACharset->{'jis_x0201'}
539     = $IANACharset->{'x0201'}
540     = $IANACharset->{'cshalfwidthkatakana'}
541 wakaba 1.9 = $HTMLCharset->{'jisx0201'}
542 wakaba 1.4 = __PACKAGE__->new ({
543 wakaba 1.9 category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT,
544 wakaba 1.2 iana_names => {
545 wakaba 1.4 'jis_x0201' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
546 wakaba 1.2 'x0201' => REGISTERED_CHARSET_NAME,
547     'cshalfwidthkatakana' => REGISTERED_CHARSET_NAME,
548     },
549 wakaba 1.9 ## TODO: |error_level|
550 wakaba 1.4 });
551 wakaba 1.2
552     $Charset->{'jis_encoding'}
553     = $IANACharset->{'jis_encoding'}
554     = $IANACharset->{'csjisencoding'}
555 wakaba 1.9 = $HTMLCharset->{'jisencoding'}
556 wakaba 1.4 = __PACKAGE__->new ({
557 wakaba 1.5 category => 0,
558 wakaba 1.2 iana_names => {
559 wakaba 1.4 'jis_encoding' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
560 wakaba 1.2 'csjisencoding' => REGISTERED_CHARSET_NAME,
561     },
562     ## NOTE: What is this?
563 wakaba 1.4 });
564 wakaba 1.1
565     $Charset->{'shift_jis'}
566     = $IANACharset->{'shift_jis'}
567     = $IANACharset->{'ms_kanji'}
568     = $IANACharset->{'csshiftjis'}
569 wakaba 1.9 = $HTMLCharset->{'shiftjis'}
570     = $HTMLCharset->{'mskanji'}
571 wakaba 1.4 = __PACKAGE__->new ({
572 wakaba 1.9 category => CHARSET_CATEGORY_SJIS | CHARSET_CATEGORY_BLOCK_SAFE |
573     CHARSET_CATEGORY_MIME_TEXT,
574 wakaba 1.1 iana_names => {
575 wakaba 1.4 'shift_jis' => PREFERRED_CHARSET_NAME | PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
576 wakaba 1.1 'ms_kanji' => REGISTERED_CHARSET_NAME,
577     'csshiftjis' => REGISTERED_CHARSET_NAME,
578     },
579 wakaba 1.5 perl_names => {
580 wakaba 1.6 'shift-jis-1997' => UNREGISTERED_CHARSET_NAME |
581 wakaba 1.7 SEMICONFORMING_ENCODING_IMPL | ERROR_REPORTING_ENCODING_IMPL,
582     shiftjis => PRIMARY_CHARSET_NAME | NONCONFORMING_ENCODING_IMPL |
583 wakaba 1.6 ERROR_REPORTING_ENCODING_IMPL,
584 wakaba 1.5 ## NOTE: Unicode mapping is wrong.
585     },
586 wakaba 1.9 ## TODO: |error_level|
587 wakaba 1.4 });
588 wakaba 1.1
589 wakaba 1.3 $Charset->{'x-sjis'}
590     = $IANACharset->{'x-sjis'}
591 wakaba 1.9 = $HTMLCharset->{'xsjis'}
592 wakaba 1.4 = __PACKAGE__->new ({
593 wakaba 1.9 category => CHARSET_CATEGORY_SJIS | CHARSET_CATEGORY_BLOCK_SAFE |
594     CHARSET_CATEGORY_ASCII_COMPAT,
595 wakaba 1.3 iana_names => {
596     'x-sjis' => UNREGISTERED_CHARSET_NAME,
597     },
598 wakaba 1.6 perl_names => {
599 wakaba 1.7 'shift-jis-1997' => FALLBACK_ENCODING_IMPL | ERROR_REPORTING_ENCODING_IMPL,
600 wakaba 1.6 },
601 wakaba 1.9 ## TODO: |error_level|
602 wakaba 1.4 });
603 wakaba 1.3
604 wakaba 1.5 $Charset->{shift_jisx0213}
605     = $IANACharset->{shift_jisx0213}
606 wakaba 1.9 = $HTMLCharset->{shiftjisx0213}
607 wakaba 1.5 = __PACKAGE__->new ({
608 wakaba 1.9 category => CHARSET_CATEGORY_SJIS | CHARSET_CATEGORY_BLOCK_SAFE |
609     CHARSET_CATEGORY_MIME_TEXT,
610 wakaba 1.5 iana_names => {
611     shift_jisx0213 => UNREGISTERED_CHARSET_NAME,
612     },
613     perl_names => {
614     #shift_jisx0213 (non-standard - i don't know its conformance)
615 wakaba 1.7 'shift-jis-1997' => FALLBACK_ENCODING_IMPL | ERROR_REPORTING_ENCODING_IMPL,
616     'shiftjis' => FALLBACK_ENCODING_IMPL | ERROR_REPORTING_ENCODING_IMPL,
617 wakaba 1.5 },
618 wakaba 1.9 ## TODO: |error_level|
619 wakaba 1.5 });
620    
621 wakaba 1.1 $Charset->{'euc-jp'}
622     = $IANACharset->{'extended_unix_code_packed_format_for_japanese'}
623     = $IANACharset->{'cseucpkdfmtjapanese'}
624     = $IANACharset->{'euc-jp'}
625 wakaba 1.9 = $HTMLCharset->{'extendedunixcodepackedformatforjapanese'}
626     = $HTMLCharset->{'cseucpkdfmtjapanese'}
627     = $HTMLCharset->{'eucjp'}
628 wakaba 1.4 = __PACKAGE__->new ({
629 wakaba 1.9 category => CHARSET_CATEGORY_EUCJP | CHARSET_CATEGORY_BLOCK_SAFE |
630     CHARSET_CATEGORY_MIME_TEXT | CHARSET_CATEGORY_ASCII_COMPAT,
631 wakaba 1.1 iana_names => {
632 wakaba 1.4 'extended_unix_code_packed_format_for_japanese' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
633 wakaba 1.1 'cseucpkdfmtjapanese' => REGISTERED_CHARSET_NAME,
634 wakaba 1.4 'euc-jp' => PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
635 wakaba 1.1 },
636 wakaba 1.5 perl_names => {
637 wakaba 1.7 'euc-jp-1997' => UNREGISTERED_CHARSET_NAME |
638     SEMICONFORMING_ENCODING_IMPL | ERROR_REPORTING_ENCODING_IMPL,
639 wakaba 1.5 ## NOTE: Though the IANA definition references the 1990 version
640     ## of EUC-JP, the 1997 version of JIS standard claims that the version
641     ## is same coded character set as the 1990 version, such that we
642     ## consider the EUC-JP 1990 version is same as the 1997 version.
643 wakaba 1.6 'euc-jp' => PREFERRED_CHARSET_NAME | NONCONFORMING_ENCODING_IMPL |
644     ERROR_REPORTING_ENCODING_IMPL,
645 wakaba 1.5 ## NOTE: Unicode mapping is wrong.
646     },
647 wakaba 1.9 ## TODO: |error_level|
648 wakaba 1.4 });
649 wakaba 1.3
650     $Charset->{'x-euc-jp'}
651     = $IANACharset->{'x-euc-jp'}
652 wakaba 1.9 = $HTMLCharset->{'xeucjp'}
653 wakaba 1.4 = __PACKAGE__->new ({
654 wakaba 1.9 category => CHARSET_CATEGORY_EUCJP | CHARSET_CATEGORY_BLOCK_SAFE |
655     CHARSET_CATEGORY_MIME_TEXT | CHARSET_CATEGORY_ASCII_COMPAT,
656 wakaba 1.3 iana_names => {
657     'x-euc-jp' => UNREGISTERED_CHARSET_NAME,
658     },
659 wakaba 1.6 perl_names => {
660     'euc-jp-1997' => FALLBACK_ENCODING_IMPL | ERROR_REPORTING_ENCODING_IMPL,
661     'euc-jp' => FALLBACK_ENCODING_IMPL | ERROR_REPORTING_ENCODING_IMPL,
662     },
663 wakaba 1.4 });
664 wakaba 1.1
665 wakaba 1.2 $Charset->{'extended_unix_code_fixed_width_for_japanese'}
666     = $IANACharset->{'extended_unix_code_fixed_width_for_japanese'}
667     = $IANACharset->{'cseucfixwidjapanese'}
668 wakaba 1.9 = $HTMLCharset->{'extendedunixcodefixedwidthforjapanese'}
669 wakaba 1.4 = __PACKAGE__->new ({
670 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
671 wakaba 1.2 iana_names => {
672 wakaba 1.4 'extended_unix_code_fixed_width_for_japanese' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
673 wakaba 1.2 'cseucfixwidjapanese' => REGISTERED_CHARSET_NAME,
674     },
675 wakaba 1.9 ## TODO: |error_level|
676 wakaba 1.4 });
677 wakaba 1.2
678 wakaba 1.1 ## TODO: ...
679    
680 wakaba 1.2 $Charset->{'euc-kr'}
681     = $IANACharset->{'euc-kr'}
682     = $IANACharset->{'cseuckr'}
683 wakaba 1.9 = $HTMLCharset->{'euckr'}
684 wakaba 1.4 = __PACKAGE__->new ({
685 wakaba 1.9 category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT,
686 wakaba 1.2 iana_names => {
687 wakaba 1.4 'euc-kr' => PRIMARY_CHARSET_NAME | PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
688 wakaba 1.2 'cseuckr' => REGISTERED_CHARSET_NAME,
689     },
690 wakaba 1.9 perl_names => {
691     ## TODO: We need a parse error generating wrapper for the decoder.
692     'cp949' => FALLBACK_ENCODING_IMPL, # part of standard Perl distribution
693     },
694     ## NOTE: |euc-kr| is handled as |windows-949|, such that properties
695     ## should be consistent with that encoding's properties.
696 wakaba 1.4 });
697 wakaba 1.2
698 wakaba 1.1 $Charset->{'iso-2022-jp'}
699     = $IANACharset->{'iso-2022-jp'}
700     = $IANACharset->{'csiso2022jp'}
701 wakaba 1.3 = $IANACharset->{'iso2022jp'}
702     = $IANACharset->{'junet-code'}
703 wakaba 1.9 = $HTMLCharset->{'iso2022jp'}
704     = $HTMLCharset->{'junetcode'}
705 wakaba 1.4 = __PACKAGE__->new ({
706 wakaba 1.9 category => CHARSET_CATEGORY_MIME_TEXT,
707 wakaba 1.1 iana_names => {
708 wakaba 1.4 'iso-2022-jp' => PREFERRED_CHARSET_NAME | PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
709 wakaba 1.1 'csiso2022jp' => REGISTERED_CHARSET_NAME,
710 wakaba 1.3 'iso2022jp' => UNREGISTERED_CHARSET_NAME,
711     'junet-code' => UNREGISTERED_CHARSET_NAME,
712 wakaba 1.1 },
713 wakaba 1.9 ## TODO: |error_level|
714 wakaba 1.4 });
715 wakaba 1.2
716     $Charset->{'iso-2022-jp-2'}
717     = $IANACharset->{'iso-2022-jp-2'}
718     = $IANACharset->{'csiso2022jp2'}
719 wakaba 1.9 = $HTMLCharset->{'iso2022jp2'}
720 wakaba 1.4 = __PACKAGE__->new ({
721 wakaba 1.9 category => CHARSET_CATEGORY_MIME_TEXT,
722 wakaba 1.2 iana_names => {
723 wakaba 1.4 'iso-2022-jp-2' => PREFERRED_CHARSET_NAME | PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
724 wakaba 1.2 'csiso2022jp2' => REGISTERED_CHARSET_NAME,
725     },
726 wakaba 1.9 ## TODO: |error_level|
727     });
728    
729     ## TODO: ...
730    
731     $IANACharset->{'gb_2312-80'}
732     = $IANACharset->{'iso-ir-58'}
733     = $IANACharset->{chinese}
734     = $HTMLCharset->{gb231280}
735     = $HTMLCharset->{isoir58}
736     = __PACKAGE__->new ({
737     ## NOTE: What is represented by this charset is unclear... I don't
738     ## understand what RFC 1945 describes...
739     category => 0,
740     iana_names => {
741     'gb_2312-80' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
742     'iso-ir-58' => REGISTERED_CHARSET_NAME,
743     'chinese' => REGISTERED_CHARSET_NAME,
744     'csiso58gb231280' => REGISTERED_CHARSET_NAME,
745     },
746     perl_names => {
747     ## TODO: GB2312->GBK Parse Error wrapper
748     'cp936' => FALLBACK_ENCODING_IMPL,
749     },
750     ## NOTE: |gb2312| is handled as |gbk|, such that properties should be
751     ## consistent.
752 wakaba 1.4 });
753 wakaba 1.1
754     ## TODO: ...
755    
756     $Charset->{'utf-8'}
757     = $IANACharset->{'utf-8'}
758 wakaba 1.3 = $IANACharset->{'x-utf-8'}
759 wakaba 1.9 = $HTMLCharset->{'utf8'}
760     = $HTMLCharset->{'xutf8'}
761 wakaba 1.4 = __PACKAGE__->new ({
762 wakaba 1.9 category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT |
763     CHARSET_CATEGORY_MIME_TEXT,
764 wakaba 1.1 iana_names => {
765 wakaba 1.4 'utf-8' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
766 wakaba 1.6 ## NOTE: IANA name "utf-8" references RFC 3629. According to the RFC,
767     ## the definitive definition is one specified in the Unicode Standard.
768 wakaba 1.3 'x-utf-8' => UNREGISTERED_CHARSET_NAME,
769 wakaba 1.9 ## NOTE: We treat |x-utf-8| as an alias of |utf-8|, since unlike
770     ## other charset like |x-sjis| or |x-euc-jp|, there is no major
771     ## variant for the UTF-8 encoding.
772     ## TODO: We might ought to reconsider this policy, since
773     ## there are UTF-8 variant in fact, such as
774     ## Unicode's UTF-8, ISO/IEC 10646's UTF-8, UTF-8n, and as
775     ## such.
776 wakaba 1.1 },
777 wakaba 1.6 perl_names => {
778     'utf-8-strict' => PRIMARY_CHARSET_NAME | SEMICONFORMING_ENCODING_IMPL |
779     ERROR_REPORTING_ENCODING_IMPL,
780     ## NOTE: It does not support non-Unicode UCS characters (conforming).
781     ## It does detect illegal sequences (conforming).
782     ## It does not support surrpgate pairs (conforming).
783     ## It does not support BOMs (non-conforming).
784     },
785 wakaba 1.9 ## TODO: |error_level|
786 wakaba 1.6 bom_pattern => qr/\xEF\xBB\xBF/,
787 wakaba 1.4 });
788 wakaba 1.3
789     $Charset->{'utf-8n'}
790     = $IANACharset->{'utf-8n'}
791 wakaba 1.9 = $HTMLCharset->{'utf-8'}
792 wakaba 1.4 = __PACKAGE__->new ({
793 wakaba 1.9 category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_MIME_TEXT |
794     CHARSET_CATEGORY_ASCII_COMPAT,
795 wakaba 1.3 iana_names => {
796     'utf-8n' => UNREGISTERED_CHARSET_NAME,
797 wakaba 1.6 ## NOTE: Is there any normative definition for the charset?
798     ## What variant of UTF-8 should we use for the charset?
799     },
800     perl_names => {
801     'utf-8-strict' => PRIMARY_CHARSET_NAME | ERROR_REPORTING_ENCODING_IMPL,
802 wakaba 1.3 },
803 wakaba 1.9 ## TODO: |error_level|
804 wakaba 1.4 });
805 wakaba 1.1
806     ## TODO: ...
807    
808 wakaba 1.2 $Charset->{'gbk'}
809     = $IANACharset->{'gbk'}
810     = $IANACharset->{'cp936'}
811     = $IANACharset->{'ms936'}
812     = $IANACharset->{'windows-936'}
813 wakaba 1.9 = $HTMLCharset->{'windows936'}
814 wakaba 1.4 = __PACKAGE__->new ({
815 wakaba 1.9 category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_MIME_TEXT,
816 wakaba 1.2 iana_names => {
817 wakaba 1.4 'gbk' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
818 wakaba 1.2 'cp936' => REGISTERED_CHARSET_NAME,
819     'ms936' => REGISTERED_CHARSET_NAME,
820     'windows-936' => REGISTERED_CHARSET_NAME,
821     },
822 wakaba 1.9 ## TODO: |error_level|
823 wakaba 1.2 iana_status => STATUS_COMMON | STATUS_OBSOLETE,
824 wakaba 1.4 });
825 wakaba 1.2
826     $Charset->{'gb18030'}
827     = $IANACharset->{'gb18030'}
828 wakaba 1.4 = __PACKAGE__->new ({
829 wakaba 1.9 category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_MIME_TEXT,
830 wakaba 1.2 iana_names => {
831 wakaba 1.4 'gb18030' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
832 wakaba 1.2 },
833     iana_status => STATUS_COMMON,
834     mime_text_suitable => 1,
835 wakaba 1.4 });
836 wakaba 1.2
837     ## TODO: ...
838    
839 wakaba 1.1 $Charset->{'utf-16be'}
840     = $IANACharset->{'utf-16be'}
841 wakaba 1.9 = $HTMLCharset->{'utf16be'}
842 wakaba 1.4 = __PACKAGE__->new ({
843 wakaba 1.8 category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_UTF16,
844 wakaba 1.1 iana_names => {
845 wakaba 1.4 'utf-16be' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
846 wakaba 1.1 },
847 wakaba 1.9 ## TODO: |error_level|
848 wakaba 1.4 });
849 wakaba 1.1
850     $Charset->{'utf-16le'}
851     = $IANACharset->{'utf-16le'}
852 wakaba 1.9 = $HTMLCharset->{'utf16le'}
853 wakaba 1.4 = __PACKAGE__->new ({
854 wakaba 1.8 category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_UTF16,
855 wakaba 1.1 iana_names => {
856 wakaba 1.4 'utf-16le' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
857 wakaba 1.1 },
858 wakaba 1.9 ## TODO: |error_level|
859 wakaba 1.4 });
860 wakaba 1.1
861     $Charset->{'utf-16'}
862     = $IANACharset->{'utf-16'}
863 wakaba 1.9 = $HTMLCharset->{'utf16'}
864 wakaba 1.4 = __PACKAGE__->new ({
865 wakaba 1.8 category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_UTF16,
866 wakaba 1.1 iana_names => {
867 wakaba 1.4 'utf-16' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
868 wakaba 1.1 },
869 wakaba 1.9 ## TODO: |error_level|
870 wakaba 1.4 });
871 wakaba 1.1
872     ## TODO: ...
873    
874 wakaba 1.2 $Charset->{'windows-31j'}
875     = $IANACharset->{'windows-31j'}
876     = $IANACharset->{'cswindows31j'}
877 wakaba 1.9 = $HTMLCharset->{'windows31j'}
878 wakaba 1.4 = __PACKAGE__->new ({
879 wakaba 1.9 category => CHARSET_CATEGORY_SJIS | CHARSET_CATEGORY_BLOCK_SAFE |
880     CHARSET_CATEGORY_MIME_TEXT,
881 wakaba 1.2 iana_names => {
882 wakaba 1.4 'windows-31j' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
883 wakaba 1.2 'cswindows31j' => REGISTERED_CHARSET_NAME,
884     },
885     iana_status => STATUS_LIMITED_USE, # maybe
886 wakaba 1.9 ## TODO: |error_level|
887 wakaba 1.4 });
888 wakaba 1.2
889     $Charset->{'gb2312'}
890     = $IANACharset->{'gb2312'}
891     = $IANACharset->{'csgb2312'}
892 wakaba 1.4 = __PACKAGE__->new ({
893 wakaba 1.9 category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_MIME_TEXT |
894     CHARSET_CATEGORY_ASCII_COMPAT,
895 wakaba 1.2 iana_names => {
896 wakaba 1.4 'gb2312' => PRIMARY_CHARSET_NAME | PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
897 wakaba 1.2 'csgb2312' => REGISTERED_CHARSET_NAME,
898     },
899 wakaba 1.9 perl_names => {
900     ## TODO: GB2312->GBK Parse Error wrapper
901     'cp936' => FALLBACK_ENCODING_IMPL,
902     },
903     ## NOTE: |gb2312| is handled as |gbk|, such that properties should be
904     ## consistent.
905 wakaba 1.4 });
906 wakaba 1.2
907     $Charset->{'big5'}
908     = $IANACharset->{'big5'}
909     = $IANACharset->{'csbig5'}
910 wakaba 1.9 = $IANACharset->{'x-x-big5'}
911     = $HTMLCharset->{xxbig5}
912 wakaba 1.4 = __PACKAGE__->new ({
913 wakaba 1.9 category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_MIME_TEXT,
914 wakaba 1.2 iana_names => {
915 wakaba 1.4 'big5' => PRIMARY_CHARSET_NAME | PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
916 wakaba 1.2 'csbig5' => REGISTERED_CHARSET_NAME,
917 wakaba 1.9 'x-x-big5' => UNREGISTERED_CHARSET_NAME,
918     ## NOTE: In HTML5, |x-x-big5| is defined as an alias of |big5|.
919     ## According to that spec, if there is any difference between
920     ## input and replacement encodings, the result is parse error.
921     ## However, since there is no formal definition for |x-x-big5|
922     ## charset, we cannot raise such errors.
923 wakaba 1.2 },
924 wakaba 1.9 ## TODO: |error_level|
925 wakaba 1.4 });
926 wakaba 1.2
927     ## TODO: ...
928    
929     $Charset->{'big5-hkscs'}
930     = $IANACharset->{'big5-hkscs'}
931 wakaba 1.9 = $HTMLCharset->{'big5hkscs'}
932 wakaba 1.4 = __PACKAGE__->new ({
933 wakaba 1.9 category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_MIME_TEXT,
934 wakaba 1.2 iana_names => {
935 wakaba 1.4 'big5-hkscs' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
936 wakaba 1.2 },
937 wakaba 1.9 ## TODO: |error_level|
938 wakaba 1.4 });
939 wakaba 1.2
940     ## TODO: ...
941    
942 wakaba 1.1 $Charset->{'windows-1252'}
943     = $IANACharset->{'windows-1252'}
944 wakaba 1.9 = $HTMLCharset->{'windows1252'}
945 wakaba 1.4 = __PACKAGE__->new ({
946 wakaba 1.9 category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT |
947     CHARSET_CATEGORY_MIME_TEXT,
948 wakaba 1.1 iana_names => {
949 wakaba 1.4 'windows-1252' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
950 wakaba 1.1 },
951 wakaba 1.9 ## TODO: Check whether use of 0x81 is conforming or not...
952     });
953    
954     $Charset->{'windows-1253'}
955     = $IANACharset->{'windows-1253'}
956     = $HTMLCharset->{'windows1253'}
957     = __PACKAGE__->new ({
958     category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT |
959     CHARSET_CATEGORY_MIME_TEXT,
960     iana_names => {
961     'windows-1253' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
962     },
963     ## TODO: Check whether use of 0x81 is conforming or not...
964     });
965    
966     $Charset->{'windows-1254'}
967     = $IANACharset->{'windows-1254'}
968     = $HTMLCharset->{'windows1254'}
969     = __PACKAGE__->new ({
970     category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT |
971     CHARSET_CATEGORY_MIME_TEXT,
972     iana_names => {
973     'windows-1254' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
974     },
975     ## TODO: Check whether use of 0x81 is conforming or not...
976 wakaba 1.4 });
977 wakaba 1.1
978     ## TODO: ...
979    
980 wakaba 1.7 $Charset->{'tis-620'}
981     = $IANACharset->{'tis-620'}
982 wakaba 1.9 = $HTMLCharset->{'tis620'}
983 wakaba 1.7 = __PACKAGE__->new ({
984 wakaba 1.9 category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT,
985 wakaba 1.7 iana_names => {
986     'tis-620' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
987     },
988     perl_names => {
989 wakaba 1.9 'web-tis-620' => UNREGISTERED_CHARSET_NAME | ERROR_REPORTING_ENCODING_IMPL,
990     'windows-874' => FALLBACK_ENCODING_IMPL | ERROR_REPORTING_ENCODING_IMPL,
991 wakaba 1.7 },
992 wakaba 1.9 fallback => {
993     "\x80" => "\x{20AC}",
994     "\x81" => undef, "\x82" => undef, "\x83" => undef, "\x84" => undef,
995     "\x85" => "\x{2026}",
996     "\x86" => undef, "\x87" => undef, "\x88" => undef, "\x89" => undef,
997     "\x8A" => undef, "\x8B" => undef, "\x8C" => undef, "\x8D" => undef,
998     "\x8E" => undef, "\x8F" => undef, "\x90" => undef,
999     "\x91" => "\x{2018}",
1000     "\x92" => "\x{2019}",
1001     "\x93" => "\x{201C}",
1002     "\x94" => "\x{201D}",
1003     "\x95" => "\x{2022}",
1004     "\x96" => "\x{2013}",
1005     "\x97" => "\x{2014}",
1006     "\x98" => undef, "\x99" => undef, "\x9A" => undef, "\x9B" => undef,
1007     "\x9C" => undef, "\x9D" => undef, "\x9E" => undef, "\x9F" => undef,
1008     "\xA0" => "\xA0",
1009     },
1010     ## NOTE: |tis-620| is treated as |windows-874|, so ensure that
1011     ## they are consistent.
1012 wakaba 1.7 });
1013    
1014     $Charset->{'iso-8859-11'}
1015     = $IANACharset->{'iso-8859-11'}
1016 wakaba 1.9 = $HTMLCharset->{'iso885911'}
1017 wakaba 1.7 = __PACKAGE__->new ({
1018 wakaba 1.9 category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT,
1019 wakaba 1.7 iana_names => {
1020     'iso-8859-11' => UNREGISTERED_CHARSET_NAME,
1021     ## NOTE: The Web Thai encoding, i.e. windows-874.
1022     },
1023     perl_names => {
1024 wakaba 1.9 'web-thai' => UNREGISTERED_CHARSET_NAME | ERROR_REPORTING_ENCODING_IMPL,
1025 wakaba 1.7 'windows-874' => FALLBACK_ENCODING_IMPL | ERROR_REPORTING_ENCODING_IMPL,
1026     },
1027     fallback => {
1028     "\x80" => "\x{20AC}",
1029 wakaba 1.9 "\x81" => undef, "\x82" => undef, "\x83" => undef, "\x84" => undef,
1030 wakaba 1.7 "\x85" => "\x{2026}",
1031 wakaba 1.9 "\x86" => undef, "\x87" => undef, "\x88" => undef, "\x89" => undef,
1032     "\x8A" => undef, "\x8B" => undef, "\x8C" => undef, "\x8D" => undef,
1033     "\x8E" => undef, "\x8F" => undef, "\x90" => undef,
1034 wakaba 1.7 "\x91" => "\x{2018}",
1035     "\x92" => "\x{2019}",
1036     "\x93" => "\x{201C}",
1037     "\x94" => "\x{201D}",
1038     "\x95" => "\x{2022}",
1039     "\x96" => "\x{2013}",
1040     "\x97" => "\x{2014}",
1041 wakaba 1.9 "\x98" => undef, "\x99" => undef, "\x9A" => undef, "\x9B" => undef,
1042     "\x9C" => undef, "\x9D" => undef, "\x9E" => undef, "\x9F" => undef,
1043 wakaba 1.7 },
1044 wakaba 1.9 ## NOTE: |iso-8859-11| is treated as |windows-874|, so ensure that
1045     ## they are consistent.
1046 wakaba 1.7 });
1047    
1048     $Charset->{'windows-874'}
1049     = $IANACharset->{'windows-874'}
1050 wakaba 1.9 = $HTMLCharset->{'windows874'}
1051 wakaba 1.7 = __PACKAGE__->new ({
1052 wakaba 1.9 category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT,
1053 wakaba 1.7 iana_names => {
1054     'windows-874' => UNREGISTERED_CHARSET_NAME,
1055     },
1056     perl_names => {
1057     'windows-874' => REGISTERED_CHARSET_NAME | ERROR_REPORTING_ENCODING_IMPL,
1058     },
1059 wakaba 1.9 ## TODO: |error_level|
1060     });
1061    
1062     $IANACharset->{'windows-949'}
1063     = $HTMLCharset->{windows949}
1064     = __PACKAGE__->new ({
1065     category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_MIME_TEXT,
1066     iana_names => {
1067     'windows-949' => UNREGISTERED_CHARSET_NAME,
1068     },
1069     perl_names => {
1070     'cp949' => PREFERRED_CHARSET_NAME | NONCONFORMING_ENCODING_IMPL |
1071     ERROR_REPORTING_ENCODING_IMPL,
1072     ## TODO: Is this implementation conforming?
1073     },
1074     ## NOTE: |error_level| is same as default, since we can't find any formal
1075     ## definition for this charset.
1076 wakaba 1.7 });
1077    
1078 wakaba 1.4 sub new ($$) {
1079     return bless $_[1], $_[0];
1080     } # new
1081    
1082     ## NOTE: A class method
1083 wakaba 1.10 sub get_by_html_name ($$) {
1084     my $name = $_[1];
1085     $name =~ tr/A-Z/a-z/; ## ASCII case-insensitive
1086     my $iana_name = $name;
1087     $name =~ s/[\x09-\x0D\x20-\x2F\x3A-\x40\x5B-\x60\x7B-\x7E]//g;
1088     ## NOTE: U+000B is included.
1089     unless ($HTMLCharset->{$name} || $IANACharset->{$name}) {
1090     $IANACharset->{$iana_name} =
1091     $HTMLCharset->{$name} = __PACKAGE__->new ({
1092     iana_names => {
1093     $iana_name => UNREGISTERED_CHARSET_NAME,
1094     },
1095     });
1096     }
1097     return $HTMLCharset->{$name} || $IANACharset->{$name};
1098     } # get_by_html_name
1099    
1100     ## NOTE: A class method
1101 wakaba 1.4 sub get_by_iana_name ($$) {
1102     my $name = $_[1];
1103     $name =~ tr/A-Z/a-z/; ## ASCII case-insensitive
1104     unless ($IANACharset->{$name}) {
1105     $IANACharset->{$name} = __PACKAGE__->new ({
1106     iana_names => {
1107     $name => UNREGISTERED_CHARSET_NAME,
1108     },
1109     });
1110     }
1111     return $IANACharset->{$name};
1112     } # get_by_iana_name
1113    
1114 wakaba 1.5 sub get_decode_handle ($$;%) {
1115     my $self = shift;
1116     my $byte_stream = shift;
1117     my %opt = @_;
1118    
1119     my $obj = {
1120 wakaba 1.13 category => $self->{category},
1121 wakaba 1.11 char_buffer => \(my $s = ''),
1122 wakaba 1.12 char_buffer_pos => 0,
1123 wakaba 1.5 character_queue => [],
1124     filehandle => $byte_stream,
1125     charset => '', ## TODO: We set a charset name for input_encoding (when we get identify-by-URI nonsense away)
1126     byte_buffer => $opt{byte_buffer} ? ${$opt{byte_buffer}} : '', ## TODO: ref, instead of value, should be used
1127     onerror => $opt{onerror} || sub {},
1128 wakaba 1.9 level => $opt{level} || {
1129     must => 'm',
1130     charset_variant => 'm',
1131     charset_fact => 'm',
1132     iso_shall => 'm',
1133     },
1134     error_level => $self->{error_level} || {
1135     ## HTML5 charset name aliases
1136     ## NOTE: Use of code points in the variant whose definition differs
1137     ## from the original charset is a parse error in HTML5. However,
1138     ## it does not affect the document conformance; the HTML5 spec
1139     ## does not define the conformance of the input stream against the
1140     ## charset in use.
1141     'fallback-char-error' => 'charset_variant',
1142     #'fallback-illegal-error' => 'charset_variant',
1143     'fallback-unassigned-error' => 'charset_variant',
1144     ## NOTE: An appropriate error level should be set for each charset
1145     ## (many charset prohibits use of unassigned code points).
1146    
1147     'illegal-octets-error' => 'charset_fact',
1148     'unassigned-code-point-error' => 'charset_fact',
1149     'invalid-state-error' => 'charset_fact',
1150     },
1151 wakaba 1.5 };
1152    
1153     require Whatpm::Charset::DecodeHandle;
1154     if ($self->{iana_names}->{'iso-2022-jp'}) {
1155     $obj->{state_2440} = 'gl-jis-1978';
1156     $obj->{state_2442} = 'gl-jis-1983';
1157     $obj->{state} = 'state_2842';
1158     eval {
1159     require Encode::GLJIS1978;
1160     require Encode::GLJIS1983;
1161     };
1162     if (Encode::find_encoding ($obj->{state_2440}) and
1163     Encode::find_encoding ($obj->{state_2442})) {
1164     return ((bless $obj, 'Whatpm::Charset::DecodeHandle::ISO2022JP'),
1165     PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME);
1166     }
1167     } elsif ($self->{xml_names}->{'iso-2022-jp'}) {
1168     $obj->{state_2440} = 'gl-jis-1997-swapped';
1169     $obj->{state_2442} = 'gl-jis-1997';
1170     $obj->{state} = 'state_2842';
1171     eval {
1172     require Encode::GLJIS1997Swapped;
1173     require Encode::GLJIS1997;
1174     };
1175     if (Encode::find_encoding ($obj->{state_2440}) and
1176     Encode::find_encoding ($obj->{state_2442})) {
1177     return ((bless $obj, 'Whatpm::Charset::DecodeHandle::ISO2022JP'),
1178     PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME);
1179     }
1180     }
1181    
1182 wakaba 1.6 my ($e, $e_status) = $self->get_perl_encoding
1183     (%opt, allow_semiconforming => 1);
1184 wakaba 1.5 if ($e) {
1185     $obj->{perl_encoding_name} = $e->name;
1186 wakaba 1.14 unless ($self->{category} & CHARSET_CATEGORY_BLOCK_SAFE) {
1187     $e_status |= FALLBACK_ENCODING_IMPL;
1188 wakaba 1.5 }
1189 wakaba 1.14 $obj->{bom_pattern} = $self->{bom_pattern};
1190     $obj->{fallback} = $self->{fallback};
1191     return ((bless $obj, 'Whatpm::Charset::DecodeHandle::Encode'), $e_status);
1192 wakaba 1.5 } else {
1193     return (undef, 0);
1194     }
1195     } # get_decode_handle
1196    
1197 wakaba 1.4 sub get_perl_encoding ($;%) {
1198     my ($self, %opt) = @_;
1199    
1200     require Encode;
1201 wakaba 1.5 my $load_encode = sub {
1202     my $name = shift;
1203     if ($name eq 'euc-jp-1997') {
1204     require Encode::EUCJP1997;
1205     } elsif ($name eq 'shift-jis-1997') {
1206     require Encode::ShiftJIS1997;
1207 wakaba 1.9 } elsif ({'web-latin1' => 1,
1208     'web-latin1-us-ascii' => 1,
1209     'web-latin5' => 1}->{$name}) {
1210 wakaba 1.7 require Whatpm::Charset::WebLatin1;
1211 wakaba 1.9 } elsif ($name eq 'web-thai' or $name eq 'web-tis-620') {
1212 wakaba 1.7 require Whatpm::Charset::WebThai;
1213 wakaba 1.5 }
1214     }; # $load_encode
1215 wakaba 1.4
1216     if ($opt{allow_error_reporting}) {
1217     for my $perl_name (keys %{$self->{perl_names} or {}}) {
1218     my $perl_status = $self->{perl_names}->{$perl_name};
1219     next unless $perl_status & ERROR_REPORTING_ENCODING_IMPL;
1220 wakaba 1.5 next if $perl_status & FALLBACK_ENCODING_IMPL;
1221 wakaba 1.6 next if $perl_status & SEMICONFORMING_ENCODING_IMPL and
1222     not $opt{allow_semiconforming};
1223 wakaba 1.4
1224 wakaba 1.5 $load_encode->($perl_name);
1225 wakaba 1.4 my $e = Encode::find_encoding ($perl_name);
1226 wakaba 1.9 if ($e and $e->name eq $perl_name) {
1227     ## NOTE: Don't return $e unless $e eq $perl_name, since
1228     ## |find_encoding| resolves e.g. |foobarlatin-1| to |iso-8859-1|,
1229     ## which might return wrong encoding object when a dedicated
1230     ## implementation not part of the standard Perl distribution is
1231     ## desired.
1232 wakaba 1.4 return ($e, $perl_status);
1233     }
1234     }
1235     }
1236    
1237     for my $perl_name (keys %{$self->{perl_names} or {}}) {
1238     my $perl_status = $self->{perl_names}->{$perl_name};
1239     next if $perl_status & ERROR_REPORTING_ENCODING_IMPL;
1240     next if $perl_status & FALLBACK_ENCODING_IMPL;
1241 wakaba 1.6 next if $perl_status & SEMICONFORMING_ENCODING_IMPL and
1242     not $opt{allow_semiconforming};
1243 wakaba 1.5
1244     $load_encode->($perl_name);
1245 wakaba 1.4 my $e = Encode::find_encoding ($perl_name);
1246     if ($e) {
1247     return ($e, $perl_status);
1248     }
1249     }
1250    
1251     if ($opt{allow_fallback}) {
1252     for my $perl_name (keys %{$self->{perl_names} or {}}) {
1253     my $perl_status = $self->{perl_names}->{$perl_name};
1254 wakaba 1.6 next unless $perl_status & FALLBACK_ENCODING_IMPL or
1255     $perl_status & SEMICONFORMING_ENCODING_IMPL;
1256     ## NOTE: We don't prefer semi-conforming implementations to
1257     ## non-conforming implementations, since semi-conforming implementations
1258     ## will never be conforming without assist of the callee, and in such
1259     ## cases the callee should set the |allow_semiconforming| option upon
1260     ## the invocation of the method anyway.
1261    
1262 wakaba 1.5 $load_encode->($perl_name);
1263 wakaba 1.4 my $e = Encode::find_encoding ($perl_name);
1264     if ($e) {
1265     return ($e, $perl_status);
1266     }
1267     }
1268    
1269     for my $iana_name (keys %{$self->{iana_names} or {}}) {
1270 wakaba 1.5 $load_encode->($iana_name);
1271 wakaba 1.4 my $e = Encode::find_encoding ($iana_name);
1272     if ($e) {
1273     return ($e, FALLBACK_ENCODING_IMPL);
1274     }
1275     }
1276     }
1277    
1278     return (undef, 0);
1279     } # get_perl_encoding
1280    
1281     sub get_iana_name ($) {
1282     my $self = shift;
1283    
1284     my $primary;
1285     my $other;
1286     for my $iana_name (keys %{$self->{iana_names} or {}}) {
1287     my $name_status = $self->{iana_names}->{$iana_name};
1288     if ($name_status & PREFERRED_CHARSET_NAME) {
1289     return $iana_name;
1290     } elsif ($name_status & PRIMARY_CHARSET_NAME) {
1291     $primary = $iana_name;
1292     } elsif ($name_status & REGISTERED_CHARSET_NAME) {
1293     $other = $iana_name;
1294     } else {
1295     $other ||= $iana_name;
1296     }
1297     }
1298    
1299     return $primary || $other;
1300     } # get_iana_name
1301    
1302     ## NOTE: A non-method function
1303 wakaba 1.3 sub is_syntactically_valid_iana_charset_name ($) {
1304 wakaba 1.1 my $name = shift;
1305     return $name =~ /\A[\x20-\x7E]{1,40}\z/;
1306 wakaba 1.9
1307     ## NOTE: According to IANAREG, "The character set names may be up to 40
1308     ## characters taken from the printable characters of US-ASCII. However,
1309     ## no distinction is made between use of upper and lower case letters.".
1310 wakaba 1.1 } # is_suntactically_valid_iana_charset_name
1311    
1312     1;
1313 wakaba 1.14 ## $Date: 2008/09/14 06:59:08 $
1314 wakaba 1.1

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24