/[suikacvs]/messaging/manakai/lib/Message/Charset/Info.pm
Suika

Contents of /messaging/manakai/lib/Message/Charset/Info.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.9 - (hide annotations) (download)
Wed Sep 10 10:28:57 2008 UTC (16 years, 2 months ago) by wakaba
Branch: MAIN
Changes since 1.8: +450 -96 lines
++ manakai/lib/Message/Charset/ChangeLog	10 Sep 2008 10:28:40 -0000
2008-09-10  Wakaba  <wakaba@suika.fam.cx>

	* Info.pm: EBCDICness, ASCII compability, and MIME text
	suitability are now represented as charset categories.
	Provision for support of HTML's charset name treatment,
	i.e. ignorance of ASCII punctuations.  Support of HTML5
	charset name aliasings (updated definition).

1 wakaba 1.1 package Message::Charset::Info;
2     use strict;
3 wakaba 1.9 our $VERSION=do{my @r=(q$Revision: 1.8 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 wakaba 1.8
5 wakaba 1.9 ## TODO: Certain encodings MUST NOT be implemented [HTML5].
6    
7     ## ISSUE: Should we convert unassigned code point with trivial Unicode
8     ## mapping into U+FFFD? Or, should we return that Unicode character
9     ## with an error? (For example, Windows-1252's 0x81 should be converted
10     ## to U+FFFD or U+0081?)
11 wakaba 1.1
12     sub UNREGISTERED_CHARSET_NAME () { 0b1 }
13 wakaba 1.4 ## Names for non-standard encodings/implementations for Perl encodings
14 wakaba 1.1 sub REGISTERED_CHARSET_NAME () { 0b10 }
15 wakaba 1.4 ## Names for standard encodings for Perl encodings
16     sub PRIMARY_CHARSET_NAME () { 0b100 }
17 wakaba 1.1 ## "Name:" field for IANA names
18 wakaba 1.4 ## Canonical name for Perl encodings
19     sub PREFERRED_CHARSET_NAME () { 0b1000 }
20 wakaba 1.1 ## "preferred MIME name" for IANA names
21    
22 wakaba 1.4 sub FALLBACK_ENCODING_IMPL () { 0b10000 }
23     ## For Perl encodings: Not a name of the encoding, the encoding
24     ## for the name might be useful as a fallback when the correct
25     ## encoding is not supported.
26     sub NONCONFORMING_ENCODING_IMPL () { FALLBACK_ENCODING_IMPL }
27     ## For Perl encodings: Not a conforming implementation of the encoding,
28     ## though it seems that the intention was to implement that encoding.
29 wakaba 1.6 sub SEMICONFORMING_ENCODING_IMPL () { 0b1000000 }
30     ## For Perl encodings: The implementation itself (returned by
31     ## |get_perl_encoding|) is non-conforming. The decode handle
32     ## implementation (returned by |get_decode_handle|) is conforming.
33 wakaba 1.4 sub ERROR_REPORTING_ENCODING_IMPL () { 0b100000 }
34     ## For Perl encodings: Support error reporting via |manakai_onerror|
35 wakaba 1.6 ## handler when the encoding is handled with decode handle.
36 wakaba 1.4
37 wakaba 1.2 ## iana_status
38     sub STATUS_COMMON () { 0b1 }
39     sub STATUS_LIMITED_USE () { 0b10 }
40     sub STATUS_OBSOLETE () { 0b100 }
41    
42 wakaba 1.5 ## category
43     sub CHARSET_CATEGORY_BLOCK_SAFE () { 0b1 }
44     ## NOTE: Stateless
45     sub CHARSET_CATEGORY_EUCJP () { 0b10 }
46     sub CHARSET_CATEGORY_SJIS () { 0b100 }
47 wakaba 1.8 sub CHARSET_CATEGORY_UTF16 () { 0b1000 }
48     ## NOTE: "A UTF-16 encoding" in HTML5.
49 wakaba 1.9 sub CHARSET_CATEGORY_ASCII_COMPAT () { 0b10000 }
50     ## NOTE: "superset of US-ASCII (specifically, ANSI_X3.4-1968)
51     ## for bytes in the range 0x09-0x0A, 0x0C-0x0D, 0x20-0x22, 0x26, 0x27,
52     ## 0x2C-0x3F, 0x41-0x5A, and 0x61-0x7A" [HTML5]
53     sub CHARSET_CATEGORY_EBCDIC () { 0b100000 }
54     ## NOTE: "based on EBCDIC" in HTML5.
55     sub CHARSET_CATEGORY_MIME_TEXT () { 0b1000000 }
56     ## NOTE: Suitable as MIME text.
57 wakaba 1.1
58     ## ISSUE: Shift_JIS is a superset of US-ASCII? ISO-2022-JP is?
59     ## ISSUE: 0x5F (_) should be added to the range?
60    
61 wakaba 1.9 my $Charset; ## TODO: this is obsolete.
62 wakaba 1.1
63     our $IANACharset;
64 wakaba 1.9 ## NOTE: Charset names used where IANA charset names are allowed, either
65     ## registered or not.
66     our $HTMLCharset;
67     ## NOTE: Same as charset names in $IANACharset, except all ASCII
68     ## punctuations are dropped and letters/digits only names are not included.
69 wakaba 1.1
70     $Charset->{'us-ascii'}
71     = $IANACharset->{'ansi_x3.4-1968'}
72     = $IANACharset->{'iso-ir-6'}
73     = $IANACharset->{'ansi_x3.4-1986'}
74     = $IANACharset->{'iso_646.irv:1991'}
75     = $IANACharset->{'ascii'}
76     = $IANACharset->{'iso646-us'}
77     = $IANACharset->{'us-ascii'}
78     = $IANACharset->{'us'}
79     = $IANACharset->{'ibm367'}
80     = $IANACharset->{'cp367'}
81     = $IANACharset->{'csascii'}
82 wakaba 1.9 = $HTMLCharset->{'ansix341968'}
83     = $HTMLCharset->{'isoir6'}
84     = $HTMLCharset->{'ansix341986'}
85     = $HTMLCharset->{'iso646irv1991'}
86     = $HTMLCharset->{'iso646us'}
87     = $HTMLCharset->{'usascii'}
88 wakaba 1.4 = __PACKAGE__->new ({
89 wakaba 1.9 category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT,
90 wakaba 1.1 iana_names => {
91 wakaba 1.4 'ansi_x3.4-1968' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
92 wakaba 1.1 'iso-ir-6' => REGISTERED_CHARSET_NAME,
93     'ansi_x3.4-1986' => REGISTERED_CHARSET_NAME,
94     'iso_646.irv:1991' => REGISTERED_CHARSET_NAME,
95     'ascii' => REGISTERED_CHARSET_NAME,
96     'iso646-us' => REGISTERED_CHARSET_NAME,
97 wakaba 1.4 'us-ascii' => PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
98 wakaba 1.1 'us' => REGISTERED_CHARSET_NAME,
99     'ibm367' => REGISTERED_CHARSET_NAME,
100     'cp367' => REGISTERED_CHARSET_NAME,
101     'csascii' => REGISTERED_CHARSET_NAME,
102     },
103 wakaba 1.9 perl_names => {
104     'web-latin1-us-ascii' => UNREGISTERED_CHARSET_NAME |
105     SEMICONFORMING_ENCODING_IMPL | ERROR_REPORTING_ENCODING_IMPL,
106     'cp1252' => FALLBACK_ENCODING_IMPL, # part of standard Perl distribution
107     },
108     fallback => {
109     "\x80" => "\x{20AC}",
110     "\x81" => undef,
111     "\x82" => "\x{201A}",
112     "\x83" => "\x{0192}",
113     "\x84" => "\x{201E}",
114     "\x85" => "\x{2026}",
115     "\x86" => "\x{2020}",
116     "\x87" => "\x{2021}",
117     "\x88" => "\x{02C6}",
118     "\x89" => "\x{2030}",
119     "\x8A" => "\x{0160}",
120     "\x8B" => "\x{2039}",
121     "\x8C" => "\x{0152}",
122     "\x8D" => undef,
123     "\x8E" => "\x{017D}",
124     "\x8F" => undef,
125     "\x90" => undef,
126     "\x91" => "\x{2018}",
127     "\x92" => "\x{2019}",
128     "\x93" => "\x{201C}",
129     "\x94" => "\x{201D}",
130     "\x95" => "\x{2022}",
131     "\x96" => "\x{2013}",
132     "\x97" => "\x{2014}",
133     "\x98" => "\x{02DC}",
134     "\x99" => "\x{2122}",
135     "\x9A" => "\x{0161}",
136     "\x9B" => "\x{203A}",
137     "\x9C" => "\x{0153}",
138     "\x9D" => undef,
139     "\x9E" => "\x{017E}",
140     "\x9F" => "\x{0178}",
141     "\xA0" => "\xA0", "\xA1" => "\xA1", "\xA2" => "\xA2", "\xA3" => "\xA3",
142     "\xA4" => "\xA4", "\xA5" => "\xA5", "\xA6" => "\xA6", "\xA7" => "\xA7",
143     "\xA8" => "\xA8", "\xA9" => "\xA9", "\xAA" => "\xAA", "\xAB" => "\xAB",
144     "\xAC" => "\xAC", "\xAD" => "\xAD", "\xAE" => "\xAE", "\xAF" => "\xAF",
145     "\xB0" => "\xB0", "\xB1" => "\xB1", "\xB2" => "\xB2", "\xB3" => "\xB3",
146     "\xB4" => "\xB4", "\xB5" => "\xB5", "\xB6" => "\xB6", "\xB7" => "\xB7",
147     "\xB8" => "\xB8", "\xB9" => "\xB9", "\xBA" => "\xBA", "\xBB" => "\xBB",
148     "\xBC" => "\xBC", "\xBD" => "\xBD", "\xBE" => "\xBE", "\xBF" => "\xBF",
149     "\xC0" => "\xC0", "\xC1" => "\xC1", "\xC2" => "\xC2", "\xC3" => "\xC3",
150     "\xC4" => "\xC4", "\xC5" => "\xC5", "\xC6" => "\xC6", "\xC7" => "\xC7",
151     "\xC8" => "\xC8", "\xC9" => "\xC9", "\xCA" => "\xCA", "\xCB" => "\xCB",
152     "\xCC" => "\xCC", "\xCD" => "\xCD", "\xCE" => "\xCE", "\xCF" => "\xCF",
153     "\xD0" => "\xD0", "\xD1" => "\xD1", "\xD2" => "\xD2", "\xD3" => "\xD3",
154     "\xD4" => "\xD4", "\xD5" => "\xD5", "\xD6" => "\xD6", "\xD7" => "\xD7",
155     "\xD8" => "\xD8", "\xD9" => "\xD9", "\xDA" => "\xDA", "\xDB" => "\xDB",
156     "\xDC" => "\xDC", "\xDD" => "\xDD", "\xDE" => "\xDE", "\xDF" => "\xDF",
157     "\xE0" => "\xE0", "\xE1" => "\xE1", "\xE2" => "\xE2", "\xE3" => "\xE3",
158     "\xE4" => "\xE4", "\xE5" => "\xE5", "\xE6" => "\xE6", "\xE7" => "\xE7",
159     "\xE8" => "\xE8", "\xE9" => "\xE9", "\xEA" => "\xEA", "\xEB" => "\xEB",
160     "\xEC" => "\xEC", "\xED" => "\xED", "\xEE" => "\xEE", "\xEF" => "\xEF",
161     "\xF0" => "\xF0", "\xF1" => "\xF1", "\xF2" => "\xF2", "\xF3" => "\xF3",
162     "\xF4" => "\xF4", "\xF5" => "\xF5", "\xF6" => "\xF6", "\xF7" => "\xF7",
163     "\xF8" => "\xF8", "\xF9" => "\xF9", "\xFA" => "\xFA", "\xFB" => "\xFB",
164     "\xFC" => "\xFC", "\xFD" => "\xFD", "\xFE" => "\xFE", "\xFF" => "\xFF",
165     },
166     ## NOTE: Treated as |windows-1252|. Properties of this charset
167     ## should be consistent with those of that charset.
168 wakaba 1.4 });
169 wakaba 1.1
170     $Charset->{'iso-8859-1'}
171     = $IANACharset->{'iso_8859-1:1987'}
172     = $IANACharset->{'iso-ir-100'}
173     = $IANACharset->{'iso_8859-1'}
174     = $IANACharset->{'iso-8859-1'}
175     = $IANACharset->{'latin1'}
176     = $IANACharset->{'l1'}
177     = $IANACharset->{'ibm819'}
178     = $IANACharset->{'cp819'}
179     = $IANACharset->{'csisolatin1'}
180 wakaba 1.9 = $HTMLCharset->{'iso885911987'}
181     = $HTMLCharset->{'isoir100'}
182     = $HTMLCharset->{'iso88591'}
183 wakaba 1.4 = __PACKAGE__->new ({
184 wakaba 1.9 category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT,
185 wakaba 1.1 iana_names => {
186 wakaba 1.4 'iso_8859-1:1987' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
187 wakaba 1.1 'iso-ir-100' => REGISTERED_CHARSET_NAME,
188     'iso_8859-1' => REGISTERED_CHARSET_NAME,
189 wakaba 1.4 'iso-8859-1' => PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
190 wakaba 1.1 'latin1' => REGISTERED_CHARSET_NAME,
191     'l1' => REGISTERED_CHARSET_NAME,
192     'ibm819' => REGISTERED_CHARSET_NAME,
193     'cp819' => REGISTERED_CHARSET_NAME,
194     'csisolatin1' => REGISTERED_CHARSET_NAME,
195     },
196 wakaba 1.7 perl_names => {
197     'web-latin1' => UNREGISTERED_CHARSET_NAME | SEMICONFORMING_ENCODING_IMPL |
198     ERROR_REPORTING_ENCODING_IMPL,
199 wakaba 1.9 'cp1252' => FALLBACK_ENCODING_IMPL, # part of standard Perl distribution
200 wakaba 1.7 },
201     fallback => {
202     "\x80" => "\x{20AC}",
203 wakaba 1.9 "\x81" => undef,
204 wakaba 1.7 "\x82" => "\x{201A}",
205     "\x83" => "\x{0192}",
206     "\x84" => "\x{201E}",
207     "\x85" => "\x{2026}",
208     "\x86" => "\x{2020}",
209     "\x87" => "\x{2021}",
210     "\x88" => "\x{02C6}",
211     "\x89" => "\x{2030}",
212     "\x8A" => "\x{0160}",
213     "\x8B" => "\x{2039}",
214     "\x8C" => "\x{0152}",
215 wakaba 1.9 "\x8D" => undef,
216 wakaba 1.7 "\x8E" => "\x{017D}",
217 wakaba 1.9 "\x8F" => undef,
218     "\x90" => undef,
219 wakaba 1.7 "\x91" => "\x{2018}",
220     "\x92" => "\x{2019}",
221     "\x93" => "\x{201C}",
222     "\x94" => "\x{201D}",
223     "\x95" => "\x{2022}",
224     "\x96" => "\x{2013}",
225     "\x97" => "\x{2014}",
226     "\x98" => "\x{02DC}",
227     "\x99" => "\x{2122}",
228     "\x9A" => "\x{0161}",
229     "\x9B" => "\x{203A}",
230     "\x9C" => "\x{0153}",
231 wakaba 1.9 "\x9D" => undef,
232 wakaba 1.7 "\x9E" => "\x{017E}",
233     "\x9F" => "\x{0178}",
234     },
235 wakaba 1.9 ## NOTE: Treated as |windows-1252|. Properties of this charset
236     ## should be consistent with those of that charset.
237 wakaba 1.4 });
238 wakaba 1.1
239 wakaba 1.2 $Charset->{'iso-8859-2'}
240     = $IANACharset->{'iso_8859-2:1987'}
241     = $IANACharset->{'iso-ir-101'}
242     = $IANACharset->{'iso_8859-2'}
243     = $IANACharset->{'iso-8859-2'}
244     = $IANACharset->{'latin2'}
245     = $IANACharset->{'l2'}
246     = $IANACharset->{'csisolatin2'}
247 wakaba 1.9 = $HTMLCharset->{'iso885921987'}
248     = $HTMLCharset->{'isoir101'}
249     = $HTMLCharset->{'iso88592'}
250 wakaba 1.4 = __PACKAGE__->new ({
251 wakaba 1.9 category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT,
252 wakaba 1.2 iana_names => {
253 wakaba 1.4 'iso_8859-2:1987' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
254 wakaba 1.2 'iso-ir-101' => REGISTERED_CHARSET_NAME,
255     'iso_8859-2' => REGISTERED_CHARSET_NAME,
256 wakaba 1.4 'iso-8859-2' => PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
257 wakaba 1.2 'latin2' => REGISTERED_CHARSET_NAME,
258     'l2' => REGISTERED_CHARSET_NAME,
259     'csisolatin2' => REGISTERED_CHARSET_NAME,
260     },
261 wakaba 1.4 });
262 wakaba 1.2
263     $Charset->{'iso-8859-3'}
264     = $IANACharset->{'iso_8859-3:1988'}
265     = $IANACharset->{'iso-ir-109'}
266     = $IANACharset->{'iso_8859-3'}
267     = $IANACharset->{'iso-8859-3'}
268     = $IANACharset->{'latin3'}
269     = $IANACharset->{'l3'}
270     = $IANACharset->{'csisolatin3'}
271 wakaba 1.9 = $HTMLCharset->{'iso885931988'}
272     = $HTMLCharset->{'isoir109'}
273     = $HTMLCharset->{'iso88593'}
274 wakaba 1.4 = __PACKAGE__->new ({
275 wakaba 1.9 category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT,
276 wakaba 1.2 iana_names => {
277 wakaba 1.4 'iso_8859-3:1988' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
278 wakaba 1.2 'iso-ir-109' => REGISTERED_CHARSET_NAME,
279     'iso_8859-3' => REGISTERED_CHARSET_NAME,
280 wakaba 1.4 'iso-8859-3' => PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
281 wakaba 1.2 'latin3' => REGISTERED_CHARSET_NAME,
282     'l3' => REGISTERED_CHARSET_NAME,
283     'csisolatin3' => REGISTERED_CHARSET_NAME,
284     },
285 wakaba 1.9 error_level => {
286     'unassigned-code-point-error' => 'iso_shall',
287     ## NOTE: I didn't check whether ISO/IEC 8859-3 prohibits the use of
288     ## unassigned code points, but ECMA-94:1986 (whose content considered
289     ## as equivalent to ISO 8859/1-4) disallows the use of them.
290     },
291 wakaba 1.4 });
292 wakaba 1.2
293     $Charset->{'iso-8859-4'}
294     = $IANACharset->{'iso_8859-4:1988'}
295     = $IANACharset->{'iso-ir-110'}
296     = $IANACharset->{'iso_8859-4'}
297     = $IANACharset->{'iso-8859-4'}
298     = $IANACharset->{'latin4'}
299     = $IANACharset->{'l4'}
300     = $IANACharset->{'csisolatin4'}
301 wakaba 1.9 = $HTMLCharset->{'iso885941988'}
302     = $HTMLCharset->{'isoir110'}
303     = $HTMLCharset->{'iso88594'}
304 wakaba 1.4 = __PACKAGE__->new ({
305 wakaba 1.9 category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT,
306 wakaba 1.2 iana_names => {
307 wakaba 1.4 'iso_8859-4:1988' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
308 wakaba 1.2 'iso-ir-110' => REGISTERED_CHARSET_NAME,
309     'iso_8859-4' => REGISTERED_CHARSET_NAME,
310 wakaba 1.4 'iso-8859-4' => PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
311 wakaba 1.2 'latin4' => REGISTERED_CHARSET_NAME,
312     'l4' => REGISTERED_CHARSET_NAME,
313     'csisolatin4' => REGISTERED_CHARSET_NAME,
314     },
315 wakaba 1.9 error_level => {
316     'unassigned-code-point-error' => 'iso_shall',
317     ## NOTE: I didn't check whether ISO/IEC 8859-3 prohibits the use of
318     ## unassigned code points, but ECMA-94:1986 (whose content considered
319     ## as equivalent to ISO 8859/1-4) disallows the use of them.
320     },
321 wakaba 1.4 });
322 wakaba 1.2
323     $Charset->{'iso-8859-5'}
324     = $IANACharset->{'iso_8859-5:1988'}
325     = $IANACharset->{'iso-ir-144'}
326     = $IANACharset->{'iso_8859-5'}
327     = $IANACharset->{'iso-8859-5'}
328     = $IANACharset->{'cyrillic'}
329     = $IANACharset->{'csisolatincyrillic'}
330 wakaba 1.9 = $HTMLCharset->{'iso885951988'}
331     = $HTMLCharset->{'isoir144'}
332     = $HTMLCharset->{'iso88595'}
333 wakaba 1.4 = __PACKAGE__->new ({
334 wakaba 1.9 category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT,
335 wakaba 1.2 iana_names => {
336 wakaba 1.4 'iso_8859-5:1988' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
337 wakaba 1.2 'iso-ir-144' => REGISTERED_CHARSET_NAME,
338     'iso_8859-5' => REGISTERED_CHARSET_NAME,
339 wakaba 1.4 'iso-8859-5' => PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
340 wakaba 1.2 'cyrillic' => REGISTERED_CHARSET_NAME,
341     'csisolatincyrillic' => REGISTERED_CHARSET_NAME,
342     },
343 wakaba 1.4 });
344 wakaba 1.2
345     $Charset->{'iso-8859-6'}
346     = $IANACharset->{'iso_8859-6:1987'}
347     = $IANACharset->{'iso-ir-127'}
348     = $IANACharset->{'iso_8859-6'}
349     = $IANACharset->{'iso-8859-6'}
350     = $IANACharset->{'ecma-114'}
351     = $IANACharset->{'asmo-708'}
352     = $IANACharset->{'arabic'}
353     = $IANACharset->{'csisolatinarabic'}
354 wakaba 1.9 = $HTMLCharset->{'iso885961987'}
355     = $HTMLCharset->{'isoir127'}
356     = $HTMLCharset->{'iso88596'}
357     = $HTMLCharset->{'ecma114'}
358     = $HTMLCharset->{'asmo708'}
359 wakaba 1.4 = __PACKAGE__->new ({
360 wakaba 1.9 category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT,
361     ## NOTE: 3/0..3/9 have different semantics from U+0030..0039,
362     ## but have same character names (maybe).
363     ## NOTE: According to RFC 2046, charset left-hand half of "iso-8859-6"
364     ## is same as "us-ascii".
365     ## TODO: RFC 1345 def?
366 wakaba 1.2 iana_names => {
367 wakaba 1.4 'iso_8859-6:1987' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
368 wakaba 1.2 'iso-ir-127' => REGISTERED_CHARSET_NAME,
369     'iso_8859-6' => REGISTERED_CHARSET_NAME,
370 wakaba 1.4 'iso-8859-6' => PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
371 wakaba 1.2 'ecma-114' => REGISTERED_CHARSET_NAME,
372     'asmo-708' => REGISTERED_CHARSET_NAME,
373     'arabic' => REGISTERED_CHARSET_NAME,
374     'csisolatinarabic' => REGISTERED_CHARSET_NAME,
375     },
376 wakaba 1.9 ## TODO: |error_level|
377 wakaba 1.4 });
378 wakaba 1.2
379     $Charset->{'iso-8859-7'}
380     = $IANACharset->{'iso_8859-7:1987'}
381     = $IANACharset->{'iso-ir-126'}
382     = $IANACharset->{'iso_8859-7'}
383     = $IANACharset->{'iso-8859-7'}
384     = $IANACharset->{'elot_928'}
385     = $IANACharset->{'ecma-118'}
386     = $IANACharset->{'greek'}
387     = $IANACharset->{'greek8'}
388     = $IANACharset->{'csisolatingreek'}
389 wakaba 1.9 = $HTMLCharset->{'iso885971987'}
390     = $HTMLCharset->{'isoir126'}
391     = $HTMLCharset->{'iso88597'}
392     = $HTMLCharset->{'elot928'}
393     = $HTMLCharset->{'ecma118'}
394 wakaba 1.4 = __PACKAGE__->new ({
395 wakaba 1.9 category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT,
396 wakaba 1.2 iana_names => {
397 wakaba 1.4 'iso_8859-7:1987' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
398 wakaba 1.2 'iso-ir-126' => REGISTERED_CHARSET_NAME,
399     'iso_8859-7' => REGISTERED_CHARSET_NAME,
400 wakaba 1.4 'iso-8859-7' => PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
401 wakaba 1.2 'elot_928' => REGISTERED_CHARSET_NAME,
402     'ecma-118' => REGISTERED_CHARSET_NAME,
403     'greek' => REGISTERED_CHARSET_NAME,
404     'greek8' => REGISTERED_CHARSET_NAME,
405     'csisolatingreek' => REGISTERED_CHARSET_NAME,
406     },
407 wakaba 1.9 ## TODO: |error_level|
408 wakaba 1.4 });
409 wakaba 1.2
410     $Charset->{'iso-8859-8'}
411     = $IANACharset->{'iso_8859-8:1988'}
412     = $IANACharset->{'iso-ir-138'}
413     = $IANACharset->{'iso_8859-8'}
414     = $IANACharset->{'iso-8859-8'}
415     = $IANACharset->{'hebrew'}
416     = $IANACharset->{'csisolatinhebrew'}
417 wakaba 1.9 = $HTMLCharset->{'iso885981988'}
418     = $HTMLCharset->{'isoir138'}
419     = $HTMLCharset->{'iso88598'}
420 wakaba 1.4 = __PACKAGE__->new ({
421 wakaba 1.9 category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT,
422 wakaba 1.2 iana_names => {
423 wakaba 1.4 'iso_8859-8:1988' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
424 wakaba 1.2 'iso-ir-138' => REGISTERED_CHARSET_NAME,
425     'iso_8859-8' => REGISTERED_CHARSET_NAME,
426 wakaba 1.4 'iso-8859-8' => PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
427 wakaba 1.2 'hebrew' => REGISTERED_CHARSET_NAME,
428     'csisolatinhebrew' => REGISTERED_CHARSET_NAME,
429     },
430 wakaba 1.9 ## TODO: |error_level|
431 wakaba 1.4 });
432 wakaba 1.2
433     $Charset->{'iso-8859-9'}
434     = $IANACharset->{'iso_8859-9:1989'}
435     = $IANACharset->{'iso-ir-148'}
436     = $IANACharset->{'iso_8859-9'}
437     = $IANACharset->{'iso-8859-9'}
438     = $IANACharset->{'latin5'}
439     = $IANACharset->{'l5'}
440     = $IANACharset->{'csisolatin5'}
441 wakaba 1.9 = $HTMLCharset->{'iso885991989'}
442     = $HTMLCharset->{'isoir148'}
443     = $HTMLCharset->{'iso88599'}
444 wakaba 1.4 = __PACKAGE__->new ({
445 wakaba 1.9 category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT,
446 wakaba 1.2 iana_names => {
447 wakaba 1.4 'iso_8859-9:1989' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
448 wakaba 1.2 'iso-ir-148' => REGISTERED_CHARSET_NAME,
449     'iso_8859-9' => REGISTERED_CHARSET_NAME,
450 wakaba 1.4 'iso-8859-9' => PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
451 wakaba 1.2 'latin5' => REGISTERED_CHARSET_NAME,
452     'l5' => REGISTERED_CHARSET_NAME,
453     'csisolatin5' => REGISTERED_CHARSET_NAME,
454     },
455 wakaba 1.9 perl_names => {
456     'web-latin5' => UNREGISTERED_CHARSET_NAME | SEMICONFORMING_ENCODING_IMPL |
457     ERROR_REPORTING_ENCODING_IMPL,
458     'cp1254' => FALLBACK_ENCODING_IMPL, # part of standard Perl distribution
459     },
460     fallback => {
461     "\x80" => "\x{20AC}",
462     "\x81" => undef,
463     "\x82" => "\x{201A}",
464     "\x83" => "\x{0192}",
465     "\x84" => "\x{201E}",
466     "\x85" => "\x{2026}",
467     "\x86" => "\x{2020}",
468     "\x87" => "\x{2021}",
469     "\x88" => "\x{02C6}",
470     "\x89" => "\x{2030}",
471     "\x8A" => "\x{0160}",
472     "\x8B" => "\x{2039}",
473     "\x8C" => "\x{0152}",
474     "\x8D" => undef,
475     "\x8E" => undef,
476     "\x8F" => undef,
477     "\x90" => undef,
478     "\x91" => "\x{2018}",
479     "\x92" => "\x{2019}",
480     "\x93" => "\x{201C}",
481     "\x94" => "\x{201D}",
482     "\x95" => "\x{2022}",
483     "\x96" => "\x{2013}",
484     "\x97" => "\x{2014}",
485     "\x98" => "\x{02DC}",
486     "\x99" => "\x{2122}",
487     "\x9A" => "\x{0161}",
488     "\x9B" => "\x{203A}",
489     "\x9C" => "\x{0153}",
490     "\x9D" => undef,
491     "\x9E" => undef,
492     "\x9F" => "\x{0178}",
493     },
494     ## NOTE: Treated as |windows-1254|. Properties of this charset
495     ## should be consistent with those of that charset.
496 wakaba 1.4 });
497 wakaba 1.2
498     $Charset->{'iso-8859-10'}
499     = $IANACharset->{'iso-8859-10'}
500     = $IANACharset->{'iso-ir-157'}
501     = $IANACharset->{'l6'}
502     = $IANACharset->{'iso_8859-10:1992'}
503     = $IANACharset->{'csisolatin6'}
504     = $IANACharset->{'latin6'}
505 wakaba 1.9 = $HTMLCharset->{'iso885910'}
506     = $HTMLCharset->{'isoir157'}
507     = $HTMLCharset->{'iso8859101992'}
508 wakaba 1.4 = __PACKAGE__->new ({
509 wakaba 1.9 category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT,
510 wakaba 1.2 iana_names => {
511 wakaba 1.4 'iso-8859-10' => PRIMARY_CHARSET_NAME | PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
512 wakaba 1.2 'iso-ir-157' => REGISTERED_CHARSET_NAME,
513     'l6' => REGISTERED_CHARSET_NAME,
514     'iso_8859-10:1992' => REGISTERED_CHARSET_NAME,
515     'csisolatin6' => REGISTERED_CHARSET_NAME,
516     'latin6' => REGISTERED_CHARSET_NAME,
517     },
518 wakaba 1.9 ## TODO: |error_level|
519 wakaba 1.4 });
520 wakaba 1.2
521     $Charset->{'iso_6937-2-add'}
522     = $IANACharset->{'iso_6937-2-add'}
523     = $IANACharset->{'iso-ir-142'}
524     = $IANACharset->{'csisotextcomm'}
525 wakaba 1.9 = $HTMLCharset->{'iso69372add'}
526     = $HTMLCharset->{'isoir142'}
527 wakaba 1.4 = __PACKAGE__->new ({
528 wakaba 1.9 category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT,
529 wakaba 1.2 iana_names => {
530 wakaba 1.4 'iso_6937-2-add' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
531 wakaba 1.2 'iso-ir-142' => REGISTERED_CHARSET_NAME,
532     'csisotextcomm' => REGISTERED_CHARSET_NAME,
533     },
534 wakaba 1.9 ## TODO: |error_level|
535 wakaba 1.4 });
536 wakaba 1.2
537     $Charset->{'jis_x0201'}
538     = $IANACharset->{'jis_x0201'}
539     = $IANACharset->{'x0201'}
540     = $IANACharset->{'cshalfwidthkatakana'}
541 wakaba 1.9 = $HTMLCharset->{'jisx0201'}
542 wakaba 1.4 = __PACKAGE__->new ({
543 wakaba 1.9 category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT,
544 wakaba 1.2 iana_names => {
545 wakaba 1.4 'jis_x0201' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
546 wakaba 1.2 'x0201' => REGISTERED_CHARSET_NAME,
547     'cshalfwidthkatakana' => REGISTERED_CHARSET_NAME,
548     },
549 wakaba 1.9 ## TODO: |error_level|
550 wakaba 1.4 });
551 wakaba 1.2
552     $Charset->{'jis_encoding'}
553     = $IANACharset->{'jis_encoding'}
554     = $IANACharset->{'csjisencoding'}
555 wakaba 1.9 = $HTMLCharset->{'jisencoding'}
556 wakaba 1.4 = __PACKAGE__->new ({
557 wakaba 1.5 category => 0,
558 wakaba 1.2 iana_names => {
559 wakaba 1.4 'jis_encoding' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
560 wakaba 1.2 'csjisencoding' => REGISTERED_CHARSET_NAME,
561     },
562     ## NOTE: What is this?
563 wakaba 1.4 });
564 wakaba 1.1
565     $Charset->{'shift_jis'}
566     = $IANACharset->{'shift_jis'}
567     = $IANACharset->{'ms_kanji'}
568     = $IANACharset->{'csshiftjis'}
569 wakaba 1.9 = $HTMLCharset->{'shiftjis'}
570     = $HTMLCharset->{'mskanji'}
571 wakaba 1.4 = __PACKAGE__->new ({
572 wakaba 1.9 category => CHARSET_CATEGORY_SJIS | CHARSET_CATEGORY_BLOCK_SAFE |
573     CHARSET_CATEGORY_MIME_TEXT,
574 wakaba 1.1 iana_names => {
575 wakaba 1.4 'shift_jis' => PREFERRED_CHARSET_NAME | PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
576 wakaba 1.1 'ms_kanji' => REGISTERED_CHARSET_NAME,
577     'csshiftjis' => REGISTERED_CHARSET_NAME,
578     },
579 wakaba 1.5 perl_names => {
580 wakaba 1.6 'shift-jis-1997' => UNREGISTERED_CHARSET_NAME |
581 wakaba 1.7 SEMICONFORMING_ENCODING_IMPL | ERROR_REPORTING_ENCODING_IMPL,
582     shiftjis => PRIMARY_CHARSET_NAME | NONCONFORMING_ENCODING_IMPL |
583 wakaba 1.6 ERROR_REPORTING_ENCODING_IMPL,
584 wakaba 1.5 ## NOTE: Unicode mapping is wrong.
585     },
586 wakaba 1.9 ## TODO: |error_level|
587 wakaba 1.4 });
588 wakaba 1.1
589 wakaba 1.3 $Charset->{'x-sjis'}
590     = $IANACharset->{'x-sjis'}
591 wakaba 1.9 = $HTMLCharset->{'xsjis'}
592 wakaba 1.4 = __PACKAGE__->new ({
593 wakaba 1.9 category => CHARSET_CATEGORY_SJIS | CHARSET_CATEGORY_BLOCK_SAFE |
594     CHARSET_CATEGORY_ASCII_COMPAT,
595 wakaba 1.3 iana_names => {
596     'x-sjis' => UNREGISTERED_CHARSET_NAME,
597     },
598 wakaba 1.6 perl_names => {
599 wakaba 1.7 'shift-jis-1997' => FALLBACK_ENCODING_IMPL | ERROR_REPORTING_ENCODING_IMPL,
600 wakaba 1.6 },
601 wakaba 1.9 ## TODO: |error_level|
602 wakaba 1.4 });
603 wakaba 1.3
604 wakaba 1.5 $Charset->{shift_jisx0213}
605     = $IANACharset->{shift_jisx0213}
606 wakaba 1.9 = $HTMLCharset->{shiftjisx0213}
607 wakaba 1.5 = __PACKAGE__->new ({
608 wakaba 1.9 category => CHARSET_CATEGORY_SJIS | CHARSET_CATEGORY_BLOCK_SAFE |
609     CHARSET_CATEGORY_MIME_TEXT,
610 wakaba 1.5 iana_names => {
611     shift_jisx0213 => UNREGISTERED_CHARSET_NAME,
612     },
613     perl_names => {
614     #shift_jisx0213 (non-standard - i don't know its conformance)
615 wakaba 1.7 'shift-jis-1997' => FALLBACK_ENCODING_IMPL | ERROR_REPORTING_ENCODING_IMPL,
616     'shiftjis' => FALLBACK_ENCODING_IMPL | ERROR_REPORTING_ENCODING_IMPL,
617 wakaba 1.5 },
618 wakaba 1.9 ## TODO: |error_level|
619 wakaba 1.5 });
620    
621 wakaba 1.1 $Charset->{'euc-jp'}
622     = $IANACharset->{'extended_unix_code_packed_format_for_japanese'}
623     = $IANACharset->{'cseucpkdfmtjapanese'}
624     = $IANACharset->{'euc-jp'}
625 wakaba 1.9 = $HTMLCharset->{'extendedunixcodepackedformatforjapanese'}
626     = $HTMLCharset->{'cseucpkdfmtjapanese'}
627     = $HTMLCharset->{'eucjp'}
628 wakaba 1.4 = __PACKAGE__->new ({
629 wakaba 1.9 category => CHARSET_CATEGORY_EUCJP | CHARSET_CATEGORY_BLOCK_SAFE |
630     CHARSET_CATEGORY_MIME_TEXT | CHARSET_CATEGORY_ASCII_COMPAT,
631 wakaba 1.1 iana_names => {
632 wakaba 1.4 'extended_unix_code_packed_format_for_japanese' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
633 wakaba 1.1 'cseucpkdfmtjapanese' => REGISTERED_CHARSET_NAME,
634 wakaba 1.4 'euc-jp' => PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
635 wakaba 1.1 },
636 wakaba 1.5 perl_names => {
637 wakaba 1.7 'euc-jp-1997' => UNREGISTERED_CHARSET_NAME |
638     SEMICONFORMING_ENCODING_IMPL | ERROR_REPORTING_ENCODING_IMPL,
639 wakaba 1.5 ## NOTE: Though the IANA definition references the 1990 version
640     ## of EUC-JP, the 1997 version of JIS standard claims that the version
641     ## is same coded character set as the 1990 version, such that we
642     ## consider the EUC-JP 1990 version is same as the 1997 version.
643 wakaba 1.6 'euc-jp' => PREFERRED_CHARSET_NAME | NONCONFORMING_ENCODING_IMPL |
644     ERROR_REPORTING_ENCODING_IMPL,
645 wakaba 1.5 ## NOTE: Unicode mapping is wrong.
646     },
647 wakaba 1.9 ## TODO: |error_level|
648 wakaba 1.4 });
649 wakaba 1.3
650     $Charset->{'x-euc-jp'}
651     = $IANACharset->{'x-euc-jp'}
652 wakaba 1.9 = $HTMLCharset->{'xeucjp'}
653 wakaba 1.4 = __PACKAGE__->new ({
654 wakaba 1.9 category => CHARSET_CATEGORY_EUCJP | CHARSET_CATEGORY_BLOCK_SAFE |
655     CHARSET_CATEGORY_MIME_TEXT | CHARSET_CATEGORY_ASCII_COMPAT,
656 wakaba 1.3 iana_names => {
657     'x-euc-jp' => UNREGISTERED_CHARSET_NAME,
658     },
659 wakaba 1.6 perl_names => {
660     'euc-jp-1997' => FALLBACK_ENCODING_IMPL | ERROR_REPORTING_ENCODING_IMPL,
661     'euc-jp' => FALLBACK_ENCODING_IMPL | ERROR_REPORTING_ENCODING_IMPL,
662     },
663 wakaba 1.4 });
664 wakaba 1.1
665 wakaba 1.2 $Charset->{'extended_unix_code_fixed_width_for_japanese'}
666     = $IANACharset->{'extended_unix_code_fixed_width_for_japanese'}
667     = $IANACharset->{'cseucfixwidjapanese'}
668 wakaba 1.9 = $HTMLCharset->{'extendedunixcodefixedwidthforjapanese'}
669 wakaba 1.4 = __PACKAGE__->new ({
670 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
671 wakaba 1.2 iana_names => {
672 wakaba 1.4 'extended_unix_code_fixed_width_for_japanese' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
673 wakaba 1.2 'cseucfixwidjapanese' => REGISTERED_CHARSET_NAME,
674     },
675 wakaba 1.9 ## TODO: |error_level|
676 wakaba 1.4 });
677 wakaba 1.2
678 wakaba 1.1 ## TODO: ...
679    
680 wakaba 1.2 $Charset->{'euc-kr'}
681     = $IANACharset->{'euc-kr'}
682     = $IANACharset->{'cseuckr'}
683 wakaba 1.9 = $HTMLCharset->{'euckr'}
684 wakaba 1.4 = __PACKAGE__->new ({
685 wakaba 1.9 category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT,
686 wakaba 1.2 iana_names => {
687 wakaba 1.4 'euc-kr' => PRIMARY_CHARSET_NAME | PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
688 wakaba 1.2 'cseuckr' => REGISTERED_CHARSET_NAME,
689     },
690 wakaba 1.9 perl_names => {
691     ## TODO: We need a parse error generating wrapper for the decoder.
692     'cp949' => FALLBACK_ENCODING_IMPL, # part of standard Perl distribution
693     },
694     ## NOTE: |euc-kr| is handled as |windows-949|, such that properties
695     ## should be consistent with that encoding's properties.
696 wakaba 1.4 });
697 wakaba 1.2
698 wakaba 1.1 $Charset->{'iso-2022-jp'}
699     = $IANACharset->{'iso-2022-jp'}
700     = $IANACharset->{'csiso2022jp'}
701 wakaba 1.3 = $IANACharset->{'iso2022jp'}
702     = $IANACharset->{'junet-code'}
703 wakaba 1.9 = $HTMLCharset->{'iso2022jp'}
704     = $HTMLCharset->{'junetcode'}
705 wakaba 1.4 = __PACKAGE__->new ({
706 wakaba 1.9 category => CHARSET_CATEGORY_MIME_TEXT,
707 wakaba 1.1 iana_names => {
708 wakaba 1.4 'iso-2022-jp' => PREFERRED_CHARSET_NAME | PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
709 wakaba 1.1 'csiso2022jp' => REGISTERED_CHARSET_NAME,
710 wakaba 1.3 'iso2022jp' => UNREGISTERED_CHARSET_NAME,
711     'junet-code' => UNREGISTERED_CHARSET_NAME,
712 wakaba 1.1 },
713 wakaba 1.9 ## TODO: |error_level|
714 wakaba 1.4 });
715 wakaba 1.2
716     $Charset->{'iso-2022-jp-2'}
717     = $IANACharset->{'iso-2022-jp-2'}
718     = $IANACharset->{'csiso2022jp2'}
719 wakaba 1.9 = $HTMLCharset->{'iso2022jp2'}
720 wakaba 1.4 = __PACKAGE__->new ({
721 wakaba 1.9 category => CHARSET_CATEGORY_MIME_TEXT,
722 wakaba 1.2 iana_names => {
723 wakaba 1.4 'iso-2022-jp-2' => PREFERRED_CHARSET_NAME | PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
724 wakaba 1.2 'csiso2022jp2' => REGISTERED_CHARSET_NAME,
725     },
726 wakaba 1.9 ## TODO: |error_level|
727     });
728    
729     ## TODO: ...
730    
731     $IANACharset->{'gb_2312-80'}
732     = $IANACharset->{'iso-ir-58'}
733     = $IANACharset->{chinese}
734     = $HTMLCharset->{gb231280}
735     = $HTMLCharset->{isoir58}
736     = __PACKAGE__->new ({
737     ## NOTE: What is represented by this charset is unclear... I don't
738     ## understand what RFC 1945 describes...
739     category => 0,
740     iana_names => {
741     'gb_2312-80' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
742     'iso-ir-58' => REGISTERED_CHARSET_NAME,
743     'chinese' => REGISTERED_CHARSET_NAME,
744     'csiso58gb231280' => REGISTERED_CHARSET_NAME,
745     },
746     perl_names => {
747     ## TODO: GB2312->GBK Parse Error wrapper
748     'cp936' => FALLBACK_ENCODING_IMPL,
749     },
750     ## NOTE: |gb2312| is handled as |gbk|, such that properties should be
751     ## consistent.
752 wakaba 1.4 });
753 wakaba 1.1
754     ## TODO: ...
755    
756     $Charset->{'utf-8'}
757     = $IANACharset->{'utf-8'}
758 wakaba 1.3 = $IANACharset->{'x-utf-8'}
759 wakaba 1.9 = $HTMLCharset->{'utf8'}
760     = $HTMLCharset->{'xutf8'}
761 wakaba 1.4 = __PACKAGE__->new ({
762 wakaba 1.9 category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT |
763     CHARSET_CATEGORY_MIME_TEXT,
764 wakaba 1.1 iana_names => {
765 wakaba 1.4 'utf-8' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
766 wakaba 1.6 ## NOTE: IANA name "utf-8" references RFC 3629. According to the RFC,
767     ## the definitive definition is one specified in the Unicode Standard.
768 wakaba 1.3 'x-utf-8' => UNREGISTERED_CHARSET_NAME,
769 wakaba 1.9 ## NOTE: We treat |x-utf-8| as an alias of |utf-8|, since unlike
770     ## other charset like |x-sjis| or |x-euc-jp|, there is no major
771     ## variant for the UTF-8 encoding.
772     ## TODO: We might ought to reconsider this policy, since
773     ## there are UTF-8 variant in fact, such as
774     ## Unicode's UTF-8, ISO/IEC 10646's UTF-8, UTF-8n, and as
775     ## such.
776 wakaba 1.1 },
777 wakaba 1.6 perl_names => {
778     'utf-8-strict' => PRIMARY_CHARSET_NAME | SEMICONFORMING_ENCODING_IMPL |
779     ERROR_REPORTING_ENCODING_IMPL,
780     ## NOTE: It does not support non-Unicode UCS characters (conforming).
781     ## It does detect illegal sequences (conforming).
782     ## It does not support surrpgate pairs (conforming).
783     ## It does not support BOMs (non-conforming).
784     },
785 wakaba 1.9 ## TODO: |error_level|
786 wakaba 1.6 bom_pattern => qr/\xEF\xBB\xBF/,
787 wakaba 1.4 });
788 wakaba 1.3
789     $Charset->{'utf-8n'}
790     = $IANACharset->{'utf-8n'}
791 wakaba 1.9 = $HTMLCharset->{'utf-8'}
792 wakaba 1.4 = __PACKAGE__->new ({
793 wakaba 1.9 category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_MIME_TEXT |
794     CHARSET_CATEGORY_ASCII_COMPAT,
795 wakaba 1.3 iana_names => {
796     'utf-8n' => UNREGISTERED_CHARSET_NAME,
797 wakaba 1.6 ## NOTE: Is there any normative definition for the charset?
798     ## What variant of UTF-8 should we use for the charset?
799     },
800     perl_names => {
801     'utf-8-strict' => PRIMARY_CHARSET_NAME | ERROR_REPORTING_ENCODING_IMPL,
802 wakaba 1.3 },
803 wakaba 1.9 ## TODO: |error_level|
804 wakaba 1.4 });
805 wakaba 1.1
806     ## TODO: ...
807    
808 wakaba 1.2 $Charset->{'gbk'}
809     = $IANACharset->{'gbk'}
810     = $IANACharset->{'cp936'}
811     = $IANACharset->{'ms936'}
812     = $IANACharset->{'windows-936'}
813 wakaba 1.9 = $HTMLCharset->{'windows936'}
814 wakaba 1.4 = __PACKAGE__->new ({
815 wakaba 1.9 category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_MIME_TEXT,
816 wakaba 1.2 iana_names => {
817 wakaba 1.4 'gbk' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
818 wakaba 1.2 'cp936' => REGISTERED_CHARSET_NAME,
819     'ms936' => REGISTERED_CHARSET_NAME,
820     'windows-936' => REGISTERED_CHARSET_NAME,
821     },
822 wakaba 1.9 ## TODO: |error_level|
823 wakaba 1.2 iana_status => STATUS_COMMON | STATUS_OBSOLETE,
824 wakaba 1.4 });
825 wakaba 1.2
826     $Charset->{'gb18030'}
827     = $IANACharset->{'gb18030'}
828 wakaba 1.4 = __PACKAGE__->new ({
829 wakaba 1.9 category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_MIME_TEXT,
830 wakaba 1.2 iana_names => {
831 wakaba 1.4 'gb18030' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
832 wakaba 1.2 },
833     iana_status => STATUS_COMMON,
834     mime_text_suitable => 1,
835 wakaba 1.4 });
836 wakaba 1.2
837     ## TODO: ...
838    
839 wakaba 1.1 $Charset->{'utf-16be'}
840     = $IANACharset->{'utf-16be'}
841 wakaba 1.9 = $HTMLCharset->{'utf16be'}
842 wakaba 1.4 = __PACKAGE__->new ({
843 wakaba 1.8 category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_UTF16,
844 wakaba 1.1 iana_names => {
845 wakaba 1.4 'utf-16be' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
846 wakaba 1.1 },
847 wakaba 1.9 ## TODO: |error_level|
848 wakaba 1.4 });
849 wakaba 1.1
850     $Charset->{'utf-16le'}
851     = $IANACharset->{'utf-16le'}
852 wakaba 1.9 = $HTMLCharset->{'utf16le'}
853 wakaba 1.4 = __PACKAGE__->new ({
854 wakaba 1.8 category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_UTF16,
855 wakaba 1.1 iana_names => {
856 wakaba 1.4 'utf-16le' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
857 wakaba 1.1 },
858 wakaba 1.9 ## TODO: |error_level|
859 wakaba 1.4 });
860 wakaba 1.1
861     $Charset->{'utf-16'}
862     = $IANACharset->{'utf-16'}
863 wakaba 1.9 = $HTMLCharset->{'utf16'}
864 wakaba 1.4 = __PACKAGE__->new ({
865 wakaba 1.8 category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_UTF16,
866 wakaba 1.1 iana_names => {
867 wakaba 1.4 'utf-16' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
868 wakaba 1.1 },
869 wakaba 1.9 ## TODO: |error_level|
870 wakaba 1.4 });
871 wakaba 1.1
872     ## TODO: ...
873    
874 wakaba 1.2 $Charset->{'windows-31j'}
875     = $IANACharset->{'windows-31j'}
876     = $IANACharset->{'cswindows31j'}
877 wakaba 1.9 = $HTMLCharset->{'windows31j'}
878 wakaba 1.4 = __PACKAGE__->new ({
879 wakaba 1.9 category => CHARSET_CATEGORY_SJIS | CHARSET_CATEGORY_BLOCK_SAFE |
880     CHARSET_CATEGORY_MIME_TEXT,
881 wakaba 1.2 iana_names => {
882 wakaba 1.4 'windows-31j' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
883 wakaba 1.2 'cswindows31j' => REGISTERED_CHARSET_NAME,
884     },
885     iana_status => STATUS_LIMITED_USE, # maybe
886 wakaba 1.9 ## TODO: |error_level|
887 wakaba 1.4 });
888 wakaba 1.2
889     $Charset->{'gb2312'}
890     = $IANACharset->{'gb2312'}
891     = $IANACharset->{'csgb2312'}
892 wakaba 1.4 = __PACKAGE__->new ({
893 wakaba 1.9 category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_MIME_TEXT |
894     CHARSET_CATEGORY_ASCII_COMPAT,
895 wakaba 1.2 iana_names => {
896 wakaba 1.4 'gb2312' => PRIMARY_CHARSET_NAME | PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
897 wakaba 1.2 'csgb2312' => REGISTERED_CHARSET_NAME,
898     },
899 wakaba 1.9 perl_names => {
900     ## TODO: GB2312->GBK Parse Error wrapper
901     'cp936' => FALLBACK_ENCODING_IMPL,
902     },
903     ## NOTE: |gb2312| is handled as |gbk|, such that properties should be
904     ## consistent.
905 wakaba 1.4 });
906 wakaba 1.2
907     $Charset->{'big5'}
908     = $IANACharset->{'big5'}
909     = $IANACharset->{'csbig5'}
910 wakaba 1.9 = $IANACharset->{'x-x-big5'}
911     = $HTMLCharset->{xxbig5}
912 wakaba 1.4 = __PACKAGE__->new ({
913 wakaba 1.9 category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_MIME_TEXT,
914 wakaba 1.2 iana_names => {
915 wakaba 1.4 'big5' => PRIMARY_CHARSET_NAME | PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
916 wakaba 1.2 'csbig5' => REGISTERED_CHARSET_NAME,
917 wakaba 1.9 'x-x-big5' => UNREGISTERED_CHARSET_NAME,
918     ## NOTE: In HTML5, |x-x-big5| is defined as an alias of |big5|.
919     ## According to that spec, if there is any difference between
920     ## input and replacement encodings, the result is parse error.
921     ## However, since there is no formal definition for |x-x-big5|
922     ## charset, we cannot raise such errors.
923 wakaba 1.2 },
924 wakaba 1.9 ## TODO: |error_level|
925 wakaba 1.4 });
926 wakaba 1.2
927     ## TODO: ...
928    
929     $Charset->{'big5-hkscs'}
930     = $IANACharset->{'big5-hkscs'}
931 wakaba 1.9 = $HTMLCharset->{'big5hkscs'}
932 wakaba 1.4 = __PACKAGE__->new ({
933 wakaba 1.9 category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_MIME_TEXT,
934 wakaba 1.2 iana_names => {
935 wakaba 1.4 'big5-hkscs' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
936 wakaba 1.2 },
937 wakaba 1.9 ## TODO: |error_level|
938 wakaba 1.4 });
939 wakaba 1.2
940     ## TODO: ...
941    
942 wakaba 1.1 $Charset->{'windows-1252'}
943     = $IANACharset->{'windows-1252'}
944 wakaba 1.9 = $HTMLCharset->{'windows1252'}
945 wakaba 1.4 = __PACKAGE__->new ({
946 wakaba 1.9 category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT |
947     CHARSET_CATEGORY_MIME_TEXT,
948 wakaba 1.1 iana_names => {
949 wakaba 1.4 'windows-1252' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
950 wakaba 1.1 },
951 wakaba 1.9 ## TODO: Check whether use of 0x81 is conforming or not...
952     });
953    
954     $Charset->{'windows-1253'}
955     = $IANACharset->{'windows-1253'}
956     = $HTMLCharset->{'windows1253'}
957     = __PACKAGE__->new ({
958     category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT |
959     CHARSET_CATEGORY_MIME_TEXT,
960     iana_names => {
961     'windows-1253' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
962     },
963     ## TODO: Check whether use of 0x81 is conforming or not...
964     });
965    
966     $Charset->{'windows-1254'}
967     = $IANACharset->{'windows-1254'}
968     = $HTMLCharset->{'windows1254'}
969     = __PACKAGE__->new ({
970     category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT |
971     CHARSET_CATEGORY_MIME_TEXT,
972     iana_names => {
973     'windows-1254' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
974     },
975     ## TODO: Check whether use of 0x81 is conforming or not...
976 wakaba 1.4 });
977 wakaba 1.1
978     ## TODO: ...
979    
980 wakaba 1.7 $Charset->{'tis-620'}
981     = $IANACharset->{'tis-620'}
982 wakaba 1.9 = $HTMLCharset->{'tis620'}
983 wakaba 1.7 = __PACKAGE__->new ({
984 wakaba 1.9 category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT,
985 wakaba 1.7 iana_names => {
986     'tis-620' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
987     },
988     perl_names => {
989 wakaba 1.9 'web-tis-620' => UNREGISTERED_CHARSET_NAME | ERROR_REPORTING_ENCODING_IMPL,
990     'windows-874' => FALLBACK_ENCODING_IMPL | ERROR_REPORTING_ENCODING_IMPL,
991 wakaba 1.7 },
992 wakaba 1.9 fallback => {
993     "\x80" => "\x{20AC}",
994     "\x81" => undef, "\x82" => undef, "\x83" => undef, "\x84" => undef,
995     "\x85" => "\x{2026}",
996     "\x86" => undef, "\x87" => undef, "\x88" => undef, "\x89" => undef,
997     "\x8A" => undef, "\x8B" => undef, "\x8C" => undef, "\x8D" => undef,
998     "\x8E" => undef, "\x8F" => undef, "\x90" => undef,
999     "\x91" => "\x{2018}",
1000     "\x92" => "\x{2019}",
1001     "\x93" => "\x{201C}",
1002     "\x94" => "\x{201D}",
1003     "\x95" => "\x{2022}",
1004     "\x96" => "\x{2013}",
1005     "\x97" => "\x{2014}",
1006     "\x98" => undef, "\x99" => undef, "\x9A" => undef, "\x9B" => undef,
1007     "\x9C" => undef, "\x9D" => undef, "\x9E" => undef, "\x9F" => undef,
1008     "\xA0" => "\xA0",
1009     },
1010     ## NOTE: |tis-620| is treated as |windows-874|, so ensure that
1011     ## they are consistent.
1012 wakaba 1.7 });
1013    
1014     $Charset->{'iso-8859-11'}
1015     = $IANACharset->{'iso-8859-11'}
1016 wakaba 1.9 = $HTMLCharset->{'iso885911'}
1017 wakaba 1.7 = __PACKAGE__->new ({
1018 wakaba 1.9 category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT,
1019 wakaba 1.7 iana_names => {
1020     'iso-8859-11' => UNREGISTERED_CHARSET_NAME,
1021     ## NOTE: The Web Thai encoding, i.e. windows-874.
1022     },
1023     perl_names => {
1024 wakaba 1.9 'web-thai' => UNREGISTERED_CHARSET_NAME | ERROR_REPORTING_ENCODING_IMPL,
1025 wakaba 1.7 'windows-874' => FALLBACK_ENCODING_IMPL | ERROR_REPORTING_ENCODING_IMPL,
1026     },
1027     fallback => {
1028     "\x80" => "\x{20AC}",
1029 wakaba 1.9 "\x81" => undef, "\x82" => undef, "\x83" => undef, "\x84" => undef,
1030 wakaba 1.7 "\x85" => "\x{2026}",
1031 wakaba 1.9 "\x86" => undef, "\x87" => undef, "\x88" => undef, "\x89" => undef,
1032     "\x8A" => undef, "\x8B" => undef, "\x8C" => undef, "\x8D" => undef,
1033     "\x8E" => undef, "\x8F" => undef, "\x90" => undef,
1034 wakaba 1.7 "\x91" => "\x{2018}",
1035     "\x92" => "\x{2019}",
1036     "\x93" => "\x{201C}",
1037     "\x94" => "\x{201D}",
1038     "\x95" => "\x{2022}",
1039     "\x96" => "\x{2013}",
1040     "\x97" => "\x{2014}",
1041 wakaba 1.9 "\x98" => undef, "\x99" => undef, "\x9A" => undef, "\x9B" => undef,
1042     "\x9C" => undef, "\x9D" => undef, "\x9E" => undef, "\x9F" => undef,
1043 wakaba 1.7 },
1044 wakaba 1.9 ## NOTE: |iso-8859-11| is treated as |windows-874|, so ensure that
1045     ## they are consistent.
1046 wakaba 1.7 });
1047    
1048     $Charset->{'windows-874'}
1049     = $IANACharset->{'windows-874'}
1050 wakaba 1.9 = $HTMLCharset->{'windows874'}
1051 wakaba 1.7 = __PACKAGE__->new ({
1052 wakaba 1.9 category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT,
1053 wakaba 1.7 iana_names => {
1054     'windows-874' => UNREGISTERED_CHARSET_NAME,
1055     },
1056     perl_names => {
1057     'windows-874' => REGISTERED_CHARSET_NAME | ERROR_REPORTING_ENCODING_IMPL,
1058     },
1059 wakaba 1.9 ## TODO: |error_level|
1060     });
1061    
1062     $IANACharset->{'windows-949'}
1063     = $HTMLCharset->{windows949}
1064     = __PACKAGE__->new ({
1065     category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_MIME_TEXT,
1066     iana_names => {
1067     'windows-949' => UNREGISTERED_CHARSET_NAME,
1068     },
1069     perl_names => {
1070     'cp949' => PREFERRED_CHARSET_NAME | NONCONFORMING_ENCODING_IMPL |
1071     ERROR_REPORTING_ENCODING_IMPL,
1072     ## TODO: Is this implementation conforming?
1073     },
1074     ## NOTE: |error_level| is same as default, since we can't find any formal
1075     ## definition for this charset.
1076 wakaba 1.7 });
1077    
1078 wakaba 1.4 sub new ($$) {
1079     return bless $_[1], $_[0];
1080     } # new
1081    
1082     ## NOTE: A class method
1083     sub get_by_iana_name ($$) {
1084     my $name = $_[1];
1085     $name =~ tr/A-Z/a-z/; ## ASCII case-insensitive
1086     unless ($IANACharset->{$name}) {
1087     $IANACharset->{$name} = __PACKAGE__->new ({
1088     iana_names => {
1089     $name => UNREGISTERED_CHARSET_NAME,
1090     },
1091     });
1092     }
1093     return $IANACharset->{$name};
1094     } # get_by_iana_name
1095    
1096 wakaba 1.5 sub get_decode_handle ($$;%) {
1097     my $self = shift;
1098     my $byte_stream = shift;
1099     my %opt = @_;
1100    
1101     my $obj = {
1102     character_queue => [],
1103     filehandle => $byte_stream,
1104     charset => '', ## TODO: We set a charset name for input_encoding (when we get identify-by-URI nonsense away)
1105     byte_buffer => $opt{byte_buffer} ? ${$opt{byte_buffer}} : '', ## TODO: ref, instead of value, should be used
1106     onerror => $opt{onerror} || sub {},
1107 wakaba 1.9 level => $opt{level} || {
1108     must => 'm',
1109     charset_variant => 'm',
1110     charset_fact => 'm',
1111     iso_shall => 'm',
1112     },
1113     error_level => $self->{error_level} || {
1114     ## HTML5 charset name aliases
1115     ## NOTE: Use of code points in the variant whose definition differs
1116     ## from the original charset is a parse error in HTML5. However,
1117     ## it does not affect the document conformance; the HTML5 spec
1118     ## does not define the conformance of the input stream against the
1119     ## charset in use.
1120     'fallback-char-error' => 'charset_variant',
1121     #'fallback-illegal-error' => 'charset_variant',
1122     'fallback-unassigned-error' => 'charset_variant',
1123     ## NOTE: An appropriate error level should be set for each charset
1124     ## (many charset prohibits use of unassigned code points).
1125    
1126     'illegal-octets-error' => 'charset_fact',
1127     'unassigned-code-point-error' => 'charset_fact',
1128     'invalid-state-error' => 'charset_fact',
1129     },
1130 wakaba 1.5 };
1131    
1132     require Whatpm::Charset::DecodeHandle;
1133     if ($self->{iana_names}->{'iso-2022-jp'}) {
1134     $obj->{state_2440} = 'gl-jis-1978';
1135     $obj->{state_2442} = 'gl-jis-1983';
1136     $obj->{state} = 'state_2842';
1137     eval {
1138     require Encode::GLJIS1978;
1139     require Encode::GLJIS1983;
1140     };
1141     if (Encode::find_encoding ($obj->{state_2440}) and
1142     Encode::find_encoding ($obj->{state_2442})) {
1143     return ((bless $obj, 'Whatpm::Charset::DecodeHandle::ISO2022JP'),
1144     PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME);
1145     }
1146     } elsif ($self->{xml_names}->{'iso-2022-jp'}) {
1147     $obj->{state_2440} = 'gl-jis-1997-swapped';
1148     $obj->{state_2442} = 'gl-jis-1997';
1149     $obj->{state} = 'state_2842';
1150     eval {
1151     require Encode::GLJIS1997Swapped;
1152     require Encode::GLJIS1997;
1153     };
1154     if (Encode::find_encoding ($obj->{state_2440}) and
1155     Encode::find_encoding ($obj->{state_2442})) {
1156     return ((bless $obj, 'Whatpm::Charset::DecodeHandle::ISO2022JP'),
1157     PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME);
1158     }
1159     }
1160    
1161 wakaba 1.6 my ($e, $e_status) = $self->get_perl_encoding
1162     (%opt, allow_semiconforming => 1);
1163 wakaba 1.5 if ($e) {
1164     $obj->{perl_encoding_name} = $e->name;
1165     if ($self->{category} & CHARSET_CATEGORY_EUCJP) {
1166     return ((bless $obj, 'Whatpm::Charset::DecodeHandle::EUCJP'),
1167     $e_status);
1168     } elsif ($self->{category} & CHARSET_CATEGORY_SJIS) {
1169     return ((bless $obj, 'Whatpm::Charset::DecodeHandle::ShiftJIS'),
1170     $e_status);
1171 wakaba 1.7 #} elsif ($self->{category} & CHARSET_CATEGORY_BLOCK_SAFE) {
1172     } else {
1173     $e_status |= FALLBACK_ENCODING_IMPL
1174     unless $self->{category} & CHARSET_CATEGORY_BLOCK_SAFE;
1175 wakaba 1.6 $obj->{bom_pattern} = $self->{bom_pattern};
1176 wakaba 1.7 $obj->{fallback} = $self->{fallback};
1177 wakaba 1.5 return ((bless $obj, 'Whatpm::Charset::DecodeHandle::Encode'),
1178     $e_status);
1179 wakaba 1.7 #} else {
1180     # ## TODO: no encoding error (?)
1181     # return (undef, 0);
1182 wakaba 1.5 }
1183     } else {
1184     ## TODO: no encoding error(?)
1185     return (undef, 0);
1186     }
1187     } # get_decode_handle
1188    
1189 wakaba 1.4 sub get_perl_encoding ($;%) {
1190     my ($self, %opt) = @_;
1191    
1192     require Encode;
1193 wakaba 1.5 my $load_encode = sub {
1194     my $name = shift;
1195     if ($name eq 'euc-jp-1997') {
1196     require Encode::EUCJP1997;
1197     } elsif ($name eq 'shift-jis-1997') {
1198     require Encode::ShiftJIS1997;
1199 wakaba 1.9 } elsif ({'web-latin1' => 1,
1200     'web-latin1-us-ascii' => 1,
1201     'web-latin5' => 1}->{$name}) {
1202 wakaba 1.7 require Whatpm::Charset::WebLatin1;
1203 wakaba 1.9 } elsif ($name eq 'web-thai' or $name eq 'web-tis-620') {
1204 wakaba 1.7 require Whatpm::Charset::WebThai;
1205 wakaba 1.5 }
1206     }; # $load_encode
1207 wakaba 1.4
1208     if ($opt{allow_error_reporting}) {
1209     for my $perl_name (keys %{$self->{perl_names} or {}}) {
1210     my $perl_status = $self->{perl_names}->{$perl_name};
1211     next unless $perl_status & ERROR_REPORTING_ENCODING_IMPL;
1212 wakaba 1.5 next if $perl_status & FALLBACK_ENCODING_IMPL;
1213 wakaba 1.6 next if $perl_status & SEMICONFORMING_ENCODING_IMPL and
1214     not $opt{allow_semiconforming};
1215 wakaba 1.4
1216 wakaba 1.5 $load_encode->($perl_name);
1217 wakaba 1.4 my $e = Encode::find_encoding ($perl_name);
1218 wakaba 1.9 if ($e and $e->name eq $perl_name) {
1219     ## NOTE: Don't return $e unless $e eq $perl_name, since
1220     ## |find_encoding| resolves e.g. |foobarlatin-1| to |iso-8859-1|,
1221     ## which might return wrong encoding object when a dedicated
1222     ## implementation not part of the standard Perl distribution is
1223     ## desired.
1224 wakaba 1.4 return ($e, $perl_status);
1225     }
1226     }
1227     }
1228    
1229     for my $perl_name (keys %{$self->{perl_names} or {}}) {
1230     my $perl_status = $self->{perl_names}->{$perl_name};
1231     next if $perl_status & ERROR_REPORTING_ENCODING_IMPL;
1232     next if $perl_status & FALLBACK_ENCODING_IMPL;
1233 wakaba 1.6 next if $perl_status & SEMICONFORMING_ENCODING_IMPL and
1234     not $opt{allow_semiconforming};
1235 wakaba 1.5
1236     $load_encode->($perl_name);
1237 wakaba 1.4 my $e = Encode::find_encoding ($perl_name);
1238     if ($e) {
1239     return ($e, $perl_status);
1240     }
1241     }
1242    
1243     if ($opt{allow_fallback}) {
1244     for my $perl_name (keys %{$self->{perl_names} or {}}) {
1245     my $perl_status = $self->{perl_names}->{$perl_name};
1246 wakaba 1.6 next unless $perl_status & FALLBACK_ENCODING_IMPL or
1247     $perl_status & SEMICONFORMING_ENCODING_IMPL;
1248     ## NOTE: We don't prefer semi-conforming implementations to
1249     ## non-conforming implementations, since semi-conforming implementations
1250     ## will never be conforming without assist of the callee, and in such
1251     ## cases the callee should set the |allow_semiconforming| option upon
1252     ## the invocation of the method anyway.
1253    
1254 wakaba 1.5 $load_encode->($perl_name);
1255 wakaba 1.4 my $e = Encode::find_encoding ($perl_name);
1256     if ($e) {
1257     return ($e, $perl_status);
1258     }
1259     }
1260    
1261     for my $iana_name (keys %{$self->{iana_names} or {}}) {
1262 wakaba 1.5 $load_encode->($iana_name);
1263 wakaba 1.4 my $e = Encode::find_encoding ($iana_name);
1264     if ($e) {
1265     return ($e, FALLBACK_ENCODING_IMPL);
1266     }
1267     }
1268     }
1269    
1270     return (undef, 0);
1271     } # get_perl_encoding
1272    
1273     sub get_iana_name ($) {
1274     my $self = shift;
1275    
1276     my $primary;
1277     my $other;
1278     for my $iana_name (keys %{$self->{iana_names} or {}}) {
1279     my $name_status = $self->{iana_names}->{$iana_name};
1280     if ($name_status & PREFERRED_CHARSET_NAME) {
1281     return $iana_name;
1282     } elsif ($name_status & PRIMARY_CHARSET_NAME) {
1283     $primary = $iana_name;
1284     } elsif ($name_status & REGISTERED_CHARSET_NAME) {
1285     $other = $iana_name;
1286     } else {
1287     $other ||= $iana_name;
1288     }
1289     }
1290    
1291     return $primary || $other;
1292     } # get_iana_name
1293    
1294     ## NOTE: A non-method function
1295 wakaba 1.3 sub is_syntactically_valid_iana_charset_name ($) {
1296 wakaba 1.1 my $name = shift;
1297     return $name =~ /\A[\x20-\x7E]{1,40}\z/;
1298 wakaba 1.9
1299     ## NOTE: According to IANAREG, "The character set names may be up to 40
1300     ## characters taken from the printable characters of US-ASCII. However,
1301     ## no distinction is made between use of upper and lower case letters.".
1302 wakaba 1.1 } # is_suntactically_valid_iana_charset_name
1303    
1304     1;
1305 wakaba 1.9 ## $Date: 2008/05/25 08:54:15 $
1306 wakaba 1.1

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24