/[suikacvs]/messaging/manakai/lib/Message/Charset/Info.pm
Suika

Contents of /messaging/manakai/lib/Message/Charset/Info.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.8 - (hide annotations) (download)
Sun May 25 08:54:15 2008 UTC (16 years, 6 months ago) by wakaba
Branch: MAIN
Changes since 1.7: +9 -5 lines
++ manakai/lib/Message/Charset/ChangeLog	25 May 2008 08:36:43 -0000
2008-05-25  Wakaba  <wakaba@suika.fam.cx>

	* Info.pm (CHARSET_CATEGORY_UTF16): New category.

1 wakaba 1.1 package Message::Charset::Info;
2     use strict;
3 wakaba 1.8 our $VERSION=do{my @r=(q$Revision: 1.7 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4    
5     ## TODO: More drastic charset name aliasing is now part of HTML5...
6 wakaba 1.1
7     sub UNREGISTERED_CHARSET_NAME () { 0b1 }
8 wakaba 1.4 ## Names for non-standard encodings/implementations for Perl encodings
9 wakaba 1.1 sub REGISTERED_CHARSET_NAME () { 0b10 }
10 wakaba 1.4 ## Names for standard encodings for Perl encodings
11     sub PRIMARY_CHARSET_NAME () { 0b100 }
12 wakaba 1.1 ## "Name:" field for IANA names
13 wakaba 1.4 ## Canonical name for Perl encodings
14     sub PREFERRED_CHARSET_NAME () { 0b1000 }
15 wakaba 1.1 ## "preferred MIME name" for IANA names
16    
17 wakaba 1.4 sub FALLBACK_ENCODING_IMPL () { 0b10000 }
18     ## For Perl encodings: Not a name of the encoding, the encoding
19     ## for the name might be useful as a fallback when the correct
20     ## encoding is not supported.
21     sub NONCONFORMING_ENCODING_IMPL () { FALLBACK_ENCODING_IMPL }
22     ## For Perl encodings: Not a conforming implementation of the encoding,
23     ## though it seems that the intention was to implement that encoding.
24 wakaba 1.6 sub SEMICONFORMING_ENCODING_IMPL () { 0b1000000 }
25     ## For Perl encodings: The implementation itself (returned by
26     ## |get_perl_encoding|) is non-conforming. The decode handle
27     ## implementation (returned by |get_decode_handle|) is conforming.
28 wakaba 1.4 sub ERROR_REPORTING_ENCODING_IMPL () { 0b100000 }
29     ## For Perl encodings: Support error reporting via |manakai_onerror|
30 wakaba 1.6 ## handler when the encoding is handled with decode handle.
31 wakaba 1.4
32 wakaba 1.2 ## iana_status
33     sub STATUS_COMMON () { 0b1 }
34     sub STATUS_LIMITED_USE () { 0b10 }
35     sub STATUS_OBSOLETE () { 0b100 }
36    
37 wakaba 1.5 ## category
38     sub CHARSET_CATEGORY_BLOCK_SAFE () { 0b1 }
39     ## NOTE: Stateless
40     sub CHARSET_CATEGORY_EUCJP () { 0b10 }
41     sub CHARSET_CATEGORY_SJIS () { 0b100 }
42 wakaba 1.8 sub CHARSET_CATEGORY_UTF16 () { 0b1000 }
43     ## NOTE: "A UTF-16 encoding" in HTML5.
44 wakaba 1.5
45 wakaba 1.1 ## iana_names
46 wakaba 1.5
47 wakaba 1.1 ## is_html_ascii_superset: "superset of US-ASCII (specifically, ANSI_X3.4-1968)
48     ## for bytes in the range 0x09 - 0x0D, 0x20, 0x21, 0x22, 0x26, 0x27,
49     ## 0x2C - 0x3F, 0x41 - 0x5A, and 0x61 - 0x7A" [HTML5]
50     ## is_ebcdic_based
51 wakaba 1.5 ## TODO: These flags are obsolete - should be replaced by category
52 wakaba 1.1
53     ## ISSUE: Shift_JIS is a superset of US-ASCII? ISO-2022-JP is?
54     ## ISSUE: 0x5F (_) should be added to the range?
55    
56     my $Charset;
57    
58     our $IANACharset;
59    
60     $Charset->{'us-ascii'}
61     = $IANACharset->{'ansi_x3.4-1968'}
62     = $IANACharset->{'iso-ir-6'}
63     = $IANACharset->{'ansi_x3.4-1986'}
64     = $IANACharset->{'iso_646.irv:1991'}
65     = $IANACharset->{'ascii'}
66     = $IANACharset->{'iso646-us'}
67     = $IANACharset->{'us-ascii'}
68     = $IANACharset->{'us'}
69     = $IANACharset->{'ibm367'}
70     = $IANACharset->{'cp367'}
71     = $IANACharset->{'csascii'}
72 wakaba 1.4 = __PACKAGE__->new ({
73 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
74 wakaba 1.1 iana_names => {
75 wakaba 1.4 'ansi_x3.4-1968' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
76 wakaba 1.1 'iso-ir-6' => REGISTERED_CHARSET_NAME,
77     'ansi_x3.4-1986' => REGISTERED_CHARSET_NAME,
78     'iso_646.irv:1991' => REGISTERED_CHARSET_NAME,
79     'ascii' => REGISTERED_CHARSET_NAME,
80     'iso646-us' => REGISTERED_CHARSET_NAME,
81 wakaba 1.4 'us-ascii' => PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
82 wakaba 1.1 'us' => REGISTERED_CHARSET_NAME,
83     'ibm367' => REGISTERED_CHARSET_NAME,
84     'cp367' => REGISTERED_CHARSET_NAME,
85     'csascii' => REGISTERED_CHARSET_NAME,
86     },
87     is_html_ascii_superset => 1,
88 wakaba 1.4 });
89 wakaba 1.1
90     $Charset->{'iso-8859-1'}
91     = $IANACharset->{'iso_8859-1:1987'}
92     = $IANACharset->{'iso-ir-100'}
93     = $IANACharset->{'iso_8859-1'}
94     = $IANACharset->{'iso-8859-1'}
95     = $IANACharset->{'latin1'}
96     = $IANACharset->{'l1'}
97     = $IANACharset->{'ibm819'}
98     = $IANACharset->{'cp819'}
99     = $IANACharset->{'csisolatin1'}
100 wakaba 1.4 = __PACKAGE__->new ({
101 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
102 wakaba 1.1 iana_names => {
103 wakaba 1.4 'iso_8859-1:1987' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
104 wakaba 1.1 'iso-ir-100' => REGISTERED_CHARSET_NAME,
105     'iso_8859-1' => REGISTERED_CHARSET_NAME,
106 wakaba 1.4 'iso-8859-1' => PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
107 wakaba 1.1 'latin1' => REGISTERED_CHARSET_NAME,
108     'l1' => REGISTERED_CHARSET_NAME,
109     'ibm819' => REGISTERED_CHARSET_NAME,
110     'cp819' => REGISTERED_CHARSET_NAME,
111     'csisolatin1' => REGISTERED_CHARSET_NAME,
112     },
113 wakaba 1.7 perl_names => {
114     'web-latin1' => UNREGISTERED_CHARSET_NAME | SEMICONFORMING_ENCODING_IMPL |
115     ERROR_REPORTING_ENCODING_IMPL,
116     'iso-8859-1' => FALLBACK_ENCODING_IMPL,
117     },
118     fallback => {
119     "\x80" => "\x{20AC}",
120     "\x82" => "\x{201A}",
121     "\x83" => "\x{0192}",
122     "\x84" => "\x{201E}",
123     "\x85" => "\x{2026}",
124     "\x86" => "\x{2020}",
125     "\x87" => "\x{2021}",
126     "\x88" => "\x{02C6}",
127     "\x89" => "\x{2030}",
128     "\x8A" => "\x{0160}",
129     "\x8B" => "\x{2039}",
130     "\x8C" => "\x{0152}",
131     "\x8E" => "\x{017D}",
132     "\x91" => "\x{2018}",
133     "\x92" => "\x{2019}",
134     "\x93" => "\x{201C}",
135     "\x94" => "\x{201D}",
136     "\x95" => "\x{2022}",
137     "\x96" => "\x{2013}",
138     "\x97" => "\x{2014}",
139     "\x98" => "\x{02DC}",
140     "\x99" => "\x{2122}",
141     "\x9A" => "\x{0161}",
142     "\x9B" => "\x{203A}",
143     "\x9C" => "\x{0153}",
144     "\x9E" => "\x{017E}",
145     "\x9F" => "\x{0178}",
146     },
147 wakaba 1.1 is_html_ascii_superset => 1,
148 wakaba 1.4 });
149 wakaba 1.1
150 wakaba 1.2 $Charset->{'iso-8859-2'}
151     = $IANACharset->{'iso_8859-2:1987'}
152     = $IANACharset->{'iso-ir-101'}
153     = $IANACharset->{'iso_8859-2'}
154     = $IANACharset->{'iso-8859-2'}
155     = $IANACharset->{'latin2'}
156     = $IANACharset->{'l2'}
157     = $IANACharset->{'csisolatin2'}
158 wakaba 1.4 = __PACKAGE__->new ({
159 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
160 wakaba 1.2 iana_names => {
161 wakaba 1.4 'iso_8859-2:1987' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
162 wakaba 1.2 'iso-ir-101' => REGISTERED_CHARSET_NAME,
163     'iso_8859-2' => REGISTERED_CHARSET_NAME,
164 wakaba 1.4 'iso-8859-2' => PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
165 wakaba 1.2 'latin2' => REGISTERED_CHARSET_NAME,
166     'l2' => REGISTERED_CHARSET_NAME,
167     'csisolatin2' => REGISTERED_CHARSET_NAME,
168     },
169     is_html_ascii_superset => 1,
170 wakaba 1.4 });
171 wakaba 1.2
172     $Charset->{'iso-8859-3'}
173     = $IANACharset->{'iso_8859-3:1988'}
174     = $IANACharset->{'iso-ir-109'}
175     = $IANACharset->{'iso_8859-3'}
176     = $IANACharset->{'iso-8859-3'}
177     = $IANACharset->{'latin3'}
178     = $IANACharset->{'l3'}
179     = $IANACharset->{'csisolatin3'}
180 wakaba 1.4 = __PACKAGE__->new ({
181 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
182 wakaba 1.2 iana_names => {
183 wakaba 1.4 'iso_8859-3:1988' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
184 wakaba 1.2 'iso-ir-109' => REGISTERED_CHARSET_NAME,
185     'iso_8859-3' => REGISTERED_CHARSET_NAME,
186 wakaba 1.4 'iso-8859-3' => PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
187 wakaba 1.2 'latin3' => REGISTERED_CHARSET_NAME,
188     'l3' => REGISTERED_CHARSET_NAME,
189     'csisolatin3' => REGISTERED_CHARSET_NAME,
190     },
191     is_html_ascii_superset => 1,
192 wakaba 1.4 });
193 wakaba 1.2
194     $Charset->{'iso-8859-4'}
195     = $IANACharset->{'iso_8859-4:1988'}
196     = $IANACharset->{'iso-ir-110'}
197     = $IANACharset->{'iso_8859-4'}
198     = $IANACharset->{'iso-8859-4'}
199     = $IANACharset->{'latin4'}
200     = $IANACharset->{'l4'}
201     = $IANACharset->{'csisolatin4'}
202 wakaba 1.4 = __PACKAGE__->new ({
203 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
204 wakaba 1.2 iana_names => {
205 wakaba 1.4 'iso_8859-4:1988' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
206 wakaba 1.2 'iso-ir-110' => REGISTERED_CHARSET_NAME,
207     'iso_8859-4' => REGISTERED_CHARSET_NAME,
208 wakaba 1.4 'iso-8859-4' => PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
209 wakaba 1.2 'latin4' => REGISTERED_CHARSET_NAME,
210     'l4' => REGISTERED_CHARSET_NAME,
211     'csisolatin4' => REGISTERED_CHARSET_NAME,
212     },
213     is_html_ascii_superset => 1,
214 wakaba 1.4 });
215 wakaba 1.2
216     $Charset->{'iso-8859-5'}
217     = $IANACharset->{'iso_8859-5:1988'}
218     = $IANACharset->{'iso-ir-144'}
219     = $IANACharset->{'iso_8859-5'}
220     = $IANACharset->{'iso-8859-5'}
221     = $IANACharset->{'cyrillic'}
222     = $IANACharset->{'csisolatincyrillic'}
223 wakaba 1.4 = __PACKAGE__->new ({
224 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
225 wakaba 1.2 iana_names => {
226 wakaba 1.4 'iso_8859-5:1988' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
227 wakaba 1.2 'iso-ir-144' => REGISTERED_CHARSET_NAME,
228     'iso_8859-5' => REGISTERED_CHARSET_NAME,
229 wakaba 1.4 'iso-8859-5' => PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
230 wakaba 1.2 'cyrillic' => REGISTERED_CHARSET_NAME,
231     'csisolatincyrillic' => REGISTERED_CHARSET_NAME,
232     },
233     is_html_ascii_superset => 1,
234 wakaba 1.4 });
235 wakaba 1.2
236     $Charset->{'iso-8859-6'}
237     = $IANACharset->{'iso_8859-6:1987'}
238     = $IANACharset->{'iso-ir-127'}
239     = $IANACharset->{'iso_8859-6'}
240     = $IANACharset->{'iso-8859-6'}
241     = $IANACharset->{'ecma-114'}
242     = $IANACharset->{'asmo-708'}
243     = $IANACharset->{'arabic'}
244     = $IANACharset->{'csisolatinarabic'}
245 wakaba 1.4 = __PACKAGE__->new ({
246 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
247 wakaba 1.2 iana_names => {
248 wakaba 1.4 'iso_8859-6:1987' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
249 wakaba 1.2 'iso-ir-127' => REGISTERED_CHARSET_NAME,
250     'iso_8859-6' => REGISTERED_CHARSET_NAME,
251 wakaba 1.4 'iso-8859-6' => PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
252 wakaba 1.2 'ecma-114' => REGISTERED_CHARSET_NAME,
253     'asmo-708' => REGISTERED_CHARSET_NAME,
254     'arabic' => REGISTERED_CHARSET_NAME,
255     'csisolatinarabic' => REGISTERED_CHARSET_NAME,
256     },
257     is_html_ascii_superset => 1,
258     ## NOTE: 3/0..3/9 have different semantics from U+0030..0039,
259     ## but have same character names (maybe).
260 wakaba 1.3 ## NOTE: According to RFC 2046, charset left-hand half of "iso-8859-6"
261     ## is same as "us-ascii".
262 wakaba 1.4 });
263 wakaba 1.2
264     $Charset->{'iso-8859-7'}
265     = $IANACharset->{'iso_8859-7:1987'}
266     = $IANACharset->{'iso-ir-126'}
267     = $IANACharset->{'iso_8859-7'}
268     = $IANACharset->{'iso-8859-7'}
269     = $IANACharset->{'elot_928'}
270     = $IANACharset->{'ecma-118'}
271     = $IANACharset->{'greek'}
272     = $IANACharset->{'greek8'}
273     = $IANACharset->{'csisolatingreek'}
274 wakaba 1.4 = __PACKAGE__->new ({
275 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
276 wakaba 1.2 iana_names => {
277 wakaba 1.4 'iso_8859-7:1987' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
278 wakaba 1.2 'iso-ir-126' => REGISTERED_CHARSET_NAME,
279     'iso_8859-7' => REGISTERED_CHARSET_NAME,
280 wakaba 1.4 'iso-8859-7' => PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
281 wakaba 1.2 'elot_928' => REGISTERED_CHARSET_NAME,
282     'ecma-118' => REGISTERED_CHARSET_NAME,
283     'greek' => REGISTERED_CHARSET_NAME,
284     'greek8' => REGISTERED_CHARSET_NAME,
285     'csisolatingreek' => REGISTERED_CHARSET_NAME,
286     },
287     is_html_ascii_superset => 1,
288 wakaba 1.4 });
289 wakaba 1.2
290     $Charset->{'iso-8859-8'}
291     = $IANACharset->{'iso_8859-8:1988'}
292     = $IANACharset->{'iso-ir-138'}
293     = $IANACharset->{'iso_8859-8'}
294     = $IANACharset->{'iso-8859-8'}
295     = $IANACharset->{'hebrew'}
296     = $IANACharset->{'csisolatinhebrew'}
297 wakaba 1.4 = __PACKAGE__->new ({
298 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
299 wakaba 1.2 iana_names => {
300 wakaba 1.4 'iso_8859-8:1988' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
301 wakaba 1.2 'iso-ir-138' => REGISTERED_CHARSET_NAME,
302     'iso_8859-8' => REGISTERED_CHARSET_NAME,
303 wakaba 1.4 'iso-8859-8' => PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
304 wakaba 1.2 'hebrew' => REGISTERED_CHARSET_NAME,
305     'csisolatinhebrew' => REGISTERED_CHARSET_NAME,
306     },
307     is_html_ascii_superset => 1,
308 wakaba 1.4 });
309 wakaba 1.2
310     $Charset->{'iso-8859-9'}
311     = $IANACharset->{'iso_8859-9:1989'}
312     = $IANACharset->{'iso-ir-148'}
313     = $IANACharset->{'iso_8859-9'}
314     = $IANACharset->{'iso-8859-9'}
315     = $IANACharset->{'latin5'}
316     = $IANACharset->{'l5'}
317     = $IANACharset->{'csisolatin5'}
318 wakaba 1.4 = __PACKAGE__->new ({
319 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
320 wakaba 1.2 iana_names => {
321 wakaba 1.4 'iso_8859-9:1989' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
322 wakaba 1.2 'iso-ir-148' => REGISTERED_CHARSET_NAME,
323     'iso_8859-9' => REGISTERED_CHARSET_NAME,
324 wakaba 1.4 'iso-8859-9' => PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
325 wakaba 1.2 'latin5' => REGISTERED_CHARSET_NAME,
326     'l5' => REGISTERED_CHARSET_NAME,
327     'csisolatin5' => REGISTERED_CHARSET_NAME,
328     },
329     is_html_ascii_superset => 1,
330 wakaba 1.4 });
331 wakaba 1.2
332     $Charset->{'iso-8859-10'}
333     = $IANACharset->{'iso-8859-10'}
334     = $IANACharset->{'iso-ir-157'}
335     = $IANACharset->{'l6'}
336     = $IANACharset->{'iso_8859-10:1992'}
337     = $IANACharset->{'csisolatin6'}
338     = $IANACharset->{'latin6'}
339 wakaba 1.4 = __PACKAGE__->new ({
340 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
341 wakaba 1.2 iana_names => {
342 wakaba 1.4 'iso-8859-10' => PRIMARY_CHARSET_NAME | PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
343 wakaba 1.2 'iso-ir-157' => REGISTERED_CHARSET_NAME,
344     'l6' => REGISTERED_CHARSET_NAME,
345     'iso_8859-10:1992' => REGISTERED_CHARSET_NAME,
346     'csisolatin6' => REGISTERED_CHARSET_NAME,
347     'latin6' => REGISTERED_CHARSET_NAME,
348     },
349     is_html_ascii_superset => 1,
350 wakaba 1.4 });
351 wakaba 1.2
352     $Charset->{'iso_6937-2-add'}
353     = $IANACharset->{'iso_6937-2-add'}
354     = $IANACharset->{'iso-ir-142'}
355     = $IANACharset->{'csisotextcomm'}
356 wakaba 1.4 = __PACKAGE__->new ({
357 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
358 wakaba 1.2 iana_names => {
359 wakaba 1.4 'iso_6937-2-add' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
360 wakaba 1.2 'iso-ir-142' => REGISTERED_CHARSET_NAME,
361     'csisotextcomm' => REGISTERED_CHARSET_NAME,
362     },
363     is_html_ascii_superset => 1,
364 wakaba 1.4 });
365 wakaba 1.2
366     $Charset->{'jis_x0201'}
367     = $IANACharset->{'jis_x0201'}
368     = $IANACharset->{'x0201'}
369     = $IANACharset->{'cshalfwidthkatakana'}
370 wakaba 1.4 = __PACKAGE__->new ({
371 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
372 wakaba 1.2 iana_names => {
373 wakaba 1.4 'jis_x0201' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
374 wakaba 1.2 'x0201' => REGISTERED_CHARSET_NAME,
375     'cshalfwidthkatakana' => REGISTERED_CHARSET_NAME,
376     },
377     is_html_ascii_superset => 1,
378 wakaba 1.4 });
379 wakaba 1.2
380     $Charset->{'jis_encoding'}
381     = $IANACharset->{'jis_encoding'}
382     = $IANACharset->{'csjisencoding'}
383 wakaba 1.4 = __PACKAGE__->new ({
384 wakaba 1.5 category => 0,
385 wakaba 1.2 iana_names => {
386 wakaba 1.4 'jis_encoding' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
387 wakaba 1.2 'csjisencoding' => REGISTERED_CHARSET_NAME,
388     },
389     ## NOTE: What is this?
390 wakaba 1.4 });
391 wakaba 1.1
392     $Charset->{'shift_jis'}
393     = $IANACharset->{'shift_jis'}
394     = $IANACharset->{'ms_kanji'}
395     = $IANACharset->{'csshiftjis'}
396 wakaba 1.4 = __PACKAGE__->new ({
397 wakaba 1.5 category => CHARSET_CATEGORY_SJIS | CHARSET_CATEGORY_BLOCK_SAFE,
398 wakaba 1.1 iana_names => {
399 wakaba 1.4 'shift_jis' => PREFERRED_CHARSET_NAME | PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
400 wakaba 1.1 'ms_kanji' => REGISTERED_CHARSET_NAME,
401     'csshiftjis' => REGISTERED_CHARSET_NAME,
402     },
403 wakaba 1.5 perl_names => {
404 wakaba 1.6 'shift-jis-1997' => UNREGISTERED_CHARSET_NAME |
405 wakaba 1.7 SEMICONFORMING_ENCODING_IMPL | ERROR_REPORTING_ENCODING_IMPL,
406     shiftjis => PRIMARY_CHARSET_NAME | NONCONFORMING_ENCODING_IMPL |
407 wakaba 1.6 ERROR_REPORTING_ENCODING_IMPL,
408 wakaba 1.5 ## NOTE: Unicode mapping is wrong.
409     },
410 wakaba 1.2 mime_text_suitable => 1,
411 wakaba 1.4 });
412 wakaba 1.1
413 wakaba 1.3 $Charset->{'x-sjis'}
414     = $IANACharset->{'x-sjis'}
415 wakaba 1.4 = __PACKAGE__->new ({
416 wakaba 1.5 category => CHARSET_CATEGORY_SJIS | CHARSET_CATEGORY_BLOCK_SAFE,
417 wakaba 1.3 iana_names => {
418     'x-sjis' => UNREGISTERED_CHARSET_NAME,
419     },
420 wakaba 1.6 perl_names => {
421 wakaba 1.7 'shift-jis-1997' => FALLBACK_ENCODING_IMPL | ERROR_REPORTING_ENCODING_IMPL,
422 wakaba 1.6 },
423 wakaba 1.3 mime_text_suitable => 1,
424 wakaba 1.4 });
425 wakaba 1.3
426 wakaba 1.5 $Charset->{shift_jisx0213}
427     = $IANACharset->{shift_jisx0213}
428     = __PACKAGE__->new ({
429     category => CHARSET_CATEGORY_SJIS | CHARSET_CATEGORY_BLOCK_SAFE,
430     iana_names => {
431     shift_jisx0213 => UNREGISTERED_CHARSET_NAME,
432     },
433     perl_names => {
434     #shift_jisx0213 (non-standard - i don't know its conformance)
435 wakaba 1.7 'shift-jis-1997' => FALLBACK_ENCODING_IMPL | ERROR_REPORTING_ENCODING_IMPL,
436     'shiftjis' => FALLBACK_ENCODING_IMPL | ERROR_REPORTING_ENCODING_IMPL,
437 wakaba 1.5 },
438     mime_text_suitable => 1,
439     });
440    
441 wakaba 1.1 $Charset->{'euc-jp'}
442     = $IANACharset->{'extended_unix_code_packed_format_for_japanese'}
443     = $IANACharset->{'cseucpkdfmtjapanese'}
444     = $IANACharset->{'euc-jp'}
445 wakaba 1.3 = $IANACharset->{'x-euc-jp'}
446 wakaba 1.4 = __PACKAGE__->new ({
447 wakaba 1.5 category => CHARSET_CATEGORY_EUCJP | CHARSET_CATEGORY_BLOCK_SAFE,
448 wakaba 1.1 iana_names => {
449 wakaba 1.4 'extended_unix_code_packed_format_for_japanese' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
450 wakaba 1.1 'cseucpkdfmtjapanese' => REGISTERED_CHARSET_NAME,
451 wakaba 1.4 'euc-jp' => PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
452 wakaba 1.1 },
453 wakaba 1.5 perl_names => {
454 wakaba 1.7 'euc-jp-1997' => UNREGISTERED_CHARSET_NAME |
455     SEMICONFORMING_ENCODING_IMPL | ERROR_REPORTING_ENCODING_IMPL,
456 wakaba 1.5 ## NOTE: Though the IANA definition references the 1990 version
457     ## of EUC-JP, the 1997 version of JIS standard claims that the version
458     ## is same coded character set as the 1990 version, such that we
459     ## consider the EUC-JP 1990 version is same as the 1997 version.
460 wakaba 1.6 'euc-jp' => PREFERRED_CHARSET_NAME | NONCONFORMING_ENCODING_IMPL |
461     ERROR_REPORTING_ENCODING_IMPL,
462 wakaba 1.5 ## NOTE: Unicode mapping is wrong.
463     },
464 wakaba 1.1 is_html_ascii_superset => 1,
465 wakaba 1.3 mime_text_suitable => 1,
466 wakaba 1.4 });
467 wakaba 1.3
468     $Charset->{'x-euc-jp'}
469     = $IANACharset->{'x-euc-jp'}
470 wakaba 1.4 = __PACKAGE__->new ({
471 wakaba 1.5 category => CHARSET_CATEGORY_EUCJP | CHARSET_CATEGORY_BLOCK_SAFE,
472 wakaba 1.3 iana_names => {
473     'x-euc-jp' => UNREGISTERED_CHARSET_NAME,
474     },
475 wakaba 1.6 perl_names => {
476     'euc-jp-1997' => FALLBACK_ENCODING_IMPL | ERROR_REPORTING_ENCODING_IMPL,
477     'euc-jp' => FALLBACK_ENCODING_IMPL | ERROR_REPORTING_ENCODING_IMPL,
478     },
479     is_html_ascii_superset => 1, is_html_ascii_superset => 1,
480 wakaba 1.3 mime_text_suitable => 1,
481 wakaba 1.4 });
482 wakaba 1.1
483 wakaba 1.2 $Charset->{'extended_unix_code_fixed_width_for_japanese'}
484     = $IANACharset->{'extended_unix_code_fixed_width_for_japanese'}
485     = $IANACharset->{'cseucfixwidjapanese'}
486 wakaba 1.4 = __PACKAGE__->new ({
487 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
488 wakaba 1.2 iana_names => {
489 wakaba 1.4 'extended_unix_code_fixed_width_for_japanese' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
490 wakaba 1.2 'cseucfixwidjapanese' => REGISTERED_CHARSET_NAME,
491     },
492 wakaba 1.4 });
493 wakaba 1.2
494 wakaba 1.1 ## TODO: ...
495    
496 wakaba 1.2 $Charset->{'euc-kr'}
497     = $IANACharset->{'euc-kr'}
498     = $IANACharset->{'cseuckr'}
499 wakaba 1.4 = __PACKAGE__->new ({
500 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
501 wakaba 1.2 iana_names => {
502 wakaba 1.4 'euc-kr' => PRIMARY_CHARSET_NAME | PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
503 wakaba 1.2 'cseuckr' => REGISTERED_CHARSET_NAME,
504     },
505     is_html_ascii_superset => 1,
506 wakaba 1.4 });
507 wakaba 1.2
508 wakaba 1.1 $Charset->{'iso-2022-jp'}
509     = $IANACharset->{'iso-2022-jp'}
510     = $IANACharset->{'csiso2022jp'}
511 wakaba 1.3 = $IANACharset->{'iso2022jp'}
512     = $IANACharset->{'junet-code'}
513 wakaba 1.4 = __PACKAGE__->new ({
514 wakaba 1.5 category => 0,
515 wakaba 1.1 iana_names => {
516 wakaba 1.4 'iso-2022-jp' => PREFERRED_CHARSET_NAME | PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
517 wakaba 1.1 'csiso2022jp' => REGISTERED_CHARSET_NAME,
518 wakaba 1.3 'iso2022jp' => UNREGISTERED_CHARSET_NAME,
519     'junet-code' => UNREGISTERED_CHARSET_NAME,
520 wakaba 1.1 },
521 wakaba 1.2 mime_text_suitable => 1,
522 wakaba 1.4 });
523 wakaba 1.2
524     $Charset->{'iso-2022-jp-2'}
525     = $IANACharset->{'iso-2022-jp-2'}
526     = $IANACharset->{'csiso2022jp2'}
527 wakaba 1.4 = __PACKAGE__->new ({
528 wakaba 1.5 category => 0,
529 wakaba 1.2 iana_names => {
530 wakaba 1.4 'iso-2022-jp-2' => PREFERRED_CHARSET_NAME | PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
531 wakaba 1.2 'csiso2022jp2' => REGISTERED_CHARSET_NAME,
532     },
533     mime_text_suitable => 1,
534 wakaba 1.4 });
535 wakaba 1.1
536     ## TODO: ...
537    
538     $Charset->{'utf-8'}
539     = $IANACharset->{'utf-8'}
540 wakaba 1.3 = $IANACharset->{'x-utf-8'}
541 wakaba 1.4 = __PACKAGE__->new ({
542 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
543 wakaba 1.1 iana_names => {
544 wakaba 1.4 'utf-8' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
545 wakaba 1.6 ## NOTE: IANA name "utf-8" references RFC 3629. According to the RFC,
546     ## the definitive definition is one specified in the Unicode Standard.
547 wakaba 1.3 'x-utf-8' => UNREGISTERED_CHARSET_NAME,
548 wakaba 1.1 },
549 wakaba 1.6 perl_names => {
550     'utf-8-strict' => PRIMARY_CHARSET_NAME | SEMICONFORMING_ENCODING_IMPL |
551     ERROR_REPORTING_ENCODING_IMPL,
552     ## NOTE: It does not support non-Unicode UCS characters (conforming).
553     ## It does detect illegal sequences (conforming).
554     ## It does not support surrpgate pairs (conforming).
555     ## It does not support BOMs (non-conforming).
556     },
557     bom_pattern => qr/\xEF\xBB\xBF/,
558 wakaba 1.1 is_html_ascii_superset => 1,
559 wakaba 1.3 mime_text_suitable => 1,
560 wakaba 1.4 });
561 wakaba 1.3
562     $Charset->{'utf-8n'}
563     = $IANACharset->{'utf-8n'}
564 wakaba 1.4 = __PACKAGE__->new ({
565 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
566 wakaba 1.3 iana_names => {
567     'utf-8n' => UNREGISTERED_CHARSET_NAME,
568 wakaba 1.6 ## NOTE: Is there any normative definition for the charset?
569     ## What variant of UTF-8 should we use for the charset?
570     },
571     perl_names => {
572     'utf-8-strict' => PRIMARY_CHARSET_NAME | ERROR_REPORTING_ENCODING_IMPL,
573 wakaba 1.3 },
574     is_html_ascii_superset => 1,
575     mime_text_suitable => 1,
576 wakaba 1.4 });
577 wakaba 1.1
578     ## TODO: ...
579    
580 wakaba 1.2 $Charset->{'gbk'}
581     = $IANACharset->{'gbk'}
582     = $IANACharset->{'cp936'}
583     = $IANACharset->{'ms936'}
584     = $IANACharset->{'windows-936'}
585 wakaba 1.4 = __PACKAGE__->new ({
586 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
587 wakaba 1.2 iana_names => {
588 wakaba 1.4 'gbk' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
589 wakaba 1.2 'cp936' => REGISTERED_CHARSET_NAME,
590     'ms936' => REGISTERED_CHARSET_NAME,
591     'windows-936' => REGISTERED_CHARSET_NAME,
592     },
593     iana_status => STATUS_COMMON | STATUS_OBSOLETE,
594     mime_text_suitable => 1,
595 wakaba 1.4 });
596 wakaba 1.2
597     $Charset->{'gb18030'}
598     = $IANACharset->{'gb18030'}
599 wakaba 1.4 = __PACKAGE__->new ({
600 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
601 wakaba 1.2 iana_names => {
602 wakaba 1.4 'gb18030' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
603 wakaba 1.2 },
604     iana_status => STATUS_COMMON,
605     mime_text_suitable => 1,
606 wakaba 1.4 });
607 wakaba 1.2
608     ## TODO: ...
609    
610 wakaba 1.1 $Charset->{'utf-16be'}
611     = $IANACharset->{'utf-16be'}
612 wakaba 1.4 = __PACKAGE__->new ({
613 wakaba 1.8 category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_UTF16,
614 wakaba 1.1 iana_names => {
615 wakaba 1.4 'utf-16be' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
616 wakaba 1.1 },
617 wakaba 1.4 });
618 wakaba 1.1
619     $Charset->{'utf-16le'}
620     = $IANACharset->{'utf-16le'}
621 wakaba 1.4 = __PACKAGE__->new ({
622 wakaba 1.8 category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_UTF16,
623 wakaba 1.1 iana_names => {
624 wakaba 1.4 'utf-16le' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
625 wakaba 1.1 },
626 wakaba 1.4 });
627 wakaba 1.1
628     $Charset->{'utf-16'}
629     = $IANACharset->{'utf-16'}
630 wakaba 1.4 = __PACKAGE__->new ({
631 wakaba 1.8 category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_UTF16,
632 wakaba 1.1 iana_names => {
633 wakaba 1.4 'utf-16' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
634 wakaba 1.1 },
635 wakaba 1.4 });
636 wakaba 1.1
637     ## TODO: ...
638    
639 wakaba 1.2 $Charset->{'windows-31j'}
640     = $IANACharset->{'windows-31j'}
641     = $IANACharset->{'cswindows31j'}
642 wakaba 1.4 = __PACKAGE__->new ({
643 wakaba 1.5 category => CHARSET_CATEGORY_SJIS | CHARSET_CATEGORY_BLOCK_SAFE,
644 wakaba 1.2 iana_names => {
645 wakaba 1.4 'windows-31j' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
646 wakaba 1.2 'cswindows31j' => REGISTERED_CHARSET_NAME,
647     },
648     iana_status => STATUS_LIMITED_USE, # maybe
649     mime_text_suitable => 1,
650 wakaba 1.4 });
651 wakaba 1.2
652     $Charset->{'gb2312'}
653     = $IANACharset->{'gb2312'}
654     = $IANACharset->{'csgb2312'}
655 wakaba 1.4 = __PACKAGE__->new ({
656 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
657 wakaba 1.2 iana_names => {
658 wakaba 1.4 'gb2312' => PRIMARY_CHARSET_NAME | PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
659 wakaba 1.2 'csgb2312' => REGISTERED_CHARSET_NAME,
660     },
661     is_html_ascii_superset => 1,
662     mime_text_suitable => 1,
663 wakaba 1.4 });
664 wakaba 1.2
665     $Charset->{'big5'}
666     = $IANACharset->{'big5'}
667     = $IANACharset->{'csbig5'}
668 wakaba 1.4 = __PACKAGE__->new ({
669 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
670 wakaba 1.2 iana_names => {
671 wakaba 1.4 'big5' => PRIMARY_CHARSET_NAME | PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
672 wakaba 1.2 'csbig5' => REGISTERED_CHARSET_NAME,
673     },
674     mime_text_suitable => 1,
675 wakaba 1.4 });
676 wakaba 1.2
677     ## TODO: ...
678    
679     $Charset->{'big5-hkscs'}
680     = $IANACharset->{'big5-hkscs'}
681 wakaba 1.4 = __PACKAGE__->new ({
682 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
683 wakaba 1.2 iana_names => {
684 wakaba 1.4 'big5-hkscs' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
685 wakaba 1.2 },
686     mime_text_suitable => 1,
687 wakaba 1.4 });
688 wakaba 1.2
689     ## TODO: ...
690    
691 wakaba 1.1 $Charset->{'windows-1252'}
692     = $IANACharset->{'windows-1252'}
693 wakaba 1.4 = __PACKAGE__->new ({
694 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
695 wakaba 1.1 iana_names => {
696 wakaba 1.4 'windows-1252' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
697 wakaba 1.1 },
698     is_html_ascii_superset => 1,
699 wakaba 1.4 });
700 wakaba 1.1
701     ## TODO: ...
702    
703 wakaba 1.7 $Charset->{'tis-620'}
704     = $IANACharset->{'tis-620'}
705     = __PACKAGE__->new ({
706     category => CHARSET_CATEGORY_BLOCK_SAFE,
707     iana_names => {
708     'tis-620' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
709     },
710     perl_names => {
711     'tis-620' => FALLBACK_ENCODING_IMPL | ERROR_REPORTING_ENCODING_IMPL,
712     ## NOTE: An alias of |iso-8859-11|.
713     },
714     is_html_ascii_superset => 1,
715     });
716    
717     $Charset->{'iso-8859-11'}
718     = $IANACharset->{'iso-8859-11'}
719     = __PACKAGE__->new ({
720     category => CHARSET_CATEGORY_BLOCK_SAFE,
721     iana_names => {
722     'iso-8859-11' => UNREGISTERED_CHARSET_NAME,
723     ## NOTE: The Web Thai encoding, i.e. windows-874.
724     },
725     perl_names => {
726     'windows-874' => FALLBACK_ENCODING_IMPL | ERROR_REPORTING_ENCODING_IMPL,
727     'web-thai' => UNREGISTERED_CHARSET_NAME | ERROR_REPORTING_ENCODING_IMPL,
728     },
729     fallback => {
730     "\x80" => "\x{20AC}",
731     "\x85" => "\x{2026}",
732     "\x91" => "\x{2018}",
733     "\x92" => "\x{2019}",
734     "\x93" => "\x{201C}",
735     "\x94" => "\x{201D}",
736     "\x95" => "\x{2022}",
737     "\x96" => "\x{2013}",
738     "\x97" => "\x{2014}",
739     },
740     is_html_ascii_superset => 1,
741     });
742    
743     $Charset->{'windows-874'}
744     = $IANACharset->{'windows-874'}
745     = __PACKAGE__->new ({
746     category => CHARSET_CATEGORY_BLOCK_SAFE,
747     iana_names => {
748     'windows-874' => UNREGISTERED_CHARSET_NAME,
749     },
750     perl_names => {
751     'windows-874' => REGISTERED_CHARSET_NAME | ERROR_REPORTING_ENCODING_IMPL,
752     },
753     is_html_ascii_superset => 1,
754     });
755    
756 wakaba 1.4 sub new ($$) {
757     return bless $_[1], $_[0];
758     } # new
759    
760     ## NOTE: A class method
761     sub get_by_iana_name ($$) {
762     my $name = $_[1];
763     $name =~ tr/A-Z/a-z/; ## ASCII case-insensitive
764     unless ($IANACharset->{$name}) {
765     $IANACharset->{$name} = __PACKAGE__->new ({
766     iana_names => {
767     $name => UNREGISTERED_CHARSET_NAME,
768     },
769     });
770     }
771     return $IANACharset->{$name};
772     } # get_by_iana_name
773    
774 wakaba 1.5 sub get_decode_handle ($$;%) {
775     my $self = shift;
776     my $byte_stream = shift;
777     my %opt = @_;
778    
779     my $obj = {
780     character_queue => [],
781     filehandle => $byte_stream,
782     charset => '', ## TODO: We set a charset name for input_encoding (when we get identify-by-URI nonsense away)
783     byte_buffer => $opt{byte_buffer} ? ${$opt{byte_buffer}} : '', ## TODO: ref, instead of value, should be used
784     onerror => $opt{onerror} || sub {},
785 wakaba 1.7 must_level => 'm',
786     fact_level => 'm',
787 wakaba 1.5 };
788    
789     require Whatpm::Charset::DecodeHandle;
790     if ($self->{iana_names}->{'iso-2022-jp'}) {
791     $obj->{state_2440} = 'gl-jis-1978';
792     $obj->{state_2442} = 'gl-jis-1983';
793     $obj->{state} = 'state_2842';
794     eval {
795     require Encode::GLJIS1978;
796     require Encode::GLJIS1983;
797     };
798     if (Encode::find_encoding ($obj->{state_2440}) and
799     Encode::find_encoding ($obj->{state_2442})) {
800     return ((bless $obj, 'Whatpm::Charset::DecodeHandle::ISO2022JP'),
801     PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME);
802     }
803     } elsif ($self->{xml_names}->{'iso-2022-jp'}) {
804     $obj->{state_2440} = 'gl-jis-1997-swapped';
805     $obj->{state_2442} = 'gl-jis-1997';
806     $obj->{state} = 'state_2842';
807     eval {
808     require Encode::GLJIS1997Swapped;
809     require Encode::GLJIS1997;
810     };
811     if (Encode::find_encoding ($obj->{state_2440}) and
812     Encode::find_encoding ($obj->{state_2442})) {
813     return ((bless $obj, 'Whatpm::Charset::DecodeHandle::ISO2022JP'),
814     PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME);
815     }
816     }
817    
818 wakaba 1.6 my ($e, $e_status) = $self->get_perl_encoding
819     (%opt, allow_semiconforming => 1);
820 wakaba 1.5 if ($e) {
821     $obj->{perl_encoding_name} = $e->name;
822     if ($self->{category} & CHARSET_CATEGORY_EUCJP) {
823     return ((bless $obj, 'Whatpm::Charset::DecodeHandle::EUCJP'),
824     $e_status);
825     } elsif ($self->{category} & CHARSET_CATEGORY_SJIS) {
826     return ((bless $obj, 'Whatpm::Charset::DecodeHandle::ShiftJIS'),
827     $e_status);
828 wakaba 1.7 #} elsif ($self->{category} & CHARSET_CATEGORY_BLOCK_SAFE) {
829     } else {
830     $e_status |= FALLBACK_ENCODING_IMPL
831     unless $self->{category} & CHARSET_CATEGORY_BLOCK_SAFE;
832 wakaba 1.6 $obj->{bom_pattern} = $self->{bom_pattern};
833 wakaba 1.7 $obj->{fallback} = $self->{fallback};
834 wakaba 1.5 return ((bless $obj, 'Whatpm::Charset::DecodeHandle::Encode'),
835     $e_status);
836 wakaba 1.7 #} else {
837     # ## TODO: no encoding error (?)
838     # return (undef, 0);
839 wakaba 1.5 }
840     } else {
841     ## TODO: no encoding error(?)
842     return (undef, 0);
843     }
844     } # get_decode_handle
845    
846 wakaba 1.4 sub get_perl_encoding ($;%) {
847     my ($self, %opt) = @_;
848    
849     require Encode;
850 wakaba 1.5 my $load_encode = sub {
851     my $name = shift;
852     if ($name eq 'euc-jp-1997') {
853     require Encode::EUCJP1997;
854     } elsif ($name eq 'shift-jis-1997') {
855     require Encode::ShiftJIS1997;
856 wakaba 1.7 } elsif ($name eq 'web-latin1') {
857     require Whatpm::Charset::WebLatin1;
858     } elsif ($name eq 'web-thai') {
859     require Whatpm::Charset::WebThai;
860 wakaba 1.5 }
861     }; # $load_encode
862 wakaba 1.4
863     if ($opt{allow_error_reporting}) {
864     for my $perl_name (keys %{$self->{perl_names} or {}}) {
865     my $perl_status = $self->{perl_names}->{$perl_name};
866     next unless $perl_status & ERROR_REPORTING_ENCODING_IMPL;
867 wakaba 1.5 next if $perl_status & FALLBACK_ENCODING_IMPL;
868 wakaba 1.6 next if $perl_status & SEMICONFORMING_ENCODING_IMPL and
869     not $opt{allow_semiconforming};
870 wakaba 1.4
871 wakaba 1.5 $load_encode->($perl_name);
872 wakaba 1.4 my $e = Encode::find_encoding ($perl_name);
873     if ($e) {
874     return ($e, $perl_status);
875     }
876     }
877     }
878    
879     for my $perl_name (keys %{$self->{perl_names} or {}}) {
880     my $perl_status = $self->{perl_names}->{$perl_name};
881     next if $perl_status & ERROR_REPORTING_ENCODING_IMPL;
882     next if $perl_status & FALLBACK_ENCODING_IMPL;
883 wakaba 1.6 next if $perl_status & SEMICONFORMING_ENCODING_IMPL and
884     not $opt{allow_semiconforming};
885 wakaba 1.5
886     $load_encode->($perl_name);
887 wakaba 1.4 my $e = Encode::find_encoding ($perl_name);
888     if ($e) {
889     return ($e, $perl_status);
890     }
891     }
892    
893     if ($opt{allow_fallback}) {
894     for my $perl_name (keys %{$self->{perl_names} or {}}) {
895     my $perl_status = $self->{perl_names}->{$perl_name};
896 wakaba 1.6 next unless $perl_status & FALLBACK_ENCODING_IMPL or
897     $perl_status & SEMICONFORMING_ENCODING_IMPL;
898     ## NOTE: We don't prefer semi-conforming implementations to
899     ## non-conforming implementations, since semi-conforming implementations
900     ## will never be conforming without assist of the callee, and in such
901     ## cases the callee should set the |allow_semiconforming| option upon
902     ## the invocation of the method anyway.
903    
904 wakaba 1.5 $load_encode->($perl_name);
905 wakaba 1.4 my $e = Encode::find_encoding ($perl_name);
906     if ($e) {
907     return ($e, $perl_status);
908     }
909     }
910    
911     for my $iana_name (keys %{$self->{iana_names} or {}}) {
912 wakaba 1.5 $load_encode->($iana_name);
913 wakaba 1.4 my $e = Encode::find_encoding ($iana_name);
914     if ($e) {
915     return ($e, FALLBACK_ENCODING_IMPL);
916     }
917     }
918     }
919    
920     return (undef, 0);
921     } # get_perl_encoding
922    
923     sub get_iana_name ($) {
924     my $self = shift;
925    
926     my $primary;
927     my $other;
928     for my $iana_name (keys %{$self->{iana_names} or {}}) {
929     my $name_status = $self->{iana_names}->{$iana_name};
930     if ($name_status & PREFERRED_CHARSET_NAME) {
931     return $iana_name;
932     } elsif ($name_status & PRIMARY_CHARSET_NAME) {
933     $primary = $iana_name;
934     } elsif ($name_status & REGISTERED_CHARSET_NAME) {
935     $other = $iana_name;
936     } else {
937     $other ||= $iana_name;
938     }
939     }
940    
941     return $primary || $other;
942     } # get_iana_name
943    
944     ## NOTE: A non-method function
945 wakaba 1.3 sub is_syntactically_valid_iana_charset_name ($) {
946 wakaba 1.1 my $name = shift;
947     return $name =~ /\A[\x20-\x7E]{1,40}\z/;
948     } # is_suntactically_valid_iana_charset_name
949    
950     1;
951 wakaba 1.8 ## $Date: 2008/05/18 06:09:50 $
952 wakaba 1.1

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24