/[suikacvs]/messaging/manakai/lib/Message/Charset/Info.pm
Suika

Contents of /messaging/manakai/lib/Message/Charset/Info.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.7 - (hide annotations) (download)
Sun May 18 06:09:50 2008 UTC (16 years, 6 months ago) by wakaba
Branch: MAIN
Changes since 1.6: +110 -11 lines
++ manakai/lib/Message/Charset/ChangeLog	18 May 2008 06:09:39 -0000
	* Info.pm: Support for WebLatin encoding.  Support for Thai
	encodings.  Support for non-block-safe encodings.  However,
	for non-block-safe encodings we cannot assume that it works
	well on our decode handle framework, therefore they are marked
	as "fallback" mode (i.e. its result is unknown).

2008-05-18  Wakaba  <wakaba@suika.fam.cx>

1 wakaba 1.1 package Message::Charset::Info;
2     use strict;
3 wakaba 1.7 our $VERSION=do{my @r=(q$Revision: 1.6 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 wakaba 1.1
5     sub UNREGISTERED_CHARSET_NAME () { 0b1 }
6 wakaba 1.4 ## Names for non-standard encodings/implementations for Perl encodings
7 wakaba 1.1 sub REGISTERED_CHARSET_NAME () { 0b10 }
8 wakaba 1.4 ## Names for standard encodings for Perl encodings
9     sub PRIMARY_CHARSET_NAME () { 0b100 }
10 wakaba 1.1 ## "Name:" field for IANA names
11 wakaba 1.4 ## Canonical name for Perl encodings
12     sub PREFERRED_CHARSET_NAME () { 0b1000 }
13 wakaba 1.1 ## "preferred MIME name" for IANA names
14    
15 wakaba 1.4 sub FALLBACK_ENCODING_IMPL () { 0b10000 }
16     ## For Perl encodings: Not a name of the encoding, the encoding
17     ## for the name might be useful as a fallback when the correct
18     ## encoding is not supported.
19     sub NONCONFORMING_ENCODING_IMPL () { FALLBACK_ENCODING_IMPL }
20     ## For Perl encodings: Not a conforming implementation of the encoding,
21     ## though it seems that the intention was to implement that encoding.
22 wakaba 1.6 sub SEMICONFORMING_ENCODING_IMPL () { 0b1000000 }
23     ## For Perl encodings: The implementation itself (returned by
24     ## |get_perl_encoding|) is non-conforming. The decode handle
25     ## implementation (returned by |get_decode_handle|) is conforming.
26 wakaba 1.4 sub ERROR_REPORTING_ENCODING_IMPL () { 0b100000 }
27     ## For Perl encodings: Support error reporting via |manakai_onerror|
28 wakaba 1.6 ## handler when the encoding is handled with decode handle.
29 wakaba 1.4
30 wakaba 1.2 ## iana_status
31     sub STATUS_COMMON () { 0b1 }
32     sub STATUS_LIMITED_USE () { 0b10 }
33     sub STATUS_OBSOLETE () { 0b100 }
34    
35 wakaba 1.5 ## category
36     sub CHARSET_CATEGORY_BLOCK_SAFE () { 0b1 }
37     ## NOTE: Stateless
38     sub CHARSET_CATEGORY_EUCJP () { 0b10 }
39     sub CHARSET_CATEGORY_SJIS () { 0b100 }
40    
41 wakaba 1.1 ## iana_names
42 wakaba 1.5
43 wakaba 1.1 ## is_html_ascii_superset: "superset of US-ASCII (specifically, ANSI_X3.4-1968)
44     ## for bytes in the range 0x09 - 0x0D, 0x20, 0x21, 0x22, 0x26, 0x27,
45     ## 0x2C - 0x3F, 0x41 - 0x5A, and 0x61 - 0x7A" [HTML5]
46     ## is_ebcdic_based
47 wakaba 1.5 ## TODO: These flags are obsolete - should be replaced by category
48 wakaba 1.1
49     ## ISSUE: Shift_JIS is a superset of US-ASCII? ISO-2022-JP is?
50     ## ISSUE: 0x5F (_) should be added to the range?
51    
52     my $Charset;
53    
54     our $IANACharset;
55    
56     $Charset->{'us-ascii'}
57     = $IANACharset->{'ansi_x3.4-1968'}
58     = $IANACharset->{'iso-ir-6'}
59     = $IANACharset->{'ansi_x3.4-1986'}
60     = $IANACharset->{'iso_646.irv:1991'}
61     = $IANACharset->{'ascii'}
62     = $IANACharset->{'iso646-us'}
63     = $IANACharset->{'us-ascii'}
64     = $IANACharset->{'us'}
65     = $IANACharset->{'ibm367'}
66     = $IANACharset->{'cp367'}
67     = $IANACharset->{'csascii'}
68 wakaba 1.4 = __PACKAGE__->new ({
69 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
70 wakaba 1.1 iana_names => {
71 wakaba 1.4 'ansi_x3.4-1968' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
72 wakaba 1.1 'iso-ir-6' => REGISTERED_CHARSET_NAME,
73     'ansi_x3.4-1986' => REGISTERED_CHARSET_NAME,
74     'iso_646.irv:1991' => REGISTERED_CHARSET_NAME,
75     'ascii' => REGISTERED_CHARSET_NAME,
76     'iso646-us' => REGISTERED_CHARSET_NAME,
77 wakaba 1.4 'us-ascii' => PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
78 wakaba 1.1 'us' => REGISTERED_CHARSET_NAME,
79     'ibm367' => REGISTERED_CHARSET_NAME,
80     'cp367' => REGISTERED_CHARSET_NAME,
81     'csascii' => REGISTERED_CHARSET_NAME,
82     },
83     is_html_ascii_superset => 1,
84 wakaba 1.4 });
85 wakaba 1.1
86     $Charset->{'iso-8859-1'}
87     = $IANACharset->{'iso_8859-1:1987'}
88     = $IANACharset->{'iso-ir-100'}
89     = $IANACharset->{'iso_8859-1'}
90     = $IANACharset->{'iso-8859-1'}
91     = $IANACharset->{'latin1'}
92     = $IANACharset->{'l1'}
93     = $IANACharset->{'ibm819'}
94     = $IANACharset->{'cp819'}
95     = $IANACharset->{'csisolatin1'}
96 wakaba 1.4 = __PACKAGE__->new ({
97 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
98 wakaba 1.1 iana_names => {
99 wakaba 1.4 'iso_8859-1:1987' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
100 wakaba 1.1 'iso-ir-100' => REGISTERED_CHARSET_NAME,
101     'iso_8859-1' => REGISTERED_CHARSET_NAME,
102 wakaba 1.4 'iso-8859-1' => PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
103 wakaba 1.1 'latin1' => REGISTERED_CHARSET_NAME,
104     'l1' => REGISTERED_CHARSET_NAME,
105     'ibm819' => REGISTERED_CHARSET_NAME,
106     'cp819' => REGISTERED_CHARSET_NAME,
107     'csisolatin1' => REGISTERED_CHARSET_NAME,
108     },
109 wakaba 1.7 perl_names => {
110     'web-latin1' => UNREGISTERED_CHARSET_NAME | SEMICONFORMING_ENCODING_IMPL |
111     ERROR_REPORTING_ENCODING_IMPL,
112     'iso-8859-1' => FALLBACK_ENCODING_IMPL,
113     },
114     fallback => {
115     "\x80" => "\x{20AC}",
116     "\x82" => "\x{201A}",
117     "\x83" => "\x{0192}",
118     "\x84" => "\x{201E}",
119     "\x85" => "\x{2026}",
120     "\x86" => "\x{2020}",
121     "\x87" => "\x{2021}",
122     "\x88" => "\x{02C6}",
123     "\x89" => "\x{2030}",
124     "\x8A" => "\x{0160}",
125     "\x8B" => "\x{2039}",
126     "\x8C" => "\x{0152}",
127     "\x8E" => "\x{017D}",
128     "\x91" => "\x{2018}",
129     "\x92" => "\x{2019}",
130     "\x93" => "\x{201C}",
131     "\x94" => "\x{201D}",
132     "\x95" => "\x{2022}",
133     "\x96" => "\x{2013}",
134     "\x97" => "\x{2014}",
135     "\x98" => "\x{02DC}",
136     "\x99" => "\x{2122}",
137     "\x9A" => "\x{0161}",
138     "\x9B" => "\x{203A}",
139     "\x9C" => "\x{0153}",
140     "\x9E" => "\x{017E}",
141     "\x9F" => "\x{0178}",
142     },
143 wakaba 1.1 is_html_ascii_superset => 1,
144 wakaba 1.4 });
145 wakaba 1.1
146 wakaba 1.2 $Charset->{'iso-8859-2'}
147     = $IANACharset->{'iso_8859-2:1987'}
148     = $IANACharset->{'iso-ir-101'}
149     = $IANACharset->{'iso_8859-2'}
150     = $IANACharset->{'iso-8859-2'}
151     = $IANACharset->{'latin2'}
152     = $IANACharset->{'l2'}
153     = $IANACharset->{'csisolatin2'}
154 wakaba 1.4 = __PACKAGE__->new ({
155 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
156 wakaba 1.2 iana_names => {
157 wakaba 1.4 'iso_8859-2:1987' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
158 wakaba 1.2 'iso-ir-101' => REGISTERED_CHARSET_NAME,
159     'iso_8859-2' => REGISTERED_CHARSET_NAME,
160 wakaba 1.4 'iso-8859-2' => PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
161 wakaba 1.2 'latin2' => REGISTERED_CHARSET_NAME,
162     'l2' => REGISTERED_CHARSET_NAME,
163     'csisolatin2' => REGISTERED_CHARSET_NAME,
164     },
165     is_html_ascii_superset => 1,
166 wakaba 1.4 });
167 wakaba 1.2
168     $Charset->{'iso-8859-3'}
169     = $IANACharset->{'iso_8859-3:1988'}
170     = $IANACharset->{'iso-ir-109'}
171     = $IANACharset->{'iso_8859-3'}
172     = $IANACharset->{'iso-8859-3'}
173     = $IANACharset->{'latin3'}
174     = $IANACharset->{'l3'}
175     = $IANACharset->{'csisolatin3'}
176 wakaba 1.4 = __PACKAGE__->new ({
177 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
178 wakaba 1.2 iana_names => {
179 wakaba 1.4 'iso_8859-3:1988' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
180 wakaba 1.2 'iso-ir-109' => REGISTERED_CHARSET_NAME,
181     'iso_8859-3' => REGISTERED_CHARSET_NAME,
182 wakaba 1.4 'iso-8859-3' => PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
183 wakaba 1.2 'latin3' => REGISTERED_CHARSET_NAME,
184     'l3' => REGISTERED_CHARSET_NAME,
185     'csisolatin3' => REGISTERED_CHARSET_NAME,
186     },
187     is_html_ascii_superset => 1,
188 wakaba 1.4 });
189 wakaba 1.2
190     $Charset->{'iso-8859-4'}
191     = $IANACharset->{'iso_8859-4:1988'}
192     = $IANACharset->{'iso-ir-110'}
193     = $IANACharset->{'iso_8859-4'}
194     = $IANACharset->{'iso-8859-4'}
195     = $IANACharset->{'latin4'}
196     = $IANACharset->{'l4'}
197     = $IANACharset->{'csisolatin4'}
198 wakaba 1.4 = __PACKAGE__->new ({
199 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
200 wakaba 1.2 iana_names => {
201 wakaba 1.4 'iso_8859-4:1988' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
202 wakaba 1.2 'iso-ir-110' => REGISTERED_CHARSET_NAME,
203     'iso_8859-4' => REGISTERED_CHARSET_NAME,
204 wakaba 1.4 'iso-8859-4' => PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
205 wakaba 1.2 'latin4' => REGISTERED_CHARSET_NAME,
206     'l4' => REGISTERED_CHARSET_NAME,
207     'csisolatin4' => REGISTERED_CHARSET_NAME,
208     },
209     is_html_ascii_superset => 1,
210 wakaba 1.4 });
211 wakaba 1.2
212     $Charset->{'iso-8859-5'}
213     = $IANACharset->{'iso_8859-5:1988'}
214     = $IANACharset->{'iso-ir-144'}
215     = $IANACharset->{'iso_8859-5'}
216     = $IANACharset->{'iso-8859-5'}
217     = $IANACharset->{'cyrillic'}
218     = $IANACharset->{'csisolatincyrillic'}
219 wakaba 1.4 = __PACKAGE__->new ({
220 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
221 wakaba 1.2 iana_names => {
222 wakaba 1.4 'iso_8859-5:1988' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
223 wakaba 1.2 'iso-ir-144' => REGISTERED_CHARSET_NAME,
224     'iso_8859-5' => REGISTERED_CHARSET_NAME,
225 wakaba 1.4 'iso-8859-5' => PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
226 wakaba 1.2 'cyrillic' => REGISTERED_CHARSET_NAME,
227     'csisolatincyrillic' => REGISTERED_CHARSET_NAME,
228     },
229     is_html_ascii_superset => 1,
230 wakaba 1.4 });
231 wakaba 1.2
232     $Charset->{'iso-8859-6'}
233     = $IANACharset->{'iso_8859-6:1987'}
234     = $IANACharset->{'iso-ir-127'}
235     = $IANACharset->{'iso_8859-6'}
236     = $IANACharset->{'iso-8859-6'}
237     = $IANACharset->{'ecma-114'}
238     = $IANACharset->{'asmo-708'}
239     = $IANACharset->{'arabic'}
240     = $IANACharset->{'csisolatinarabic'}
241 wakaba 1.4 = __PACKAGE__->new ({
242 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
243 wakaba 1.2 iana_names => {
244 wakaba 1.4 'iso_8859-6:1987' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
245 wakaba 1.2 'iso-ir-127' => REGISTERED_CHARSET_NAME,
246     'iso_8859-6' => REGISTERED_CHARSET_NAME,
247 wakaba 1.4 'iso-8859-6' => PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
248 wakaba 1.2 'ecma-114' => REGISTERED_CHARSET_NAME,
249     'asmo-708' => REGISTERED_CHARSET_NAME,
250     'arabic' => REGISTERED_CHARSET_NAME,
251     'csisolatinarabic' => REGISTERED_CHARSET_NAME,
252     },
253     is_html_ascii_superset => 1,
254     ## NOTE: 3/0..3/9 have different semantics from U+0030..0039,
255     ## but have same character names (maybe).
256 wakaba 1.3 ## NOTE: According to RFC 2046, charset left-hand half of "iso-8859-6"
257     ## is same as "us-ascii".
258 wakaba 1.4 });
259 wakaba 1.2
260     $Charset->{'iso-8859-7'}
261     = $IANACharset->{'iso_8859-7:1987'}
262     = $IANACharset->{'iso-ir-126'}
263     = $IANACharset->{'iso_8859-7'}
264     = $IANACharset->{'iso-8859-7'}
265     = $IANACharset->{'elot_928'}
266     = $IANACharset->{'ecma-118'}
267     = $IANACharset->{'greek'}
268     = $IANACharset->{'greek8'}
269     = $IANACharset->{'csisolatingreek'}
270 wakaba 1.4 = __PACKAGE__->new ({
271 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
272 wakaba 1.2 iana_names => {
273 wakaba 1.4 'iso_8859-7:1987' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
274 wakaba 1.2 'iso-ir-126' => REGISTERED_CHARSET_NAME,
275     'iso_8859-7' => REGISTERED_CHARSET_NAME,
276 wakaba 1.4 'iso-8859-7' => PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
277 wakaba 1.2 'elot_928' => REGISTERED_CHARSET_NAME,
278     'ecma-118' => REGISTERED_CHARSET_NAME,
279     'greek' => REGISTERED_CHARSET_NAME,
280     'greek8' => REGISTERED_CHARSET_NAME,
281     'csisolatingreek' => REGISTERED_CHARSET_NAME,
282     },
283     is_html_ascii_superset => 1,
284 wakaba 1.4 });
285 wakaba 1.2
286     $Charset->{'iso-8859-8'}
287     = $IANACharset->{'iso_8859-8:1988'}
288     = $IANACharset->{'iso-ir-138'}
289     = $IANACharset->{'iso_8859-8'}
290     = $IANACharset->{'iso-8859-8'}
291     = $IANACharset->{'hebrew'}
292     = $IANACharset->{'csisolatinhebrew'}
293 wakaba 1.4 = __PACKAGE__->new ({
294 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
295 wakaba 1.2 iana_names => {
296 wakaba 1.4 'iso_8859-8:1988' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
297 wakaba 1.2 'iso-ir-138' => REGISTERED_CHARSET_NAME,
298     'iso_8859-8' => REGISTERED_CHARSET_NAME,
299 wakaba 1.4 'iso-8859-8' => PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
300 wakaba 1.2 'hebrew' => REGISTERED_CHARSET_NAME,
301     'csisolatinhebrew' => REGISTERED_CHARSET_NAME,
302     },
303     is_html_ascii_superset => 1,
304 wakaba 1.4 });
305 wakaba 1.2
306     $Charset->{'iso-8859-9'}
307     = $IANACharset->{'iso_8859-9:1989'}
308     = $IANACharset->{'iso-ir-148'}
309     = $IANACharset->{'iso_8859-9'}
310     = $IANACharset->{'iso-8859-9'}
311     = $IANACharset->{'latin5'}
312     = $IANACharset->{'l5'}
313     = $IANACharset->{'csisolatin5'}
314 wakaba 1.4 = __PACKAGE__->new ({
315 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
316 wakaba 1.2 iana_names => {
317 wakaba 1.4 'iso_8859-9:1989' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
318 wakaba 1.2 'iso-ir-148' => REGISTERED_CHARSET_NAME,
319     'iso_8859-9' => REGISTERED_CHARSET_NAME,
320 wakaba 1.4 'iso-8859-9' => PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
321 wakaba 1.2 'latin5' => REGISTERED_CHARSET_NAME,
322     'l5' => REGISTERED_CHARSET_NAME,
323     'csisolatin5' => REGISTERED_CHARSET_NAME,
324     },
325     is_html_ascii_superset => 1,
326 wakaba 1.4 });
327 wakaba 1.2
328     $Charset->{'iso-8859-10'}
329     = $IANACharset->{'iso-8859-10'}
330     = $IANACharset->{'iso-ir-157'}
331     = $IANACharset->{'l6'}
332     = $IANACharset->{'iso_8859-10:1992'}
333     = $IANACharset->{'csisolatin6'}
334     = $IANACharset->{'latin6'}
335 wakaba 1.4 = __PACKAGE__->new ({
336 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
337 wakaba 1.2 iana_names => {
338 wakaba 1.4 'iso-8859-10' => PRIMARY_CHARSET_NAME | PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
339 wakaba 1.2 'iso-ir-157' => REGISTERED_CHARSET_NAME,
340     'l6' => REGISTERED_CHARSET_NAME,
341     'iso_8859-10:1992' => REGISTERED_CHARSET_NAME,
342     'csisolatin6' => REGISTERED_CHARSET_NAME,
343     'latin6' => REGISTERED_CHARSET_NAME,
344     },
345     is_html_ascii_superset => 1,
346 wakaba 1.4 });
347 wakaba 1.2
348     $Charset->{'iso_6937-2-add'}
349     = $IANACharset->{'iso_6937-2-add'}
350     = $IANACharset->{'iso-ir-142'}
351     = $IANACharset->{'csisotextcomm'}
352 wakaba 1.4 = __PACKAGE__->new ({
353 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
354 wakaba 1.2 iana_names => {
355 wakaba 1.4 'iso_6937-2-add' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
356 wakaba 1.2 'iso-ir-142' => REGISTERED_CHARSET_NAME,
357     'csisotextcomm' => REGISTERED_CHARSET_NAME,
358     },
359     is_html_ascii_superset => 1,
360 wakaba 1.4 });
361 wakaba 1.2
362     $Charset->{'jis_x0201'}
363     = $IANACharset->{'jis_x0201'}
364     = $IANACharset->{'x0201'}
365     = $IANACharset->{'cshalfwidthkatakana'}
366 wakaba 1.4 = __PACKAGE__->new ({
367 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
368 wakaba 1.2 iana_names => {
369 wakaba 1.4 'jis_x0201' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
370 wakaba 1.2 'x0201' => REGISTERED_CHARSET_NAME,
371     'cshalfwidthkatakana' => REGISTERED_CHARSET_NAME,
372     },
373     is_html_ascii_superset => 1,
374 wakaba 1.4 });
375 wakaba 1.2
376     $Charset->{'jis_encoding'}
377     = $IANACharset->{'jis_encoding'}
378     = $IANACharset->{'csjisencoding'}
379 wakaba 1.4 = __PACKAGE__->new ({
380 wakaba 1.5 category => 0,
381 wakaba 1.2 iana_names => {
382 wakaba 1.4 'jis_encoding' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
383 wakaba 1.2 'csjisencoding' => REGISTERED_CHARSET_NAME,
384     },
385     ## NOTE: What is this?
386 wakaba 1.4 });
387 wakaba 1.1
388     $Charset->{'shift_jis'}
389     = $IANACharset->{'shift_jis'}
390     = $IANACharset->{'ms_kanji'}
391     = $IANACharset->{'csshiftjis'}
392 wakaba 1.4 = __PACKAGE__->new ({
393 wakaba 1.5 category => CHARSET_CATEGORY_SJIS | CHARSET_CATEGORY_BLOCK_SAFE,
394 wakaba 1.1 iana_names => {
395 wakaba 1.4 'shift_jis' => PREFERRED_CHARSET_NAME | PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
396 wakaba 1.1 'ms_kanji' => REGISTERED_CHARSET_NAME,
397     'csshiftjis' => REGISTERED_CHARSET_NAME,
398     },
399 wakaba 1.5 perl_names => {
400 wakaba 1.6 'shift-jis-1997' => UNREGISTERED_CHARSET_NAME |
401 wakaba 1.7 SEMICONFORMING_ENCODING_IMPL | ERROR_REPORTING_ENCODING_IMPL,
402     shiftjis => PRIMARY_CHARSET_NAME | NONCONFORMING_ENCODING_IMPL |
403 wakaba 1.6 ERROR_REPORTING_ENCODING_IMPL,
404 wakaba 1.5 ## NOTE: Unicode mapping is wrong.
405     },
406 wakaba 1.2 mime_text_suitable => 1,
407 wakaba 1.4 });
408 wakaba 1.1
409 wakaba 1.3 $Charset->{'x-sjis'}
410     = $IANACharset->{'x-sjis'}
411 wakaba 1.4 = __PACKAGE__->new ({
412 wakaba 1.5 category => CHARSET_CATEGORY_SJIS | CHARSET_CATEGORY_BLOCK_SAFE,
413 wakaba 1.3 iana_names => {
414     'x-sjis' => UNREGISTERED_CHARSET_NAME,
415     },
416 wakaba 1.6 perl_names => {
417 wakaba 1.7 'shift-jis-1997' => FALLBACK_ENCODING_IMPL | ERROR_REPORTING_ENCODING_IMPL,
418 wakaba 1.6 },
419 wakaba 1.3 mime_text_suitable => 1,
420 wakaba 1.4 });
421 wakaba 1.3
422 wakaba 1.5 $Charset->{shift_jisx0213}
423     = $IANACharset->{shift_jisx0213}
424     = __PACKAGE__->new ({
425     category => CHARSET_CATEGORY_SJIS | CHARSET_CATEGORY_BLOCK_SAFE,
426     iana_names => {
427     shift_jisx0213 => UNREGISTERED_CHARSET_NAME,
428     },
429     perl_names => {
430     #shift_jisx0213 (non-standard - i don't know its conformance)
431 wakaba 1.7 'shift-jis-1997' => FALLBACK_ENCODING_IMPL | ERROR_REPORTING_ENCODING_IMPL,
432     'shiftjis' => FALLBACK_ENCODING_IMPL | ERROR_REPORTING_ENCODING_IMPL,
433 wakaba 1.5 },
434     mime_text_suitable => 1,
435     });
436    
437 wakaba 1.1 $Charset->{'euc-jp'}
438     = $IANACharset->{'extended_unix_code_packed_format_for_japanese'}
439     = $IANACharset->{'cseucpkdfmtjapanese'}
440     = $IANACharset->{'euc-jp'}
441 wakaba 1.3 = $IANACharset->{'x-euc-jp'}
442 wakaba 1.4 = __PACKAGE__->new ({
443 wakaba 1.5 category => CHARSET_CATEGORY_EUCJP | CHARSET_CATEGORY_BLOCK_SAFE,
444 wakaba 1.1 iana_names => {
445 wakaba 1.4 'extended_unix_code_packed_format_for_japanese' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
446 wakaba 1.1 'cseucpkdfmtjapanese' => REGISTERED_CHARSET_NAME,
447 wakaba 1.4 'euc-jp' => PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
448 wakaba 1.1 },
449 wakaba 1.5 perl_names => {
450 wakaba 1.7 'euc-jp-1997' => UNREGISTERED_CHARSET_NAME |
451     SEMICONFORMING_ENCODING_IMPL | ERROR_REPORTING_ENCODING_IMPL,
452 wakaba 1.5 ## NOTE: Though the IANA definition references the 1990 version
453     ## of EUC-JP, the 1997 version of JIS standard claims that the version
454     ## is same coded character set as the 1990 version, such that we
455     ## consider the EUC-JP 1990 version is same as the 1997 version.
456 wakaba 1.6 'euc-jp' => PREFERRED_CHARSET_NAME | NONCONFORMING_ENCODING_IMPL |
457     ERROR_REPORTING_ENCODING_IMPL,
458 wakaba 1.5 ## NOTE: Unicode mapping is wrong.
459     },
460 wakaba 1.1 is_html_ascii_superset => 1,
461 wakaba 1.3 mime_text_suitable => 1,
462 wakaba 1.4 });
463 wakaba 1.3
464     $Charset->{'x-euc-jp'}
465     = $IANACharset->{'x-euc-jp'}
466 wakaba 1.4 = __PACKAGE__->new ({
467 wakaba 1.5 category => CHARSET_CATEGORY_EUCJP | CHARSET_CATEGORY_BLOCK_SAFE,
468 wakaba 1.3 iana_names => {
469     'x-euc-jp' => UNREGISTERED_CHARSET_NAME,
470     },
471 wakaba 1.6 perl_names => {
472     'euc-jp-1997' => FALLBACK_ENCODING_IMPL | ERROR_REPORTING_ENCODING_IMPL,
473     'euc-jp' => FALLBACK_ENCODING_IMPL | ERROR_REPORTING_ENCODING_IMPL,
474     },
475     is_html_ascii_superset => 1, is_html_ascii_superset => 1,
476 wakaba 1.3 mime_text_suitable => 1,
477 wakaba 1.4 });
478 wakaba 1.1
479 wakaba 1.2 $Charset->{'extended_unix_code_fixed_width_for_japanese'}
480     = $IANACharset->{'extended_unix_code_fixed_width_for_japanese'}
481     = $IANACharset->{'cseucfixwidjapanese'}
482 wakaba 1.4 = __PACKAGE__->new ({
483 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
484 wakaba 1.2 iana_names => {
485 wakaba 1.4 'extended_unix_code_fixed_width_for_japanese' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
486 wakaba 1.2 'cseucfixwidjapanese' => REGISTERED_CHARSET_NAME,
487     },
488 wakaba 1.4 });
489 wakaba 1.2
490 wakaba 1.1 ## TODO: ...
491    
492 wakaba 1.2 $Charset->{'euc-kr'}
493     = $IANACharset->{'euc-kr'}
494     = $IANACharset->{'cseuckr'}
495 wakaba 1.4 = __PACKAGE__->new ({
496 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
497 wakaba 1.2 iana_names => {
498 wakaba 1.4 'euc-kr' => PRIMARY_CHARSET_NAME | PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
499 wakaba 1.2 'cseuckr' => REGISTERED_CHARSET_NAME,
500     },
501     is_html_ascii_superset => 1,
502 wakaba 1.4 });
503 wakaba 1.2
504 wakaba 1.1 $Charset->{'iso-2022-jp'}
505     = $IANACharset->{'iso-2022-jp'}
506     = $IANACharset->{'csiso2022jp'}
507 wakaba 1.3 = $IANACharset->{'iso2022jp'}
508     = $IANACharset->{'junet-code'}
509 wakaba 1.4 = __PACKAGE__->new ({
510 wakaba 1.5 category => 0,
511 wakaba 1.1 iana_names => {
512 wakaba 1.4 'iso-2022-jp' => PREFERRED_CHARSET_NAME | PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
513 wakaba 1.1 'csiso2022jp' => REGISTERED_CHARSET_NAME,
514 wakaba 1.3 'iso2022jp' => UNREGISTERED_CHARSET_NAME,
515     'junet-code' => UNREGISTERED_CHARSET_NAME,
516 wakaba 1.1 },
517 wakaba 1.2 mime_text_suitable => 1,
518 wakaba 1.4 });
519 wakaba 1.2
520     $Charset->{'iso-2022-jp-2'}
521     = $IANACharset->{'iso-2022-jp-2'}
522     = $IANACharset->{'csiso2022jp2'}
523 wakaba 1.4 = __PACKAGE__->new ({
524 wakaba 1.5 category => 0,
525 wakaba 1.2 iana_names => {
526 wakaba 1.4 'iso-2022-jp-2' => PREFERRED_CHARSET_NAME | PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
527 wakaba 1.2 'csiso2022jp2' => REGISTERED_CHARSET_NAME,
528     },
529     mime_text_suitable => 1,
530 wakaba 1.4 });
531 wakaba 1.1
532     ## TODO: ...
533    
534     $Charset->{'utf-8'}
535     = $IANACharset->{'utf-8'}
536 wakaba 1.3 = $IANACharset->{'x-utf-8'}
537 wakaba 1.4 = __PACKAGE__->new ({
538 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
539 wakaba 1.1 iana_names => {
540 wakaba 1.4 'utf-8' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
541 wakaba 1.6 ## NOTE: IANA name "utf-8" references RFC 3629. According to the RFC,
542     ## the definitive definition is one specified in the Unicode Standard.
543 wakaba 1.3 'x-utf-8' => UNREGISTERED_CHARSET_NAME,
544 wakaba 1.1 },
545 wakaba 1.6 perl_names => {
546     'utf-8-strict' => PRIMARY_CHARSET_NAME | SEMICONFORMING_ENCODING_IMPL |
547     ERROR_REPORTING_ENCODING_IMPL,
548     ## NOTE: It does not support non-Unicode UCS characters (conforming).
549     ## It does detect illegal sequences (conforming).
550     ## It does not support surrpgate pairs (conforming).
551     ## It does not support BOMs (non-conforming).
552     },
553     bom_pattern => qr/\xEF\xBB\xBF/,
554 wakaba 1.1 is_html_ascii_superset => 1,
555 wakaba 1.3 mime_text_suitable => 1,
556 wakaba 1.4 });
557 wakaba 1.3
558     $Charset->{'utf-8n'}
559     = $IANACharset->{'utf-8n'}
560 wakaba 1.4 = __PACKAGE__->new ({
561 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
562 wakaba 1.3 iana_names => {
563     'utf-8n' => UNREGISTERED_CHARSET_NAME,
564 wakaba 1.6 ## NOTE: Is there any normative definition for the charset?
565     ## What variant of UTF-8 should we use for the charset?
566     },
567     perl_names => {
568     'utf-8-strict' => PRIMARY_CHARSET_NAME | ERROR_REPORTING_ENCODING_IMPL,
569 wakaba 1.3 },
570     is_html_ascii_superset => 1,
571     mime_text_suitable => 1,
572 wakaba 1.4 });
573 wakaba 1.1
574     ## TODO: ...
575    
576 wakaba 1.2 $Charset->{'gbk'}
577     = $IANACharset->{'gbk'}
578     = $IANACharset->{'cp936'}
579     = $IANACharset->{'ms936'}
580     = $IANACharset->{'windows-936'}
581 wakaba 1.4 = __PACKAGE__->new ({
582 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
583 wakaba 1.2 iana_names => {
584 wakaba 1.4 'gbk' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
585 wakaba 1.2 'cp936' => REGISTERED_CHARSET_NAME,
586     'ms936' => REGISTERED_CHARSET_NAME,
587     'windows-936' => REGISTERED_CHARSET_NAME,
588     },
589     iana_status => STATUS_COMMON | STATUS_OBSOLETE,
590     mime_text_suitable => 1,
591 wakaba 1.4 });
592 wakaba 1.2
593     $Charset->{'gb18030'}
594     = $IANACharset->{'gb18030'}
595 wakaba 1.4 = __PACKAGE__->new ({
596 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
597 wakaba 1.2 iana_names => {
598 wakaba 1.4 'gb18030' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
599 wakaba 1.2 },
600     iana_status => STATUS_COMMON,
601     mime_text_suitable => 1,
602 wakaba 1.4 });
603 wakaba 1.2
604     ## TODO: ...
605    
606 wakaba 1.1 $Charset->{'utf-16be'}
607     = $IANACharset->{'utf-16be'}
608 wakaba 1.4 = __PACKAGE__->new ({
609 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
610 wakaba 1.1 iana_names => {
611 wakaba 1.4 'utf-16be' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
612 wakaba 1.1 },
613 wakaba 1.4 });
614 wakaba 1.1
615     $Charset->{'utf-16le'}
616     = $IANACharset->{'utf-16le'}
617 wakaba 1.4 = __PACKAGE__->new ({
618 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
619 wakaba 1.1 iana_names => {
620 wakaba 1.4 'utf-16le' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
621 wakaba 1.1 },
622 wakaba 1.4 });
623 wakaba 1.1
624     $Charset->{'utf-16'}
625     = $IANACharset->{'utf-16'}
626 wakaba 1.4 = __PACKAGE__->new ({
627 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
628 wakaba 1.1 iana_names => {
629 wakaba 1.4 'utf-16' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
630 wakaba 1.1 },
631 wakaba 1.4 });
632 wakaba 1.1
633     ## TODO: ...
634    
635 wakaba 1.2 $Charset->{'windows-31j'}
636     = $IANACharset->{'windows-31j'}
637     = $IANACharset->{'cswindows31j'}
638 wakaba 1.4 = __PACKAGE__->new ({
639 wakaba 1.5 category => CHARSET_CATEGORY_SJIS | CHARSET_CATEGORY_BLOCK_SAFE,
640 wakaba 1.2 iana_names => {
641 wakaba 1.4 'windows-31j' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
642 wakaba 1.2 'cswindows31j' => REGISTERED_CHARSET_NAME,
643     },
644     iana_status => STATUS_LIMITED_USE, # maybe
645     mime_text_suitable => 1,
646 wakaba 1.4 });
647 wakaba 1.2
648     $Charset->{'gb2312'}
649     = $IANACharset->{'gb2312'}
650     = $IANACharset->{'csgb2312'}
651 wakaba 1.4 = __PACKAGE__->new ({
652 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
653 wakaba 1.2 iana_names => {
654 wakaba 1.4 'gb2312' => PRIMARY_CHARSET_NAME | PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
655 wakaba 1.2 'csgb2312' => REGISTERED_CHARSET_NAME,
656     },
657     is_html_ascii_superset => 1,
658     mime_text_suitable => 1,
659 wakaba 1.4 });
660 wakaba 1.2
661     $Charset->{'big5'}
662     = $IANACharset->{'big5'}
663     = $IANACharset->{'csbig5'}
664 wakaba 1.4 = __PACKAGE__->new ({
665 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
666 wakaba 1.2 iana_names => {
667 wakaba 1.4 'big5' => PRIMARY_CHARSET_NAME | PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
668 wakaba 1.2 'csbig5' => REGISTERED_CHARSET_NAME,
669     },
670     mime_text_suitable => 1,
671 wakaba 1.4 });
672 wakaba 1.2
673     ## TODO: ...
674    
675     $Charset->{'big5-hkscs'}
676     = $IANACharset->{'big5-hkscs'}
677 wakaba 1.4 = __PACKAGE__->new ({
678 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
679 wakaba 1.2 iana_names => {
680 wakaba 1.4 'big5-hkscs' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
681 wakaba 1.2 },
682     mime_text_suitable => 1,
683 wakaba 1.4 });
684 wakaba 1.2
685     ## TODO: ...
686    
687 wakaba 1.1 $Charset->{'windows-1252'}
688     = $IANACharset->{'windows-1252'}
689 wakaba 1.4 = __PACKAGE__->new ({
690 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
691 wakaba 1.1 iana_names => {
692 wakaba 1.4 'windows-1252' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
693 wakaba 1.1 },
694     is_html_ascii_superset => 1,
695 wakaba 1.4 });
696 wakaba 1.1
697     ## TODO: ...
698    
699 wakaba 1.7 $Charset->{'tis-620'}
700     = $IANACharset->{'tis-620'}
701     = __PACKAGE__->new ({
702     category => CHARSET_CATEGORY_BLOCK_SAFE,
703     iana_names => {
704     'tis-620' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
705     },
706     perl_names => {
707     'tis-620' => FALLBACK_ENCODING_IMPL | ERROR_REPORTING_ENCODING_IMPL,
708     ## NOTE: An alias of |iso-8859-11|.
709     },
710     is_html_ascii_superset => 1,
711     });
712    
713     $Charset->{'iso-8859-11'}
714     = $IANACharset->{'iso-8859-11'}
715     = __PACKAGE__->new ({
716     category => CHARSET_CATEGORY_BLOCK_SAFE,
717     iana_names => {
718     'iso-8859-11' => UNREGISTERED_CHARSET_NAME,
719     ## NOTE: The Web Thai encoding, i.e. windows-874.
720     },
721     perl_names => {
722     'windows-874' => FALLBACK_ENCODING_IMPL | ERROR_REPORTING_ENCODING_IMPL,
723     'web-thai' => UNREGISTERED_CHARSET_NAME | ERROR_REPORTING_ENCODING_IMPL,
724     },
725     fallback => {
726     "\x80" => "\x{20AC}",
727     "\x85" => "\x{2026}",
728     "\x91" => "\x{2018}",
729     "\x92" => "\x{2019}",
730     "\x93" => "\x{201C}",
731     "\x94" => "\x{201D}",
732     "\x95" => "\x{2022}",
733     "\x96" => "\x{2013}",
734     "\x97" => "\x{2014}",
735     },
736     is_html_ascii_superset => 1,
737     });
738    
739     $Charset->{'windows-874'}
740     = $IANACharset->{'windows-874'}
741     = __PACKAGE__->new ({
742     category => CHARSET_CATEGORY_BLOCK_SAFE,
743     iana_names => {
744     'windows-874' => UNREGISTERED_CHARSET_NAME,
745     },
746     perl_names => {
747     'windows-874' => REGISTERED_CHARSET_NAME | ERROR_REPORTING_ENCODING_IMPL,
748     },
749     is_html_ascii_superset => 1,
750     });
751    
752 wakaba 1.4 sub new ($$) {
753     return bless $_[1], $_[0];
754     } # new
755    
756     ## NOTE: A class method
757     sub get_by_iana_name ($$) {
758     my $name = $_[1];
759     $name =~ tr/A-Z/a-z/; ## ASCII case-insensitive
760     unless ($IANACharset->{$name}) {
761     $IANACharset->{$name} = __PACKAGE__->new ({
762     iana_names => {
763     $name => UNREGISTERED_CHARSET_NAME,
764     },
765     });
766     }
767     return $IANACharset->{$name};
768     } # get_by_iana_name
769    
770 wakaba 1.5 sub get_decode_handle ($$;%) {
771     my $self = shift;
772     my $byte_stream = shift;
773     my %opt = @_;
774    
775     my $obj = {
776     character_queue => [],
777     filehandle => $byte_stream,
778     charset => '', ## TODO: We set a charset name for input_encoding (when we get identify-by-URI nonsense away)
779     byte_buffer => $opt{byte_buffer} ? ${$opt{byte_buffer}} : '', ## TODO: ref, instead of value, should be used
780     onerror => $opt{onerror} || sub {},
781 wakaba 1.7 must_level => 'm',
782     fact_level => 'm',
783 wakaba 1.5 };
784    
785     require Whatpm::Charset::DecodeHandle;
786     if ($self->{iana_names}->{'iso-2022-jp'}) {
787     $obj->{state_2440} = 'gl-jis-1978';
788     $obj->{state_2442} = 'gl-jis-1983';
789     $obj->{state} = 'state_2842';
790     eval {
791     require Encode::GLJIS1978;
792     require Encode::GLJIS1983;
793     };
794     if (Encode::find_encoding ($obj->{state_2440}) and
795     Encode::find_encoding ($obj->{state_2442})) {
796     return ((bless $obj, 'Whatpm::Charset::DecodeHandle::ISO2022JP'),
797     PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME);
798     }
799     } elsif ($self->{xml_names}->{'iso-2022-jp'}) {
800     $obj->{state_2440} = 'gl-jis-1997-swapped';
801     $obj->{state_2442} = 'gl-jis-1997';
802     $obj->{state} = 'state_2842';
803     eval {
804     require Encode::GLJIS1997Swapped;
805     require Encode::GLJIS1997;
806     };
807     if (Encode::find_encoding ($obj->{state_2440}) and
808     Encode::find_encoding ($obj->{state_2442})) {
809     return ((bless $obj, 'Whatpm::Charset::DecodeHandle::ISO2022JP'),
810     PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME);
811     }
812     }
813    
814 wakaba 1.6 my ($e, $e_status) = $self->get_perl_encoding
815     (%opt, allow_semiconforming => 1);
816 wakaba 1.5 if ($e) {
817     $obj->{perl_encoding_name} = $e->name;
818     if ($self->{category} & CHARSET_CATEGORY_EUCJP) {
819     return ((bless $obj, 'Whatpm::Charset::DecodeHandle::EUCJP'),
820     $e_status);
821     } elsif ($self->{category} & CHARSET_CATEGORY_SJIS) {
822     return ((bless $obj, 'Whatpm::Charset::DecodeHandle::ShiftJIS'),
823     $e_status);
824 wakaba 1.7 #} elsif ($self->{category} & CHARSET_CATEGORY_BLOCK_SAFE) {
825     } else {
826     $e_status |= FALLBACK_ENCODING_IMPL
827     unless $self->{category} & CHARSET_CATEGORY_BLOCK_SAFE;
828 wakaba 1.6 $obj->{bom_pattern} = $self->{bom_pattern};
829 wakaba 1.7 $obj->{fallback} = $self->{fallback};
830 wakaba 1.5 return ((bless $obj, 'Whatpm::Charset::DecodeHandle::Encode'),
831     $e_status);
832 wakaba 1.7 #} else {
833     # ## TODO: no encoding error (?)
834     # return (undef, 0);
835 wakaba 1.5 }
836     } else {
837     ## TODO: no encoding error(?)
838     return (undef, 0);
839     }
840     } # get_decode_handle
841    
842 wakaba 1.4 sub get_perl_encoding ($;%) {
843     my ($self, %opt) = @_;
844    
845     require Encode;
846 wakaba 1.5 my $load_encode = sub {
847     my $name = shift;
848     if ($name eq 'euc-jp-1997') {
849     require Encode::EUCJP1997;
850     } elsif ($name eq 'shift-jis-1997') {
851     require Encode::ShiftJIS1997;
852 wakaba 1.7 } elsif ($name eq 'web-latin1') {
853     require Whatpm::Charset::WebLatin1;
854     } elsif ($name eq 'web-thai') {
855     require Whatpm::Charset::WebThai;
856 wakaba 1.5 }
857     }; # $load_encode
858 wakaba 1.4
859     if ($opt{allow_error_reporting}) {
860     for my $perl_name (keys %{$self->{perl_names} or {}}) {
861     my $perl_status = $self->{perl_names}->{$perl_name};
862     next unless $perl_status & ERROR_REPORTING_ENCODING_IMPL;
863 wakaba 1.5 next if $perl_status & FALLBACK_ENCODING_IMPL;
864 wakaba 1.6 next if $perl_status & SEMICONFORMING_ENCODING_IMPL and
865     not $opt{allow_semiconforming};
866 wakaba 1.4
867 wakaba 1.5 $load_encode->($perl_name);
868 wakaba 1.4 my $e = Encode::find_encoding ($perl_name);
869     if ($e) {
870     return ($e, $perl_status);
871     }
872     }
873     }
874    
875     for my $perl_name (keys %{$self->{perl_names} or {}}) {
876     my $perl_status = $self->{perl_names}->{$perl_name};
877     next if $perl_status & ERROR_REPORTING_ENCODING_IMPL;
878     next if $perl_status & FALLBACK_ENCODING_IMPL;
879 wakaba 1.6 next if $perl_status & SEMICONFORMING_ENCODING_IMPL and
880     not $opt{allow_semiconforming};
881 wakaba 1.5
882     $load_encode->($perl_name);
883 wakaba 1.4 my $e = Encode::find_encoding ($perl_name);
884     if ($e) {
885     return ($e, $perl_status);
886     }
887     }
888    
889     if ($opt{allow_fallback}) {
890     for my $perl_name (keys %{$self->{perl_names} or {}}) {
891     my $perl_status = $self->{perl_names}->{$perl_name};
892 wakaba 1.6 next unless $perl_status & FALLBACK_ENCODING_IMPL or
893     $perl_status & SEMICONFORMING_ENCODING_IMPL;
894     ## NOTE: We don't prefer semi-conforming implementations to
895     ## non-conforming implementations, since semi-conforming implementations
896     ## will never be conforming without assist of the callee, and in such
897     ## cases the callee should set the |allow_semiconforming| option upon
898     ## the invocation of the method anyway.
899    
900 wakaba 1.5 $load_encode->($perl_name);
901 wakaba 1.4 my $e = Encode::find_encoding ($perl_name);
902     if ($e) {
903     return ($e, $perl_status);
904     }
905     }
906    
907     for my $iana_name (keys %{$self->{iana_names} or {}}) {
908 wakaba 1.5 $load_encode->($iana_name);
909 wakaba 1.4 my $e = Encode::find_encoding ($iana_name);
910     if ($e) {
911     return ($e, FALLBACK_ENCODING_IMPL);
912     }
913     }
914     }
915    
916     return (undef, 0);
917     } # get_perl_encoding
918    
919     sub get_iana_name ($) {
920     my $self = shift;
921    
922     my $primary;
923     my $other;
924     for my $iana_name (keys %{$self->{iana_names} or {}}) {
925     my $name_status = $self->{iana_names}->{$iana_name};
926     if ($name_status & PREFERRED_CHARSET_NAME) {
927     return $iana_name;
928     } elsif ($name_status & PRIMARY_CHARSET_NAME) {
929     $primary = $iana_name;
930     } elsif ($name_status & REGISTERED_CHARSET_NAME) {
931     $other = $iana_name;
932     } else {
933     $other ||= $iana_name;
934     }
935     }
936    
937     return $primary || $other;
938     } # get_iana_name
939    
940     ## NOTE: A non-method function
941 wakaba 1.3 sub is_syntactically_valid_iana_charset_name ($) {
942 wakaba 1.1 my $name = shift;
943     return $name =~ /\A[\x20-\x7E]{1,40}\z/;
944     } # is_suntactically_valid_iana_charset_name
945    
946     1;
947 wakaba 1.7 ## $Date: 2008/05/18 03:49:36 $
948 wakaba 1.1

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24