/[suikacvs]/messaging/manakai/lib/Message/Charset/Info.pm
Suika

Contents of /messaging/manakai/lib/Message/Charset/Info.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.6 - (hide annotations) (download)
Sun May 18 03:49:36 2008 UTC (16 years, 6 months ago) by wakaba
Branch: MAIN
Changes since 1.5: +51 -9 lines
++ manakai/lib/Message/Charset/ChangeLog	18 May 2008 03:49:07 -0000
2008-05-18  Wakaba  <wakaba@suika.fam.cx>

	* Info.pm (SEMICONFORMING_ENCODING_IMPL): New.  Distinguish full
	non-conformance from semi-conformance, where we can something
	to make the decoder conformant.

1 wakaba 1.1 package Message::Charset::Info;
2     use strict;
3 wakaba 1.6 our $VERSION=do{my @r=(q$Revision: 1.5 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 wakaba 1.1
5     sub UNREGISTERED_CHARSET_NAME () { 0b1 }
6 wakaba 1.4 ## Names for non-standard encodings/implementations for Perl encodings
7 wakaba 1.1 sub REGISTERED_CHARSET_NAME () { 0b10 }
8 wakaba 1.4 ## Names for standard encodings for Perl encodings
9     sub PRIMARY_CHARSET_NAME () { 0b100 }
10 wakaba 1.1 ## "Name:" field for IANA names
11 wakaba 1.4 ## Canonical name for Perl encodings
12     sub PREFERRED_CHARSET_NAME () { 0b1000 }
13 wakaba 1.1 ## "preferred MIME name" for IANA names
14    
15 wakaba 1.4 sub FALLBACK_ENCODING_IMPL () { 0b10000 }
16     ## For Perl encodings: Not a name of the encoding, the encoding
17     ## for the name might be useful as a fallback when the correct
18     ## encoding is not supported.
19     sub NONCONFORMING_ENCODING_IMPL () { FALLBACK_ENCODING_IMPL }
20     ## For Perl encodings: Not a conforming implementation of the encoding,
21     ## though it seems that the intention was to implement that encoding.
22 wakaba 1.6 sub SEMICONFORMING_ENCODING_IMPL () { 0b1000000 }
23     ## For Perl encodings: The implementation itself (returned by
24     ## |get_perl_encoding|) is non-conforming. The decode handle
25     ## implementation (returned by |get_decode_handle|) is conforming.
26 wakaba 1.4 sub ERROR_REPORTING_ENCODING_IMPL () { 0b100000 }
27     ## For Perl encodings: Support error reporting via |manakai_onerror|
28 wakaba 1.6 ## handler when the encoding is handled with decode handle.
29 wakaba 1.4
30 wakaba 1.2 ## iana_status
31     sub STATUS_COMMON () { 0b1 }
32     sub STATUS_LIMITED_USE () { 0b10 }
33     sub STATUS_OBSOLETE () { 0b100 }
34    
35 wakaba 1.5 ## category
36     sub CHARSET_CATEGORY_BLOCK_SAFE () { 0b1 }
37     ## NOTE: Stateless
38     sub CHARSET_CATEGORY_EUCJP () { 0b10 }
39     sub CHARSET_CATEGORY_SJIS () { 0b100 }
40    
41 wakaba 1.1 ## iana_names
42 wakaba 1.5
43 wakaba 1.1 ## is_html_ascii_superset: "superset of US-ASCII (specifically, ANSI_X3.4-1968)
44     ## for bytes in the range 0x09 - 0x0D, 0x20, 0x21, 0x22, 0x26, 0x27,
45     ## 0x2C - 0x3F, 0x41 - 0x5A, and 0x61 - 0x7A" [HTML5]
46     ## is_ebcdic_based
47 wakaba 1.5 ## TODO: These flags are obsolete - should be replaced by category
48 wakaba 1.1
49     ## ISSUE: Shift_JIS is a superset of US-ASCII? ISO-2022-JP is?
50     ## ISSUE: 0x5F (_) should be added to the range?
51    
52     my $Charset;
53    
54     our $IANACharset;
55    
56     $Charset->{'us-ascii'}
57     = $IANACharset->{'ansi_x3.4-1968'}
58     = $IANACharset->{'iso-ir-6'}
59     = $IANACharset->{'ansi_x3.4-1986'}
60     = $IANACharset->{'iso_646.irv:1991'}
61     = $IANACharset->{'ascii'}
62     = $IANACharset->{'iso646-us'}
63     = $IANACharset->{'us-ascii'}
64     = $IANACharset->{'us'}
65     = $IANACharset->{'ibm367'}
66     = $IANACharset->{'cp367'}
67     = $IANACharset->{'csascii'}
68 wakaba 1.4 = __PACKAGE__->new ({
69 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
70 wakaba 1.1 iana_names => {
71 wakaba 1.4 'ansi_x3.4-1968' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
72 wakaba 1.1 'iso-ir-6' => REGISTERED_CHARSET_NAME,
73     'ansi_x3.4-1986' => REGISTERED_CHARSET_NAME,
74     'iso_646.irv:1991' => REGISTERED_CHARSET_NAME,
75     'ascii' => REGISTERED_CHARSET_NAME,
76     'iso646-us' => REGISTERED_CHARSET_NAME,
77 wakaba 1.4 'us-ascii' => PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
78 wakaba 1.1 'us' => REGISTERED_CHARSET_NAME,
79     'ibm367' => REGISTERED_CHARSET_NAME,
80     'cp367' => REGISTERED_CHARSET_NAME,
81     'csascii' => REGISTERED_CHARSET_NAME,
82     },
83     is_html_ascii_superset => 1,
84 wakaba 1.4 });
85 wakaba 1.1
86     $Charset->{'iso-8859-1'}
87     = $IANACharset->{'iso_8859-1:1987'}
88     = $IANACharset->{'iso-ir-100'}
89     = $IANACharset->{'iso_8859-1'}
90     = $IANACharset->{'iso-8859-1'}
91     = $IANACharset->{'latin1'}
92     = $IANACharset->{'l1'}
93     = $IANACharset->{'ibm819'}
94     = $IANACharset->{'cp819'}
95     = $IANACharset->{'csisolatin1'}
96 wakaba 1.4 = __PACKAGE__->new ({
97 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
98 wakaba 1.1 iana_names => {
99 wakaba 1.4 'iso_8859-1:1987' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
100 wakaba 1.1 'iso-ir-100' => REGISTERED_CHARSET_NAME,
101     'iso_8859-1' => REGISTERED_CHARSET_NAME,
102 wakaba 1.4 'iso-8859-1' => PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
103 wakaba 1.1 'latin1' => REGISTERED_CHARSET_NAME,
104     'l1' => REGISTERED_CHARSET_NAME,
105     'ibm819' => REGISTERED_CHARSET_NAME,
106     'cp819' => REGISTERED_CHARSET_NAME,
107     'csisolatin1' => REGISTERED_CHARSET_NAME,
108     },
109     is_html_ascii_superset => 1,
110 wakaba 1.4 });
111 wakaba 1.1
112 wakaba 1.2 $Charset->{'iso-8859-2'}
113     = $IANACharset->{'iso_8859-2:1987'}
114     = $IANACharset->{'iso-ir-101'}
115     = $IANACharset->{'iso_8859-2'}
116     = $IANACharset->{'iso-8859-2'}
117     = $IANACharset->{'latin2'}
118     = $IANACharset->{'l2'}
119     = $IANACharset->{'csisolatin2'}
120 wakaba 1.4 = __PACKAGE__->new ({
121 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
122 wakaba 1.2 iana_names => {
123 wakaba 1.4 'iso_8859-2:1987' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
124 wakaba 1.2 'iso-ir-101' => REGISTERED_CHARSET_NAME,
125     'iso_8859-2' => REGISTERED_CHARSET_NAME,
126 wakaba 1.4 'iso-8859-2' => PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
127 wakaba 1.2 'latin2' => REGISTERED_CHARSET_NAME,
128     'l2' => REGISTERED_CHARSET_NAME,
129     'csisolatin2' => REGISTERED_CHARSET_NAME,
130     },
131     is_html_ascii_superset => 1,
132 wakaba 1.4 });
133 wakaba 1.2
134     $Charset->{'iso-8859-3'}
135     = $IANACharset->{'iso_8859-3:1988'}
136     = $IANACharset->{'iso-ir-109'}
137     = $IANACharset->{'iso_8859-3'}
138     = $IANACharset->{'iso-8859-3'}
139     = $IANACharset->{'latin3'}
140     = $IANACharset->{'l3'}
141     = $IANACharset->{'csisolatin3'}
142 wakaba 1.4 = __PACKAGE__->new ({
143 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
144 wakaba 1.2 iana_names => {
145 wakaba 1.4 'iso_8859-3:1988' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
146 wakaba 1.2 'iso-ir-109' => REGISTERED_CHARSET_NAME,
147     'iso_8859-3' => REGISTERED_CHARSET_NAME,
148 wakaba 1.4 'iso-8859-3' => PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
149 wakaba 1.2 'latin3' => REGISTERED_CHARSET_NAME,
150     'l3' => REGISTERED_CHARSET_NAME,
151     'csisolatin3' => REGISTERED_CHARSET_NAME,
152     },
153     is_html_ascii_superset => 1,
154 wakaba 1.4 });
155 wakaba 1.2
156     $Charset->{'iso-8859-4'}
157     = $IANACharset->{'iso_8859-4:1988'}
158     = $IANACharset->{'iso-ir-110'}
159     = $IANACharset->{'iso_8859-4'}
160     = $IANACharset->{'iso-8859-4'}
161     = $IANACharset->{'latin4'}
162     = $IANACharset->{'l4'}
163     = $IANACharset->{'csisolatin4'}
164 wakaba 1.4 = __PACKAGE__->new ({
165 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
166 wakaba 1.2 iana_names => {
167 wakaba 1.4 'iso_8859-4:1988' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
168 wakaba 1.2 'iso-ir-110' => REGISTERED_CHARSET_NAME,
169     'iso_8859-4' => REGISTERED_CHARSET_NAME,
170 wakaba 1.4 'iso-8859-4' => PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
171 wakaba 1.2 'latin4' => REGISTERED_CHARSET_NAME,
172     'l4' => REGISTERED_CHARSET_NAME,
173     'csisolatin4' => REGISTERED_CHARSET_NAME,
174     },
175     is_html_ascii_superset => 1,
176 wakaba 1.4 });
177 wakaba 1.2
178     $Charset->{'iso-8859-5'}
179     = $IANACharset->{'iso_8859-5:1988'}
180     = $IANACharset->{'iso-ir-144'}
181     = $IANACharset->{'iso_8859-5'}
182     = $IANACharset->{'iso-8859-5'}
183     = $IANACharset->{'cyrillic'}
184     = $IANACharset->{'csisolatincyrillic'}
185 wakaba 1.4 = __PACKAGE__->new ({
186 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
187 wakaba 1.2 iana_names => {
188 wakaba 1.4 'iso_8859-5:1988' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
189 wakaba 1.2 'iso-ir-144' => REGISTERED_CHARSET_NAME,
190     'iso_8859-5' => REGISTERED_CHARSET_NAME,
191 wakaba 1.4 'iso-8859-5' => PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
192 wakaba 1.2 'cyrillic' => REGISTERED_CHARSET_NAME,
193     'csisolatincyrillic' => REGISTERED_CHARSET_NAME,
194     },
195     is_html_ascii_superset => 1,
196 wakaba 1.4 });
197 wakaba 1.2
198     $Charset->{'iso-8859-6'}
199     = $IANACharset->{'iso_8859-6:1987'}
200     = $IANACharset->{'iso-ir-127'}
201     = $IANACharset->{'iso_8859-6'}
202     = $IANACharset->{'iso-8859-6'}
203     = $IANACharset->{'ecma-114'}
204     = $IANACharset->{'asmo-708'}
205     = $IANACharset->{'arabic'}
206     = $IANACharset->{'csisolatinarabic'}
207 wakaba 1.4 = __PACKAGE__->new ({
208 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
209 wakaba 1.2 iana_names => {
210 wakaba 1.4 'iso_8859-6:1987' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
211 wakaba 1.2 'iso-ir-127' => REGISTERED_CHARSET_NAME,
212     'iso_8859-6' => REGISTERED_CHARSET_NAME,
213 wakaba 1.4 'iso-8859-6' => PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
214 wakaba 1.2 'ecma-114' => REGISTERED_CHARSET_NAME,
215     'asmo-708' => REGISTERED_CHARSET_NAME,
216     'arabic' => REGISTERED_CHARSET_NAME,
217     'csisolatinarabic' => REGISTERED_CHARSET_NAME,
218     },
219     is_html_ascii_superset => 1,
220     ## NOTE: 3/0..3/9 have different semantics from U+0030..0039,
221     ## but have same character names (maybe).
222 wakaba 1.3 ## NOTE: According to RFC 2046, charset left-hand half of "iso-8859-6"
223     ## is same as "us-ascii".
224 wakaba 1.4 });
225 wakaba 1.2
226     $Charset->{'iso-8859-7'}
227     = $IANACharset->{'iso_8859-7:1987'}
228     = $IANACharset->{'iso-ir-126'}
229     = $IANACharset->{'iso_8859-7'}
230     = $IANACharset->{'iso-8859-7'}
231     = $IANACharset->{'elot_928'}
232     = $IANACharset->{'ecma-118'}
233     = $IANACharset->{'greek'}
234     = $IANACharset->{'greek8'}
235     = $IANACharset->{'csisolatingreek'}
236 wakaba 1.4 = __PACKAGE__->new ({
237 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
238 wakaba 1.2 iana_names => {
239 wakaba 1.4 'iso_8859-7:1987' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
240 wakaba 1.2 'iso-ir-126' => REGISTERED_CHARSET_NAME,
241     'iso_8859-7' => REGISTERED_CHARSET_NAME,
242 wakaba 1.4 'iso-8859-7' => PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
243 wakaba 1.2 'elot_928' => REGISTERED_CHARSET_NAME,
244     'ecma-118' => REGISTERED_CHARSET_NAME,
245     'greek' => REGISTERED_CHARSET_NAME,
246     'greek8' => REGISTERED_CHARSET_NAME,
247     'csisolatingreek' => REGISTERED_CHARSET_NAME,
248     },
249     is_html_ascii_superset => 1,
250 wakaba 1.4 });
251 wakaba 1.2
252     $Charset->{'iso-8859-8'}
253     = $IANACharset->{'iso_8859-8:1988'}
254     = $IANACharset->{'iso-ir-138'}
255     = $IANACharset->{'iso_8859-8'}
256     = $IANACharset->{'iso-8859-8'}
257     = $IANACharset->{'hebrew'}
258     = $IANACharset->{'csisolatinhebrew'}
259 wakaba 1.4 = __PACKAGE__->new ({
260 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
261 wakaba 1.2 iana_names => {
262 wakaba 1.4 'iso_8859-8:1988' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
263 wakaba 1.2 'iso-ir-138' => REGISTERED_CHARSET_NAME,
264     'iso_8859-8' => REGISTERED_CHARSET_NAME,
265 wakaba 1.4 'iso-8859-8' => PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
266 wakaba 1.2 'hebrew' => REGISTERED_CHARSET_NAME,
267     'csisolatinhebrew' => REGISTERED_CHARSET_NAME,
268     },
269     is_html_ascii_superset => 1,
270 wakaba 1.4 });
271 wakaba 1.2
272     $Charset->{'iso-8859-9'}
273     = $IANACharset->{'iso_8859-9:1989'}
274     = $IANACharset->{'iso-ir-148'}
275     = $IANACharset->{'iso_8859-9'}
276     = $IANACharset->{'iso-8859-9'}
277     = $IANACharset->{'latin5'}
278     = $IANACharset->{'l5'}
279     = $IANACharset->{'csisolatin5'}
280 wakaba 1.4 = __PACKAGE__->new ({
281 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
282 wakaba 1.2 iana_names => {
283 wakaba 1.4 'iso_8859-9:1989' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
284 wakaba 1.2 'iso-ir-148' => REGISTERED_CHARSET_NAME,
285     'iso_8859-9' => REGISTERED_CHARSET_NAME,
286 wakaba 1.4 'iso-8859-9' => PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
287 wakaba 1.2 'latin5' => REGISTERED_CHARSET_NAME,
288     'l5' => REGISTERED_CHARSET_NAME,
289     'csisolatin5' => REGISTERED_CHARSET_NAME,
290     },
291     is_html_ascii_superset => 1,
292 wakaba 1.4 });
293 wakaba 1.2
294     $Charset->{'iso-8859-10'}
295     = $IANACharset->{'iso-8859-10'}
296     = $IANACharset->{'iso-ir-157'}
297     = $IANACharset->{'l6'}
298     = $IANACharset->{'iso_8859-10:1992'}
299     = $IANACharset->{'csisolatin6'}
300     = $IANACharset->{'latin6'}
301 wakaba 1.4 = __PACKAGE__->new ({
302 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
303 wakaba 1.2 iana_names => {
304 wakaba 1.4 'iso-8859-10' => PRIMARY_CHARSET_NAME | PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
305 wakaba 1.2 'iso-ir-157' => REGISTERED_CHARSET_NAME,
306     'l6' => REGISTERED_CHARSET_NAME,
307     'iso_8859-10:1992' => REGISTERED_CHARSET_NAME,
308     'csisolatin6' => REGISTERED_CHARSET_NAME,
309     'latin6' => REGISTERED_CHARSET_NAME,
310     },
311     is_html_ascii_superset => 1,
312 wakaba 1.4 });
313 wakaba 1.2
314     $Charset->{'iso_6937-2-add'}
315     = $IANACharset->{'iso_6937-2-add'}
316     = $IANACharset->{'iso-ir-142'}
317     = $IANACharset->{'csisotextcomm'}
318 wakaba 1.4 = __PACKAGE__->new ({
319 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
320 wakaba 1.2 iana_names => {
321 wakaba 1.4 'iso_6937-2-add' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
322 wakaba 1.2 'iso-ir-142' => REGISTERED_CHARSET_NAME,
323     'csisotextcomm' => REGISTERED_CHARSET_NAME,
324     },
325     is_html_ascii_superset => 1,
326 wakaba 1.4 });
327 wakaba 1.2
328     $Charset->{'jis_x0201'}
329     = $IANACharset->{'jis_x0201'}
330     = $IANACharset->{'x0201'}
331     = $IANACharset->{'cshalfwidthkatakana'}
332 wakaba 1.4 = __PACKAGE__->new ({
333 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
334 wakaba 1.2 iana_names => {
335 wakaba 1.4 'jis_x0201' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
336 wakaba 1.2 'x0201' => REGISTERED_CHARSET_NAME,
337     'cshalfwidthkatakana' => REGISTERED_CHARSET_NAME,
338     },
339     is_html_ascii_superset => 1,
340 wakaba 1.4 });
341 wakaba 1.2
342     $Charset->{'jis_encoding'}
343     = $IANACharset->{'jis_encoding'}
344     = $IANACharset->{'csjisencoding'}
345 wakaba 1.4 = __PACKAGE__->new ({
346 wakaba 1.5 category => 0,
347 wakaba 1.2 iana_names => {
348 wakaba 1.4 'jis_encoding' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
349 wakaba 1.2 'csjisencoding' => REGISTERED_CHARSET_NAME,
350     },
351     ## NOTE: What is this?
352 wakaba 1.4 });
353 wakaba 1.1
354     $Charset->{'shift_jis'}
355     = $IANACharset->{'shift_jis'}
356     = $IANACharset->{'ms_kanji'}
357     = $IANACharset->{'csshiftjis'}
358 wakaba 1.4 = __PACKAGE__->new ({
359 wakaba 1.5 category => CHARSET_CATEGORY_SJIS | CHARSET_CATEGORY_BLOCK_SAFE,
360 wakaba 1.1 iana_names => {
361 wakaba 1.4 'shift_jis' => PREFERRED_CHARSET_NAME | PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
362 wakaba 1.1 'ms_kanji' => REGISTERED_CHARSET_NAME,
363     'csshiftjis' => REGISTERED_CHARSET_NAME,
364     },
365 wakaba 1.5 perl_names => {
366 wakaba 1.6 'shift-jis-1997' => UNREGISTERED_CHARSET_NAME |
367     ERROR_REPORTING_ENCODING_IMPL,
368 wakaba 1.5 shiftjis => PRIMARY_CHARSET_NAME | NONCONFORMING_ENCODING_IMPL,
369     ## NOTE: Unicode mapping is wrong.
370     },
371 wakaba 1.2 mime_text_suitable => 1,
372 wakaba 1.4 });
373 wakaba 1.1
374 wakaba 1.3 $Charset->{'x-sjis'}
375     = $IANACharset->{'x-sjis'}
376 wakaba 1.4 = __PACKAGE__->new ({
377 wakaba 1.5 category => CHARSET_CATEGORY_SJIS | CHARSET_CATEGORY_BLOCK_SAFE,
378 wakaba 1.3 iana_names => {
379     'x-sjis' => UNREGISTERED_CHARSET_NAME,
380     },
381 wakaba 1.6 perl_names => {
382     'shift-jis-1997' => UNREGISTERED_CHARSET_NAME | FALLBACK_ENCODING_IMPL |
383     ERROR_REPORTING_ENCODING_IMPL,
384     },
385 wakaba 1.3 mime_text_suitable => 1,
386 wakaba 1.4 });
387 wakaba 1.3
388 wakaba 1.5 $Charset->{shift_jisx0213}
389     = $IANACharset->{shift_jisx0213}
390     = __PACKAGE__->new ({
391     category => CHARSET_CATEGORY_SJIS | CHARSET_CATEGORY_BLOCK_SAFE,
392     iana_names => {
393     shift_jisx0213 => UNREGISTERED_CHARSET_NAME,
394     },
395     perl_names => {
396     #shift_jisx0213 (non-standard - i don't know its conformance)
397     'shift-jis-1997' => FALLBACK_ENCODING_IMPL,
398     },
399     mime_text_suitable => 1,
400     });
401    
402 wakaba 1.1 $Charset->{'euc-jp'}
403     = $IANACharset->{'extended_unix_code_packed_format_for_japanese'}
404     = $IANACharset->{'cseucpkdfmtjapanese'}
405     = $IANACharset->{'euc-jp'}
406 wakaba 1.3 = $IANACharset->{'x-euc-jp'}
407 wakaba 1.4 = __PACKAGE__->new ({
408 wakaba 1.5 category => CHARSET_CATEGORY_EUCJP | CHARSET_CATEGORY_BLOCK_SAFE,
409 wakaba 1.1 iana_names => {
410 wakaba 1.4 'extended_unix_code_packed_format_for_japanese' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
411 wakaba 1.1 'cseucpkdfmtjapanese' => REGISTERED_CHARSET_NAME,
412 wakaba 1.4 'euc-jp' => PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
413 wakaba 1.1 },
414 wakaba 1.5 perl_names => {
415     'euc-jp-1997' => UNREGISTERED_CHARSET_NAME | ERROR_REPORTING_ENCODING_IMPL,
416     ## NOTE: Though the IANA definition references the 1990 version
417     ## of EUC-JP, the 1997 version of JIS standard claims that the version
418     ## is same coded character set as the 1990 version, such that we
419     ## consider the EUC-JP 1990 version is same as the 1997 version.
420 wakaba 1.6 'euc-jp' => PREFERRED_CHARSET_NAME | NONCONFORMING_ENCODING_IMPL |
421     ERROR_REPORTING_ENCODING_IMPL,
422 wakaba 1.5 ## NOTE: Unicode mapping is wrong.
423     },
424 wakaba 1.1 is_html_ascii_superset => 1,
425 wakaba 1.3 mime_text_suitable => 1,
426 wakaba 1.4 });
427 wakaba 1.3
428     $Charset->{'x-euc-jp'}
429     = $IANACharset->{'x-euc-jp'}
430 wakaba 1.4 = __PACKAGE__->new ({
431 wakaba 1.5 category => CHARSET_CATEGORY_EUCJP | CHARSET_CATEGORY_BLOCK_SAFE,
432 wakaba 1.3 iana_names => {
433     'x-euc-jp' => UNREGISTERED_CHARSET_NAME,
434     },
435 wakaba 1.6 perl_names => {
436     'euc-jp-1997' => FALLBACK_ENCODING_IMPL | ERROR_REPORTING_ENCODING_IMPL,
437     'euc-jp' => FALLBACK_ENCODING_IMPL | ERROR_REPORTING_ENCODING_IMPL,
438     },
439     is_html_ascii_superset => 1, is_html_ascii_superset => 1,
440 wakaba 1.3 mime_text_suitable => 1,
441 wakaba 1.4 });
442 wakaba 1.1
443 wakaba 1.2 $Charset->{'extended_unix_code_fixed_width_for_japanese'}
444     = $IANACharset->{'extended_unix_code_fixed_width_for_japanese'}
445     = $IANACharset->{'cseucfixwidjapanese'}
446 wakaba 1.4 = __PACKAGE__->new ({
447 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
448 wakaba 1.2 iana_names => {
449 wakaba 1.4 'extended_unix_code_fixed_width_for_japanese' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
450 wakaba 1.2 'cseucfixwidjapanese' => REGISTERED_CHARSET_NAME,
451     },
452 wakaba 1.4 });
453 wakaba 1.2
454 wakaba 1.1 ## TODO: ...
455    
456 wakaba 1.2 $Charset->{'euc-kr'}
457     = $IANACharset->{'euc-kr'}
458     = $IANACharset->{'cseuckr'}
459 wakaba 1.4 = __PACKAGE__->new ({
460 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
461 wakaba 1.2 iana_names => {
462 wakaba 1.4 'euc-kr' => PRIMARY_CHARSET_NAME | PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
463 wakaba 1.2 'cseuckr' => REGISTERED_CHARSET_NAME,
464     },
465     is_html_ascii_superset => 1,
466 wakaba 1.4 });
467 wakaba 1.2
468 wakaba 1.1 $Charset->{'iso-2022-jp'}
469     = $IANACharset->{'iso-2022-jp'}
470     = $IANACharset->{'csiso2022jp'}
471 wakaba 1.3 = $IANACharset->{'iso2022jp'}
472     = $IANACharset->{'junet-code'}
473 wakaba 1.4 = __PACKAGE__->new ({
474 wakaba 1.5 category => 0,
475 wakaba 1.1 iana_names => {
476 wakaba 1.4 'iso-2022-jp' => PREFERRED_CHARSET_NAME | PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
477 wakaba 1.1 'csiso2022jp' => REGISTERED_CHARSET_NAME,
478 wakaba 1.3 'iso2022jp' => UNREGISTERED_CHARSET_NAME,
479     'junet-code' => UNREGISTERED_CHARSET_NAME,
480 wakaba 1.1 },
481 wakaba 1.2 mime_text_suitable => 1,
482 wakaba 1.4 });
483 wakaba 1.2
484     $Charset->{'iso-2022-jp-2'}
485     = $IANACharset->{'iso-2022-jp-2'}
486     = $IANACharset->{'csiso2022jp2'}
487 wakaba 1.4 = __PACKAGE__->new ({
488 wakaba 1.5 category => 0,
489 wakaba 1.2 iana_names => {
490 wakaba 1.4 'iso-2022-jp-2' => PREFERRED_CHARSET_NAME | PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
491 wakaba 1.2 'csiso2022jp2' => REGISTERED_CHARSET_NAME,
492     },
493     mime_text_suitable => 1,
494 wakaba 1.4 });
495 wakaba 1.1
496     ## TODO: ...
497    
498     $Charset->{'utf-8'}
499     = $IANACharset->{'utf-8'}
500 wakaba 1.3 = $IANACharset->{'x-utf-8'}
501 wakaba 1.4 = __PACKAGE__->new ({
502 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
503 wakaba 1.1 iana_names => {
504 wakaba 1.4 'utf-8' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
505 wakaba 1.6 ## NOTE: IANA name "utf-8" references RFC 3629. According to the RFC,
506     ## the definitive definition is one specified in the Unicode Standard.
507 wakaba 1.3 'x-utf-8' => UNREGISTERED_CHARSET_NAME,
508 wakaba 1.1 },
509 wakaba 1.6 perl_names => {
510     'utf-8-strict' => PRIMARY_CHARSET_NAME | SEMICONFORMING_ENCODING_IMPL |
511     ERROR_REPORTING_ENCODING_IMPL,
512     ## NOTE: It does not support non-Unicode UCS characters (conforming).
513     ## It does detect illegal sequences (conforming).
514     ## It does not support surrpgate pairs (conforming).
515     ## It does not support BOMs (non-conforming).
516     },
517     bom_pattern => qr/\xEF\xBB\xBF/,
518 wakaba 1.1 is_html_ascii_superset => 1,
519 wakaba 1.3 mime_text_suitable => 1,
520 wakaba 1.4 });
521 wakaba 1.3
522     $Charset->{'utf-8n'}
523     = $IANACharset->{'utf-8n'}
524 wakaba 1.4 = __PACKAGE__->new ({
525 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
526 wakaba 1.3 iana_names => {
527     'utf-8n' => UNREGISTERED_CHARSET_NAME,
528 wakaba 1.6 ## NOTE: Is there any normative definition for the charset?
529     ## What variant of UTF-8 should we use for the charset?
530     },
531     perl_names => {
532     'utf-8-strict' => PRIMARY_CHARSET_NAME | ERROR_REPORTING_ENCODING_IMPL,
533 wakaba 1.3 },
534     is_html_ascii_superset => 1,
535     mime_text_suitable => 1,
536 wakaba 1.4 });
537 wakaba 1.1
538     ## TODO: ...
539    
540 wakaba 1.2 $Charset->{'gbk'}
541     = $IANACharset->{'gbk'}
542     = $IANACharset->{'cp936'}
543     = $IANACharset->{'ms936'}
544     = $IANACharset->{'windows-936'}
545 wakaba 1.4 = __PACKAGE__->new ({
546 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
547 wakaba 1.2 iana_names => {
548 wakaba 1.4 'gbk' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
549 wakaba 1.2 'cp936' => REGISTERED_CHARSET_NAME,
550     'ms936' => REGISTERED_CHARSET_NAME,
551     'windows-936' => REGISTERED_CHARSET_NAME,
552     },
553     iana_status => STATUS_COMMON | STATUS_OBSOLETE,
554     mime_text_suitable => 1,
555 wakaba 1.4 });
556 wakaba 1.2
557     $Charset->{'gb18030'}
558     = $IANACharset->{'gb18030'}
559 wakaba 1.4 = __PACKAGE__->new ({
560 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
561 wakaba 1.2 iana_names => {
562 wakaba 1.4 'gb18030' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
563 wakaba 1.2 },
564     iana_status => STATUS_COMMON,
565     mime_text_suitable => 1,
566 wakaba 1.4 });
567 wakaba 1.2
568     ## TODO: ...
569    
570 wakaba 1.1 $Charset->{'utf-16be'}
571     = $IANACharset->{'utf-16be'}
572 wakaba 1.4 = __PACKAGE__->new ({
573 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
574 wakaba 1.1 iana_names => {
575 wakaba 1.4 'utf-16be' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
576 wakaba 1.1 },
577 wakaba 1.4 });
578 wakaba 1.1
579     $Charset->{'utf-16le'}
580     = $IANACharset->{'utf-16le'}
581 wakaba 1.4 = __PACKAGE__->new ({
582 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
583 wakaba 1.1 iana_names => {
584 wakaba 1.4 'utf-16le' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
585 wakaba 1.1 },
586 wakaba 1.4 });
587 wakaba 1.1
588     $Charset->{'utf-16'}
589     = $IANACharset->{'utf-16'}
590 wakaba 1.4 = __PACKAGE__->new ({
591 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
592 wakaba 1.1 iana_names => {
593 wakaba 1.4 'utf-16' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
594 wakaba 1.1 },
595 wakaba 1.4 });
596 wakaba 1.1
597     ## TODO: ...
598    
599 wakaba 1.2 $Charset->{'windows-31j'}
600     = $IANACharset->{'windows-31j'}
601     = $IANACharset->{'cswindows31j'}
602 wakaba 1.4 = __PACKAGE__->new ({
603 wakaba 1.5 category => CHARSET_CATEGORY_SJIS | CHARSET_CATEGORY_BLOCK_SAFE,
604 wakaba 1.2 iana_names => {
605 wakaba 1.4 'windows-31j' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
606 wakaba 1.2 'cswindows31j' => REGISTERED_CHARSET_NAME,
607     },
608     iana_status => STATUS_LIMITED_USE, # maybe
609     mime_text_suitable => 1,
610 wakaba 1.4 });
611 wakaba 1.2
612     $Charset->{'gb2312'}
613     = $IANACharset->{'gb2312'}
614     = $IANACharset->{'csgb2312'}
615 wakaba 1.4 = __PACKAGE__->new ({
616 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
617 wakaba 1.2 iana_names => {
618 wakaba 1.4 'gb2312' => PRIMARY_CHARSET_NAME | PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
619 wakaba 1.2 'csgb2312' => REGISTERED_CHARSET_NAME,
620     },
621     is_html_ascii_superset => 1,
622     mime_text_suitable => 1,
623 wakaba 1.4 });
624 wakaba 1.2
625     $Charset->{'big5'}
626     = $IANACharset->{'big5'}
627     = $IANACharset->{'csbig5'}
628 wakaba 1.4 = __PACKAGE__->new ({
629 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
630 wakaba 1.2 iana_names => {
631 wakaba 1.4 'big5' => PRIMARY_CHARSET_NAME | PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
632 wakaba 1.2 'csbig5' => REGISTERED_CHARSET_NAME,
633     },
634     mime_text_suitable => 1,
635 wakaba 1.4 });
636 wakaba 1.2
637     ## TODO: ...
638    
639     $Charset->{'big5-hkscs'}
640     = $IANACharset->{'big5-hkscs'}
641 wakaba 1.4 = __PACKAGE__->new ({
642 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
643 wakaba 1.2 iana_names => {
644 wakaba 1.4 'big5-hkscs' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
645 wakaba 1.2 },
646     mime_text_suitable => 1,
647 wakaba 1.4 });
648 wakaba 1.2
649     ## TODO: ...
650    
651 wakaba 1.1 $Charset->{'windows-1252'}
652     = $IANACharset->{'windows-1252'}
653 wakaba 1.4 = __PACKAGE__->new ({
654 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
655 wakaba 1.1 iana_names => {
656 wakaba 1.4 'windows-1252' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
657 wakaba 1.1 },
658     is_html_ascii_superset => 1,
659 wakaba 1.4 });
660 wakaba 1.1
661     ## TODO: ...
662    
663 wakaba 1.4 sub new ($$) {
664     return bless $_[1], $_[0];
665     } # new
666    
667     ## NOTE: A class method
668     sub get_by_iana_name ($$) {
669     my $name = $_[1];
670     $name =~ tr/A-Z/a-z/; ## ASCII case-insensitive
671     unless ($IANACharset->{$name}) {
672     $IANACharset->{$name} = __PACKAGE__->new ({
673     iana_names => {
674     $name => UNREGISTERED_CHARSET_NAME,
675     },
676     });
677     }
678     return $IANACharset->{$name};
679     } # get_by_iana_name
680    
681 wakaba 1.5 sub get_decode_handle ($$;%) {
682     my $self = shift;
683     my $byte_stream = shift;
684     my %opt = @_;
685    
686     my $obj = {
687     character_queue => [],
688     filehandle => $byte_stream,
689     charset => '', ## TODO: We set a charset name for input_encoding (when we get identify-by-URI nonsense away)
690     byte_buffer => $opt{byte_buffer} ? ${$opt{byte_buffer}} : '', ## TODO: ref, instead of value, should be used
691     onerror => $opt{onerror} || sub {},
692     };
693    
694     require Whatpm::Charset::DecodeHandle;
695     if ($self->{iana_names}->{'iso-2022-jp'}) {
696     $obj->{state_2440} = 'gl-jis-1978';
697     $obj->{state_2442} = 'gl-jis-1983';
698     $obj->{state} = 'state_2842';
699     eval {
700     require Encode::GLJIS1978;
701     require Encode::GLJIS1983;
702     };
703     if (Encode::find_encoding ($obj->{state_2440}) and
704     Encode::find_encoding ($obj->{state_2442})) {
705     return ((bless $obj, 'Whatpm::Charset::DecodeHandle::ISO2022JP'),
706     PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME);
707     }
708     } elsif ($self->{xml_names}->{'iso-2022-jp'}) {
709     $obj->{state_2440} = 'gl-jis-1997-swapped';
710     $obj->{state_2442} = 'gl-jis-1997';
711     $obj->{state} = 'state_2842';
712     eval {
713     require Encode::GLJIS1997Swapped;
714     require Encode::GLJIS1997;
715     };
716     if (Encode::find_encoding ($obj->{state_2440}) and
717     Encode::find_encoding ($obj->{state_2442})) {
718     return ((bless $obj, 'Whatpm::Charset::DecodeHandle::ISO2022JP'),
719     PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME);
720     }
721     }
722    
723 wakaba 1.6 my ($e, $e_status) = $self->get_perl_encoding
724     (%opt, allow_semiconforming => 1);
725 wakaba 1.5 if ($e) {
726     $obj->{perl_encoding_name} = $e->name;
727     if ($self->{category} & CHARSET_CATEGORY_EUCJP) {
728     return ((bless $obj, 'Whatpm::Charset::DecodeHandle::EUCJP'),
729     $e_status);
730     } elsif ($self->{category} & CHARSET_CATEGORY_SJIS) {
731     return ((bless $obj, 'Whatpm::Charset::DecodeHandle::ShiftJIS'),
732     $e_status);
733     } elsif ($self->{category} & CHARSET_CATEGORY_BLOCK_SAFE) {
734 wakaba 1.6 $obj->{bom_pattern} = $self->{bom_pattern};
735 wakaba 1.5 return ((bless $obj, 'Whatpm::Charset::DecodeHandle::Encode'),
736     $e_status);
737     } else {
738     ## TODO: no encoding error (?)
739     return (undef, 0);
740     }
741     } else {
742     ## TODO: no encoding error(?)
743     return (undef, 0);
744     }
745     } # get_decode_handle
746    
747 wakaba 1.4 sub get_perl_encoding ($;%) {
748     my ($self, %opt) = @_;
749    
750     require Encode;
751 wakaba 1.5 my $load_encode = sub {
752     my $name = shift;
753     if ($name eq 'euc-jp-1997') {
754     require Encode::EUCJP1997;
755     } elsif ($name eq 'shift-jis-1997') {
756     require Encode::ShiftJIS1997;
757     }
758     }; # $load_encode
759 wakaba 1.4
760     if ($opt{allow_error_reporting}) {
761     for my $perl_name (keys %{$self->{perl_names} or {}}) {
762     my $perl_status = $self->{perl_names}->{$perl_name};
763     next unless $perl_status & ERROR_REPORTING_ENCODING_IMPL;
764 wakaba 1.5 next if $perl_status & FALLBACK_ENCODING_IMPL;
765 wakaba 1.6 next if $perl_status & SEMICONFORMING_ENCODING_IMPL and
766     not $opt{allow_semiconforming};
767 wakaba 1.4
768 wakaba 1.5 $load_encode->($perl_name);
769 wakaba 1.4 my $e = Encode::find_encoding ($perl_name);
770     if ($e) {
771     return ($e, $perl_status);
772     }
773     }
774     }
775    
776     for my $perl_name (keys %{$self->{perl_names} or {}}) {
777     my $perl_status = $self->{perl_names}->{$perl_name};
778     next if $perl_status & ERROR_REPORTING_ENCODING_IMPL;
779     next if $perl_status & FALLBACK_ENCODING_IMPL;
780 wakaba 1.6 next if $perl_status & SEMICONFORMING_ENCODING_IMPL and
781     not $opt{allow_semiconforming};
782 wakaba 1.5
783     $load_encode->($perl_name);
784 wakaba 1.4 my $e = Encode::find_encoding ($perl_name);
785     if ($e) {
786     return ($e, $perl_status);
787     }
788     }
789    
790     if ($opt{allow_fallback}) {
791     for my $perl_name (keys %{$self->{perl_names} or {}}) {
792     my $perl_status = $self->{perl_names}->{$perl_name};
793 wakaba 1.6 next unless $perl_status & FALLBACK_ENCODING_IMPL or
794     $perl_status & SEMICONFORMING_ENCODING_IMPL;
795     ## NOTE: We don't prefer semi-conforming implementations to
796     ## non-conforming implementations, since semi-conforming implementations
797     ## will never be conforming without assist of the callee, and in such
798     ## cases the callee should set the |allow_semiconforming| option upon
799     ## the invocation of the method anyway.
800    
801 wakaba 1.5 $load_encode->($perl_name);
802 wakaba 1.4 my $e = Encode::find_encoding ($perl_name);
803     if ($e) {
804     return ($e, $perl_status);
805     }
806     }
807    
808     for my $iana_name (keys %{$self->{iana_names} or {}}) {
809 wakaba 1.5 $load_encode->($iana_name);
810 wakaba 1.4 my $e = Encode::find_encoding ($iana_name);
811     if ($e) {
812     return ($e, FALLBACK_ENCODING_IMPL);
813     }
814     }
815     }
816    
817     return (undef, 0);
818     } # get_perl_encoding
819    
820     sub get_iana_name ($) {
821     my $self = shift;
822    
823     my $primary;
824     my $other;
825     for my $iana_name (keys %{$self->{iana_names} or {}}) {
826     my $name_status = $self->{iana_names}->{$iana_name};
827     if ($name_status & PREFERRED_CHARSET_NAME) {
828     return $iana_name;
829     } elsif ($name_status & PRIMARY_CHARSET_NAME) {
830     $primary = $iana_name;
831     } elsif ($name_status & REGISTERED_CHARSET_NAME) {
832     $other = $iana_name;
833     } else {
834     $other ||= $iana_name;
835     }
836     }
837    
838     return $primary || $other;
839     } # get_iana_name
840    
841     ## NOTE: A non-method function
842 wakaba 1.3 sub is_syntactically_valid_iana_charset_name ($) {
843 wakaba 1.1 my $name = shift;
844     return $name =~ /\A[\x20-\x7E]{1,40}\z/;
845     } # is_suntactically_valid_iana_charset_name
846    
847     1;
848 wakaba 1.6 ## $Date: 2008/05/17 12:32:14 $
849 wakaba 1.1

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24