/[suikacvs]/messaging/manakai/lib/Message/Charset/Info.pm
Suika

Contents of /messaging/manakai/lib/Message/Charset/Info.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.5 - (hide annotations) (download)
Sat May 17 12:32:14 2008 UTC (16 years, 6 months ago) by wakaba
Branch: MAIN
Changes since 1.4: +148 -2 lines
++ manakai/lib/Message/Charset/ChangeLog	17 May 2008 12:32:10 -0000
2008-05-17  Wakaba  <wakaba@suika.fam.cx>

	* Info.pm: Charset definitions now have |category| properties.
	Property |perl_names| is added to |shift_jis| and |euc-jp|.
	Charset definition for |shift_jisx0213| is added.
	(get_decode_handle): New.
	(get_perl_encoding): Auto-load encode modules for internal use
	by DecodeHandle (part of Whatpm module).

1 wakaba 1.1 package Message::Charset::Info;
2     use strict;
3 wakaba 1.4 our $VERSION=do{my @r=(q$Revision: 1.3 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 wakaba 1.1
5     sub UNREGISTERED_CHARSET_NAME () { 0b1 }
6 wakaba 1.4 ## Names for non-standard encodings/implementations for Perl encodings
7 wakaba 1.1 sub REGISTERED_CHARSET_NAME () { 0b10 }
8 wakaba 1.4 ## Names for standard encodings for Perl encodings
9     sub PRIMARY_CHARSET_NAME () { 0b100 }
10 wakaba 1.1 ## "Name:" field for IANA names
11 wakaba 1.4 ## Canonical name for Perl encodings
12     sub PREFERRED_CHARSET_NAME () { 0b1000 }
13 wakaba 1.1 ## "preferred MIME name" for IANA names
14    
15 wakaba 1.4 sub FALLBACK_ENCODING_IMPL () { 0b10000 }
16     ## For Perl encodings: Not a name of the encoding, the encoding
17     ## for the name might be useful as a fallback when the correct
18     ## encoding is not supported.
19     sub NONCONFORMING_ENCODING_IMPL () { FALLBACK_ENCODING_IMPL }
20     ## For Perl encodings: Not a conforming implementation of the encoding,
21     ## though it seems that the intention was to implement that encoding.
22     sub ERROR_REPORTING_ENCODING_IMPL () { 0b100000 }
23     ## For Perl encodings: Support error reporting via |manakai_onerror|
24     ## handler.
25    
26 wakaba 1.2 ## iana_status
27     sub STATUS_COMMON () { 0b1 }
28     sub STATUS_LIMITED_USE () { 0b10 }
29     sub STATUS_OBSOLETE () { 0b100 }
30    
31 wakaba 1.5 ## category
32     sub CHARSET_CATEGORY_BLOCK_SAFE () { 0b1 }
33     ## NOTE: Stateless
34     sub CHARSET_CATEGORY_EUCJP () { 0b10 }
35     sub CHARSET_CATEGORY_SJIS () { 0b100 }
36    
37 wakaba 1.1 ## iana_names
38 wakaba 1.5
39 wakaba 1.1 ## is_html_ascii_superset: "superset of US-ASCII (specifically, ANSI_X3.4-1968)
40     ## for bytes in the range 0x09 - 0x0D, 0x20, 0x21, 0x22, 0x26, 0x27,
41     ## 0x2C - 0x3F, 0x41 - 0x5A, and 0x61 - 0x7A" [HTML5]
42     ## is_ebcdic_based
43 wakaba 1.5 ## TODO: These flags are obsolete - should be replaced by category
44 wakaba 1.1
45     ## ISSUE: Shift_JIS is a superset of US-ASCII? ISO-2022-JP is?
46     ## ISSUE: 0x5F (_) should be added to the range?
47    
48     my $Charset;
49    
50     our $IANACharset;
51    
52     $Charset->{'us-ascii'}
53     = $IANACharset->{'ansi_x3.4-1968'}
54     = $IANACharset->{'iso-ir-6'}
55     = $IANACharset->{'ansi_x3.4-1986'}
56     = $IANACharset->{'iso_646.irv:1991'}
57     = $IANACharset->{'ascii'}
58     = $IANACharset->{'iso646-us'}
59     = $IANACharset->{'us-ascii'}
60     = $IANACharset->{'us'}
61     = $IANACharset->{'ibm367'}
62     = $IANACharset->{'cp367'}
63     = $IANACharset->{'csascii'}
64 wakaba 1.4 = __PACKAGE__->new ({
65 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
66 wakaba 1.1 iana_names => {
67 wakaba 1.4 'ansi_x3.4-1968' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
68 wakaba 1.1 'iso-ir-6' => REGISTERED_CHARSET_NAME,
69     'ansi_x3.4-1986' => REGISTERED_CHARSET_NAME,
70     'iso_646.irv:1991' => REGISTERED_CHARSET_NAME,
71     'ascii' => REGISTERED_CHARSET_NAME,
72     'iso646-us' => REGISTERED_CHARSET_NAME,
73 wakaba 1.4 'us-ascii' => PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
74 wakaba 1.1 'us' => REGISTERED_CHARSET_NAME,
75     'ibm367' => REGISTERED_CHARSET_NAME,
76     'cp367' => REGISTERED_CHARSET_NAME,
77     'csascii' => REGISTERED_CHARSET_NAME,
78     },
79     is_html_ascii_superset => 1,
80 wakaba 1.4 });
81 wakaba 1.1
82     $Charset->{'iso-8859-1'}
83     = $IANACharset->{'iso_8859-1:1987'}
84     = $IANACharset->{'iso-ir-100'}
85     = $IANACharset->{'iso_8859-1'}
86     = $IANACharset->{'iso-8859-1'}
87     = $IANACharset->{'latin1'}
88     = $IANACharset->{'l1'}
89     = $IANACharset->{'ibm819'}
90     = $IANACharset->{'cp819'}
91     = $IANACharset->{'csisolatin1'}
92 wakaba 1.4 = __PACKAGE__->new ({
93 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
94 wakaba 1.1 iana_names => {
95 wakaba 1.4 'iso_8859-1:1987' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
96 wakaba 1.1 'iso-ir-100' => REGISTERED_CHARSET_NAME,
97     'iso_8859-1' => REGISTERED_CHARSET_NAME,
98 wakaba 1.4 'iso-8859-1' => PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
99 wakaba 1.1 'latin1' => REGISTERED_CHARSET_NAME,
100     'l1' => REGISTERED_CHARSET_NAME,
101     'ibm819' => REGISTERED_CHARSET_NAME,
102     'cp819' => REGISTERED_CHARSET_NAME,
103     'csisolatin1' => REGISTERED_CHARSET_NAME,
104     },
105     is_html_ascii_superset => 1,
106 wakaba 1.4 });
107 wakaba 1.1
108 wakaba 1.2 $Charset->{'iso-8859-2'}
109     = $IANACharset->{'iso_8859-2:1987'}
110     = $IANACharset->{'iso-ir-101'}
111     = $IANACharset->{'iso_8859-2'}
112     = $IANACharset->{'iso-8859-2'}
113     = $IANACharset->{'latin2'}
114     = $IANACharset->{'l2'}
115     = $IANACharset->{'csisolatin2'}
116 wakaba 1.4 = __PACKAGE__->new ({
117 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
118 wakaba 1.2 iana_names => {
119 wakaba 1.4 'iso_8859-2:1987' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
120 wakaba 1.2 'iso-ir-101' => REGISTERED_CHARSET_NAME,
121     'iso_8859-2' => REGISTERED_CHARSET_NAME,
122 wakaba 1.4 'iso-8859-2' => PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
123 wakaba 1.2 'latin2' => REGISTERED_CHARSET_NAME,
124     'l2' => REGISTERED_CHARSET_NAME,
125     'csisolatin2' => REGISTERED_CHARSET_NAME,
126     },
127     is_html_ascii_superset => 1,
128 wakaba 1.4 });
129 wakaba 1.2
130     $Charset->{'iso-8859-3'}
131     = $IANACharset->{'iso_8859-3:1988'}
132     = $IANACharset->{'iso-ir-109'}
133     = $IANACharset->{'iso_8859-3'}
134     = $IANACharset->{'iso-8859-3'}
135     = $IANACharset->{'latin3'}
136     = $IANACharset->{'l3'}
137     = $IANACharset->{'csisolatin3'}
138 wakaba 1.4 = __PACKAGE__->new ({
139 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
140 wakaba 1.2 iana_names => {
141 wakaba 1.4 'iso_8859-3:1988' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
142 wakaba 1.2 'iso-ir-109' => REGISTERED_CHARSET_NAME,
143     'iso_8859-3' => REGISTERED_CHARSET_NAME,
144 wakaba 1.4 'iso-8859-3' => PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
145 wakaba 1.2 'latin3' => REGISTERED_CHARSET_NAME,
146     'l3' => REGISTERED_CHARSET_NAME,
147     'csisolatin3' => REGISTERED_CHARSET_NAME,
148     },
149     is_html_ascii_superset => 1,
150 wakaba 1.4 });
151 wakaba 1.2
152     $Charset->{'iso-8859-4'}
153     = $IANACharset->{'iso_8859-4:1988'}
154     = $IANACharset->{'iso-ir-110'}
155     = $IANACharset->{'iso_8859-4'}
156     = $IANACharset->{'iso-8859-4'}
157     = $IANACharset->{'latin4'}
158     = $IANACharset->{'l4'}
159     = $IANACharset->{'csisolatin4'}
160 wakaba 1.4 = __PACKAGE__->new ({
161 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
162 wakaba 1.2 iana_names => {
163 wakaba 1.4 'iso_8859-4:1988' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
164 wakaba 1.2 'iso-ir-110' => REGISTERED_CHARSET_NAME,
165     'iso_8859-4' => REGISTERED_CHARSET_NAME,
166 wakaba 1.4 'iso-8859-4' => PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
167 wakaba 1.2 'latin4' => REGISTERED_CHARSET_NAME,
168     'l4' => REGISTERED_CHARSET_NAME,
169     'csisolatin4' => REGISTERED_CHARSET_NAME,
170     },
171     is_html_ascii_superset => 1,
172 wakaba 1.4 });
173 wakaba 1.2
174     $Charset->{'iso-8859-5'}
175     = $IANACharset->{'iso_8859-5:1988'}
176     = $IANACharset->{'iso-ir-144'}
177     = $IANACharset->{'iso_8859-5'}
178     = $IANACharset->{'iso-8859-5'}
179     = $IANACharset->{'cyrillic'}
180     = $IANACharset->{'csisolatincyrillic'}
181 wakaba 1.4 = __PACKAGE__->new ({
182 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
183 wakaba 1.2 iana_names => {
184 wakaba 1.4 'iso_8859-5:1988' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
185 wakaba 1.2 'iso-ir-144' => REGISTERED_CHARSET_NAME,
186     'iso_8859-5' => REGISTERED_CHARSET_NAME,
187 wakaba 1.4 'iso-8859-5' => PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
188 wakaba 1.2 'cyrillic' => REGISTERED_CHARSET_NAME,
189     'csisolatincyrillic' => REGISTERED_CHARSET_NAME,
190     },
191     is_html_ascii_superset => 1,
192 wakaba 1.4 });
193 wakaba 1.2
194     $Charset->{'iso-8859-6'}
195     = $IANACharset->{'iso_8859-6:1987'}
196     = $IANACharset->{'iso-ir-127'}
197     = $IANACharset->{'iso_8859-6'}
198     = $IANACharset->{'iso-8859-6'}
199     = $IANACharset->{'ecma-114'}
200     = $IANACharset->{'asmo-708'}
201     = $IANACharset->{'arabic'}
202     = $IANACharset->{'csisolatinarabic'}
203 wakaba 1.4 = __PACKAGE__->new ({
204 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
205 wakaba 1.2 iana_names => {
206 wakaba 1.4 'iso_8859-6:1987' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
207 wakaba 1.2 'iso-ir-127' => REGISTERED_CHARSET_NAME,
208     'iso_8859-6' => REGISTERED_CHARSET_NAME,
209 wakaba 1.4 'iso-8859-6' => PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
210 wakaba 1.2 'ecma-114' => REGISTERED_CHARSET_NAME,
211     'asmo-708' => REGISTERED_CHARSET_NAME,
212     'arabic' => REGISTERED_CHARSET_NAME,
213     'csisolatinarabic' => REGISTERED_CHARSET_NAME,
214     },
215     is_html_ascii_superset => 1,
216     ## NOTE: 3/0..3/9 have different semantics from U+0030..0039,
217     ## but have same character names (maybe).
218 wakaba 1.3 ## NOTE: According to RFC 2046, charset left-hand half of "iso-8859-6"
219     ## is same as "us-ascii".
220 wakaba 1.4 });
221 wakaba 1.2
222     $Charset->{'iso-8859-7'}
223     = $IANACharset->{'iso_8859-7:1987'}
224     = $IANACharset->{'iso-ir-126'}
225     = $IANACharset->{'iso_8859-7'}
226     = $IANACharset->{'iso-8859-7'}
227     = $IANACharset->{'elot_928'}
228     = $IANACharset->{'ecma-118'}
229     = $IANACharset->{'greek'}
230     = $IANACharset->{'greek8'}
231     = $IANACharset->{'csisolatingreek'}
232 wakaba 1.4 = __PACKAGE__->new ({
233 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
234 wakaba 1.2 iana_names => {
235 wakaba 1.4 'iso_8859-7:1987' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
236 wakaba 1.2 'iso-ir-126' => REGISTERED_CHARSET_NAME,
237     'iso_8859-7' => REGISTERED_CHARSET_NAME,
238 wakaba 1.4 'iso-8859-7' => PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
239 wakaba 1.2 'elot_928' => REGISTERED_CHARSET_NAME,
240     'ecma-118' => REGISTERED_CHARSET_NAME,
241     'greek' => REGISTERED_CHARSET_NAME,
242     'greek8' => REGISTERED_CHARSET_NAME,
243     'csisolatingreek' => REGISTERED_CHARSET_NAME,
244     },
245     is_html_ascii_superset => 1,
246 wakaba 1.4 });
247 wakaba 1.2
248     $Charset->{'iso-8859-8'}
249     = $IANACharset->{'iso_8859-8:1988'}
250     = $IANACharset->{'iso-ir-138'}
251     = $IANACharset->{'iso_8859-8'}
252     = $IANACharset->{'iso-8859-8'}
253     = $IANACharset->{'hebrew'}
254     = $IANACharset->{'csisolatinhebrew'}
255 wakaba 1.4 = __PACKAGE__->new ({
256 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
257 wakaba 1.2 iana_names => {
258 wakaba 1.4 'iso_8859-8:1988' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
259 wakaba 1.2 'iso-ir-138' => REGISTERED_CHARSET_NAME,
260     'iso_8859-8' => REGISTERED_CHARSET_NAME,
261 wakaba 1.4 'iso-8859-8' => PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
262 wakaba 1.2 'hebrew' => REGISTERED_CHARSET_NAME,
263     'csisolatinhebrew' => REGISTERED_CHARSET_NAME,
264     },
265     is_html_ascii_superset => 1,
266 wakaba 1.4 });
267 wakaba 1.2
268     $Charset->{'iso-8859-9'}
269     = $IANACharset->{'iso_8859-9:1989'}
270     = $IANACharset->{'iso-ir-148'}
271     = $IANACharset->{'iso_8859-9'}
272     = $IANACharset->{'iso-8859-9'}
273     = $IANACharset->{'latin5'}
274     = $IANACharset->{'l5'}
275     = $IANACharset->{'csisolatin5'}
276 wakaba 1.4 = __PACKAGE__->new ({
277 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
278 wakaba 1.2 iana_names => {
279 wakaba 1.4 'iso_8859-9:1989' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
280 wakaba 1.2 'iso-ir-148' => REGISTERED_CHARSET_NAME,
281     'iso_8859-9' => REGISTERED_CHARSET_NAME,
282 wakaba 1.4 'iso-8859-9' => PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
283 wakaba 1.2 'latin5' => REGISTERED_CHARSET_NAME,
284     'l5' => REGISTERED_CHARSET_NAME,
285     'csisolatin5' => REGISTERED_CHARSET_NAME,
286     },
287     is_html_ascii_superset => 1,
288 wakaba 1.4 });
289 wakaba 1.2
290     $Charset->{'iso-8859-10'}
291     = $IANACharset->{'iso-8859-10'}
292     = $IANACharset->{'iso-ir-157'}
293     = $IANACharset->{'l6'}
294     = $IANACharset->{'iso_8859-10:1992'}
295     = $IANACharset->{'csisolatin6'}
296     = $IANACharset->{'latin6'}
297 wakaba 1.4 = __PACKAGE__->new ({
298 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
299 wakaba 1.2 iana_names => {
300 wakaba 1.4 'iso-8859-10' => PRIMARY_CHARSET_NAME | PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
301 wakaba 1.2 'iso-ir-157' => REGISTERED_CHARSET_NAME,
302     'l6' => REGISTERED_CHARSET_NAME,
303     'iso_8859-10:1992' => REGISTERED_CHARSET_NAME,
304     'csisolatin6' => REGISTERED_CHARSET_NAME,
305     'latin6' => REGISTERED_CHARSET_NAME,
306     },
307     is_html_ascii_superset => 1,
308 wakaba 1.4 });
309 wakaba 1.2
310     $Charset->{'iso_6937-2-add'}
311     = $IANACharset->{'iso_6937-2-add'}
312     = $IANACharset->{'iso-ir-142'}
313     = $IANACharset->{'csisotextcomm'}
314 wakaba 1.4 = __PACKAGE__->new ({
315 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
316 wakaba 1.2 iana_names => {
317 wakaba 1.4 'iso_6937-2-add' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
318 wakaba 1.2 'iso-ir-142' => REGISTERED_CHARSET_NAME,
319     'csisotextcomm' => REGISTERED_CHARSET_NAME,
320     },
321     is_html_ascii_superset => 1,
322 wakaba 1.4 });
323 wakaba 1.2
324     $Charset->{'jis_x0201'}
325     = $IANACharset->{'jis_x0201'}
326     = $IANACharset->{'x0201'}
327     = $IANACharset->{'cshalfwidthkatakana'}
328 wakaba 1.4 = __PACKAGE__->new ({
329 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
330 wakaba 1.2 iana_names => {
331 wakaba 1.4 'jis_x0201' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
332 wakaba 1.2 'x0201' => REGISTERED_CHARSET_NAME,
333     'cshalfwidthkatakana' => REGISTERED_CHARSET_NAME,
334     },
335     is_html_ascii_superset => 1,
336 wakaba 1.4 });
337 wakaba 1.2
338     $Charset->{'jis_encoding'}
339     = $IANACharset->{'jis_encoding'}
340     = $IANACharset->{'csjisencoding'}
341 wakaba 1.4 = __PACKAGE__->new ({
342 wakaba 1.5 category => 0,
343 wakaba 1.2 iana_names => {
344 wakaba 1.4 'jis_encoding' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
345 wakaba 1.2 'csjisencoding' => REGISTERED_CHARSET_NAME,
346     },
347     ## NOTE: What is this?
348 wakaba 1.4 });
349 wakaba 1.1
350     $Charset->{'shift_jis'}
351     = $IANACharset->{'shift_jis'}
352     = $IANACharset->{'ms_kanji'}
353     = $IANACharset->{'csshiftjis'}
354 wakaba 1.4 = __PACKAGE__->new ({
355 wakaba 1.5 category => CHARSET_CATEGORY_SJIS | CHARSET_CATEGORY_BLOCK_SAFE,
356 wakaba 1.1 iana_names => {
357 wakaba 1.4 'shift_jis' => PREFERRED_CHARSET_NAME | PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
358 wakaba 1.1 'ms_kanji' => REGISTERED_CHARSET_NAME,
359     'csshiftjis' => REGISTERED_CHARSET_NAME,
360     },
361 wakaba 1.5 perl_names => {
362     'shift-jis-1997' => UNREGISTERED_CHARSET_NAME,
363     shiftjis => PRIMARY_CHARSET_NAME | NONCONFORMING_ENCODING_IMPL,
364     ## NOTE: Unicode mapping is wrong.
365     },
366 wakaba 1.2 mime_text_suitable => 1,
367 wakaba 1.4 });
368 wakaba 1.1
369 wakaba 1.3 $Charset->{'x-sjis'}
370     = $IANACharset->{'x-sjis'}
371 wakaba 1.4 = __PACKAGE__->new ({
372 wakaba 1.5 category => CHARSET_CATEGORY_SJIS | CHARSET_CATEGORY_BLOCK_SAFE,
373 wakaba 1.3 iana_names => {
374     'x-sjis' => UNREGISTERED_CHARSET_NAME,
375     },
376     mime_text_suitable => 1,
377 wakaba 1.4 });
378 wakaba 1.3
379 wakaba 1.5 $Charset->{shift_jisx0213}
380     = $IANACharset->{shift_jisx0213}
381     = __PACKAGE__->new ({
382     category => CHARSET_CATEGORY_SJIS | CHARSET_CATEGORY_BLOCK_SAFE,
383     iana_names => {
384     shift_jisx0213 => UNREGISTERED_CHARSET_NAME,
385     },
386     perl_names => {
387     #shift_jisx0213 (non-standard - i don't know its conformance)
388     'shift-jis-1997' => FALLBACK_ENCODING_IMPL,
389     },
390     mime_text_suitable => 1,
391     });
392    
393 wakaba 1.1 $Charset->{'euc-jp'}
394     = $IANACharset->{'extended_unix_code_packed_format_for_japanese'}
395     = $IANACharset->{'cseucpkdfmtjapanese'}
396     = $IANACharset->{'euc-jp'}
397 wakaba 1.3 = $IANACharset->{'x-euc-jp'}
398 wakaba 1.4 = __PACKAGE__->new ({
399 wakaba 1.5 category => CHARSET_CATEGORY_EUCJP | CHARSET_CATEGORY_BLOCK_SAFE,
400 wakaba 1.1 iana_names => {
401 wakaba 1.4 'extended_unix_code_packed_format_for_japanese' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
402 wakaba 1.1 'cseucpkdfmtjapanese' => REGISTERED_CHARSET_NAME,
403 wakaba 1.4 'euc-jp' => PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
404 wakaba 1.1 },
405 wakaba 1.5 perl_names => {
406     'euc-jp-1997' => UNREGISTERED_CHARSET_NAME | ERROR_REPORTING_ENCODING_IMPL,
407     ## NOTE: Though the IANA definition references the 1990 version
408     ## of EUC-JP, the 1997 version of JIS standard claims that the version
409     ## is same coded character set as the 1990 version, such that we
410     ## consider the EUC-JP 1990 version is same as the 1997 version.
411     'euc-jp' => PREFERRED_CHARSET_NAME | NONCONFORMING_ENCODING_IMPL,
412     ## NOTE: Unicode mapping is wrong.
413     },
414 wakaba 1.1 is_html_ascii_superset => 1,
415 wakaba 1.3 mime_text_suitable => 1,
416 wakaba 1.4 });
417 wakaba 1.3
418     $Charset->{'x-euc-jp'}
419     = $IANACharset->{'x-euc-jp'}
420 wakaba 1.4 = __PACKAGE__->new ({
421 wakaba 1.5 category => CHARSET_CATEGORY_EUCJP | CHARSET_CATEGORY_BLOCK_SAFE,
422 wakaba 1.3 iana_names => {
423     'x-euc-jp' => UNREGISTERED_CHARSET_NAME,
424     },
425     is_html_ascii_superset => 1,
426     mime_text_suitable => 1,
427 wakaba 1.4 });
428 wakaba 1.1
429 wakaba 1.2 $Charset->{'extended_unix_code_fixed_width_for_japanese'}
430     = $IANACharset->{'extended_unix_code_fixed_width_for_japanese'}
431     = $IANACharset->{'cseucfixwidjapanese'}
432 wakaba 1.4 = __PACKAGE__->new ({
433 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
434 wakaba 1.2 iana_names => {
435 wakaba 1.4 'extended_unix_code_fixed_width_for_japanese' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
436 wakaba 1.2 'cseucfixwidjapanese' => REGISTERED_CHARSET_NAME,
437     },
438 wakaba 1.4 });
439 wakaba 1.2
440 wakaba 1.1 ## TODO: ...
441    
442 wakaba 1.2 $Charset->{'euc-kr'}
443     = $IANACharset->{'euc-kr'}
444     = $IANACharset->{'cseuckr'}
445 wakaba 1.4 = __PACKAGE__->new ({
446 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
447 wakaba 1.2 iana_names => {
448 wakaba 1.4 'euc-kr' => PRIMARY_CHARSET_NAME | PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
449 wakaba 1.2 'cseuckr' => REGISTERED_CHARSET_NAME,
450     },
451     is_html_ascii_superset => 1,
452 wakaba 1.4 });
453 wakaba 1.2
454 wakaba 1.1 $Charset->{'iso-2022-jp'}
455     = $IANACharset->{'iso-2022-jp'}
456     = $IANACharset->{'csiso2022jp'}
457 wakaba 1.3 = $IANACharset->{'iso2022jp'}
458     = $IANACharset->{'junet-code'}
459 wakaba 1.4 = __PACKAGE__->new ({
460 wakaba 1.5 category => 0,
461 wakaba 1.1 iana_names => {
462 wakaba 1.4 'iso-2022-jp' => PREFERRED_CHARSET_NAME | PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
463 wakaba 1.1 'csiso2022jp' => REGISTERED_CHARSET_NAME,
464 wakaba 1.3 'iso2022jp' => UNREGISTERED_CHARSET_NAME,
465     'junet-code' => UNREGISTERED_CHARSET_NAME,
466 wakaba 1.1 },
467 wakaba 1.2 mime_text_suitable => 1,
468 wakaba 1.4 });
469 wakaba 1.2
470     $Charset->{'iso-2022-jp-2'}
471     = $IANACharset->{'iso-2022-jp-2'}
472     = $IANACharset->{'csiso2022jp2'}
473 wakaba 1.4 = __PACKAGE__->new ({
474 wakaba 1.5 category => 0,
475 wakaba 1.2 iana_names => {
476 wakaba 1.4 'iso-2022-jp-2' => PREFERRED_CHARSET_NAME | PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
477 wakaba 1.2 'csiso2022jp2' => REGISTERED_CHARSET_NAME,
478     },
479     mime_text_suitable => 1,
480 wakaba 1.4 });
481 wakaba 1.1
482     ## TODO: ...
483    
484     $Charset->{'utf-8'}
485     = $IANACharset->{'utf-8'}
486 wakaba 1.3 = $IANACharset->{'x-utf-8'}
487 wakaba 1.4 = __PACKAGE__->new ({
488 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
489 wakaba 1.1 iana_names => {
490 wakaba 1.4 'utf-8' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
491 wakaba 1.3 'x-utf-8' => UNREGISTERED_CHARSET_NAME,
492 wakaba 1.1 },
493     is_html_ascii_superset => 1,
494 wakaba 1.3 mime_text_suitable => 1,
495 wakaba 1.4 });
496 wakaba 1.3
497     $Charset->{'utf-8n'}
498     = $IANACharset->{'utf-8n'}
499 wakaba 1.4 = __PACKAGE__->new ({
500 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
501 wakaba 1.3 iana_names => {
502     'utf-8n' => UNREGISTERED_CHARSET_NAME,
503     },
504     is_html_ascii_superset => 1,
505     mime_text_suitable => 1,
506 wakaba 1.4 });
507 wakaba 1.1
508     ## TODO: ...
509    
510 wakaba 1.2 $Charset->{'gbk'}
511     = $IANACharset->{'gbk'}
512     = $IANACharset->{'cp936'}
513     = $IANACharset->{'ms936'}
514     = $IANACharset->{'windows-936'}
515 wakaba 1.4 = __PACKAGE__->new ({
516 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
517 wakaba 1.2 iana_names => {
518 wakaba 1.4 'gbk' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
519 wakaba 1.2 'cp936' => REGISTERED_CHARSET_NAME,
520     'ms936' => REGISTERED_CHARSET_NAME,
521     'windows-936' => REGISTERED_CHARSET_NAME,
522     },
523     iana_status => STATUS_COMMON | STATUS_OBSOLETE,
524     mime_text_suitable => 1,
525 wakaba 1.4 });
526 wakaba 1.2
527     $Charset->{'gb18030'}
528     = $IANACharset->{'gb18030'}
529 wakaba 1.4 = __PACKAGE__->new ({
530 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
531 wakaba 1.2 iana_names => {
532 wakaba 1.4 'gb18030' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
533 wakaba 1.2 },
534     iana_status => STATUS_COMMON,
535     mime_text_suitable => 1,
536 wakaba 1.4 });
537 wakaba 1.2
538     ## TODO: ...
539    
540 wakaba 1.1 $Charset->{'utf-16be'}
541     = $IANACharset->{'utf-16be'}
542 wakaba 1.4 = __PACKAGE__->new ({
543 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
544 wakaba 1.1 iana_names => {
545 wakaba 1.4 'utf-16be' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
546 wakaba 1.1 },
547 wakaba 1.4 });
548 wakaba 1.1
549     $Charset->{'utf-16le'}
550     = $IANACharset->{'utf-16le'}
551 wakaba 1.4 = __PACKAGE__->new ({
552 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
553 wakaba 1.1 iana_names => {
554 wakaba 1.4 'utf-16le' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
555 wakaba 1.1 },
556 wakaba 1.4 });
557 wakaba 1.1
558     $Charset->{'utf-16'}
559     = $IANACharset->{'utf-16'}
560 wakaba 1.4 = __PACKAGE__->new ({
561 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
562 wakaba 1.1 iana_names => {
563 wakaba 1.4 'utf-16' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
564 wakaba 1.1 },
565 wakaba 1.4 });
566 wakaba 1.1
567     ## TODO: ...
568    
569 wakaba 1.2 $Charset->{'windows-31j'}
570     = $IANACharset->{'windows-31j'}
571     = $IANACharset->{'cswindows31j'}
572 wakaba 1.4 = __PACKAGE__->new ({
573 wakaba 1.5 category => CHARSET_CATEGORY_SJIS | CHARSET_CATEGORY_BLOCK_SAFE,
574 wakaba 1.2 iana_names => {
575 wakaba 1.4 'windows-31j' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
576 wakaba 1.2 'cswindows31j' => REGISTERED_CHARSET_NAME,
577     },
578     iana_status => STATUS_LIMITED_USE, # maybe
579     mime_text_suitable => 1,
580 wakaba 1.4 });
581 wakaba 1.2
582     $Charset->{'gb2312'}
583     = $IANACharset->{'gb2312'}
584     = $IANACharset->{'csgb2312'}
585 wakaba 1.4 = __PACKAGE__->new ({
586 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
587 wakaba 1.2 iana_names => {
588 wakaba 1.4 'gb2312' => PRIMARY_CHARSET_NAME | PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
589 wakaba 1.2 'csgb2312' => REGISTERED_CHARSET_NAME,
590     },
591     is_html_ascii_superset => 1,
592     mime_text_suitable => 1,
593 wakaba 1.4 });
594 wakaba 1.2
595     $Charset->{'big5'}
596     = $IANACharset->{'big5'}
597     = $IANACharset->{'csbig5'}
598 wakaba 1.4 = __PACKAGE__->new ({
599 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
600 wakaba 1.2 iana_names => {
601 wakaba 1.4 'big5' => PRIMARY_CHARSET_NAME | PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME,
602 wakaba 1.2 'csbig5' => REGISTERED_CHARSET_NAME,
603     },
604     mime_text_suitable => 1,
605 wakaba 1.4 });
606 wakaba 1.2
607     ## TODO: ...
608    
609     $Charset->{'big5-hkscs'}
610     = $IANACharset->{'big5-hkscs'}
611 wakaba 1.4 = __PACKAGE__->new ({
612 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
613 wakaba 1.2 iana_names => {
614 wakaba 1.4 'big5-hkscs' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
615 wakaba 1.2 },
616     mime_text_suitable => 1,
617 wakaba 1.4 });
618 wakaba 1.2
619     ## TODO: ...
620    
621 wakaba 1.1 $Charset->{'windows-1252'}
622     = $IANACharset->{'windows-1252'}
623 wakaba 1.4 = __PACKAGE__->new ({
624 wakaba 1.5 category => CHARSET_CATEGORY_BLOCK_SAFE,
625 wakaba 1.1 iana_names => {
626 wakaba 1.4 'windows-1252' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME,
627 wakaba 1.1 },
628     is_html_ascii_superset => 1,
629 wakaba 1.4 });
630 wakaba 1.1
631     ## TODO: ...
632    
633 wakaba 1.4 sub new ($$) {
634     return bless $_[1], $_[0];
635     } # new
636    
637     ## NOTE: A class method
638     sub get_by_iana_name ($$) {
639     my $name = $_[1];
640     $name =~ tr/A-Z/a-z/; ## ASCII case-insensitive
641     unless ($IANACharset->{$name}) {
642     $IANACharset->{$name} = __PACKAGE__->new ({
643     iana_names => {
644     $name => UNREGISTERED_CHARSET_NAME,
645     },
646     });
647     }
648     return $IANACharset->{$name};
649     } # get_by_iana_name
650    
651 wakaba 1.5 sub get_decode_handle ($$;%) {
652     my $self = shift;
653     my $byte_stream = shift;
654     my %opt = @_;
655    
656     my $obj = {
657     character_queue => [],
658     filehandle => $byte_stream,
659     charset => '', ## TODO: We set a charset name for input_encoding (when we get identify-by-URI nonsense away)
660     byte_buffer => $opt{byte_buffer} ? ${$opt{byte_buffer}} : '', ## TODO: ref, instead of value, should be used
661     onerror => $opt{onerror} || sub {},
662     };
663    
664     require Whatpm::Charset::DecodeHandle;
665     if ($self->{iana_names}->{'iso-2022-jp'}) {
666     $obj->{state_2440} = 'gl-jis-1978';
667     $obj->{state_2442} = 'gl-jis-1983';
668     $obj->{state} = 'state_2842';
669     eval {
670     require Encode::GLJIS1978;
671     require Encode::GLJIS1983;
672     };
673     if (Encode::find_encoding ($obj->{state_2440}) and
674     Encode::find_encoding ($obj->{state_2442})) {
675     return ((bless $obj, 'Whatpm::Charset::DecodeHandle::ISO2022JP'),
676     PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME);
677     }
678     } elsif ($self->{xml_names}->{'iso-2022-jp'}) {
679     $obj->{state_2440} = 'gl-jis-1997-swapped';
680     $obj->{state_2442} = 'gl-jis-1997';
681     $obj->{state} = 'state_2842';
682     eval {
683     require Encode::GLJIS1997Swapped;
684     require Encode::GLJIS1997;
685     };
686     if (Encode::find_encoding ($obj->{state_2440}) and
687     Encode::find_encoding ($obj->{state_2442})) {
688     return ((bless $obj, 'Whatpm::Charset::DecodeHandle::ISO2022JP'),
689     PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME);
690     }
691     }
692    
693     my ($e, $e_status) = $self->get_perl_encoding (%opt);
694     if ($e) {
695     $obj->{perl_encoding_name} = $e->name;
696     if ($self->{category} & CHARSET_CATEGORY_EUCJP) {
697     return ((bless $obj, 'Whatpm::Charset::DecodeHandle::EUCJP'),
698     $e_status);
699     } elsif ($self->{category} & CHARSET_CATEGORY_SJIS) {
700     return ((bless $obj, 'Whatpm::Charset::DecodeHandle::ShiftJIS'),
701     $e_status);
702     } elsif ($self->{category} & CHARSET_CATEGORY_BLOCK_SAFE) {
703     return ((bless $obj, 'Whatpm::Charset::DecodeHandle::Encode'),
704     $e_status);
705     } else {
706     ## TODO: no encoding error (?)
707     return (undef, 0);
708     }
709     } else {
710     ## TODO: no encoding error(?)
711     return (undef, 0);
712     }
713     } # get_decode_handle
714    
715 wakaba 1.4 sub get_perl_encoding ($;%) {
716     my ($self, %opt) = @_;
717    
718     require Encode;
719 wakaba 1.5 my $load_encode = sub {
720     my $name = shift;
721     if ($name eq 'euc-jp-1997') {
722     require Encode::EUCJP1997;
723     } elsif ($name eq 'shift-jis-1997') {
724     require Encode::ShiftJIS1997;
725     }
726     }; # $load_encode
727 wakaba 1.4
728     if ($opt{allow_error_reporting}) {
729     for my $perl_name (keys %{$self->{perl_names} or {}}) {
730     my $perl_status = $self->{perl_names}->{$perl_name};
731     next unless $perl_status & ERROR_REPORTING_ENCODING_IMPL;
732 wakaba 1.5 next if $perl_status & FALLBACK_ENCODING_IMPL;
733 wakaba 1.4
734 wakaba 1.5 $load_encode->($perl_name);
735 wakaba 1.4 my $e = Encode::find_encoding ($perl_name);
736     if ($e) {
737     return ($e, $perl_status);
738     }
739     }
740     }
741    
742     for my $perl_name (keys %{$self->{perl_names} or {}}) {
743     my $perl_status = $self->{perl_names}->{$perl_name};
744     next if $perl_status & ERROR_REPORTING_ENCODING_IMPL;
745     next if $perl_status & FALLBACK_ENCODING_IMPL;
746 wakaba 1.5
747     $load_encode->($perl_name);
748 wakaba 1.4 my $e = Encode::find_encoding ($perl_name);
749     if ($e) {
750     return ($e, $perl_status);
751     }
752     }
753    
754     if ($opt{allow_fallback}) {
755     for my $perl_name (keys %{$self->{perl_names} or {}}) {
756     my $perl_status = $self->{perl_names}->{$perl_name};
757     next unless $perl_status & FALLBACK_ENCODING_IMPL;
758    
759 wakaba 1.5 $load_encode->($perl_name);
760 wakaba 1.4 my $e = Encode::find_encoding ($perl_name);
761     if ($e) {
762     return ($e, $perl_status);
763     }
764     }
765    
766     for my $iana_name (keys %{$self->{iana_names} or {}}) {
767 wakaba 1.5 $load_encode->($iana_name);
768 wakaba 1.4 my $e = Encode::find_encoding ($iana_name);
769     if ($e) {
770     return ($e, FALLBACK_ENCODING_IMPL);
771     }
772     }
773     }
774    
775     return (undef, 0);
776     } # get_perl_encoding
777    
778     sub get_iana_name ($) {
779     my $self = shift;
780    
781     my $primary;
782     my $other;
783     for my $iana_name (keys %{$self->{iana_names} or {}}) {
784     my $name_status = $self->{iana_names}->{$iana_name};
785     if ($name_status & PREFERRED_CHARSET_NAME) {
786     return $iana_name;
787     } elsif ($name_status & PRIMARY_CHARSET_NAME) {
788     $primary = $iana_name;
789     } elsif ($name_status & REGISTERED_CHARSET_NAME) {
790     $other = $iana_name;
791     } else {
792     $other ||= $iana_name;
793     }
794     }
795    
796     return $primary || $other;
797     } # get_iana_name
798    
799     ## NOTE: A non-method function
800 wakaba 1.3 sub is_syntactically_valid_iana_charset_name ($) {
801 wakaba 1.1 my $name = shift;
802     return $name =~ /\A[\x20-\x7E]{1,40}\z/;
803     } # is_suntactically_valid_iana_charset_name
804    
805     1;
806 wakaba 1.4 ## $Date: 2007/12/22 06:29:32 $
807 wakaba 1.1

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24