/[suikacvs]/messaging/manakai/lib/Message/Charset/Info.pm
Suika

Contents of /messaging/manakai/lib/Message/Charset/Info.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (hide annotations) (download)
Wed Nov 21 12:47:22 2007 UTC (17 years ago) by wakaba
Branch: MAIN
Changes since 1.1: +344 -3 lines
++ manakai/lib/Message/Charset/ChangeLog	21 Nov 2007 12:46:59 -0000
2007-11-21  Wakaba  <wakaba@suika.fam.cx>

	* Info.pm: More charset definitions.

1 wakaba 1.1 package Message::Charset::Info;
2     use strict;
3 wakaba 1.2 our $VERSION=do{my @r=(q$Revision: 1.1 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 wakaba 1.1
5     sub UNREGISTERED_CHARSET_NAME () { 0b1 }
6     sub REGISTERED_CHARSET_NAME () { 0b10 }
7     sub PRIMARY_CHARSET_NAME () { 0b100 | REGISTERED_CHARSET_NAME }
8     ## "Name:" field for IANA names
9     sub PREFERRED_CHARSET_NAME () { 0b1000 | REGISTERED_CHARSET_NAME }
10     ## "preferred MIME name" for IANA names
11    
12 wakaba 1.2 ## iana_status
13     sub STATUS_COMMON () { 0b1 }
14     sub STATUS_LIMITED_USE () { 0b10 }
15     sub STATUS_OBSOLETE () { 0b100 }
16    
17 wakaba 1.1 ## iana_names
18     ## is_html_ascii_superset: "superset of US-ASCII (specifically, ANSI_X3.4-1968)
19     ## for bytes in the range 0x09 - 0x0D, 0x20, 0x21, 0x22, 0x26, 0x27,
20     ## 0x2C - 0x3F, 0x41 - 0x5A, and 0x61 - 0x7A" [HTML5]
21     ## is_ebcdic_based
22    
23     ## ISSUE: Shift_JIS is a superset of US-ASCII? ISO-2022-JP is?
24     ## ISSUE: 0x5F (_) should be added to the range?
25    
26     my $Charset;
27    
28     our $IANACharset;
29    
30     $Charset->{'us-ascii'}
31     = $IANACharset->{'ansi_x3.4-1968'}
32     = $IANACharset->{'iso-ir-6'}
33     = $IANACharset->{'ansi_x3.4-1986'}
34     = $IANACharset->{'iso_646.irv:1991'}
35     = $IANACharset->{'ascii'}
36     = $IANACharset->{'iso646-us'}
37     = $IANACharset->{'us-ascii'}
38     = $IANACharset->{'us'}
39     = $IANACharset->{'ibm367'}
40     = $IANACharset->{'cp367'}
41     = $IANACharset->{'csascii'}
42     = {
43     iana_names => {
44     'ansi_x3.4-1968' => PRIMARY_CHARSET_NAME,
45     'iso-ir-6' => REGISTERED_CHARSET_NAME,
46     'ansi_x3.4-1986' => REGISTERED_CHARSET_NAME,
47     'iso_646.irv:1991' => REGISTERED_CHARSET_NAME,
48     'ascii' => REGISTERED_CHARSET_NAME,
49     'iso646-us' => REGISTERED_CHARSET_NAME,
50     'us-ascii' => PREFERRED_CHARSET_NAME,
51     'us' => REGISTERED_CHARSET_NAME,
52     'ibm367' => REGISTERED_CHARSET_NAME,
53     'cp367' => REGISTERED_CHARSET_NAME,
54     'csascii' => REGISTERED_CHARSET_NAME,
55     },
56     is_html_ascii_superset => 1,
57     };
58    
59     $Charset->{'iso-8859-1'}
60     = $IANACharset->{'iso_8859-1:1987'}
61     = $IANACharset->{'iso-ir-100'}
62     = $IANACharset->{'iso_8859-1'}
63     = $IANACharset->{'iso-8859-1'}
64     = $IANACharset->{'latin1'}
65     = $IANACharset->{'l1'}
66     = $IANACharset->{'ibm819'}
67     = $IANACharset->{'cp819'}
68     = $IANACharset->{'csisolatin1'}
69     = {
70     iana_names => {
71     'iso_8859-1:1987' => PRIMARY_CHARSET_NAME,
72     'iso-ir-100' => REGISTERED_CHARSET_NAME,
73     'iso_8859-1' => REGISTERED_CHARSET_NAME,
74     'iso-8859-1' => PREFERRED_CHARSET_NAME,
75     'latin1' => REGISTERED_CHARSET_NAME,
76     'l1' => REGISTERED_CHARSET_NAME,
77     'ibm819' => REGISTERED_CHARSET_NAME,
78     'cp819' => REGISTERED_CHARSET_NAME,
79     'csisolatin1' => REGISTERED_CHARSET_NAME,
80     },
81     is_html_ascii_superset => 1,
82     };
83    
84 wakaba 1.2 $Charset->{'iso-8859-2'}
85     = $IANACharset->{'iso_8859-2:1987'}
86     = $IANACharset->{'iso-ir-101'}
87     = $IANACharset->{'iso_8859-2'}
88     = $IANACharset->{'iso-8859-2'}
89     = $IANACharset->{'latin2'}
90     = $IANACharset->{'l2'}
91     = $IANACharset->{'csisolatin2'}
92     = {
93     iana_names => {
94     'iso_8859-2:1987' => PRIMARY_CHARSET_NAME,
95     'iso-ir-101' => REGISTERED_CHARSET_NAME,
96     'iso_8859-2' => REGISTERED_CHARSET_NAME,
97     'iso-8859-2' => PREFERRED_CHARSET_NAME,
98     'latin2' => REGISTERED_CHARSET_NAME,
99     'l2' => REGISTERED_CHARSET_NAME,
100     'csisolatin2' => REGISTERED_CHARSET_NAME,
101     },
102     is_html_ascii_superset => 1,
103     };
104    
105     $Charset->{'iso-8859-3'}
106     = $IANACharset->{'iso_8859-3:1988'}
107     = $IANACharset->{'iso-ir-109'}
108     = $IANACharset->{'iso_8859-3'}
109     = $IANACharset->{'iso-8859-3'}
110     = $IANACharset->{'latin3'}
111     = $IANACharset->{'l3'}
112     = $IANACharset->{'csisolatin3'}
113     = {
114     iana_names => {
115     'iso_8859-3:1988' => PRIMARY_CHARSET_NAME,
116     'iso-ir-109' => REGISTERED_CHARSET_NAME,
117     'iso_8859-3' => REGISTERED_CHARSET_NAME,
118     'iso-8859-3' => PREFERRED_CHARSET_NAME,
119     'latin3' => REGISTERED_CHARSET_NAME,
120     'l3' => REGISTERED_CHARSET_NAME,
121     'csisolatin3' => REGISTERED_CHARSET_NAME,
122     },
123     is_html_ascii_superset => 1,
124     };
125    
126     $Charset->{'iso-8859-4'}
127     = $IANACharset->{'iso_8859-4:1988'}
128     = $IANACharset->{'iso-ir-110'}
129     = $IANACharset->{'iso_8859-4'}
130     = $IANACharset->{'iso-8859-4'}
131     = $IANACharset->{'latin4'}
132     = $IANACharset->{'l4'}
133     = $IANACharset->{'csisolatin4'}
134     = {
135     iana_names => {
136     'iso_8859-4:1988' => PRIMARY_CHARSET_NAME,
137     'iso-ir-110' => REGISTERED_CHARSET_NAME,
138     'iso_8859-4' => REGISTERED_CHARSET_NAME,
139     'iso-8859-4' => PREFERRED_CHARSET_NAME,
140     'latin4' => REGISTERED_CHARSET_NAME,
141     'l4' => REGISTERED_CHARSET_NAME,
142     'csisolatin4' => REGISTERED_CHARSET_NAME,
143     },
144     is_html_ascii_superset => 1,
145     };
146    
147     $Charset->{'iso-8859-5'}
148     = $IANACharset->{'iso_8859-5:1988'}
149     = $IANACharset->{'iso-ir-144'}
150     = $IANACharset->{'iso_8859-5'}
151     = $IANACharset->{'iso-8859-5'}
152     = $IANACharset->{'cyrillic'}
153     = $IANACharset->{'csisolatincyrillic'}
154     = {
155     iana_names => {
156     'iso_8859-5:1988' => PRIMARY_CHARSET_NAME,
157     'iso-ir-144' => REGISTERED_CHARSET_NAME,
158     'iso_8859-5' => REGISTERED_CHARSET_NAME,
159     'iso-8859-5' => PREFERRED_CHARSET_NAME,
160     'cyrillic' => REGISTERED_CHARSET_NAME,
161     'csisolatincyrillic' => REGISTERED_CHARSET_NAME,
162     },
163     is_html_ascii_superset => 1,
164     };
165    
166     $Charset->{'iso-8859-6'}
167     = $IANACharset->{'iso_8859-6:1987'}
168     = $IANACharset->{'iso-ir-127'}
169     = $IANACharset->{'iso_8859-6'}
170     = $IANACharset->{'iso-8859-6'}
171     = $IANACharset->{'ecma-114'}
172     = $IANACharset->{'asmo-708'}
173     = $IANACharset->{'arabic'}
174     = $IANACharset->{'csisolatinarabic'}
175     = {
176     iana_names => {
177     'iso_8859-6:1987' => PRIMARY_CHARSET_NAME,
178     'iso-ir-127' => REGISTERED_CHARSET_NAME,
179     'iso_8859-6' => REGISTERED_CHARSET_NAME,
180     'iso-8859-6' => PREFERRED_CHARSET_NAME,
181     'ecma-114' => REGISTERED_CHARSET_NAME,
182     'asmo-708' => REGISTERED_CHARSET_NAME,
183     'arabic' => REGISTERED_CHARSET_NAME,
184     'csisolatinarabic' => REGISTERED_CHARSET_NAME,
185     },
186     is_html_ascii_superset => 1,
187     ## NOTE: 3/0..3/9 have different semantics from U+0030..0039,
188     ## but have same character names (maybe).
189     };
190    
191     $Charset->{'iso-8859-7'}
192     = $IANACharset->{'iso_8859-7:1987'}
193     = $IANACharset->{'iso-ir-126'}
194     = $IANACharset->{'iso_8859-7'}
195     = $IANACharset->{'iso-8859-7'}
196     = $IANACharset->{'elot_928'}
197     = $IANACharset->{'ecma-118'}
198     = $IANACharset->{'greek'}
199     = $IANACharset->{'greek8'}
200     = $IANACharset->{'csisolatingreek'}
201     = {
202     iana_names => {
203     'iso_8859-7:1987' => PRIMARY_CHARSET_NAME,
204     'iso-ir-126' => REGISTERED_CHARSET_NAME,
205     'iso_8859-7' => REGISTERED_CHARSET_NAME,
206     'iso-8859-7' => PREFERRED_CHARSET_NAME,
207     'elot_928' => REGISTERED_CHARSET_NAME,
208     'ecma-118' => REGISTERED_CHARSET_NAME,
209     'greek' => REGISTERED_CHARSET_NAME,
210     'greek8' => REGISTERED_CHARSET_NAME,
211     'csisolatingreek' => REGISTERED_CHARSET_NAME,
212     },
213     is_html_ascii_superset => 1,
214     };
215    
216     $Charset->{'iso-8859-8'}
217     = $IANACharset->{'iso_8859-8:1988'}
218     = $IANACharset->{'iso-ir-138'}
219     = $IANACharset->{'iso_8859-8'}
220     = $IANACharset->{'iso-8859-8'}
221     = $IANACharset->{'hebrew'}
222     = $IANACharset->{'csisolatinhebrew'}
223     = {
224     iana_names => {
225     'iso_8859-8:1988' => PRIMARY_CHARSET_NAME,
226     'iso-ir-138' => REGISTERED_CHARSET_NAME,
227     'iso_8859-8' => REGISTERED_CHARSET_NAME,
228     'iso-8859-8' => PREFERRED_CHARSET_NAME,
229     'hebrew' => REGISTERED_CHARSET_NAME,
230     'csisolatinhebrew' => REGISTERED_CHARSET_NAME,
231     },
232     is_html_ascii_superset => 1,
233     };
234    
235     $Charset->{'iso-8859-9'}
236     = $IANACharset->{'iso_8859-9:1989'}
237     = $IANACharset->{'iso-ir-148'}
238     = $IANACharset->{'iso_8859-9'}
239     = $IANACharset->{'iso-8859-9'}
240     = $IANACharset->{'latin5'}
241     = $IANACharset->{'l5'}
242     = $IANACharset->{'csisolatin5'}
243     = {
244     iana_names => {
245     'iso_8859-9:1989' => PRIMARY_CHARSET_NAME,
246     'iso-ir-148' => REGISTERED_CHARSET_NAME,
247     'iso_8859-9' => REGISTERED_CHARSET_NAME,
248     'iso-8859-9' => PREFERRED_CHARSET_NAME,
249     'latin5' => REGISTERED_CHARSET_NAME,
250     'l5' => REGISTERED_CHARSET_NAME,
251     'csisolatin5' => REGISTERED_CHARSET_NAME,
252     },
253     is_html_ascii_superset => 1,
254     };
255    
256     $Charset->{'iso-8859-10'}
257     = $IANACharset->{'iso-8859-10'}
258     = $IANACharset->{'iso-ir-157'}
259     = $IANACharset->{'l6'}
260     = $IANACharset->{'iso_8859-10:1992'}
261     = $IANACharset->{'csisolatin6'}
262     = $IANACharset->{'latin6'}
263     = {
264     iana_names => {
265     'iso-8859-10' => PRIMARY_CHARSET_NAME | PREFERRED_CHARSET_NAME,
266     'iso-ir-157' => REGISTERED_CHARSET_NAME,
267     'l6' => REGISTERED_CHARSET_NAME,
268     'iso_8859-10:1992' => REGISTERED_CHARSET_NAME,
269     'csisolatin6' => REGISTERED_CHARSET_NAME,
270     'latin6' => REGISTERED_CHARSET_NAME,
271     },
272     is_html_ascii_superset => 1,
273     };
274    
275     $Charset->{'iso_6937-2-add'}
276     = $IANACharset->{'iso_6937-2-add'}
277     = $IANACharset->{'iso-ir-142'}
278     = $IANACharset->{'csisotextcomm'}
279     = {
280     iana_names => {
281     'iso_6937-2-add' => PRIMARY_CHARSET_NAME,
282     'iso-ir-142' => REGISTERED_CHARSET_NAME,
283     'csisotextcomm' => REGISTERED_CHARSET_NAME,
284     },
285     is_html_ascii_superset => 1,
286     };
287    
288     $Charset->{'jis_x0201'}
289     = $IANACharset->{'jis_x0201'}
290     = $IANACharset->{'x0201'}
291     = $IANACharset->{'cshalfwidthkatakana'}
292     = {
293     iana_names => {
294     'jis_x0201' => PRIMARY_CHARSET_NAME,
295     'x0201' => REGISTERED_CHARSET_NAME,
296     'cshalfwidthkatakana' => REGISTERED_CHARSET_NAME,
297     },
298     is_html_ascii_superset => 1,
299     };
300    
301     $Charset->{'jis_encoding'}
302     = $IANACharset->{'jis_encoding'}
303     = $IANACharset->{'csjisencoding'}
304     = {
305     iana_names => {
306     'jis_encoding' => PRIMARY_CHARSET_NAME,
307     'csjisencoding' => REGISTERED_CHARSET_NAME,
308     },
309     ## NOTE: What is this?
310     };
311 wakaba 1.1
312     $Charset->{'shift_jis'}
313     = $IANACharset->{'shift_jis'}
314     = $IANACharset->{'ms_kanji'}
315     = $IANACharset->{'csshiftjis'}
316     = {
317     iana_names => {
318     'shift_jis' => PREFERRED_CHARSET_NAME | PRIMARY_CHARSET_NAME,
319     'ms_kanji' => REGISTERED_CHARSET_NAME,
320     'csshiftjis' => REGISTERED_CHARSET_NAME,
321     },
322 wakaba 1.2 mime_text_suitable => 1,
323 wakaba 1.1 };
324    
325     $Charset->{'euc-jp'}
326     = $IANACharset->{'extended_unix_code_packed_format_for_japanese'}
327     = $IANACharset->{'cseucpkdfmtjapanese'}
328     = $IANACharset->{'euc-jp'}
329     = {
330     iana_names => {
331     'extended_unix_code_packed_format_for_japanese' => PRIMARY_CHARSET_NAME,
332     'cseucpkdfmtjapanese' => REGISTERED_CHARSET_NAME,
333     'euc-jp' => PREFERRED_CHARSET_NAME,
334     },
335     is_html_ascii_superset => 1,
336     };
337    
338 wakaba 1.2 $Charset->{'extended_unix_code_fixed_width_for_japanese'}
339     = $IANACharset->{'extended_unix_code_fixed_width_for_japanese'}
340     = $IANACharset->{'cseucfixwidjapanese'}
341     = {
342     iana_names => {
343     'extended_unix_code_fixed_width_for_japanese' => PRIMARY_CHARSET_NAME,
344     'cseucfixwidjapanese' => REGISTERED_CHARSET_NAME,
345     },
346     };
347    
348 wakaba 1.1 ## TODO: ...
349    
350 wakaba 1.2 $Charset->{'euc-kr'}
351     = $IANACharset->{'euc-kr'}
352     = $IANACharset->{'cseuckr'}
353     = {
354     iana_names => {
355     'euc-kr' => PRIMARY_CHARSET_NAME | PREFERRED_CHARSET_NAME,
356     'cseuckr' => REGISTERED_CHARSET_NAME,
357     },
358     is_html_ascii_superset => 1,
359     };
360    
361 wakaba 1.1 $Charset->{'iso-2022-jp'}
362     = $IANACharset->{'iso-2022-jp'}
363     = $IANACharset->{'csiso2022jp'}
364     = {
365     iana_names => {
366     'iso-2022-jp' => PREFERRED_CHARSET_NAME | PRIMARY_CHARSET_NAME,
367     'csiso2022jp' => REGISTERED_CHARSET_NAME,
368     },
369 wakaba 1.2 mime_text_suitable => 1,
370     };
371    
372     $Charset->{'iso-2022-jp-2'}
373     = $IANACharset->{'iso-2022-jp-2'}
374     = $IANACharset->{'csiso2022jp2'}
375     = {
376     iana_names => {
377     'iso-2022-jp-2' => PREFERRED_CHARSET_NAME | PRIMARY_CHARSET_NAME,
378     'csiso2022jp2' => REGISTERED_CHARSET_NAME,
379     },
380     mime_text_suitable => 1,
381 wakaba 1.1 };
382    
383     ## TODO: ...
384    
385     $Charset->{'utf-8'}
386     = $IANACharset->{'utf-8'}
387     = {
388     iana_names => {
389     'utf-8' => PRIMARY_CHARSET_NAME,
390     },
391     is_html_ascii_superset => 1,
392     };
393    
394     ## TODO: ...
395    
396 wakaba 1.2 $Charset->{'gbk'}
397     = $IANACharset->{'gbk'}
398     = $IANACharset->{'cp936'}
399     = $IANACharset->{'ms936'}
400     = $IANACharset->{'windows-936'}
401     = {
402     iana_names => {
403     'gbk' => PRIMARY_CHARSET_NAME,
404     'cp936' => REGISTERED_CHARSET_NAME,
405     'ms936' => REGISTERED_CHARSET_NAME,
406     'windows-936' => REGISTERED_CHARSET_NAME,
407     },
408     iana_status => STATUS_COMMON | STATUS_OBSOLETE,
409     mime_text_suitable => 1,
410     };
411    
412     $Charset->{'gb18030'}
413     = $IANACharset->{'gb18030'}
414     = {
415     iana_names => {
416     'gb18030' => PRIMARY_CHARSET_NAME,
417     },
418     iana_status => STATUS_COMMON,
419     mime_text_suitable => 1,
420     };
421    
422     ## TODO: ...
423    
424 wakaba 1.1 $Charset->{'utf-16be'}
425     = $IANACharset->{'utf-16be'}
426     = {
427     iana_names => {
428     'utf-16be' => PRIMARY_CHARSET_NAME,
429     },
430     };
431    
432     $Charset->{'utf-16le'}
433     = $IANACharset->{'utf-16le'}
434     = {
435     iana_names => {
436     'utf-16le' => PRIMARY_CHARSET_NAME,
437     },
438     };
439    
440     $Charset->{'utf-16'}
441     = $IANACharset->{'utf-16'}
442     = {
443     iana_names => {
444     'utf-16' => PRIMARY_CHARSET_NAME,
445     },
446     };
447    
448     ## TODO: ...
449    
450 wakaba 1.2 $Charset->{'windows-31j'}
451     = $IANACharset->{'windows-31j'}
452     = $IANACharset->{'cswindows31j'}
453     = {
454     iana_names => {
455     'windows-31j' => PRIMARY_CHARSET_NAME,
456     'cswindows31j' => REGISTERED_CHARSET_NAME,
457     },
458     iana_status => STATUS_LIMITED_USE, # maybe
459     mime_text_suitable => 1,
460     };
461    
462     $Charset->{'gb2312'}
463     = $IANACharset->{'gb2312'}
464     = $IANACharset->{'csgb2312'}
465     = {
466     iana_names => {
467     'gb2312' => PRIMARY_CHARSET_NAME | PREFERRED_CHARSET_NAME,
468     'csgb2312' => REGISTERED_CHARSET_NAME,
469     },
470     is_html_ascii_superset => 1,
471     mime_text_suitable => 1,
472     };
473    
474     $Charset->{'big5'}
475     = $IANACharset->{'big5'}
476     = $IANACharset->{'csbig5'}
477     = {
478     iana_names => {
479     'big5' => PRIMARY_CHARSET_NAME | PREFERRED_CHARSET_NAME,
480     'csbig5' => REGISTERED_CHARSET_NAME,
481     },
482     mime_text_suitable => 1,
483     };
484    
485     ## TODO: ...
486    
487     $Charset->{'big5-hkscs'}
488     = $IANACharset->{'big5-hkscs'}
489     = {
490     iana_names => {
491     'big5-hkscs' => PRIMARY_CHARSET_NAME,
492     },
493     mime_text_suitable => 1,
494     };
495    
496     ## TODO: ...
497    
498 wakaba 1.1 $Charset->{'windows-1252'}
499     = $IANACharset->{'windows-1252'}
500     = {
501     iana_names => {
502     'windows-1252' => PRIMARY_CHARSET_NAME,
503     },
504     is_html_ascii_superset => 1,
505     };
506    
507     ## TODO: ...
508    
509     sub is_syntactically_iana_charset_name ($) {
510     my $name = shift;
511     return $name =~ /\A[\x20-\x7E]{1,40}\z/;
512     } # is_suntactically_valid_iana_charset_name
513    
514     1;
515 wakaba 1.2 ## $Date: 2007/11/18 11:08:40 $
516 wakaba 1.1

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24