/[suikacvs]/messaging/manakai/lib/Message/MIME/Charset.pm
Suika

Contents of /messaging/manakai/lib/Message/MIME/Charset.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.12 - (hide annotations) (download)
Sun Jul 21 03:25:23 2002 UTC (22 years, 4 months ago) by wakaba
Branch: MAIN
Changes since 1.11: +13 -3 lines
2002-07-20  Wakaba <w@suika.fam.cx>

	* Charset.pm (msname2iananame): New function.

1 wakaba 1.1
2     =head1 NAME
3    
4     Message::MIME::Charset Perl module
5    
6     =head1 DESCRIPTION
7    
8     Perl module for MIME charset.
9    
10     =cut
11    
12 wakaba 1.10 ## NOTE: You should not require/use other module (even it
13     ## is part of Message::* Perl Modules) as far as possible,
14     ## to be able to use this module (M::M::Charset) from
15     ## other (non-Message::*) modules.
16    
17 wakaba 1.1 package Message::MIME::Charset;
18     use strict;
19 wakaba 1.12 use vars qw(%CHARSET %MSNAME2IANANAME %REG $VERSION);
20     $VERSION=do{my @r=(q$Revision: 1.11 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
21 wakaba 1.1
22 wakaba 1.9 &_builtin_charset;
23     sub _builtin_charset () {
24 wakaba 1.4
25     $CHARSET{'*DEFAULT'} = {
26     preferred_name => '',
27    
28     encoder => sub { $_[1] },
29     decoder => sub { $_[1] },
30    
31     mime_text => 1, ## Suitability in use as MIME text/* charset
32     #accept_cte => [qw/7bit .../],
33     cte_7bit_preferred => 'quoted-printable',
34     };
35     $CHARSET{'*default'} = $CHARSET{'*DEFAULT'};
36    
37     $CHARSET{'us-ascii'} = {
38     preferred_name => 'us-ascii',
39    
40     encoder => sub { $_[1] },
41     decoder => sub { $_[1] },
42 wakaba 1.5
43     mime_text => 1,
44 wakaba 1.4 };
45    
46     $CHARSET{'iso-2022-int-1'} = {
47     preferred_name => 'iso-2022-int-1',
48    
49     encoder => sub { $_[1] },
50     decoder => sub { $_[1] },
51 wakaba 1.5
52     mime_text => 1,
53 wakaba 1.4 };
54    
55     $CHARSET{'unknown-8bit'} = {
56     preferred_name => 'unknown-8bit',
57    
58     encoder => sub { $_[1] },
59     decoder => sub { $_[1] },
60    
61     mime_text => 0,
62 wakaba 1.6 cte_7bit_preferred => 'base64',
63 wakaba 1.4 };
64     $CHARSET{'x-unknown'} = $CHARSET{'unknown-8bit'};
65    
66 wakaba 1.9 } # /builtin_charset
67    
68     my %_MINIMUMIZER = (
69     'euc-jp' => \&_name_euc_japan,
70     'euc-jisx0213' => \&_name_euc_japan,
71     'euc-jisx0213-plane1' => \&_name_euc_japan,
72     'x-euc-jisx0213-packed' => \&_name_euc_japan,
73 wakaba 1.11 'x-iso-2022' => \&_name_8bit_iso2022,
74     'iso-2022-cn' => \&_name_8bit_iso2022,
75     'iso-2022-cn-ext' => \&_name_8bit_iso2022,
76 wakaba 1.9 'iso-2022-int-1' => \&_name_net_ascii_8bit,
77 wakaba 1.11 'iso-2022-jp' => \&_name_8bit_iso2022,
78     'iso-2022-jp-1' => \&_name_8bit_iso2022,
79     'iso-2022-jp-2' => \&_name_8bit_iso2022,
80     'iso-2022-jp-3' => \&_name_8bit_iso2022,
81     'iso-2022-jp-3-plane1' => \&_name_8bit_iso2022,
82     'iso-2022-kr' => \&_name_8bit_iso2022,
83     'iso-8859-1' => \&_name_8bit_iso2022,
84 wakaba 1.9 'iso-10646-j-1' => \&_name_utf16be,
85     'iso-10646-ucs-2' => \&_name_utf16be,
86     'iso-10646-ucs-4' => \&_name_utf32be,
87     'iso-10646-ucs-basic' => \&_name_utf16be,
88     'iso-10646-unicode-latin1' => \&_name_utf16be,
89     jis_x0201 => \&_name_shift_jis,
90 wakaba 1.11 junet => \&_name_8bit_iso2022,
91 wakaba 1.9 'x-junet8' => \&_name_net_ascii_8bit,
92     shift_jis => \&_name_shift_jis,
93     shift_jisx0213 => \&_name_shift_jis,
94     'shift_jisx0213-plane1' => \&_name_shift_jis,
95     'x-sjis' => \&_name_shift_jis,
96     'us-ascii' => \&_name_net_ascii_8bit,
97     'utf-8' => \&_name_net_ascii_8bit,
98     'utf-16be' => \&_name_utf16be,
99     'utf-32be' => \&_name_utf32be,
100     );
101    
102 wakaba 1.12 %MSNAME2IANANAME = (
103     'iso-2022-jp' => 'x-iso2022jp-cp932',
104     'ks_c_5601-1987' => 'windows-949',
105     );
106    
107 wakaba 1.4 sub make_charset ($%) {
108     my $name = shift;
109     return unless $name; ## Note: charset "0" is not supported.
110     my %definition = @_;
111 wakaba 1.6
112     $definition{preferred_name} ||= $name;
113 wakaba 1.4 if ($definition{preferred_name} ne $name
114     && ref $CHARSET{$definition{preferred_name}}) {
115     ## New charset is an alias of defined charset,
116     $CHARSET{$name} = $CHARSET{$definition{preferred_name}};
117     return;
118     } elsif ($definition{alias_of} && ref $CHARSET{$definition{alias_of}}) {
119     ## New charset is an alias of defined charset,
120     $CHARSET{$name} = $CHARSET{$definition{alias_of}};
121     return;
122     }
123     $CHARSET{$name} = \%definition;
124    
125     ## Set default values
126 wakaba 1.6 #$definition{encoder} ||= sub { $_[1] };
127     #$definition{decoder} ||= sub { $_[1] };
128 wakaba 1.4
129     $definition{mime_text} = 0 unless defined $definition{mime_text};
130     $definition{cte_7bit_preferred} = 'base64'
131     unless defined $definition{cte_7bit_preferred};
132     }
133 wakaba 1.1
134     sub encode ($$) {
135     my ($charset, $s) = (lc shift, shift);
136 wakaba 1.4 if (ref $CHARSET{$charset}->{encoder}) {
137 wakaba 1.5 return (&{$CHARSET{$charset}->{encoder}} ($charset, $s), success => 1);
138 wakaba 1.1 }
139 wakaba 1.5 ($s, success => 0);
140 wakaba 1.1 }
141    
142     sub decode ($$) {
143     my ($charset, $s) = (lc shift, shift);
144 wakaba 1.4 if (ref $CHARSET{$charset}->{decoder}) {
145     return (&{$CHARSET{$charset}->{decoder}} ($charset, $s), 1);
146 wakaba 1.1 }
147 wakaba 1.2 ($s, 0);
148 wakaba 1.1 }
149    
150     sub name_normalize ($) {
151     my $name = lc shift;
152 wakaba 1.4 $CHARSET{$name}->{preferred_name} || $name;
153 wakaba 1.1 }
154    
155 wakaba 1.6 sub name_minimumize ($$) {
156     my ($charset, $s) = (lc shift, shift);
157 wakaba 1.9 if (ref $CHARSET{$charset}->{name_minimumizer} eq 'CODE') {
158 wakaba 1.6 return &{$CHARSET{$charset}->{name_minimumizer}} ($charset, $s);
159 wakaba 1.9 } elsif (ref $_MINIMUMIZER{$charset}) {
160     return &{$_MINIMUMIZER{$charset}} ($charset, $s);
161     }
162     (charset => $charset);
163     }
164    
165 wakaba 1.12 sub msname2iananame ($) {
166     my $mscharset = shift;
167     $MSNAME2IANANAME{$mscharset} || $mscharset;
168     }
169    
170 wakaba 1.9 sub _name_7bit_iso2022 ($$) {shift;
171     my $s = shift;
172     if ($s =~ /[\x0E\x0F\x1B]/) {
173     return (charset => 'iso-2022-jp')
174     unless $s =~ /\x1B[^\x24\x28]
175     |\x1B\x24[^\x40B]
176     |\x1B\x28[^BJ]
177     |\x0E|\x0F/x;
178     return (charset => 'iso-2022-jp-1')
179     unless $s =~ /\x1B[^\x24\x28]
180     |\x1B\x24[^\x40B\x28]
181     |\x1B\x24\x28[^D]
182     |\x1B\x28[^BJ]
183     |\x0E|\x0F/x;
184     return (charset => 'iso-2022-jp-3-plane1')
185     unless $s =~ /\x1B[^\x24\x28]
186     |\x1B\x24[^\x28] #[^B\x28]
187     |\x1B\x24\x28[^O]
188     |\x1B\x28[^B]
189     |\x0E|\x0F/x;
190     return (charset => 'iso-2022-jp-3')
191     unless $s =~ /\x1B[^\x24\x28]
192     |\x1B\x24[^\x28] #[^B\x28]
193     |\x1B\x24\x28[^OP]
194     |\x1B\x28[^B]
195     |\x0E|\x0F/x;
196     return (charset => 'iso-2022-kr')
197     unless $s =~ /\x1B[^\x24]
198     |\x1B\x24[^\x29]
199     |\x1B\x24\x29[^C]/x;
200     return (charset => 'iso-2022-jp-2')
201     unless $s =~ /\x1B[^\x24\x28\x2E\x4E]
202     |\x1B\x24[^\x40AB\x28]
203     |\x1B\x24\x28[^CD]
204     |\x1B\x28[^BJ]
205     |\x1B\x2E[^AF]
206     |\x0E|\x0F/x;
207     return (charset => 'iso-2022-cn')
208     unless $s =~ /\x1B[^\x4E\x24]
209     |\x1B\x24[^\x29\x2A]
210     |\x1B\x24\x29[^AG]
211     |\x1B\x24\x2A[^H]/x;
212     return (charset => 'iso-2022-cn-ext')
213     unless $s =~ /\x1B[^\x4E\x4F\x24]
214     |\x1B\x24[^\x29\x2A]
215     |\x1B\x24\x29[^AEG]
216     |\x1B\x24\x2A[^HIJKLM]/x;
217     return (charset => 'iso-2022-int-1')
218     unless $s =~ /\x1B[^\x24\x28\x2D]
219     |\x1B\x24[^\x40AB\x28\x29]
220     |\x1B\x24\x28[^DGH]
221     |\x1B\x24\x29[^C]
222     |\x1B\x28[^BJ]
223     |\x1B\x2D[^AF]/x;
224     return (charset => 'junet')
225     unless $s =~ /\x1B[^\x24\x28\x2C]
226     |\x1B\x24[^\x28\x2C\x40-\x42]
227     |\x1B\x24[\x28\x2C][^\x20-\x7E]
228     |\x1B\x24[\x28\x2C][\x20-\x2F]+[^\x30-\x7E]
229     |\x1B[\x28\x2C][^\x20-\x7E]
230     |\x1B[\x28\x2C][\x20-\x2F]+[^\x30-\x7E]
231     |\x0E|\x0F/x;
232     return (charset => 'x-iso-2022');
233     } else {
234     return (charset => 'us-ascii');
235 wakaba 1.6 }
236     }
237    
238 wakaba 1.9 sub _name_net_ascii_8bit ($) {
239     my $name = shift; my $s = shift;
240 wakaba 1.8 return (charset => 'us-ascii') unless $s =~ /[\x1B\x0E\x0F\x80-\xFF]/;
241     if ($s =~ /[\x80-\xFF]/) {
242     if ($s =~ /[\xC0-\xFD][\x80-\xBF]*[\x80-\x8F]/) {
243     if ($s =~ /\x1B/) {
244 wakaba 1.9 return (charset => 'x-junet8'); ## junet + UTF-8
245 wakaba 1.8 } else {
246     return (charset => 'utf-8');
247     }
248     } elsif ($s =~ /\x1B/) {
249 wakaba 1.9 return (charset => 'x-iso-2022'); ## 8bit ISO 2022
250     } else {
251     return (charset => 'iso-8859-1');
252     }
253     } else { ## 7bit ISO 2022
254     return _name_7bit_iso2022 ($name, $s);
255     }
256     }
257    
258 wakaba 1.11 sub _name_8bit_iso2022 ($$) {
259 wakaba 1.9 my $name = shift; my $s = shift;
260     return (charset => 'us-ascii') unless $s =~ /[\x1B\x0E\x0F\x80-\xFF]/;
261     if ($s =~ /[\x80-\xFF]/) {
262     if ($s =~ /\x1B/) {
263     return (charset => 'x-iso-2022'); ## 8bit ISO 2022
264 wakaba 1.8 } else {
265     return (charset => 'iso-8859-1');
266     }
267 wakaba 1.9 } else { ## 7bit ISO 2022
268     return _name_7bit_iso2022 ($name, $s);
269     }
270     }
271    
272     ## Not completed.
273     ## TODO: gb18030, cn-gb-12345
274     ## TODO: _name_euc_gbf (cn-gb-12345, gb2312)
275     sub _name_euc_gb ($$) {
276     my $name = shift; my $s = shift;
277     if ($s =~ /[\x80-\xFF]/) {
278     if ($s =~ /
279     (?:\G|[\x00-\x3F\x7F\x80\xFF])
280     (?:[\xA1-\xA9\xB0-\xFE][\xA1-\xFE]
281     |[\x40-\x7E])*
282     (?:
283     [\x81-\xA0\xAA-\xAF][\x40-\xFE]
284     |[\xA1-\xFE][\x40-\xA0]
285     )
286     /x) {
287     (charset => 'gbk');
288     } elsif ($s =~ /
289     (?:\xA2[\xA1-\xAA]
290     |\xA6[\xE0-\xF5]
291     |\xA8[\xBB-\xC0]
292     )
293     (?=(?:[\xA1-\xFE][\xA1-\xFE])*(?:[\x00-\xA0\xFF]|\z))
294     /x) {
295     (charset => 'gbk');
296     } elsif ($s =~ /
297     (?:\xA3\xE7|\xA7[\xDD-\xF2]
298     |\xA8[\xBB-\xC0]
299     |[\xAA-\xAF\xF8-\xFE][\xA1-\xFE]
300     )
301     (?=(?:[\xA1-\xFE][\xA1-\xFE])*(?:[\x00-\xA0\xFF]|\z))
302     /x) {
303     (charset => 'cn-gb-isoir165', 'charset-edition' => 1992);
304     } elsif ($s =~ /\xEF\xF1 ## Typo bug of GB 2312
305     (?=(?:[\xA1-\xFE][\xA1-\xFE])*(?:[\x00-\xA0\xFF]|\z))
306     /x) {
307     (charset => 'gb2312');
308     } else {
309     (charset => 'gb2312', 'charset-edition' => 1980);
310     }
311     } elsif ($s =~ /[\x0E\x0F]/) {
312     (charset => 'gb2312'); ## Actually, this is not "gb2312"
313     } else {
314 wakaba 1.11 _name_7bit_iso2022 ($name, $s);
315 wakaba 1.9 }
316     }
317    
318     sub _name_euc_japan ($$) {
319     my $name = shift; my $s = shift;
320     if ($s =~ /[\x80-\xFF]/) {
321     if ($s =~ /\x8F[\xA1\xA3-\xA5\xA8\xAC-\xAF\xEE-\xFE][\xA1-\xFE]/) {
322     if ($s =~ /\x8F[\xA2\xA6\xA7\xA9-\xAB\xB0-\xED][\xA1-\xFE]/) {
323     ## JIS X 0213 plane 2 + JIS X 0212
324     (charset => 'x-euc-jisx0213-packed');
325     } else {
326     (charset => 'euc-jisx0213');
327     }
328     } elsif ($s =~ m{(?<![\x8E\x8F]) ## Not G2/G3 character
329     (?: ## JIS X 0213:2000
330     [\xA9-\xAF\xF5-\xFE][\xA1-\xFE]
331     |\xA2[\xAF-\xB9\xC2-\xC9\xD1-\xDB\xE9-\xF1\xFA-\xFD]
332     |\xA3[\xA1-\xAF\xBA-\xC0\xDB-\xE0\xFB-\xFE]
333     |\xA4[\xF4-\xFE]|\xA5[\xF7-\xFE]
334     |\xA6[\xB9-\xC0\xD9-\xFE]|\xA7[\xC2-\xD0\xF2-\xFE]
335     |\xA8[\xC1-\xFE]|\xCF[\xD4-\xFE]|\xF4[\xA7-\xFE]
336     )
337     (?=(?:[\xA1-\xFE][\xA1-\xFE])*(?:[\x00-\xA0\xFF]|\z))}x) {
338     if ($s =~ /\x8F/) { ## JIS X 0213 plane 1 + JIS X 0212
339     (charset => 'x-euc-jisx0213-packed');
340     } else {
341     (charset => 'euc-jisx0213-plane1');
342     }
343     } else {
344     (charset => 'euc-jp');
345     }
346     } elsif ($s =~ /\x0E|\x0F|\x1B[\x4E\x4F]/) {
347     (charset => 'euc-jisx0213'); ## Actually, this is not euc-japan
348     } else {
349 wakaba 1.11 _name_7bit_iso2022 ($name, $s);
350 wakaba 1.9 }
351     }
352    
353     sub _name_shift_jis ($$) {
354     my $name = shift; my $s = shift;
355     if ($s =~ /[\x80-\xFF]/) {
356     if ($s =~ /[\x0E\x0F\x1B]/) {
357     (charset => 'x-sjis');
358     } elsif ($s =~ /
359     (?:\G|[\x00-\x3F\x7F])
360     (?:[\x81-\x9F\xE0-\xFC][\x40-\x7E\x80-\xFC]
361     |[\x40-\x7E\xA1-\xDF])*
362     [\xF0-\xFC][\x40-\x7E\x80-\xFC]
363     /x) {
364     (charset => 'shift_jisx0213');
365     } elsif ($s =~ /
366     (?:\G|[\x00-\x3F\x7F])
367     (?:[\x81-\x9F\xE0-\xFC][\x40-\x7E\x80-\xFC]
368     |[\x40-\x7E\xA1-\xDF])*
369     (?:
370     [\x85-\x87\xEB-\xEF][\x40-\x7E\x80-\xFC]
371     |\x81[\xAD-\xB7\xC0-\xC7\xCF-\xD9\xE9-\xEF\xF8-\xFB]
372     |\x82[\x40-\x4E\x59-\x5F\x7A-\x80\x9B-\x9E\xF2-\xFC]
373     |\x83[\x97-\x9E\xB7-\xBE\xD7-\xFC]
374     |\x84[\x61-\x6F\x72-\x9E\xBF-\xFC]
375     |\x88[\x40-\x9E]|\x98[\x73-\x9E]|\xEA[\xA5-\xFC]
376     )
377     /x) {
378     (charset => 'shift_jisx0213-plane1');
379     } else {
380     (charset => 'shift_jis');
381     }
382     } elsif ($s =~ /[\x5C\x7E]/) {
383     if ($s =~ /\x1B\x0E\x0F/) {
384     (charset => 'x-sjis'); ## ISO 2022 with implied "ESC ( J"
385     ## BUG: "ESC ( B foobar\aaa ESC ( J aiueo" also matchs this
386     } else {
387     (charset => 'jis_x0201');
388     }
389     } else {
390 wakaba 1.11 _name_7bit_iso2022 ($name, $s);
391 wakaba 1.9 }
392     }
393    
394     sub _name_utf16be ($$) {
395     shift; my $s = shift;
396     if ($s =~ /[\xD8-\xDB][\x00-\xFF][\xDC-\xDF][\x00-\xFF]
397     (?=(?:[\x00-\xFF][\x00-\xFF])*\z)/sx) {
398     (charset => 'utf-16be');
399     } elsif ($s =~ /[\x01-\xFF][\x00-\xFF]
400     (?=(?:[\x00-\xFF][\x00-\xFF])*\z)/sx) {
401     if ($s =~ /([^\x00\x03\x04\x23\x25\x30\xFE\xFF]
402     [\x00-\xFF] # ^\x20\x22\x4E-\x9F\xF9\xFA
403     |\x03[^\x00-\x6F\xD0-\xFF]
404     #|\x20[^\x00-\x6F]
405     |\x25[^\x00-\x7F]
406     |\xFE[^\x30-\x4F]
407     |\xFF[^\x00-\xEF]
408     ## note 1 of RFC 1816 is ambitious, so block entire
409     ## is excepted
410     |\x30[\x00-\x3F]
411     )
412     (?=(?:[\x00-\xFF][\x00-\xFF])*\z)/sx) {
413     (charset => 'iso-10646-ucs-2');
414     } else {
415     (charset => 'iso-10646-j-1');
416     }
417     } elsif ($s =~ /\x00[\x80-\xFF]
418     (?=(?:[\x00-\xFF][\x00-\xFF])*\z)/sx) {
419     (charset => 'iso-10646-unicode-latin1');
420     } else {
421     (charset => 'iso-10646-ucs-basic');
422     }
423     }
424    
425     sub _name_utf32be ($$) {
426     shift; my $s = shift;
427     if ($s =~ /
428     ([\x01-\x7F][\x00-\xFF]{3}
429     |\x00[\x11-\xFF][\x00-\xFF][\x00-\xFF])
430     (?=(?:[\x00-\xFF]{4})*\z)/sx) {
431     (charset => 'iso-10646-ucs-4');
432     } else {
433     (charset => 'utf-32be');
434 wakaba 1.8 }
435     }
436    
437 wakaba 1.1 =head1 LICENSE
438    
439     Copyright 2002 wakaba E<lt>w@suika.fam.cxE<gt>.
440    
441     This program is free software; you can redistribute it and/or modify
442     it under the terms of the GNU General Public License as published by
443     the Free Software Foundation; either version 2 of the License, or
444     (at your option) any later version.
445    
446     This program is distributed in the hope that it will be useful,
447     but WITHOUT ANY WARRANTY; without even the implied warranty of
448     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
449     GNU General Public License for more details.
450    
451     You should have received a copy of the GNU General Public License
452     along with this program; see the file COPYING. If not, write to
453     the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
454     Boston, MA 02111-1307, USA.
455    
456     =head1 CHANGE
457    
458     See F<ChangeLog>.
459 wakaba 1.12 $Date: 2002/07/19 11:49:46 $
460 wakaba 1.1
461     =cut
462    
463     1;

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24