/[suikacvs]/messaging/manakai/lib/Message/MIME/Charset.pm
Suika

Contents of /messaging/manakai/lib/Message/MIME/Charset.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.9 - (hide annotations) (download)
Sun Jun 23 12:16:10 2002 UTC (22 years, 5 months ago) by wakaba
Branch: MAIN
Changes since 1.8: +298 -22 lines
2002-06-23  Wakaba <w@suika.fam.cx>

	* Charset.pm
	- New name_minimumizers. (Some are moved from 
	Message::MIME::Charset::Jcode.)
	- (%_MINIMUMIZER): New internal hash.
	- Use internal name-minimumizer (with %_MINIMUMIZER)
	if minimumizer is not defined in charset's definition.
	* EncodedWord.pm (decode_ccontent): Order of arguments
	is changed.
	* MediaType.pm: Typo fix.

1 wakaba 1.1
2     =head1 NAME
3    
4     Message::MIME::Charset Perl module
5    
6     =head1 DESCRIPTION
7    
8     Perl module for MIME charset.
9    
10     =cut
11    
12     package Message::MIME::Charset;
13     use strict;
14 wakaba 1.9 use vars qw(%CHARSET %REG $VERSION);
15     $VERSION=do{my @r=(q$Revision: 1.8 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
16 wakaba 1.1
17 wakaba 1.9 &_builtin_charset;
18     sub _builtin_charset () {
19 wakaba 1.4
20     $CHARSET{'*DEFAULT'} = {
21     preferred_name => '',
22    
23     encoder => sub { $_[1] },
24     decoder => sub { $_[1] },
25    
26     mime_text => 1, ## Suitability in use as MIME text/* charset
27     #accept_cte => [qw/7bit .../],
28     cte_7bit_preferred => 'quoted-printable',
29     };
30     $CHARSET{'*default'} = $CHARSET{'*DEFAULT'};
31    
32     $CHARSET{'us-ascii'} = {
33     preferred_name => 'us-ascii',
34    
35     encoder => sub { $_[1] },
36     decoder => sub { $_[1] },
37 wakaba 1.5
38     mime_text => 1,
39 wakaba 1.4 };
40    
41     $CHARSET{'iso-2022-int-1'} = {
42     preferred_name => 'iso-2022-int-1',
43    
44     encoder => sub { $_[1] },
45     decoder => sub { $_[1] },
46 wakaba 1.5
47     mime_text => 1,
48 wakaba 1.4 };
49    
50     $CHARSET{'unknown-8bit'} = {
51     preferred_name => 'unknown-8bit',
52    
53     encoder => sub { $_[1] },
54     decoder => sub { $_[1] },
55    
56     mime_text => 0,
57 wakaba 1.6 cte_7bit_preferred => 'base64',
58 wakaba 1.4 };
59     $CHARSET{'x-unknown'} = $CHARSET{'unknown-8bit'};
60    
61 wakaba 1.9 } # /builtin_charset
62    
63     my %_MINIMUMIZER = (
64     'euc-jp' => \&_name_euc_japan,
65     'euc-jisx0213' => \&_name_euc_japan,
66     'euc-jisx0213-plane1' => \&_name_euc_japan,
67     'x-euc-jisx0213-packed' => \&_name_euc_japan,
68     'x-iso-2022' => \&_name_8bit_iso_2022,
69     'iso-2022-cn' => \&_name_8bit_iso_2022,
70     'iso-2022-cn-ext' => \&_name_8bit_iso_2022,
71     'iso-2022-int-1' => \&_name_net_ascii_8bit,
72     'iso-2022-jp' => \&_name_8bit_iso_2022,
73     'iso-2022-jp-1' => \&_name_8bit_iso_2022,
74     'iso-2022-jp-2' => \&_name_8bit_iso_2022,
75     'iso-2022-jp-3' => \&_name_8bit_iso_2022,
76     'iso-2022-jp-3-plane1' => \&_name_8bit_iso_2022,
77     'iso-2022-kr' => \&_name_8bit_iso_2022,
78     'iso-8859-1' => \&_name_8bit_iso_2022,
79     'iso-10646-j-1' => \&_name_utf16be,
80     'iso-10646-ucs-2' => \&_name_utf16be,
81     'iso-10646-ucs-4' => \&_name_utf32be,
82     'iso-10646-ucs-basic' => \&_name_utf16be,
83     'iso-10646-unicode-latin1' => \&_name_utf16be,
84     jis_x0201 => \&_name_shift_jis,
85     junet => \&_name_8bit_iso_2022,
86     'x-junet8' => \&_name_net_ascii_8bit,
87     shift_jis => \&_name_shift_jis,
88     shift_jisx0213 => \&_name_shift_jis,
89     'shift_jisx0213-plane1' => \&_name_shift_jis,
90     'x-sjis' => \&_name_shift_jis,
91     'us-ascii' => \&_name_net_ascii_8bit,
92     'utf-8' => \&_name_net_ascii_8bit,
93     'utf-16be' => \&_name_utf16be,
94     'utf-32be' => \&_name_utf32be,
95     );
96    
97 wakaba 1.4 sub make_charset ($%) {
98     my $name = shift;
99     return unless $name; ## Note: charset "0" is not supported.
100     my %definition = @_;
101 wakaba 1.6
102     $definition{preferred_name} ||= $name;
103 wakaba 1.4 if ($definition{preferred_name} ne $name
104     && ref $CHARSET{$definition{preferred_name}}) {
105     ## New charset is an alias of defined charset,
106     $CHARSET{$name} = $CHARSET{$definition{preferred_name}};
107     return;
108     } elsif ($definition{alias_of} && ref $CHARSET{$definition{alias_of}}) {
109     ## New charset is an alias of defined charset,
110     $CHARSET{$name} = $CHARSET{$definition{alias_of}};
111     return;
112     }
113     $CHARSET{$name} = \%definition;
114    
115     ## Set default values
116 wakaba 1.6 #$definition{encoder} ||= sub { $_[1] };
117     #$definition{decoder} ||= sub { $_[1] };
118 wakaba 1.4
119     $definition{mime_text} = 0 unless defined $definition{mime_text};
120     $definition{cte_7bit_preferred} = 'base64'
121     unless defined $definition{cte_7bit_preferred};
122     }
123 wakaba 1.1
124     sub encode ($$) {
125     my ($charset, $s) = (lc shift, shift);
126 wakaba 1.4 if (ref $CHARSET{$charset}->{encoder}) {
127 wakaba 1.5 return (&{$CHARSET{$charset}->{encoder}} ($charset, $s), success => 1);
128 wakaba 1.1 }
129 wakaba 1.5 ($s, success => 0);
130 wakaba 1.1 }
131    
132     sub decode ($$) {
133     my ($charset, $s) = (lc shift, shift);
134 wakaba 1.4 if (ref $CHARSET{$charset}->{decoder}) {
135     return (&{$CHARSET{$charset}->{decoder}} ($charset, $s), 1);
136 wakaba 1.1 }
137 wakaba 1.2 ($s, 0);
138 wakaba 1.1 }
139    
140     sub name_normalize ($) {
141     my $name = lc shift;
142 wakaba 1.4 $CHARSET{$name}->{preferred_name} || $name;
143 wakaba 1.1 }
144    
145 wakaba 1.6 sub name_minimumize ($$) {
146     my ($charset, $s) = (lc shift, shift);
147 wakaba 1.9 if (ref $CHARSET{$charset}->{name_minimumizer} eq 'CODE') {
148 wakaba 1.6 return &{$CHARSET{$charset}->{name_minimumizer}} ($charset, $s);
149 wakaba 1.9 } elsif (ref $_MINIMUMIZER{$charset}) {
150     return &{$_MINIMUMIZER{$charset}} ($charset, $s);
151     }
152     (charset => $charset);
153     }
154    
155     sub _name_7bit_iso2022 ($$) {shift;
156     my $s = shift;
157     if ($s =~ /[\x0E\x0F\x1B]/) {
158     return (charset => 'iso-2022-jp')
159     unless $s =~ /\x1B[^\x24\x28]
160     |\x1B\x24[^\x40B]
161     |\x1B\x28[^BJ]
162     |\x0E|\x0F/x;
163     return (charset => 'iso-2022-jp-1')
164     unless $s =~ /\x1B[^\x24\x28]
165     |\x1B\x24[^\x40B\x28]
166     |\x1B\x24\x28[^D]
167     |\x1B\x28[^BJ]
168     |\x0E|\x0F/x;
169     return (charset => 'iso-2022-jp-3-plane1')
170     unless $s =~ /\x1B[^\x24\x28]
171     |\x1B\x24[^\x28] #[^B\x28]
172     |\x1B\x24\x28[^O]
173     |\x1B\x28[^B]
174     |\x0E|\x0F/x;
175     return (charset => 'iso-2022-jp-3')
176     unless $s =~ /\x1B[^\x24\x28]
177     |\x1B\x24[^\x28] #[^B\x28]
178     |\x1B\x24\x28[^OP]
179     |\x1B\x28[^B]
180     |\x0E|\x0F/x;
181     return (charset => 'iso-2022-kr')
182     unless $s =~ /\x1B[^\x24]
183     |\x1B\x24[^\x29]
184     |\x1B\x24\x29[^C]/x;
185     return (charset => 'iso-2022-jp-2')
186     unless $s =~ /\x1B[^\x24\x28\x2E\x4E]
187     |\x1B\x24[^\x40AB\x28]
188     |\x1B\x24\x28[^CD]
189     |\x1B\x28[^BJ]
190     |\x1B\x2E[^AF]
191     |\x0E|\x0F/x;
192     return (charset => 'iso-2022-cn')
193     unless $s =~ /\x1B[^\x4E\x24]
194     |\x1B\x24[^\x29\x2A]
195     |\x1B\x24\x29[^AG]
196     |\x1B\x24\x2A[^H]/x;
197     return (charset => 'iso-2022-cn-ext')
198     unless $s =~ /\x1B[^\x4E\x4F\x24]
199     |\x1B\x24[^\x29\x2A]
200     |\x1B\x24\x29[^AEG]
201     |\x1B\x24\x2A[^HIJKLM]/x;
202     return (charset => 'iso-2022-int-1')
203     unless $s =~ /\x1B[^\x24\x28\x2D]
204     |\x1B\x24[^\x40AB\x28\x29]
205     |\x1B\x24\x28[^DGH]
206     |\x1B\x24\x29[^C]
207     |\x1B\x28[^BJ]
208     |\x1B\x2D[^AF]/x;
209     return (charset => 'junet')
210     unless $s =~ /\x1B[^\x24\x28\x2C]
211     |\x1B\x24[^\x28\x2C\x40-\x42]
212     |\x1B\x24[\x28\x2C][^\x20-\x7E]
213     |\x1B\x24[\x28\x2C][\x20-\x2F]+[^\x30-\x7E]
214     |\x1B[\x28\x2C][^\x20-\x7E]
215     |\x1B[\x28\x2C][\x20-\x2F]+[^\x30-\x7E]
216     |\x0E|\x0F/x;
217     return (charset => 'x-iso-2022');
218     } else {
219     return (charset => 'us-ascii');
220 wakaba 1.6 }
221     }
222    
223 wakaba 1.9 sub _name_net_ascii_8bit ($) {
224     my $name = shift; my $s = shift;
225 wakaba 1.8 return (charset => 'us-ascii') unless $s =~ /[\x1B\x0E\x0F\x80-\xFF]/;
226     if ($s =~ /[\x80-\xFF]/) {
227     if ($s =~ /[\xC0-\xFD][\x80-\xBF]*[\x80-\x8F]/) {
228     if ($s =~ /\x1B/) {
229 wakaba 1.9 return (charset => 'x-junet8'); ## junet + UTF-8
230 wakaba 1.8 } else {
231     return (charset => 'utf-8');
232     }
233     } elsif ($s =~ /\x1B/) {
234 wakaba 1.9 return (charset => 'x-iso-2022'); ## 8bit ISO 2022
235     } else {
236     return (charset => 'iso-8859-1');
237     }
238     } else { ## 7bit ISO 2022
239     return _name_7bit_iso2022 ($name, $s);
240     }
241     }
242    
243     sub _name_8bit_iso_2022 ($$) {
244     my $name = shift; my $s = shift;
245     return (charset => 'us-ascii') unless $s =~ /[\x1B\x0E\x0F\x80-\xFF]/;
246     if ($s =~ /[\x80-\xFF]/) {
247     if ($s =~ /\x1B/) {
248     return (charset => 'x-iso-2022'); ## 8bit ISO 2022
249 wakaba 1.8 } else {
250     return (charset => 'iso-8859-1');
251     }
252 wakaba 1.9 } else { ## 7bit ISO 2022
253     return _name_7bit_iso2022 ($name, $s);
254     }
255     }
256    
257     ## Not completed.
258     ## TODO: gb18030, cn-gb-12345
259     ## TODO: _name_euc_gbf (cn-gb-12345, gb2312)
260     sub _name_euc_gb ($$) {
261     my $name = shift; my $s = shift;
262     if ($s =~ /[\x80-\xFF]/) {
263     if ($s =~ /
264     (?:\G|[\x00-\x3F\x7F\x80\xFF])
265     (?:[\xA1-\xA9\xB0-\xFE][\xA1-\xFE]
266     |[\x40-\x7E])*
267     (?:
268     [\x81-\xA0\xAA-\xAF][\x40-\xFE]
269     |[\xA1-\xFE][\x40-\xA0]
270     )
271     /x) {
272     (charset => 'gbk');
273     } elsif ($s =~ /
274     (?:\xA2[\xA1-\xAA]
275     |\xA6[\xE0-\xF5]
276     |\xA8[\xBB-\xC0]
277     )
278     (?=(?:[\xA1-\xFE][\xA1-\xFE])*(?:[\x00-\xA0\xFF]|\z))
279     /x) {
280     (charset => 'gbk');
281     } elsif ($s =~ /
282     (?:\xA3\xE7|\xA7[\xDD-\xF2]
283     |\xA8[\xBB-\xC0]
284     |[\xAA-\xAF\xF8-\xFE][\xA1-\xFE]
285     )
286     (?=(?:[\xA1-\xFE][\xA1-\xFE])*(?:[\x00-\xA0\xFF]|\z))
287     /x) {
288     (charset => 'cn-gb-isoir165', 'charset-edition' => 1992);
289     } elsif ($s =~ /\xEF\xF1 ## Typo bug of GB 2312
290     (?=(?:[\xA1-\xFE][\xA1-\xFE])*(?:[\x00-\xA0\xFF]|\z))
291     /x) {
292     (charset => 'gb2312');
293     } else {
294     (charset => 'gb2312', 'charset-edition' => 1980);
295     }
296     } elsif ($s =~ /[\x0E\x0F]/) {
297     (charset => 'gb2312'); ## Actually, this is not "gb2312"
298     } else {
299     _name_7bit_iso_2022 ($name, $s);
300     }
301     }
302    
303     sub _name_euc_japan ($$) {
304     my $name = shift; my $s = shift;
305     if ($s =~ /[\x80-\xFF]/) {
306     if ($s =~ /\x8F[\xA1\xA3-\xA5\xA8\xAC-\xAF\xEE-\xFE][\xA1-\xFE]/) {
307     if ($s =~ /\x8F[\xA2\xA6\xA7\xA9-\xAB\xB0-\xED][\xA1-\xFE]/) {
308     ## JIS X 0213 plane 2 + JIS X 0212
309     (charset => 'x-euc-jisx0213-packed');
310     } else {
311     (charset => 'euc-jisx0213');
312     }
313     } elsif ($s =~ m{(?<![\x8E\x8F]) ## Not G2/G3 character
314     (?: ## JIS X 0213:2000
315     [\xA9-\xAF\xF5-\xFE][\xA1-\xFE]
316     |\xA2[\xAF-\xB9\xC2-\xC9\xD1-\xDB\xE9-\xF1\xFA-\xFD]
317     |\xA3[\xA1-\xAF\xBA-\xC0\xDB-\xE0\xFB-\xFE]
318     |\xA4[\xF4-\xFE]|\xA5[\xF7-\xFE]
319     |\xA6[\xB9-\xC0\xD9-\xFE]|\xA7[\xC2-\xD0\xF2-\xFE]
320     |\xA8[\xC1-\xFE]|\xCF[\xD4-\xFE]|\xF4[\xA7-\xFE]
321     )
322     (?=(?:[\xA1-\xFE][\xA1-\xFE])*(?:[\x00-\xA0\xFF]|\z))}x) {
323     if ($s =~ /\x8F/) { ## JIS X 0213 plane 1 + JIS X 0212
324     (charset => 'x-euc-jisx0213-packed');
325     } else {
326     (charset => 'euc-jisx0213-plane1');
327     }
328     } else {
329     (charset => 'euc-jp');
330     }
331     } elsif ($s =~ /\x0E|\x0F|\x1B[\x4E\x4F]/) {
332     (charset => 'euc-jisx0213'); ## Actually, this is not euc-japan
333     } else {
334     _name_7bit_iso_2022 ($name, $s);
335     }
336     }
337    
338     sub _name_shift_jis ($$) {
339     my $name = shift; my $s = shift;
340     if ($s =~ /[\x80-\xFF]/) {
341     if ($s =~ /[\x0E\x0F\x1B]/) {
342     (charset => 'x-sjis');
343     } elsif ($s =~ /
344     (?:\G|[\x00-\x3F\x7F])
345     (?:[\x81-\x9F\xE0-\xFC][\x40-\x7E\x80-\xFC]
346     |[\x40-\x7E\xA1-\xDF])*
347     [\xF0-\xFC][\x40-\x7E\x80-\xFC]
348     /x) {
349     (charset => 'shift_jisx0213');
350     } elsif ($s =~ /
351     (?:\G|[\x00-\x3F\x7F])
352     (?:[\x81-\x9F\xE0-\xFC][\x40-\x7E\x80-\xFC]
353     |[\x40-\x7E\xA1-\xDF])*
354     (?:
355     [\x85-\x87\xEB-\xEF][\x40-\x7E\x80-\xFC]
356     |\x81[\xAD-\xB7\xC0-\xC7\xCF-\xD9\xE9-\xEF\xF8-\xFB]
357     |\x82[\x40-\x4E\x59-\x5F\x7A-\x80\x9B-\x9E\xF2-\xFC]
358     |\x83[\x97-\x9E\xB7-\xBE\xD7-\xFC]
359     |\x84[\x61-\x6F\x72-\x9E\xBF-\xFC]
360     |\x88[\x40-\x9E]|\x98[\x73-\x9E]|\xEA[\xA5-\xFC]
361     )
362     /x) {
363     (charset => 'shift_jisx0213-plane1');
364     } else {
365     (charset => 'shift_jis');
366     }
367     } elsif ($s =~ /[\x5C\x7E]/) {
368     if ($s =~ /\x1B\x0E\x0F/) {
369     (charset => 'x-sjis'); ## ISO 2022 with implied "ESC ( J"
370     ## BUG: "ESC ( B foobar\aaa ESC ( J aiueo" also matchs this
371     } else {
372     (charset => 'jis_x0201');
373     }
374     } else {
375     _name_7bit_iso_2022 ($name, $s);
376     }
377     }
378    
379     sub _name_utf16be ($$) {
380     shift; my $s = shift;
381     if ($s =~ /[\xD8-\xDB][\x00-\xFF][\xDC-\xDF][\x00-\xFF]
382     (?=(?:[\x00-\xFF][\x00-\xFF])*\z)/sx) {
383     (charset => 'utf-16be');
384     } elsif ($s =~ /[\x01-\xFF][\x00-\xFF]
385     (?=(?:[\x00-\xFF][\x00-\xFF])*\z)/sx) {
386     if ($s =~ /([^\x00\x03\x04\x23\x25\x30\xFE\xFF]
387     [\x00-\xFF] # ^\x20\x22\x4E-\x9F\xF9\xFA
388     |\x03[^\x00-\x6F\xD0-\xFF]
389     #|\x20[^\x00-\x6F]
390     |\x25[^\x00-\x7F]
391     |\xFE[^\x30-\x4F]
392     |\xFF[^\x00-\xEF]
393     ## note 1 of RFC 1816 is ambitious, so block entire
394     ## is excepted
395     |\x30[\x00-\x3F]
396     )
397     (?=(?:[\x00-\xFF][\x00-\xFF])*\z)/sx) {
398     (charset => 'iso-10646-ucs-2');
399     } else {
400     (charset => 'iso-10646-j-1');
401     }
402     } elsif ($s =~ /\x00[\x80-\xFF]
403     (?=(?:[\x00-\xFF][\x00-\xFF])*\z)/sx) {
404     (charset => 'iso-10646-unicode-latin1');
405     } else {
406     (charset => 'iso-10646-ucs-basic');
407     }
408     }
409    
410     sub _name_utf32be ($$) {
411     shift; my $s = shift;
412     if ($s =~ /
413     ([\x01-\x7F][\x00-\xFF]{3}
414     |\x00[\x11-\xFF][\x00-\xFF][\x00-\xFF])
415     (?=(?:[\x00-\xFF]{4})*\z)/sx) {
416     (charset => 'iso-10646-ucs-4');
417     } else {
418     (charset => 'utf-32be');
419 wakaba 1.8 }
420     }
421    
422 wakaba 1.1 =head1 LICENSE
423    
424     Copyright 2002 wakaba E<lt>w@suika.fam.cxE<gt>.
425    
426     This program is free software; you can redistribute it and/or modify
427     it under the terms of the GNU General Public License as published by
428     the Free Software Foundation; either version 2 of the License, or
429     (at your option) any later version.
430    
431     This program is distributed in the hope that it will be useful,
432     but WITHOUT ANY WARRANTY; without even the implied warranty of
433     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
434     GNU General Public License for more details.
435    
436     You should have received a copy of the GNU General Public License
437     along with this program; see the file COPYING. If not, write to
438     the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
439     Boston, MA 02111-1307, USA.
440    
441     =head1 CHANGE
442    
443     See F<ChangeLog>.
444 wakaba 1.9 $Date: 2002/06/16 10:45:54 $
445 wakaba 1.1
446     =cut
447    
448     1;

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24