/[suikacvs]/messaging/manakai/lib/Message/MIME/Charset.pm
Suika

Contents of /messaging/manakai/lib/Message/MIME/Charset.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.17 - (show annotations) (download)
Sun Aug 18 06:22:36 2002 UTC (22 years, 3 months ago) by wakaba
Branch: MAIN
CVS Tags: msg-0-1
Branch point for: stable
Changes since 1.16: +5 -52 lines
2002-08-18  Wakaba <w@suika.fam.cx>

	* Charset.pm (_name_utf16be, _name_utf32be): Removed.
	(Moved to Message::MIME::Charset::MinName)

1
2 =head1 NAME
3
4 Message::MIME::Charset Perl module
5
6 =head1 DESCRIPTION
7
8 Perl module for MIME charset.
9
10 =cut
11
12 ## NOTE: You should not require/use other module (even it
13 ## is part of Message::* Perl Modules) as far as possible,
14 ## to be able to use this module (M::M::Charset) from
15 ## other (non-Message::*) modules.
16
17 package Message::MIME::Charset;
18 use strict;
19 use vars qw(%CHARSET %MSNAME2IANANAME %REG $VERSION);
20 $VERSION=do{my @r=(q$Revision: 1.17 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
21
22 &_builtin_charset;
23 sub _builtin_charset () {
24
25 $CHARSET{'*DEFAULT'} = {
26 preferred_name => '',
27
28 encoder => sub { $_[1] },
29 decoder => sub { $_[1] },
30
31 mime_text => 1, ## Suitability in use as MIME text/* charset
32 #accept_cte => [qw/7bit .../],
33 cte_7bit_preferred => 'quoted-printable',
34 };
35 $CHARSET{'*default'} = $CHARSET{'*DEFAULT'};
36
37 $CHARSET{'us-ascii'} = {
38 preferred_name => 'us-ascii',
39
40 encoder => sub { $_[1] },
41 decoder => sub { $_[1] },
42
43 mime_text => 1,
44 cte_7bit_preferred => 'quoted-printable',
45 };
46
47 $CHARSET{'iso-2022-int-1'} = {
48 preferred_name => 'iso-2022-int-1',
49
50 encoder => sub { $_[1] },
51 decoder => sub { $_[1] },
52
53 mime_text => 1,
54 };
55
56 $CHARSET{'unknown-8bit'} = {
57 preferred_name => 'unknown-8bit',
58
59 encoder => sub { $_[1] },
60 decoder => sub { $_[1] },
61
62 mime_text => 1,
63 cte_7bit_preferred => 'base64',
64 };
65
66 $CHARSET{'x-unknown'} = {
67 preferred_name => 'x-unknown',
68
69 encoder => sub { $_[1] },
70 decoder => sub { $_[1] },
71
72 mime_text => 0,
73 cte_7bit_preferred => 'base64',
74 };
75
76 $CHARSET{'*undef'} = {
77 preferred_name => '',
78
79 #encoder => sub { $_[1] },
80 #decoder => sub { $_[1] },
81
82 mime_text => 0,
83 cte_7bit_preferred => 'base64',
84 };
85
86 } # /builtin_charset
87
88 my %_MINIMUMIZER = (
89 'euc-jp' => \&_name_euc_japan,
90 'euc-jisx0213' => \&_name_euc_japan,
91 'euc-jisx0213-plane1' => \&_name_euc_japan,
92 'x-euc-jisx0213-packed' => \&_name_euc_japan,
93 'x-iso-2022' => \&_name_8bit_iso2022,
94 'iso-2022-cn' => \&_name_8bit_iso2022,
95 'iso-2022-cn-ext' => \&_name_8bit_iso2022,
96 'iso-2022-int-1' => \&_name_net_ascii_8bit,
97 'iso-2022-jp' => \&_name_8bit_iso2022,
98 'iso-2022-jp-1' => \&_name_8bit_iso2022,
99 'iso-2022-jp-2' => \&_name_8bit_iso2022,
100 'iso-2022-jp-3' => \&_name_8bit_iso2022,
101 'iso-2022-jp-3-plane1' => \&_name_8bit_iso2022,
102 'iso-2022-kr' => \&_name_8bit_iso2022,
103 'iso-8859-1' => \&_name_8bit_iso2022,
104 jis_x0201 => \&_name_shift_jis,
105 junet => \&_name_8bit_iso2022,
106 'x-junet8' => \&_name_net_ascii_8bit,
107 shift_jis => \&_name_shift_jis,
108 shift_jisx0213 => \&_name_shift_jis,
109 'shift_jisx0213-plane1' => \&_name_shift_jis,
110 'x-sjis' => \&_name_shift_jis,
111 'us-ascii' => \&_name_net_ascii_8bit,
112 'utf-8' => \&_name_net_ascii_8bit,
113 );
114
115 my %_IsMimeText;
116 for (qw(
117 adobe-standard-encoding adobe-symbol-encoding
118 big5 big5-eten big5-hkscs
119 cp950
120 gbk gb18030
121 euc-jp euc-jisx0213 euc-kr euc-tw
122 hp-roman8
123 hz-gb-2312
124 ibm437
125 junet x-junet8 x-iso-2022
126 iso-2022-cn iso-2022-cn-ext
127 iso-2022-int-1
128 iso-2022-jp iso-2022-jp-1 iso-2022-jp-2 iso-2022-jp-3
129 x-iso2022jp-cp932
130 iso-2022-kr
131 iso-8859-1 iso-8859-2 iso-8859-3
132 iso-8859-4 iso-8859-5 iso-8859-6
133 iso-8859-7 iso-8859-8 iso-8859-9
134 iso-8859-10 iso-8859-12 iso-8859-13
135 iso-8859-14 iso-8859-15 iso-8859-16
136 jis_encoding
137 koi8-r koi8-u
138 x-mac-arabic x-mac-centralroman x-mac-cyrillic x-mac-greek
139 x-mac-hebrew x-mac-icelandic macintosh x-mac-turkish
140 x-mac-ukrainian x-mac-chinesesimp x-mac-japanese x-mac-korean
141 shift_jis shift_jisx0213 x-sjis
142 tis-620
143 unicode-1-1-utf-7 unicode-1-1-utf-8
144 unicode-2-0-utf-7 unicode-2-0-utf-8
145 utf-7 utf-8 utf-9
146 viscii
147 windows-1250 windows-1251 windows-1252 windows-1253
148 windows-1254 windows-1255 windows-1256 windows-1257
149 windows-1258 windows-31j windows-949
150 )) { $_IsMimeText{$_} = 1 }
151
152 %MSNAME2IANANAME = (
153 'iso-2022-jp' => 'x-iso2022jp-cp932',
154 'ks_c_5601-1987' => 'windows-949',
155 );
156
157 sub make_charset ($%) {
158 my $name = shift;
159 return unless $name; ## Note: charset "0" is not supported.
160 my %definition = @_;
161
162 $definition{preferred_name} ||= $name;
163 if ($definition{preferred_name} ne $name
164 && ref $CHARSET{$definition{preferred_name}}) {
165 ## New charset is an alias of defined charset,
166 $CHARSET{$name} = $CHARSET{$definition{preferred_name}};
167 return;
168 } elsif ($definition{alias_of} && ref $CHARSET{$definition{alias_of}}) {
169 ## New charset is an alias of defined charset,
170 $CHARSET{$name} = $CHARSET{$definition{alias_of}};
171 return;
172 }
173 $CHARSET{$name} = \%definition;
174
175 ## Set default values
176 #$definition{encoder} ||= sub { $_[1] };
177 #$definition{decoder} ||= sub { $_[1] };
178
179 $definition{mime_text} = 0 unless defined $definition{mime_text};
180 $definition{cte_7bit_preferred} = 'base64'
181 unless defined $definition{cte_7bit_preferred};
182 }
183
184 sub encode ($$) {
185 my ($charset, $s) = (lc shift, shift);
186 my $c = ref $CHARSET{$charset}->{encoder}? $charset: '*undef';
187 if (ref $CHARSET{$c}->{encoder}) {
188 my ($t, %r) = &{$CHARSET{$c}->{encoder}} ($charset, $s);
189 unless (defined $r{success}) {
190 $r{success} = 1;
191 }
192 return ($t, %r);
193 }
194 ($s, success => 0);
195 }
196
197 sub decode ($$) {
198 my ($charset, $s) = (lc shift, shift);
199 my $c = ref $CHARSET{$charset}->{decoder}? $charset: '*undef';
200 if (ref $CHARSET{$c}->{decoder}) {
201 my ($t, %r) = &{$CHARSET{$c}->{decoder}} ($charset, $s);
202 unless (defined $r{success}) {
203 $r{success} = 1;
204 }
205 return ($t, %r);
206 }
207 ($s, success => 0);
208 }
209
210 sub name_normalize ($) {
211 my $name = lc shift;
212 if (ref $CHARSET{$name}->{preferred_name} eq 'CODE') {
213 return &{ $CHARSET{$name}->{preferred_name} } ($name);
214 } elsif ($CHARSET{$name}->{preferred_name}) {
215 return $CHARSET{$name}->{preferred_name};
216 } elsif (ref $CHARSET{'*undef'}->{preferred_name} eq 'CODE') {
217 return &{ $CHARSET{'*undef'}->{preferred_name} } ($name);
218 }
219 $name;
220 }
221
222 sub name_minimumize ($$) {
223 require Message::MIME::Charset::MinName;
224 my ($charset, $s) = (lc shift, shift);
225 if (ref $CHARSET{$charset}->{name_minimumizer} eq 'CODE') {
226 return &{$CHARSET{$charset}->{name_minimumizer}} ($charset, $s);
227 } elsif (ref $Message::MIME::Charset::MinName::MIN{$charset}) {
228 return &{$Message::MIME::Charset::MinName::MIN{$charset}} ($charset, $s);
229 } elsif (ref $_MINIMUMIZER{$charset}) {
230 return &{$_MINIMUMIZER{$charset}} ($charset, $s);
231 } elsif (ref $CHARSET{'*undef'}->{name_minimumizer} eq 'CODE') {
232 return &{$CHARSET{'*undef'}->{name_minimumizer}} ($charset, $s);
233 }
234 (charset => $charset);
235 }
236
237 sub msname2iananame ($) {
238 my $mscharset = shift;
239 $MSNAME2IANANAME{$mscharset} || $mscharset;
240 }
241
242 sub _name_7bit_iso2022 ($$) {shift;
243 my $s = shift;
244 if ($s =~ /[\x0E\x0F\x1B]/) {
245 return (charset => 'iso-2022-jp')
246 unless $s =~ /\x1B[^\x24\x28]
247 |\x1B\x24[^\x40B]
248 |\x1B\x28[^BJ]
249 |\x0E|\x0F/x;
250 return (charset => 'iso-2022-jp-1')
251 unless $s =~ /\x1B[^\x24\x28]
252 |\x1B\x24[^\x40B\x28]
253 |\x1B\x24\x28[^D]
254 |\x1B\x28[^BJ]
255 |\x0E|\x0F/x;
256 return (charset => 'iso-2022-jp-3-plane1')
257 unless $s =~ /\x1B[^\x24\x28]
258 |\x1B\x24[^\x28] #[^B\x28]
259 |\x1B\x24\x28[^O]
260 |\x1B\x28[^B]
261 |\x0E|\x0F/x;
262 return (charset => 'iso-2022-jp-3')
263 unless $s =~ /\x1B[^\x24\x28]
264 |\x1B\x24[^\x28] #[^B\x28]
265 |\x1B\x24\x28[^OP]
266 |\x1B\x28[^B]
267 |\x0E|\x0F/x;
268 return (charset => 'iso-2022-kr')
269 unless $s =~ /\x1B[^\x24]
270 |\x1B\x24[^\x29]
271 |\x1B\x24\x29[^C]/x;
272 return (charset => 'iso-2022-jp-2')
273 unless $s =~ /\x1B[^\x24\x28\x2E\x4E]
274 |\x1B\x24[^\x40AB\x28]
275 |\x1B\x24\x28[^CD]
276 |\x1B\x28[^BJ]
277 |\x1B\x2E[^AF]
278 |\x0E|\x0F/x;
279 return (charset => 'iso-2022-cn')
280 unless $s =~ /\x1B[^\x4E\x24]
281 |\x1B\x24[^\x29\x2A]
282 |\x1B\x24\x29[^AG]
283 |\x1B\x24\x2A[^H]/x;
284 return (charset => 'iso-2022-cn-ext')
285 unless $s =~ /\x1B[^\x4E\x4F\x24]
286 |\x1B\x24[^\x29\x2A]
287 |\x1B\x24\x29[^AEG]
288 |\x1B\x24\x2A[^HIJKLM]/x;
289 return (charset => 'iso-2022-int-1')
290 unless $s =~ /\x1B[^\x24\x28\x2D]
291 |\x1B\x24[^\x40AB\x28\x29]
292 |\x1B\x24\x28[^DGH]
293 |\x1B\x24\x29[^C]
294 |\x1B\x28[^BJ]
295 |\x1B\x2D[^AF]/x;
296 return (charset => 'junet')
297 unless $s =~ /\x1B[^\x24\x28\x2C]
298 |\x1B\x24[^\x28\x2C\x40-\x42]
299 |\x1B\x24[\x28\x2C][^\x20-\x7E]
300 |\x1B\x24[\x28\x2C][\x20-\x2F]+[^\x30-\x7E]
301 |\x1B[\x28\x2C][^\x20-\x7E]
302 |\x1B[\x28\x2C][\x20-\x2F]+[^\x30-\x7E]
303 |\x0E|\x0F/x;
304 return (charset => 'x-iso-2022');
305 } else {
306 return (charset => 'us-ascii');
307 }
308 }
309
310 sub _name_net_ascii_8bit ($) {
311 my $name = shift; my $s = shift;
312 return (charset => 'us-ascii') unless $s =~ /[\x1B\x0E\x0F\x80-\xFF]/;
313 if ($s =~ /[\x80-\xFF]/) {
314 if ($s =~ /[\xC0-\xFD][\x80-\xBF]*[\x80-\x8F]/) {
315 if ($s =~ /\x1B/) {
316 return (charset => 'x-junet8'); ## junet + UTF-8
317 } else {
318 return (charset => 'utf-8');
319 }
320 } elsif ($s =~ /\x1B/) {
321 return (charset => 'x-iso-2022'); ## 8bit ISO 2022
322 } else {
323 return (charset => 'iso-8859-1');
324 }
325 } else { ## 7bit ISO 2022
326 return _name_7bit_iso2022 ($name, $s);
327 }
328 }
329
330 sub _name_8bit_iso2022 ($$) {
331 my $name = shift; my $s = shift;
332 return (charset => 'us-ascii') unless $s =~ /[\x1B\x0E\x0F\x80-\xFF]/;
333 if ($s =~ /[\x80-\xFF]/) {
334 if ($s =~ /\x1B/) {
335 return (charset => 'x-iso-2022'); ## 8bit ISO 2022
336 } else {
337 return (charset => 'iso-8859-1');
338 }
339 } else { ## 7bit ISO 2022
340 return _name_7bit_iso2022 ($name, $s);
341 }
342 }
343
344 ## Not completed.
345 ## TODO: gb18030, cn-gb-12345
346 ## TODO: _name_euc_gbf (cn-gb-12345, gb2312)
347 sub _name_euc_gb ($$) {
348 my $name = shift; my $s = shift;
349 if ($s =~ /[\x80-\xFF]/) {
350 if ($s =~ /
351 (?:\G|[\x00-\x3F\x7F\x80\xFF])
352 (?:[\xA1-\xA9\xB0-\xFE][\xA1-\xFE]
353 |[\x40-\x7E])*
354 (?:
355 [\x81-\xA0\xAA-\xAF][\x40-\xFE]
356 |[\xA1-\xFE][\x40-\xA0]
357 )
358 /x) {
359 (charset => 'gbk');
360 } elsif ($s =~ /
361 (?:\xA2[\xA1-\xAA]
362 |\xA6[\xE0-\xF5]
363 |\xA8[\xBB-\xC0]
364 )
365 (?=(?:[\xA1-\xFE][\xA1-\xFE])*(?:[\x00-\xA0\xFF]|\z))
366 /x) {
367 (charset => 'gbk');
368 } elsif ($s =~ /
369 (?:\xA3\xE7|\xA7[\xDD-\xF2]
370 |\xA8[\xBB-\xC0]
371 |[\xAA-\xAF\xF8-\xFE][\xA1-\xFE]
372 )
373 (?=(?:[\xA1-\xFE][\xA1-\xFE])*(?:[\x00-\xA0\xFF]|\z))
374 /x) {
375 (charset => 'cn-gb-isoir165', 'charset-edition' => 1992);
376 } elsif ($s =~ /\xEF\xF1 ## Typo bug of GB 2312
377 (?=(?:[\xA1-\xFE][\xA1-\xFE])*(?:[\x00-\xA0\xFF]|\z))
378 /x) {
379 (charset => 'gb2312');
380 } else {
381 (charset => 'gb2312', 'charset-edition' => 1980);
382 }
383 } elsif ($s =~ /[\x0E\x0F]/) {
384 (charset => 'gb2312'); ## Actually, this is not "gb2312"
385 } else {
386 _name_7bit_iso2022 ($name, $s);
387 }
388 }
389
390 sub _name_euc_japan ($$) {
391 my $name = shift; my $s = shift;
392 if ($s =~ /[\x80-\xFF]/) {
393 if ($s =~ /\x8F[\xA1\xA3-\xA5\xA8\xAC-\xAF\xEE-\xFE][\xA1-\xFE]/) {
394 if ($s =~ /\x8F[\xA2\xA6\xA7\xA9-\xAB\xB0-\xED][\xA1-\xFE]/) {
395 ## JIS X 0213 plane 2 + JIS X 0212
396 (charset => 'x-euc-jisx0213-packed');
397 } else {
398 (charset => 'euc-jisx0213');
399 }
400 } elsif ($s =~ m{(?<![\x8E\x8F]) ## Not G2/G3 character
401 (?: ## JIS X 0213:2000
402 [\xA9-\xAF\xF5-\xFE][\xA1-\xFE]
403 |\xA2[\xAF-\xB9\xC2-\xC9\xD1-\xDB\xE9-\xF1\xFA-\xFD]
404 |\xA3[\xA1-\xAF\xBA-\xC0\xDB-\xE0\xFB-\xFE]
405 |\xA4[\xF4-\xFE]|\xA5[\xF7-\xFE]
406 |\xA6[\xB9-\xC0\xD9-\xFE]|\xA7[\xC2-\xD0\xF2-\xFE]
407 |\xA8[\xC1-\xFE]|\xCF[\xD4-\xFE]|\xF4[\xA7-\xFE]
408 )
409 (?=(?:[\xA1-\xFE][\xA1-\xFE])*(?:[\x00-\xA0\xFF]|\z))}x) {
410 if ($s =~ /\x8F/) { ## JIS X 0213 plane 1 + JIS X 0212
411 (charset => 'x-euc-jisx0213-packed');
412 } else {
413 (charset => 'euc-jisx0213-plane1');
414 }
415 } else {
416 (charset => 'euc-jp');
417 }
418 } elsif ($s =~ /\x0E|\x0F|\x1B[\x4E\x4F]/) {
419 (charset => 'euc-jisx0213'); ## Actually, this is not euc-japan
420 } else {
421 _name_7bit_iso2022 ($name, $s);
422 }
423 }
424
425 sub _name_shift_jis ($$) {
426 my $name = shift; my $s = shift;
427 if ($s =~ /[\x80-\xFF]/) {
428 if ($s =~ /[\x0E\x0F\x1B]/) {
429 (charset => 'x-sjis');
430 } elsif ($s =~ /
431 (?:\G|[\x00-\x3F\x7F])
432 (?:[\x81-\x9F\xE0-\xFC][\x40-\x7E\x80-\xFC]
433 |[\x40-\x7E\xA1-\xDF])*
434 [\xF0-\xFC][\x40-\x7E\x80-\xFC]
435 /x) {
436 (charset => 'shift_jisx0213');
437 } elsif ($s =~ /
438 (?:\G|[\x00-\x3F\x7F])
439 (?:[\x81-\x9F\xE0-\xFC][\x40-\x7E\x80-\xFC]
440 |[\x40-\x7E\xA1-\xDF])*
441 (?:
442 [\x85-\x87\xEB-\xEF][\x40-\x7E\x80-\xFC]
443 |\x81[\xAD-\xB7\xC0-\xC7\xCF-\xD9\xE9-\xEF\xF8-\xFB]
444 |\x82[\x40-\x4E\x59-\x5F\x7A-\x80\x9B-\x9E\xF2-\xFC]
445 |\x83[\x97-\x9E\xB7-\xBE\xD7-\xFC]
446 |\x84[\x61-\x6F\x72-\x9E\xBF-\xFC]
447 |\x88[\x40-\x9E]|\x98[\x73-\x9E]|\xEA[\xA5-\xFC]
448 )
449 /x) {
450 (charset => 'shift_jisx0213-plane1');
451 } else {
452 (charset => 'shift_jis');
453 }
454 } elsif ($s =~ /[\x5C\x7E]/) {
455 if ($s =~ /\x1B\x0E\x0F/) {
456 (charset => 'x-sjis'); ## ISO 2022 with implied "ESC ( J"
457 ## BUG: "ESC ( B foobar\aaa ESC ( J aiueo" also matchs this
458 } else {
459 (charset => 'jis_x0201');
460 }
461 } else {
462 _name_7bit_iso2022 ($name, $s);
463 }
464 }
465
466 sub _utf8_on ($) {
467 Encode::_utf8_on ($_[0]) if $Encode::VERSION;
468 }
469 sub _utf8_off ($) {
470 Encode::_utf8_off ($_[0]) if $Encode::VERSION;
471 }
472
473 sub is_mime_text ($) {
474 my $name = lc shift;
475 if (ref $CHARSET{$name}->{mime_text} eq 'CODE') {
476 return &{ $CHARSET{$name}->{mime_text} } ($name);
477 } elsif (defined $CHARSET{$name}->{mime_text}) {
478 return $CHARSET{$name}->{mime_text};
479 } elsif (defined $_IsMimeText{$name}) {
480 return $_IsMimeText{$name};
481 } elsif (ref $CHARSET{'*undef'}->{mime_text} eq 'CODE') {
482 return &{ $CHARSET{'*undef'}->{mime_text} } ($name);
483 }
484 0;
485 }
486
487 =head1 LICENSE
488
489 Copyright 2002 wakaba E<lt>w@suika.fam.cxE<gt>.
490
491 This program is free software; you can redistribute it and/or modify
492 it under the terms of the GNU General Public License as published by
493 the Free Software Foundation; either version 2 of the License, or
494 (at your option) any later version.
495
496 This program is distributed in the hope that it will be useful,
497 but WITHOUT ANY WARRANTY; without even the implied warranty of
498 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
499 GNU General Public License for more details.
500
501 You should have received a copy of the GNU General Public License
502 along with this program; see the file COPYING. If not, write to
503 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
504 Boston, MA 02111-1307, USA.
505
506 =head1 CHANGE
507
508 See F<ChangeLog>.
509 $Date: 2002/08/18 06:22:36 $
510
511 =cut
512
513 1;

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24