1 |
|
|
2 |
=head1 NAME |
=head1 NAME |
3 |
|
|
4 |
Message::MIME::Charset Perl module |
Message::MIME::Charset --- Message-pm: Coded character sets support |
5 |
|
|
6 |
=head1 DESCRIPTION |
=head1 DESCRIPTION |
7 |
|
|
8 |
Perl module for MIME charset. |
This module provides some abstracted functions to handle string |
9 |
|
in various character codes with (as-far-as-possiblly-) coding system |
10 |
|
independent implemention. |
11 |
|
|
12 |
|
Note that this module is not only used to implement MIME charset mechanism |
13 |
|
but also used to support non-MIME schemes of character encoding. |
14 |
|
|
15 |
|
This module is part of Message::* Perl Modules. |
16 |
|
|
17 |
=cut |
=cut |
18 |
|
|
49 |
|
|
50 |
mime_text => 1, |
mime_text => 1, |
51 |
cte_7bit_preferred => 'quoted-printable', |
cte_7bit_preferred => 'quoted-printable', |
52 |
|
|
53 |
|
divide_string => \&_divide_string_1, |
54 |
|
cte_header_preferred => 'q', |
55 |
|
|
56 |
|
is_representable_in => sub { $_[1] =~ /[^\x00-\x7F]/ ? 0 : 1 }, |
57 |
}; |
}; |
58 |
|
|
59 |
$CHARSET{'iso-2022-int-1'} = { |
$CHARSET{'iso-2022-int-1'} = { |
63 |
decoder => sub { $_[1] }, |
decoder => sub { $_[1] }, |
64 |
|
|
65 |
mime_text => 1, |
mime_text => 1, |
66 |
|
cte_7bit_preferred => 'quoted-printable', |
67 |
|
cte_header_preferred => 'q', |
68 |
}; |
}; |
69 |
|
|
70 |
$CHARSET{'unknown-8bit'} = { |
$CHARSET{'unknown-8bit'} = { |
75 |
|
|
76 |
mime_text => 1, |
mime_text => 1, |
77 |
cte_7bit_preferred => 'base64', |
cte_7bit_preferred => 'base64', |
78 |
|
|
79 |
|
divide_string => \&_divide_string_1, |
80 |
}; |
}; |
81 |
|
|
82 |
$CHARSET{'x-unknown'} = { |
$CHARSET{'x-unknown'} = { |
87 |
|
|
88 |
mime_text => 0, |
mime_text => 0, |
89 |
cte_7bit_preferred => 'base64', |
cte_7bit_preferred => 'base64', |
90 |
|
|
91 |
|
divide_string => \&_divide_string_1, |
92 |
}; |
}; |
93 |
|
|
94 |
$CHARSET{'*undef'} = { |
$CHARSET{'*undef'} = { |
101 |
cte_7bit_preferred => 'base64', |
cte_7bit_preferred => 'base64', |
102 |
}; |
}; |
103 |
|
|
104 |
|
$CHARSET{'*internal'} = { |
105 |
|
preferred_name => '', |
106 |
|
|
107 |
|
#encoder => sub { $_[1] }, |
108 |
|
#decoder => sub { $_[1] }, |
109 |
|
|
110 |
|
mime_text => 0, |
111 |
|
cte_7bit_preferred => 'base64', |
112 |
|
}; |
113 |
|
|
114 |
|
$CHARSET{'*default_value'} = { ## Dummy charset for default property value |
115 |
|
perl_name => undef, |
116 |
|
preferred_name => undef, |
117 |
|
mime_text => 0, |
118 |
|
cte_7bit_preferred => 'base64', |
119 |
|
cte_header_preferred => '*auto', |
120 |
|
}; |
121 |
|
|
122 |
} # /builtin_charset |
} # /builtin_charset |
123 |
|
|
124 |
my %_MINIMUMIZER = ( |
my %_MINIMUMIZER = ( |
138 |
'iso-2022-kr' => \&_name_8bit_iso2022, |
'iso-2022-kr' => \&_name_8bit_iso2022, |
139 |
'iso-8859-1' => \&_name_8bit_iso2022, |
'iso-8859-1' => \&_name_8bit_iso2022, |
140 |
jis_x0201 => \&_name_shift_jis, |
jis_x0201 => \&_name_shift_jis, |
141 |
junet => \&_name_8bit_iso2022, |
'x-iso-2022-7bit' => \&_name_8bit_iso2022, |
142 |
'x-junet8' => \&_name_net_ascii_8bit, |
'x-iso-2022-7bit-utf-8' => \&_name_net_ascii_8bit, |
143 |
shift_jis => \&_name_shift_jis, |
shift_jis => \&_name_shift_jis, |
144 |
shift_jisx0213 => \&_name_shift_jis, |
shift_jisx0213 => \&_name_shift_jis, |
145 |
'shift_jisx0213-plane1' => \&_name_shift_jis, |
'shift_jisx0213-plane1' => \&_name_shift_jis, |
158 |
hp-roman8 |
hp-roman8 |
159 |
hz-gb-2312 |
hz-gb-2312 |
160 |
ibm437 |
ibm437 |
161 |
junet x-junet8 x-iso-2022 |
x-iso-2022-7bit x-iso-2022-7bit-utf-8 x-iso-2022 |
162 |
iso-2022-cn iso-2022-cn-ext |
iso-2022-cn iso-2022-cn-ext |
163 |
iso-2022-int-1 |
iso-2022-int-1 |
164 |
iso-2022-jp iso-2022-jp-1 iso-2022-jp-2 iso-2022-jp-3 |
iso-2022-jp iso-2022-jp-1 iso-2022-jp-2 iso-2022-jp-3 |
255 |
$name; |
$name; |
256 |
} |
} |
257 |
|
|
258 |
sub name_minimumize ($$) { |
sub name_minimumize ($$;$) { |
259 |
require Message::MIME::Charset::MinName; |
require Message::MIME::Charset::MinName; |
260 |
my ($charset, $s) = (lc shift, shift); |
my ($charset, $s, $option) = (lc shift, @_); |
261 |
if (ref $CHARSET{$charset}->{name_minimumizer} eq 'CODE') { |
if (ref $CHARSET{$charset}->{name_minimumizer} eq 'CODE') { |
262 |
return &{$CHARSET{$charset}->{name_minimumizer}} ($charset, $s); |
return &{$CHARSET{$charset}->{name_minimumizer}} ($charset, $s); |
263 |
} elsif (ref $Message::MIME::Charset::MinName::MIN{$charset}) { |
} elsif (ref $Message::MIME::Charset::MinName::MIN{$charset}) { |
264 |
return &{$Message::MIME::Charset::MinName::MIN{$charset}} ($charset, $s); |
return &{$Message::MIME::Charset::MinName::MIN{$charset}} ($charset, $s, $option); |
265 |
} elsif (ref $_MINIMUMIZER{$charset}) { |
} elsif (ref $_MINIMUMIZER{$charset}) { |
266 |
return &{$_MINIMUMIZER{$charset}} ($charset, $s); |
return &{$_MINIMUMIZER{$charset}} ($charset, $s, $option); |
267 |
} elsif (ref $CHARSET{'*undef'}->{name_minimumizer} eq 'CODE') { |
} elsif (ref $CHARSET{'*undef'}->{name_minimumizer} eq 'CODE') { |
268 |
return &{$CHARSET{'*undef'}->{name_minimumizer}} ($charset, $s); |
return &{$CHARSET{'*undef'}->{name_minimumizer}} ($charset, $s, $option); |
269 |
} |
} |
270 |
(charset => $charset); |
(charset => $charset); |
271 |
} |
} |
329 |
|\x1B\x24\x29[^C] |
|\x1B\x24\x29[^C] |
330 |
|\x1B\x28[^BJ] |
|\x1B\x28[^BJ] |
331 |
|\x1B\x2D[^AF]/x; |
|\x1B\x2D[^AF]/x; |
332 |
return (charset => 'junet') |
return (charset => 'x-iso-2022-7bit') |
333 |
unless $s =~ /\x1B[^\x24\x28\x2C] |
unless $s =~ /\x1B[^\x24\x28\x2C] |
334 |
|\x1B\x24[^\x28\x2C\x40-\x42] |
|\x1B\x24[^\x28\x2C\x40-\x42] |
335 |
|\x1B\x24[\x28\x2C][^\x20-\x7E] |
|\x1B\x24[\x28\x2C][^\x20-\x7E] |
347 |
my $name = shift; my $s = shift; |
my $name = shift; my $s = shift; |
348 |
return (charset => 'us-ascii') unless $s =~ /[\x1B\x0E\x0F\x80-\xFF]/; |
return (charset => 'us-ascii') unless $s =~ /[\x1B\x0E\x0F\x80-\xFF]/; |
349 |
if ($s =~ /[\x80-\xFF]/) { |
if ($s =~ /[\x80-\xFF]/) { |
350 |
if ($s =~ /[\xC0-\xFD][\x80-\xBF]*[\x80-\x8F]/) { |
if ($s =~ /[\xC0-\xFD][\x80-\xBF]*[\x80-\xBF]/) { |
351 |
if ($s =~ /\x1B/) { |
if ($s =~ /\x1B/) { |
352 |
return (charset => 'x-junet8'); ## junet + UTF-8 |
return (charset => 'x-iso-2022-7bit'); ## iso-2022-7bit + UTF-8 |
353 |
} else { |
} else { |
354 |
return (charset => 'utf-8'); |
return (charset => 'utf-8'); |
355 |
} |
} |
499 |
} |
} |
500 |
} |
} |
501 |
|
|
502 |
|
eval q{require Encode}; |
503 |
sub _utf8_on ($) { |
sub _utf8_on ($) { |
504 |
Encode::_utf8_on ($_[0]) if $Encode::VERSION; |
Encode::_utf8_on ($_[0]) if $Encode::VERSION; |
505 |
} |
} |
521 |
0; |
0; |
522 |
} |
} |
523 |
|
|
524 |
|
sub divide_string ($$;%) { |
525 |
|
my ($charset, $string, %option) = @_; |
526 |
|
$option{-max} ||= 70; |
527 |
|
if (ref $CHARSET{$charset}->{divide_string}) { |
528 |
|
return &{$CHARSET{$charset}->{divide_string}} ($charset, $string, \%option); |
529 |
|
} else { |
530 |
|
my @r; ## 12 = 3*4. Most of stateless codes are 1-4 octets per char. |
531 |
|
my $l = int ($option{-max} / 12) * 12; |
532 |
|
for my $i (0..int (length ($string) / $l)) { |
533 |
|
push @r, substr ($string, $l*$i, $l); |
534 |
|
} |
535 |
|
return \@r; |
536 |
|
} |
537 |
|
} |
538 |
|
sub _divide_string_1 ($%) { |
539 |
|
my (undef, $string, $option) = @_; |
540 |
|
my @r; |
541 |
|
for my $i (0..int (length ($string) / $option->{-max})) { |
542 |
|
push @r, substr ($string, $option->{-max}*$i, $option->{-max}); |
543 |
|
} |
544 |
|
return \@r; |
545 |
|
} |
546 |
|
|
547 |
|
sub get_property ($$) { |
548 |
|
my ($property, $charset) = @_; |
549 |
|
if (defined $CHARSET{$charset}->{$property}) { |
550 |
|
return $CHARSET{$charset}->{$property}; |
551 |
|
} else { |
552 |
|
return $CHARSET{'*default_value'}->{$property}; |
553 |
|
} |
554 |
|
} |
555 |
|
|
556 |
|
=head1 {charset => $charset,...} = Message::MIME::Charset::get_interchange_charset ($charset, $string, {%option}) |
557 |
|
|
558 |
|
Get charset name (for IANA name context) for information interchange. |
559 |
|
|
560 |
|
=cut |
561 |
|
|
562 |
|
sub get_interchange_charset ($$;$) { |
563 |
|
my ($charset, $string, $option) = @_; |
564 |
|
|
565 |
|
{charset => $charset}; |
566 |
|
} |
567 |
|
|
568 |
|
=head1 1/0 = is_representable_in ($charset, $string, {%option}) |
569 |
|
|
570 |
|
Return whether $string (encoded in *internal charset) is able to be |
571 |
|
represented in the $charset. |
572 |
|
|
573 |
|
Options: Currently no option argument is available. |
574 |
|
|
575 |
|
=cut |
576 |
|
|
577 |
|
sub is_representable_in ($$;$) { |
578 |
|
my ($charset, $string, $option) = @_; |
579 |
|
if (ref $CHARSET{$charset}->{is_representable_in}) { |
580 |
|
return &{$CHARSET{$charset}->{is_representable_in}} (@_); |
581 |
|
} else { |
582 |
|
return 0; |
583 |
|
} |
584 |
|
} |
585 |
|
|
586 |
=head1 LICENSE |
=head1 LICENSE |
587 |
|
|
588 |
Copyright 2002 wakaba E<lt>w@suika.fam.cxE<gt>. |
Copyright 2002 Wakaba <w@suika.fam.cx> |
589 |
|
|
590 |
This program is free software; you can redistribute it and/or modify |
This program is free software; you can redistribute it and/or modify |
591 |
it under the terms of the GNU General Public License as published by |
it under the terms of the GNU General Public License as published by |
602 |
the Free Software Foundation, Inc., 59 Temple Place - Suite 330, |
the Free Software Foundation, Inc., 59 Temple Place - Suite 330, |
603 |
Boston, MA 02111-1307, USA. |
Boston, MA 02111-1307, USA. |
604 |
|
|
|
=head1 CHANGE |
|
|
|
|
|
See F<ChangeLog>. |
|
|
$Date$ |
|
|
|
|
605 |
=cut |
=cut |
606 |
|
|
607 |
1; |
1; # $Date$ |