=head1 NAME Message::MIME::Charset Perl module =head1 DESCRIPTION Perl module for MIME charset. =cut ## NOTE: You should not require/use other module (even it ## is part of Message::* Perl Modules) as far as possible, ## to be able to use this module (M::M::Charset) from ## other (non-Message::*) modules. package Message::MIME::Charset; use strict; use vars qw(%CHARSET %REG $VERSION); $VERSION=do{my @r=(q$Revision: 1.10 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r}; &_builtin_charset; sub _builtin_charset () { $CHARSET{'*DEFAULT'} = { preferred_name => '', encoder => sub { $_[1] }, decoder => sub { $_[1] }, mime_text => 1, ## Suitability in use as MIME text/* charset #accept_cte => [qw/7bit .../], cte_7bit_preferred => 'quoted-printable', }; $CHARSET{'*default'} = $CHARSET{'*DEFAULT'}; $CHARSET{'us-ascii'} = { preferred_name => 'us-ascii', encoder => sub { $_[1] }, decoder => sub { $_[1] }, mime_text => 1, }; $CHARSET{'iso-2022-int-1'} = { preferred_name => 'iso-2022-int-1', encoder => sub { $_[1] }, decoder => sub { $_[1] }, mime_text => 1, }; $CHARSET{'unknown-8bit'} = { preferred_name => 'unknown-8bit', encoder => sub { $_[1] }, decoder => sub { $_[1] }, mime_text => 0, cte_7bit_preferred => 'base64', }; $CHARSET{'x-unknown'} = $CHARSET{'unknown-8bit'}; } # /builtin_charset my %_MINIMUMIZER = ( 'euc-jp' => \&_name_euc_japan, 'euc-jisx0213' => \&_name_euc_japan, 'euc-jisx0213-plane1' => \&_name_euc_japan, 'x-euc-jisx0213-packed' => \&_name_euc_japan, 'x-iso-2022' => \&_name_8bit_iso_2022, 'iso-2022-cn' => \&_name_8bit_iso_2022, 'iso-2022-cn-ext' => \&_name_8bit_iso_2022, 'iso-2022-int-1' => \&_name_net_ascii_8bit, 'iso-2022-jp' => \&_name_8bit_iso_2022, 'iso-2022-jp-1' => \&_name_8bit_iso_2022, 'iso-2022-jp-2' => \&_name_8bit_iso_2022, 'iso-2022-jp-3' => \&_name_8bit_iso_2022, 'iso-2022-jp-3-plane1' => \&_name_8bit_iso_2022, 'iso-2022-kr' => \&_name_8bit_iso_2022, 'iso-8859-1' => \&_name_8bit_iso_2022, 'iso-10646-j-1' => \&_name_utf16be, 'iso-10646-ucs-2' => \&_name_utf16be, 'iso-10646-ucs-4' => \&_name_utf32be, 'iso-10646-ucs-basic' => \&_name_utf16be, 'iso-10646-unicode-latin1' => \&_name_utf16be, jis_x0201 => \&_name_shift_jis, junet => \&_name_8bit_iso_2022, 'x-junet8' => \&_name_net_ascii_8bit, shift_jis => \&_name_shift_jis, shift_jisx0213 => \&_name_shift_jis, 'shift_jisx0213-plane1' => \&_name_shift_jis, 'x-sjis' => \&_name_shift_jis, 'us-ascii' => \&_name_net_ascii_8bit, 'utf-8' => \&_name_net_ascii_8bit, 'utf-16be' => \&_name_utf16be, 'utf-32be' => \&_name_utf32be, ); sub make_charset ($%) { my $name = shift; return unless $name; ## Note: charset "0" is not supported. my %definition = @_; $definition{preferred_name} ||= $name; if ($definition{preferred_name} ne $name && ref $CHARSET{$definition{preferred_name}}) { ## New charset is an alias of defined charset, $CHARSET{$name} = $CHARSET{$definition{preferred_name}}; return; } elsif ($definition{alias_of} && ref $CHARSET{$definition{alias_of}}) { ## New charset is an alias of defined charset, $CHARSET{$name} = $CHARSET{$definition{alias_of}}; return; } $CHARSET{$name} = \%definition; ## Set default values #$definition{encoder} ||= sub { $_[1] }; #$definition{decoder} ||= sub { $_[1] }; $definition{mime_text} = 0 unless defined $definition{mime_text}; $definition{cte_7bit_preferred} = 'base64' unless defined $definition{cte_7bit_preferred}; } sub encode ($$) { my ($charset, $s) = (lc shift, shift); if (ref $CHARSET{$charset}->{encoder}) { return (&{$CHARSET{$charset}->{encoder}} ($charset, $s), success => 1); } ($s, success => 0); } sub decode ($$) { my ($charset, $s) = (lc shift, shift); if (ref $CHARSET{$charset}->{decoder}) { return (&{$CHARSET{$charset}->{decoder}} ($charset, $s), 1); } ($s, 0); } sub name_normalize ($) { my $name = lc shift; $CHARSET{$name}->{preferred_name} || $name; } sub name_minimumize ($$) { my ($charset, $s) = (lc shift, shift); if (ref $CHARSET{$charset}->{name_minimumizer} eq 'CODE') { return &{$CHARSET{$charset}->{name_minimumizer}} ($charset, $s); } elsif (ref $_MINIMUMIZER{$charset}) { return &{$_MINIMUMIZER{$charset}} ($charset, $s); } (charset => $charset); } sub _name_7bit_iso2022 ($$) {shift; my $s = shift; if ($s =~ /[\x0E\x0F\x1B]/) { return (charset => 'iso-2022-jp') unless $s =~ /\x1B[^\x24\x28] |\x1B\x24[^\x40B] |\x1B\x28[^BJ] |\x0E|\x0F/x; return (charset => 'iso-2022-jp-1') unless $s =~ /\x1B[^\x24\x28] |\x1B\x24[^\x40B\x28] |\x1B\x24\x28[^D] |\x1B\x28[^BJ] |\x0E|\x0F/x; return (charset => 'iso-2022-jp-3-plane1') unless $s =~ /\x1B[^\x24\x28] |\x1B\x24[^\x28] #[^B\x28] |\x1B\x24\x28[^O] |\x1B\x28[^B] |\x0E|\x0F/x; return (charset => 'iso-2022-jp-3') unless $s =~ /\x1B[^\x24\x28] |\x1B\x24[^\x28] #[^B\x28] |\x1B\x24\x28[^OP] |\x1B\x28[^B] |\x0E|\x0F/x; return (charset => 'iso-2022-kr') unless $s =~ /\x1B[^\x24] |\x1B\x24[^\x29] |\x1B\x24\x29[^C]/x; return (charset => 'iso-2022-jp-2') unless $s =~ /\x1B[^\x24\x28\x2E\x4E] |\x1B\x24[^\x40AB\x28] |\x1B\x24\x28[^CD] |\x1B\x28[^BJ] |\x1B\x2E[^AF] |\x0E|\x0F/x; return (charset => 'iso-2022-cn') unless $s =~ /\x1B[^\x4E\x24] |\x1B\x24[^\x29\x2A] |\x1B\x24\x29[^AG] |\x1B\x24\x2A[^H]/x; return (charset => 'iso-2022-cn-ext') unless $s =~ /\x1B[^\x4E\x4F\x24] |\x1B\x24[^\x29\x2A] |\x1B\x24\x29[^AEG] |\x1B\x24\x2A[^HIJKLM]/x; return (charset => 'iso-2022-int-1') unless $s =~ /\x1B[^\x24\x28\x2D] |\x1B\x24[^\x40AB\x28\x29] |\x1B\x24\x28[^DGH] |\x1B\x24\x29[^C] |\x1B\x28[^BJ] |\x1B\x2D[^AF]/x; return (charset => 'junet') unless $s =~ /\x1B[^\x24\x28\x2C] |\x1B\x24[^\x28\x2C\x40-\x42] |\x1B\x24[\x28\x2C][^\x20-\x7E] |\x1B\x24[\x28\x2C][\x20-\x2F]+[^\x30-\x7E] |\x1B[\x28\x2C][^\x20-\x7E] |\x1B[\x28\x2C][\x20-\x2F]+[^\x30-\x7E] |\x0E|\x0F/x; return (charset => 'x-iso-2022'); } else { return (charset => 'us-ascii'); } } sub _name_net_ascii_8bit ($) { my $name = shift; my $s = shift; return (charset => 'us-ascii') unless $s =~ /[\x1B\x0E\x0F\x80-\xFF]/; if ($s =~ /[\x80-\xFF]/) { if ($s =~ /[\xC0-\xFD][\x80-\xBF]*[\x80-\x8F]/) { if ($s =~ /\x1B/) { return (charset => 'x-junet8'); ## junet + UTF-8 } else { return (charset => 'utf-8'); } } elsif ($s =~ /\x1B/) { return (charset => 'x-iso-2022'); ## 8bit ISO 2022 } else { return (charset => 'iso-8859-1'); } } else { ## 7bit ISO 2022 return _name_7bit_iso2022 ($name, $s); } } sub _name_8bit_iso_2022 ($$) { my $name = shift; my $s = shift; return (charset => 'us-ascii') unless $s =~ /[\x1B\x0E\x0F\x80-\xFF]/; if ($s =~ /[\x80-\xFF]/) { if ($s =~ /\x1B/) { return (charset => 'x-iso-2022'); ## 8bit ISO 2022 } else { return (charset => 'iso-8859-1'); } } else { ## 7bit ISO 2022 return _name_7bit_iso2022 ($name, $s); } } ## Not completed. ## TODO: gb18030, cn-gb-12345 ## TODO: _name_euc_gbf (cn-gb-12345, gb2312) sub _name_euc_gb ($$) { my $name = shift; my $s = shift; if ($s =~ /[\x80-\xFF]/) { if ($s =~ / (?:\G|[\x00-\x3F\x7F\x80\xFF]) (?:[\xA1-\xA9\xB0-\xFE][\xA1-\xFE] |[\x40-\x7E])* (?: [\x81-\xA0\xAA-\xAF][\x40-\xFE] |[\xA1-\xFE][\x40-\xA0] ) /x) { (charset => 'gbk'); } elsif ($s =~ / (?:\xA2[\xA1-\xAA] |\xA6[\xE0-\xF5] |\xA8[\xBB-\xC0] ) (?=(?:[\xA1-\xFE][\xA1-\xFE])*(?:[\x00-\xA0\xFF]|\z)) /x) { (charset => 'gbk'); } elsif ($s =~ / (?:\xA3\xE7|\xA7[\xDD-\xF2] |\xA8[\xBB-\xC0] |[\xAA-\xAF\xF8-\xFE][\xA1-\xFE] ) (?=(?:[\xA1-\xFE][\xA1-\xFE])*(?:[\x00-\xA0\xFF]|\z)) /x) { (charset => 'cn-gb-isoir165', 'charset-edition' => 1992); } elsif ($s =~ /\xEF\xF1 ## Typo bug of GB 2312 (?=(?:[\xA1-\xFE][\xA1-\xFE])*(?:[\x00-\xA0\xFF]|\z)) /x) { (charset => 'gb2312'); } else { (charset => 'gb2312', 'charset-edition' => 1980); } } elsif ($s =~ /[\x0E\x0F]/) { (charset => 'gb2312'); ## Actually, this is not "gb2312" } else { _name_7bit_iso_2022 ($name, $s); } } sub _name_euc_japan ($$) { my $name = shift; my $s = shift; if ($s =~ /[\x80-\xFF]/) { if ($s =~ /\x8F[\xA1\xA3-\xA5\xA8\xAC-\xAF\xEE-\xFE][\xA1-\xFE]/) { if ($s =~ /\x8F[\xA2\xA6\xA7\xA9-\xAB\xB0-\xED][\xA1-\xFE]/) { ## JIS X 0213 plane 2 + JIS X 0212 (charset => 'x-euc-jisx0213-packed'); } else { (charset => 'euc-jisx0213'); } } elsif ($s =~ m{(? 'x-euc-jisx0213-packed'); } else { (charset => 'euc-jisx0213-plane1'); } } else { (charset => 'euc-jp'); } } elsif ($s =~ /\x0E|\x0F|\x1B[\x4E\x4F]/) { (charset => 'euc-jisx0213'); ## Actually, this is not euc-japan } else { _name_7bit_iso_2022 ($name, $s); } } sub _name_shift_jis ($$) { my $name = shift; my $s = shift; if ($s =~ /[\x80-\xFF]/) { if ($s =~ /[\x0E\x0F\x1B]/) { (charset => 'x-sjis'); } elsif ($s =~ / (?:\G|[\x00-\x3F\x7F]) (?:[\x81-\x9F\xE0-\xFC][\x40-\x7E\x80-\xFC] |[\x40-\x7E\xA1-\xDF])* [\xF0-\xFC][\x40-\x7E\x80-\xFC] /x) { (charset => 'shift_jisx0213'); } elsif ($s =~ / (?:\G|[\x00-\x3F\x7F]) (?:[\x81-\x9F\xE0-\xFC][\x40-\x7E\x80-\xFC] |[\x40-\x7E\xA1-\xDF])* (?: [\x85-\x87\xEB-\xEF][\x40-\x7E\x80-\xFC] |\x81[\xAD-\xB7\xC0-\xC7\xCF-\xD9\xE9-\xEF\xF8-\xFB] |\x82[\x40-\x4E\x59-\x5F\x7A-\x80\x9B-\x9E\xF2-\xFC] |\x83[\x97-\x9E\xB7-\xBE\xD7-\xFC] |\x84[\x61-\x6F\x72-\x9E\xBF-\xFC] |\x88[\x40-\x9E]|\x98[\x73-\x9E]|\xEA[\xA5-\xFC] ) /x) { (charset => 'shift_jisx0213-plane1'); } else { (charset => 'shift_jis'); } } elsif ($s =~ /[\x5C\x7E]/) { if ($s =~ /\x1B\x0E\x0F/) { (charset => 'x-sjis'); ## ISO 2022 with implied "ESC ( J" ## BUG: "ESC ( B foobar\aaa ESC ( J aiueo" also matchs this } else { (charset => 'jis_x0201'); } } else { _name_7bit_iso_2022 ($name, $s); } } sub _name_utf16be ($$) { shift; my $s = shift; if ($s =~ /[\xD8-\xDB][\x00-\xFF][\xDC-\xDF][\x00-\xFF] (?=(?:[\x00-\xFF][\x00-\xFF])*\z)/sx) { (charset => 'utf-16be'); } elsif ($s =~ /[\x01-\xFF][\x00-\xFF] (?=(?:[\x00-\xFF][\x00-\xFF])*\z)/sx) { if ($s =~ /([^\x00\x03\x04\x23\x25\x30\xFE\xFF] [\x00-\xFF] # ^\x20\x22\x4E-\x9F\xF9\xFA |\x03[^\x00-\x6F\xD0-\xFF] #|\x20[^\x00-\x6F] |\x25[^\x00-\x7F] |\xFE[^\x30-\x4F] |\xFF[^\x00-\xEF] ## note 1 of RFC 1816 is ambitious, so block entire ## is excepted |\x30[\x00-\x3F] ) (?=(?:[\x00-\xFF][\x00-\xFF])*\z)/sx) { (charset => 'iso-10646-ucs-2'); } else { (charset => 'iso-10646-j-1'); } } elsif ($s =~ /\x00[\x80-\xFF] (?=(?:[\x00-\xFF][\x00-\xFF])*\z)/sx) { (charset => 'iso-10646-unicode-latin1'); } else { (charset => 'iso-10646-ucs-basic'); } } sub _name_utf32be ($$) { shift; my $s = shift; if ($s =~ / ([\x01-\x7F][\x00-\xFF]{3} |\x00[\x11-\xFF][\x00-\xFF][\x00-\xFF]) (?=(?:[\x00-\xFF]{4})*\z)/sx) { (charset => 'iso-10646-ucs-4'); } else { (charset => 'utf-32be'); } } =head1 LICENSE Copyright 2002 wakaba Ew@suika.fam.cxE. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; see the file COPYING. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. =head1 CHANGE See F. $Date: 2002/07/17 00:33:29 $ =cut 1;