Message/MIME/Charset.pm


=head1 NAME

Message::MIME::Charset Perl module

=head1 DESCRIPTION

Perl module for MIME charset.

=cut

## NOTE: You should not require/use other module (even it
##       is part of Message::* Perl Modules) as far as possible,
##       to be able to use this module (M::M::Charset) from
##       other (non-Message::*) modules.

package Message::MIME::Charset;
use strict;
use vars qw(%CHARSET %MSNAME2IANANAME %REG $VERSION);
$VERSION=do{my @r=(q$Revision: 1.17 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};

&_builtin_charset;
sub _builtin_charset () {

$CHARSET{'*DEFAULT'} = {
        preferred_name  => '',
        
        encoder => sub { $_[1] },
        decoder => sub { $_[1] },
        
        mime_text       => 1,   ## Suitability in use as MIME text/* charset
        #accept_cte     => [qw/7bit .../],
        cte_7bit_preferred      => 'quoted-printable',
};
$CHARSET{'*default'} = $CHARSET{'*DEFAULT'};

$CHARSET{'us-ascii'} = {
        preferred_name  => 'us-ascii',
        
        encoder => sub { $_[1] },
        decoder => sub { $_[1] },
        
        mime_text       => 1,
        cte_7bit_preferred      => 'quoted-printable',
};

$CHARSET{'iso-2022-int-1'} = {
        preferred_name  => 'iso-2022-int-1',
        
        encoder => sub { $_[1] },
        decoder => sub { $_[1] },
        
        mime_text       => 1,
};

$CHARSET{'unknown-8bit'} = {
        preferred_name  => 'unknown-8bit',
        
        encoder => sub { $_[1] },
        decoder => sub { $_[1] },
        
        mime_text       => 1,
        cte_7bit_preferred      => 'base64',
};

$CHARSET{'x-unknown'} = {
        preferred_name  => 'x-unknown',
        
        encoder => sub { $_[1] },
        decoder => sub { $_[1] },
        
        mime_text       => 0,
        cte_7bit_preferred      => 'base64',
};

$CHARSET{'*undef'} = {
        preferred_name  => '',
        
        #encoder        => sub { $_[1] },
        #decoder        => sub { $_[1] },
        
        mime_text       => 0,
        cte_7bit_preferred      => 'base64',
};

}       # /builtin_charset

my %_MINIMUMIZER = (
        'euc-jp'        => \&_name_euc_japan,
        'euc-jisx0213'  => \&_name_euc_japan,
        'euc-jisx0213-plane1'   => \&_name_euc_japan,
        'x-euc-jisx0213-packed' => \&_name_euc_japan,
        'x-iso-2022'    => \&_name_8bit_iso2022,
        'iso-2022-cn'   => \&_name_8bit_iso2022,
        'iso-2022-cn-ext'       => \&_name_8bit_iso2022,
        'iso-2022-int-1'        => \&_name_net_ascii_8bit,
        'iso-2022-jp'   => \&_name_8bit_iso2022,
        'iso-2022-jp-1' => \&_name_8bit_iso2022,
        'iso-2022-jp-2' => \&_name_8bit_iso2022,
        'iso-2022-jp-3' => \&_name_8bit_iso2022,
        'iso-2022-jp-3-plane1'  => \&_name_8bit_iso2022,
        'iso-2022-kr'   => \&_name_8bit_iso2022,
        'iso-8859-1'    => \&_name_8bit_iso2022,
        jis_x0201       => \&_name_shift_jis,
        junet   => \&_name_8bit_iso2022,
        'x-junet8'      => \&_name_net_ascii_8bit,
        shift_jis       => \&_name_shift_jis,
        shift_jisx0213  => \&_name_shift_jis,
        'shift_jisx0213-plane1' => \&_name_shift_jis,
        'x-sjis'        => \&_name_shift_jis,
        'us-ascii'      => \&_name_net_ascii_8bit,
        'utf-8' => \&_name_net_ascii_8bit,
);

my %_IsMimeText;
for (qw(
        adobe-standard-encoding adobe-symbol-encoding
        big5    big5-eten       big5-hkscs
        cp950
        gbk     gb18030
        euc-jp  euc-jisx0213    euc-kr  euc-tw
        hp-roman8
        hz-gb-2312
        ibm437
        junet   x-junet8        x-iso-2022
        iso-2022-cn     iso-2022-cn-ext
        iso-2022-int-1
        iso-2022-jp     iso-2022-jp-1   iso-2022-jp-2   iso-2022-jp-3
        x-iso2022jp-cp932
        iso-2022-kr
        iso-8859-1      iso-8859-2      iso-8859-3
        iso-8859-4      iso-8859-5      iso-8859-6
        iso-8859-7      iso-8859-8      iso-8859-9
        iso-8859-10     iso-8859-12     iso-8859-13
        iso-8859-14     iso-8859-15     iso-8859-16
        jis_encoding
        koi8-r  koi8-u
        x-mac-arabic    x-mac-centralroman      x-mac-cyrillic  x-mac-greek
        x-mac-hebrew    x-mac-icelandic macintosh       x-mac-turkish
        x-mac-ukrainian x-mac-chinesesimp       x-mac-japanese  x-mac-korean
        shift_jis       shift_jisx0213  x-sjis
        tis-620
        unicode-1-1-utf-7       unicode-1-1-utf-8
        unicode-2-0-utf-7       unicode-2-0-utf-8
        utf-7   utf-8   utf-9
        viscii
        windows-1250    windows-1251    windows-1252    windows-1253
        windows-1254    windows-1255    windows-1256    windows-1257
        windows-1258    windows-31j     windows-949
)) { $_IsMimeText{$_} = 1 }

%MSNAME2IANANAME = (
        'iso-2022-jp'   => 'x-iso2022jp-cp932',
        'ks_c_5601-1987'        => 'windows-949',
);

sub make_charset ($%) {
  my $name = shift;
  return unless $name;  ## Note: charset "0" is not supported.
  my %definition = @_;
  
  $definition{preferred_name} ||= $name;
  if ($definition{preferred_name} ne $name
      && ref $CHARSET{$definition{preferred_name}}) {
  ## New charset is an alias of defined charset,
    $CHARSET{$name} = $CHARSET{$definition{preferred_name}};
    return;
  } elsif ($definition{alias_of} && ref $CHARSET{$definition{alias_of}}) {
  ## New charset is an alias of defined charset,
    $CHARSET{$name} = $CHARSET{$definition{alias_of}};
    return;
  }
  $CHARSET{$name} = \%definition;
  
  ## Set default values
  #$definition{encoder} ||= sub { $_[1] };
  #$definition{decoder} ||= sub { $_[1] };

  $definition{mime_text} = 0 unless defined $definition{mime_text};
  $definition{cte_7bit_preferred} = 'base64'
    unless defined $definition{cte_7bit_preferred};
}

sub encode ($$) {
  my ($charset, $s) = (lc shift, shift);
  my $c = ref $CHARSET{$charset}->{encoder}? $charset: '*undef';
  if (ref $CHARSET{$c}->{encoder}) {
    my ($t, %r) = &{$CHARSET{$c}->{encoder}} ($charset, $s);
    unless (defined $r{success}) {
      $r{success} = 1;
    }
    return ($t, %r);
  }
  ($s, success => 0);
}

sub decode ($$) {
  my ($charset, $s) = (lc shift, shift);
  my $c = ref $CHARSET{$charset}->{decoder}? $charset: '*undef';
  if (ref $CHARSET{$c}->{decoder}) {
    my ($t, %r) = &{$CHARSET{$c}->{decoder}} ($charset, $s);
    unless (defined $r{success}) {
      $r{success} = 1;
    }
    return ($t, %r);
  }
  ($s, success => 0);
}

sub name_normalize ($) {
  my $name = lc shift;
  if (ref $CHARSET{$name}->{preferred_name} eq 'CODE') {
    return &{ $CHARSET{$name}->{preferred_name} } ($name);
  } elsif ($CHARSET{$name}->{preferred_name}) {
    return $CHARSET{$name}->{preferred_name};
  } elsif (ref $CHARSET{'*undef'}->{preferred_name} eq 'CODE') {
    return &{ $CHARSET{'*undef'}->{preferred_name} } ($name);
  }
  $name;
}

sub name_minimumize ($$) {
  require Message::MIME::Charset::MinName;
  my ($charset, $s) = (lc shift, shift);
  if (ref $CHARSET{$charset}->{name_minimumizer} eq 'CODE') {
    return &{$CHARSET{$charset}->{name_minimumizer}} ($charset, $s);
  } elsif (ref $Message::MIME::Charset::MinName::MIN{$charset}) {
    return &{$Message::MIME::Charset::MinName::MIN{$charset}} ($charset, $s);
  } elsif (ref $_MINIMUMIZER{$charset}) {
    return &{$_MINIMUMIZER{$charset}} ($charset, $s);
  } elsif (ref $CHARSET{'*undef'}->{name_minimumizer} eq 'CODE') {
    return &{$CHARSET{'*undef'}->{name_minimumizer}} ($charset, $s);
  }
  (charset => $charset);
}

sub msname2iananame ($) {
  my $mscharset = shift;
  $MSNAME2IANANAME{$mscharset} || $mscharset;
}

sub _name_7bit_iso2022 ($$) {shift;
  my $s = shift;
  if ($s =~ /[\x0E\x0F\x1B]/) {
    return (charset => 'iso-2022-jp')
      unless $s =~ /\x1B[^\x24\x28]
                   |\x1B\x24[^\x40B]
                   |\x1B\x28[^BJ]
                   |\x0E|\x0F/x;
    return (charset => 'iso-2022-jp-1')
      unless $s =~ /\x1B[^\x24\x28]
                   |\x1B\x24[^\x40B\x28]
                   |\x1B\x24\x28[^D]
                   |\x1B\x28[^BJ]
                   |\x0E|\x0F/x;
    return (charset => 'iso-2022-jp-3-plane1')
      unless $s =~ /\x1B[^\x24\x28]
                   |\x1B\x24[^\x28]     #[^B\x28]
                   |\x1B\x24\x28[^O]
                   |\x1B\x28[^B]
                   |\x0E|\x0F/x;
    return (charset => 'iso-2022-jp-3')
      unless $s =~ /\x1B[^\x24\x28]
                   |\x1B\x24[^\x28]     #[^B\x28]
                   |\x1B\x24\x28[^OP]
                   |\x1B\x28[^B]
                   |\x0E|\x0F/x;
    return (charset => 'iso-2022-kr')
      unless $s =~ /\x1B[^\x24]
                   |\x1B\x24[^\x29]
                   |\x1B\x24\x29[^C]/x;
    return (charset => 'iso-2022-jp-2')
      unless $s =~ /\x1B[^\x24\x28\x2E\x4E]
                   |\x1B\x24[^\x40AB\x28]
                   |\x1B\x24\x28[^CD]
                   |\x1B\x28[^BJ]
                   |\x1B\x2E[^AF]
                   |\x0E|\x0F/x;
    return (charset => 'iso-2022-cn')
      unless $s =~ /\x1B[^\x4E\x24]
                   |\x1B\x24[^\x29\x2A]
                   |\x1B\x24\x29[^AG]
                   |\x1B\x24\x2A[^H]/x;
    return (charset => 'iso-2022-cn-ext')
      unless $s =~ /\x1B[^\x4E\x4F\x24]
                   |\x1B\x24[^\x29\x2A]
                   |\x1B\x24\x29[^AEG]
                   |\x1B\x24\x2A[^HIJKLM]/x;
    return (charset => 'iso-2022-int-1')
      unless $s =~ /\x1B[^\x24\x28\x2D]
                   |\x1B\x24[^\x40AB\x28\x29]
                   |\x1B\x24\x28[^DGH]
                   |\x1B\x24\x29[^C]
                   |\x1B\x28[^BJ]
                   |\x1B\x2D[^AF]/x;
    return (charset => 'junet')
      unless $s =~ /\x1B[^\x24\x28\x2C]
                   |\x1B\x24[^\x28\x2C\x40-\x42]
                   |\x1B\x24[\x28\x2C][^\x20-\x7E]
                   |\x1B\x24[\x28\x2C][\x20-\x2F]+[^\x30-\x7E]
                   |\x1B[\x28\x2C][^\x20-\x7E]
                   |\x1B[\x28\x2C][\x20-\x2F]+[^\x30-\x7E]
                   |\x0E|\x0F/x;
    return (charset => 'x-iso-2022');
  } else {
    return (charset => 'us-ascii');
  }
}

sub _name_net_ascii_8bit ($) {
  my $name = shift; my $s = shift;
  return (charset => 'us-ascii') unless $s =~ /[\x1B\x0E\x0F\x80-\xFF]/;
  if ($s =~ /[\x80-\xFF]/) {
    if ($s =~ /[\xC0-\xFD][\x80-\xBF]*[\x80-\x8F]/) {
      if ($s =~ /\x1B/) {
        return (charset => 'x-junet8'); ## junet + UTF-8
      } else {
        return (charset => 'utf-8');
      }
    } elsif ($s =~ /\x1B/) {
      return (charset => 'x-iso-2022'); ## 8bit ISO 2022
    } else {
      return (charset => 'iso-8859-1');
    }
  } else {      ## 7bit ISO 2022
    return _name_7bit_iso2022 ($name, $s);
  }
}

sub _name_8bit_iso2022 ($$) {
  my $name = shift; my $s = shift;
  return (charset => 'us-ascii') unless $s =~ /[\x1B\x0E\x0F\x80-\xFF]/;
  if ($s =~ /[\x80-\xFF]/) {
    if ($s =~ /\x1B/) {
      return (charset => 'x-iso-2022'); ## 8bit ISO 2022
    } else {
      return (charset => 'iso-8859-1');
    }
  } else {      ## 7bit ISO 2022
    return _name_7bit_iso2022 ($name, $s);
  }
}

## Not completed.
## TODO: gb18030, cn-gb-12345
## TODO: _name_euc_gbf (cn-gb-12345, gb2312)
sub _name_euc_gb ($$) {
  my $name = shift; my $s = shift;
  if ($s =~ /[\x80-\xFF]/) {
    if ($s =~ /
                  (?:\G|[\x00-\x3F\x7F\x80\xFF])
                  (?:[\xA1-\xA9\xB0-\xFE][\xA1-\xFE]
                    |[\x40-\x7E])*
        (?:
          [\x81-\xA0\xAA-\xAF][\x40-\xFE]
         |[\xA1-\xFE][\x40-\xA0]
        )
      /x) {
      (charset => 'gbk');
    } elsif ($s =~ /
        (?:\xA2[\xA1-\xAA]
          |\xA6[\xE0-\xF5]
          |\xA8[\xBB-\xC0]
        )
          (?=(?:[\xA1-\xFE][\xA1-\xFE])*(?:[\x00-\xA0\xFF]|\z))
      /x) {
      (charset => 'gbk');
    } elsif ($s =~ /
        (?:\xA3\xE7|\xA7[\xDD-\xF2]
          |\xA8[\xBB-\xC0]
          |[\xAA-\xAF\xF8-\xFE][\xA1-\xFE]
        )
          (?=(?:[\xA1-\xFE][\xA1-\xFE])*(?:[\x00-\xA0\xFF]|\z))
      /x) {
      (charset => 'cn-gb-isoir165', 'charset-edition' => 1992);
    } elsif ($s =~ /\xEF\xF1    ## Typo bug of GB 2312
          (?=(?:[\xA1-\xFE][\xA1-\xFE])*(?:[\x00-\xA0\xFF]|\z))
      /x) {
      (charset => 'gb2312');
    } else {
      (charset => 'gb2312', 'charset-edition' => 1980);
    }
  } elsif ($s =~ /[\x0E\x0F]/) {
    (charset => 'gb2312');      ## Actually, this is not "gb2312"
  } else {
    _name_7bit_iso2022 ($name, $s);
  }
}

sub _name_euc_japan ($$) {
  my $name = shift; my $s = shift;
  if ($s =~ /[\x80-\xFF]/) {
    if ($s =~ /\x8F[\xA1\xA3-\xA5\xA8\xAC-\xAF\xEE-\xFE][\xA1-\xFE]/) {
      if ($s =~ /\x8F[\xA2\xA6\xA7\xA9-\xAB\xB0-\xED][\xA1-\xFE]/) {
      ## JIS X 0213 plane 2 + JIS X 0212
        (charset => 'x-euc-jisx0213-packed');
      } else {
        (charset => 'euc-jisx0213');
      }
    } elsif ($s =~ m{(?<![\x8E\x8F])    ## Not G2/G3 character
                    (?: ## JIS X 0213:2000
                       [\xA9-\xAF\xF5-\xFE][\xA1-\xFE]
                      |\xA2[\xAF-\xB9\xC2-\xC9\xD1-\xDB\xE9-\xF1\xFA-\xFD]
                      |\xA3[\xA1-\xAF\xBA-\xC0\xDB-\xE0\xFB-\xFE]
                      |\xA4[\xF4-\xFE]|\xA5[\xF7-\xFE]
                      |\xA6[\xB9-\xC0\xD9-\xFE]|\xA7[\xC2-\xD0\xF2-\xFE]
                      |\xA8[\xC1-\xFE]|\xCF[\xD4-\xFE]|\xF4[\xA7-\xFE]
                    )
                    (?=(?:[\xA1-\xFE][\xA1-\xFE])*(?:[\x00-\xA0\xFF]|\z))}x) {
      if ($s =~ /\x8F/) {       ## JIS X 0213 plane 1 + JIS X 0212
        (charset => 'x-euc-jisx0213-packed');
      } else {
        (charset => 'euc-jisx0213-plane1');
      }
    } else {
      (charset => 'euc-jp');
    }
  } elsif ($s =~ /\x0E|\x0F|\x1B[\x4E\x4F]/) {
    (charset => 'euc-jisx0213');        ## Actually, this is not euc-japan
  } else {
    _name_7bit_iso2022 ($name, $s);
  }
}

sub _name_shift_jis ($$) {
  my $name = shift; my $s = shift;
  if ($s =~ /[\x80-\xFF]/) {
    if ($s =~ /[\x0E\x0F\x1B]/) {
      (charset => 'x-sjis');
    } elsif ($s =~ /
                  (?:\G|[\x00-\x3F\x7F])
                  (?:[\x81-\x9F\xE0-\xFC][\x40-\x7E\x80-\xFC]
                    |[\x40-\x7E\xA1-\xDF])*
               [\xF0-\xFC][\x40-\x7E\x80-\xFC]
      /x) {
      (charset => 'shift_jisx0213');
    } elsif ($s =~ /
                  (?:\G|[\x00-\x3F\x7F])
                  (?:[\x81-\x9F\xE0-\xFC][\x40-\x7E\x80-\xFC]
                    |[\x40-\x7E\xA1-\xDF])*
              (?:
               [\x85-\x87\xEB-\xEF][\x40-\x7E\x80-\xFC]
              |\x81[\xAD-\xB7\xC0-\xC7\xCF-\xD9\xE9-\xEF\xF8-\xFB]
              |\x82[\x40-\x4E\x59-\x5F\x7A-\x80\x9B-\x9E\xF2-\xFC]
              |\x83[\x97-\x9E\xB7-\xBE\xD7-\xFC]
              |\x84[\x61-\x6F\x72-\x9E\xBF-\xFC]
              |\x88[\x40-\x9E]|\x98[\x73-\x9E]|\xEA[\xA5-\xFC]
              )
    /x) {
      (charset => 'shift_jisx0213-plane1');
    } else {
      (charset => 'shift_jis');
    }
  } elsif ($s =~ /[\x5C\x7E]/) {
    if ($s =~ /\x1B\x0E\x0F/) {
      (charset => 'x-sjis');    ## ISO 2022 with implied "ESC ( J"
        ## BUG: "ESC ( B foobar\aaa ESC ( J aiueo" also matchs this
    } else {
      (charset => 'jis_x0201');
    }
  } else {
    _name_7bit_iso2022 ($name, $s);
  }
}

sub _utf8_on ($) {
  Encode::_utf8_on ($_[0]) if $Encode::VERSION;
}
sub _utf8_off ($) {
  Encode::_utf8_off ($_[0]) if $Encode::VERSION;
}

sub is_mime_text ($) {
  my $name = lc shift;
  if (ref $CHARSET{$name}->{mime_text} eq 'CODE') {
    return &{ $CHARSET{$name}->{mime_text} } ($name);
  } elsif (defined $CHARSET{$name}->{mime_text}) {
    return $CHARSET{$name}->{mime_text};
  } elsif (defined $_IsMimeText{$name}) {
    return $_IsMimeText{$name};
  } elsif (ref $CHARSET{'*undef'}->{mime_text} eq 'CODE') {
    return &{ $CHARSET{'*undef'}->{mime_text} } ($name);
  }
  0;
}

=head1 LICENSE

Copyright 2002 wakaba E<lt>w@suika.fam.cxE<gt>.

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; see the file COPYING.  If not, write to
the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
Boston, MA 02111-1307, USA.

=head1 CHANGE

See F<ChangeLog>.
$Date: 2002/08/18 06:22:36 $

=cut

1;
1
2	=head1 NAME
3
4	Message::MIME::Charset Perl module
5
6	=head1 DESCRIPTION
7
8	Perl module for MIME charset.
9
10	=cut
11
12	## NOTE: You should not require/use other module (even it
13	## is part of Message::* Perl Modules) as far as possible,
14	## to be able to use this module (M::M::Charset) from
15	## other (non-Message::*) modules.
16
17	package Message::MIME::Charset;
18	use strict;
19	use vars qw(%CHARSET %MSNAME2IANANAME %REG $VERSION);
20	$VERSION=do{my @r=(q$Revision: 1.17 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
21
22	&_builtin_charset;
23	sub _builtin_charset () {
24
25	$CHARSET{'*DEFAULT'} = {
26	preferred_name => '',
27
28	encoder => sub { $_[1] },
29	decoder => sub { $_[1] },
30
31	mime_text => 1, ## Suitability in use as MIME text/* charset
32	#accept_cte => [qw/7bit .../],
33	cte_7bit_preferred => 'quoted-printable',
34	};
35	$CHARSET{'default'} = $CHARSET{'DEFAULT'};
36
37	$CHARSET{'us-ascii'} = {
38	preferred_name => 'us-ascii',
39
40	encoder => sub { $_[1] },
41	decoder => sub { $_[1] },
42
43	mime_text => 1,
44	cte_7bit_preferred => 'quoted-printable',
45	};
46
47	$CHARSET{'iso-2022-int-1'} = {
48	preferred_name => 'iso-2022-int-1',
49
50	encoder => sub { $_[1] },
51	decoder => sub { $_[1] },
52
53	mime_text => 1,
54	};
55
56	$CHARSET{'unknown-8bit'} = {
57	preferred_name => 'unknown-8bit',
58
59	encoder => sub { $_[1] },
60	decoder => sub { $_[1] },
61
62	mime_text => 1,
63	cte_7bit_preferred => 'base64',
64	};
65
66	$CHARSET{'x-unknown'} = {
67	preferred_name => 'x-unknown',
68
69	encoder => sub { $_[1] },
70	decoder => sub { $_[1] },
71
72	mime_text => 0,
73	cte_7bit_preferred => 'base64',
74	};
75
76	$CHARSET{'*undef'} = {
77	preferred_name => '',
78
79	#encoder => sub { $_[1] },
80	#decoder => sub { $_[1] },
81
82	mime_text => 0,
83	cte_7bit_preferred => 'base64',
84	};
85
86	} # /builtin_charset
87
88	my %_MINIMUMIZER = (
89	'euc-jp' => \&_name_euc_japan,
90	'euc-jisx0213' => \&_name_euc_japan,
91	'euc-jisx0213-plane1' => \&_name_euc_japan,
92	'x-euc-jisx0213-packed' => \&_name_euc_japan,
93	'x-iso-2022' => \&_name_8bit_iso2022,
94	'iso-2022-cn' => \&_name_8bit_iso2022,
95	'iso-2022-cn-ext' => \&_name_8bit_iso2022,
96	'iso-2022-int-1' => \&_name_net_ascii_8bit,
97	'iso-2022-jp' => \&_name_8bit_iso2022,
98	'iso-2022-jp-1' => \&_name_8bit_iso2022,
99	'iso-2022-jp-2' => \&_name_8bit_iso2022,
100	'iso-2022-jp-3' => \&_name_8bit_iso2022,
101	'iso-2022-jp-3-plane1' => \&_name_8bit_iso2022,
102	'iso-2022-kr' => \&_name_8bit_iso2022,
103	'iso-8859-1' => \&_name_8bit_iso2022,
104	jis_x0201 => \&_name_shift_jis,
105	junet => \&_name_8bit_iso2022,
106	'x-junet8' => \&_name_net_ascii_8bit,
107	shift_jis => \&_name_shift_jis,
108	shift_jisx0213 => \&_name_shift_jis,
109	'shift_jisx0213-plane1' => \&_name_shift_jis,
110	'x-sjis' => \&_name_shift_jis,
111	'us-ascii' => \&_name_net_ascii_8bit,
112	'utf-8' => \&_name_net_ascii_8bit,
113	);
114
115	my %_IsMimeText;
116	for (qw(
117	adobe-standard-encoding adobe-symbol-encoding
118	big5 big5-eten big5-hkscs
119	cp950
120	gbk gb18030
121	euc-jp euc-jisx0213 euc-kr euc-tw
122	hp-roman8
123	hz-gb-2312
124	ibm437
125	junet x-junet8 x-iso-2022
126	iso-2022-cn iso-2022-cn-ext
127	iso-2022-int-1
128	iso-2022-jp iso-2022-jp-1 iso-2022-jp-2 iso-2022-jp-3
129	x-iso2022jp-cp932
130	iso-2022-kr
131	iso-8859-1 iso-8859-2 iso-8859-3
132	iso-8859-4 iso-8859-5 iso-8859-6
133	iso-8859-7 iso-8859-8 iso-8859-9
134	iso-8859-10 iso-8859-12 iso-8859-13
135	iso-8859-14 iso-8859-15 iso-8859-16
136	jis_encoding
137	koi8-r koi8-u
138	x-mac-arabic x-mac-centralroman x-mac-cyrillic x-mac-greek
139	x-mac-hebrew x-mac-icelandic macintosh x-mac-turkish
140	x-mac-ukrainian x-mac-chinesesimp x-mac-japanese x-mac-korean
141	shift_jis shift_jisx0213 x-sjis
142	tis-620
143	unicode-1-1-utf-7 unicode-1-1-utf-8
144	unicode-2-0-utf-7 unicode-2-0-utf-8
145	utf-7 utf-8 utf-9
146	viscii
147	windows-1250 windows-1251 windows-1252 windows-1253
148	windows-1254 windows-1255 windows-1256 windows-1257
149	windows-1258 windows-31j windows-949
150	)) { $_IsMimeText{$_} = 1 }
151
152	%MSNAME2IANANAME = (
153	'iso-2022-jp' => 'x-iso2022jp-cp932',
154	'ks_c_5601-1987' => 'windows-949',
155	);
156
157	sub make_charset ($%) {
158	my $name = shift;
159	return unless $name; ## Note: charset "0" is not supported.
160	my %definition = @_;
161
162	$definition{preferred_name} \|\|= $name;
163	if ($definition{preferred_name} ne $name
164	&& ref $CHARSET{$definition{preferred_name}}) {
165	## New charset is an alias of defined charset,
166	$CHARSET{$name} = $CHARSET{$definition{preferred_name}};
167	return;
168	} elsif ($definition{alias_of} && ref $CHARSET{$definition{alias_of}}) {
169	## New charset is an alias of defined charset,
170	$CHARSET{$name} = $CHARSET{$definition{alias_of}};
171	return;
172	}
173	$CHARSET{$name} = \%definition;
174
175	## Set default values
176	#$definition{encoder} \|\|= sub { $_[1] };
177	#$definition{decoder} \|\|= sub { $_[1] };
178
179	$definition{mime_text} = 0 unless defined $definition{mime_text};
180	$definition{cte_7bit_preferred} = 'base64'
181	unless defined $definition{cte_7bit_preferred};
182	}
183
184	sub encode ($$) {
185	my ($charset, $s) = (lc shift, shift);
186	my $c = ref $CHARSET{$charset}->{encoder}? $charset: '*undef';
187	if (ref $CHARSET{$c}->{encoder}) {
188	my ($t, %r) = &{$CHARSET{$c}->{encoder}} ($charset, $s);
189	unless (defined $r{success}) {
190	$r{success} = 1;
191	}
192	return ($t, %r);
193	}
194	($s, success => 0);
195	}
196
197	sub decode ($$) {
198	my ($charset, $s) = (lc shift, shift);
199	my $c = ref $CHARSET{$charset}->{decoder}? $charset: '*undef';
200	if (ref $CHARSET{$c}->{decoder}) {
201	my ($t, %r) = &{$CHARSET{$c}->{decoder}} ($charset, $s);
202	unless (defined $r{success}) {
203	$r{success} = 1;
204	}
205	return ($t, %r);
206	}
207	($s, success => 0);
208	}
209
210	sub name_normalize ($) {
211	my $name = lc shift;
212	if (ref $CHARSET{$name}->{preferred_name} eq 'CODE') {
213	return &{ $CHARSET{$name}->{preferred_name} } ($name);
214	} elsif ($CHARSET{$name}->{preferred_name}) {
215	return $CHARSET{$name}->{preferred_name};
216	} elsif (ref $CHARSET{'*undef'}->{preferred_name} eq 'CODE') {
217	return &{ $CHARSET{'*undef'}->{preferred_name} } ($name);
218	}
219	$name;
220	}
221
222	sub name_minimumize ($$) {
223	require Message::MIME::Charset::MinName;
224	my ($charset, $s) = (lc shift, shift);
225	if (ref $CHARSET{$charset}->{name_minimumizer} eq 'CODE') {
226	return &{$CHARSET{$charset}->{name_minimumizer}} ($charset, $s);
227	} elsif (ref $Message::MIME::Charset::MinName::MIN{$charset}) {
228	return &{$Message::MIME::Charset::MinName::MIN{$charset}} ($charset, $s);
229	} elsif (ref $_MINIMUMIZER{$charset}) {
230	return &{$_MINIMUMIZER{$charset}} ($charset, $s);
231	} elsif (ref $CHARSET{'*undef'}->{name_minimumizer} eq 'CODE') {
232	return &{$CHARSET{'*undef'}->{name_minimumizer}} ($charset, $s);
233	}
234	(charset => $charset);
235	}
236
237	sub msname2iananame ($) {
238	my $mscharset = shift;
239	$MSNAME2IANANAME{$mscharset} \|\| $mscharset;
240	}
241
242	sub _name_7bit_iso2022 ($$) {shift;
243	my $s = shift;
244	if ($s =~ /[\x0E\x0F\x1B]/) {
245	return (charset => 'iso-2022-jp')
246	unless $s =~ /\x1B[^\x24\x28]
247	\|\x1B\x24[^\x40B]
248	\|\x1B\x28[^BJ]
249	\|\x0E\|\x0F/x;
250	return (charset => 'iso-2022-jp-1')
251	unless $s =~ /\x1B[^\x24\x28]
252	\|\x1B\x24[^\x40B\x28]
253	\|\x1B\x24\x28[^D]
254	\|\x1B\x28[^BJ]
255	\|\x0E\|\x0F/x;
256	return (charset => 'iso-2022-jp-3-plane1')
257	unless $s =~ /\x1B[^\x24\x28]
258	\|\x1B\x24[^\x28] #[^B\x28]
259	\|\x1B\x24\x28[^O]
260	\|\x1B\x28[^B]
261	\|\x0E\|\x0F/x;
262	return (charset => 'iso-2022-jp-3')
263	unless $s =~ /\x1B[^\x24\x28]
264	\|\x1B\x24[^\x28] #[^B\x28]
265	\|\x1B\x24\x28[^OP]
266	\|\x1B\x28[^B]
267	\|\x0E\|\x0F/x;
268	return (charset => 'iso-2022-kr')
269	unless $s =~ /\x1B[^\x24]
270	\|\x1B\x24[^\x29]
271	\|\x1B\x24\x29[^C]/x;
272	return (charset => 'iso-2022-jp-2')
273	unless $s =~ /\x1B[^\x24\x28\x2E\x4E]
274	\|\x1B\x24[^\x40AB\x28]
275	\|\x1B\x24\x28[^CD]
276	\|\x1B\x28[^BJ]
277	\|\x1B\x2E[^AF]
278	\|\x0E\|\x0F/x;
279	return (charset => 'iso-2022-cn')
280	unless $s =~ /\x1B[^\x4E\x24]
281	\|\x1B\x24[^\x29\x2A]
282	\|\x1B\x24\x29[^AG]
283	\|\x1B\x24\x2A[^H]/x;
284	return (charset => 'iso-2022-cn-ext')
285	unless $s =~ /\x1B[^\x4E\x4F\x24]
286	\|\x1B\x24[^\x29\x2A]
287	\|\x1B\x24\x29[^AEG]
288	\|\x1B\x24\x2A[^HIJKLM]/x;
289	return (charset => 'iso-2022-int-1')
290	unless $s =~ /\x1B[^\x24\x28\x2D]
291	\|\x1B\x24[^\x40AB\x28\x29]
292	\|\x1B\x24\x28[^DGH]
293	\|\x1B\x24\x29[^C]
294	\|\x1B\x28[^BJ]
295	\|\x1B\x2D[^AF]/x;
296	return (charset => 'junet')
297	unless $s =~ /\x1B[^\x24\x28\x2C]
298	\|\x1B\x24[^\x28\x2C\x40-\x42]
299	\|\x1B\x24[\x28\x2C][^\x20-\x7E]
300	\|\x1B\x24[\x28\x2C][\x20-\x2F]+[^\x30-\x7E]
301	\|\x1B[\x28\x2C][^\x20-\x7E]
302	\|\x1B[\x28\x2C][\x20-\x2F]+[^\x30-\x7E]
303	\|\x0E\|\x0F/x;
304	return (charset => 'x-iso-2022');
305	} else {
306	return (charset => 'us-ascii');
307	}
308	}
309
310	sub _name_net_ascii_8bit ($) {
311	my $name = shift; my $s = shift;
312	return (charset => 'us-ascii') unless $s =~ /[\x1B\x0E\x0F\x80-\xFF]/;
313	if ($s =~ /[\x80-\xFF]/) {
314	if ($s =~ /[\xC0-\xFD][\x80-\xBF]*[\x80-\x8F]/) {
315	if ($s =~ /\x1B/) {
316	return (charset => 'x-junet8'); ## junet + UTF-8
317	} else {
318	return (charset => 'utf-8');
319	}
320	} elsif ($s =~ /\x1B/) {
321	return (charset => 'x-iso-2022'); ## 8bit ISO 2022
322	} else {
323	return (charset => 'iso-8859-1');
324	}
325	} else { ## 7bit ISO 2022
326	return _name_7bit_iso2022 ($name, $s);
327	}
328	}
329
330	sub _name_8bit_iso2022 ($$) {
331	my $name = shift; my $s = shift;
332	return (charset => 'us-ascii') unless $s =~ /[\x1B\x0E\x0F\x80-\xFF]/;
333	if ($s =~ /[\x80-\xFF]/) {
334	if ($s =~ /\x1B/) {
335	return (charset => 'x-iso-2022'); ## 8bit ISO 2022
336	} else {
337	return (charset => 'iso-8859-1');
338	}
339	} else { ## 7bit ISO 2022
340	return _name_7bit_iso2022 ($name, $s);
341	}
342	}
343
344	## Not completed.
345	## TODO: gb18030, cn-gb-12345
346	## TODO: _name_euc_gbf (cn-gb-12345, gb2312)
347	sub _name_euc_gb ($$) {
348	my $name = shift; my $s = shift;
349	if ($s =~ /[\x80-\xFF]/) {
350	if ($s =~ /
351	(?:\G\|[\x00-\x3F\x7F\x80\xFF])
352	(?:[\xA1-\xA9\xB0-\xFE][\xA1-\xFE]
353	\|[\x40-\x7E])*
354	(?:
355	[\x81-\xA0\xAA-\xAF][\x40-\xFE]
356	\|[\xA1-\xFE][\x40-\xA0]
357	)
358	/x) {
359	(charset => 'gbk');
360	} elsif ($s =~ /
361	(?:\xA2[\xA1-\xAA]
362	\|\xA6[\xE0-\xF5]
363	\|\xA8[\xBB-\xC0]
364	)
365	(?=(?:[\xA1-\xFE][\xA1-\xFE])*(?:[\x00-\xA0\xFF]\|\z))
366	/x) {
367	(charset => 'gbk');
368	} elsif ($s =~ /
369	(?:\xA3\xE7\|\xA7[\xDD-\xF2]
370	\|\xA8[\xBB-\xC0]
371	\|[\xAA-\xAF\xF8-\xFE][\xA1-\xFE]
372	)
373	(?=(?:[\xA1-\xFE][\xA1-\xFE])*(?:[\x00-\xA0\xFF]\|\z))
374	/x) {
375	(charset => 'cn-gb-isoir165', 'charset-edition' => 1992);
376	} elsif ($s =~ /\xEF\xF1 ## Typo bug of GB 2312
377	(?=(?:[\xA1-\xFE][\xA1-\xFE])*(?:[\x00-\xA0\xFF]\|\z))
378	/x) {
379	(charset => 'gb2312');
380	} else {
381	(charset => 'gb2312', 'charset-edition' => 1980);
382	}
383	} elsif ($s =~ /[\x0E\x0F]/) {
384	(charset => 'gb2312'); ## Actually, this is not "gb2312"
385	} else {
386	_name_7bit_iso2022 ($name, $s);
387	}
388	}
389
390	sub _name_euc_japan ($$) {
391	my $name = shift; my $s = shift;
392	if ($s =~ /[\x80-\xFF]/) {
393	if ($s =~ /\x8F[\xA1\xA3-\xA5\xA8\xAC-\xAF\xEE-\xFE][\xA1-\xFE]/) {
394	if ($s =~ /\x8F[\xA2\xA6\xA7\xA9-\xAB\xB0-\xED][\xA1-\xFE]/) {
395	## JIS X 0213 plane 2 + JIS X 0212
396	(charset => 'x-euc-jisx0213-packed');
397	} else {
398	(charset => 'euc-jisx0213');
399	}
400	} elsif ($s =~ m{(?<![\x8E\x8F]) ## Not G2/G3 character
401	(?: ## JIS X 0213:2000
402	[\xA9-\xAF\xF5-\xFE][\xA1-\xFE]
403	\|\xA2[\xAF-\xB9\xC2-\xC9\xD1-\xDB\xE9-\xF1\xFA-\xFD]
404	\|\xA3[\xA1-\xAF\xBA-\xC0\xDB-\xE0\xFB-\xFE]
405	\|\xA4[\xF4-\xFE]\|\xA5[\xF7-\xFE]
406	\|\xA6[\xB9-\xC0\xD9-\xFE]\|\xA7[\xC2-\xD0\xF2-\xFE]
407	\|\xA8[\xC1-\xFE]\|\xCF[\xD4-\xFE]\|\xF4[\xA7-\xFE]
408	)
409	(?=(?:[\xA1-\xFE][\xA1-\xFE])*(?:[\x00-\xA0\xFF]\|\z))}x) {
410	if ($s =~ /\x8F/) { ## JIS X 0213 plane 1 + JIS X 0212
411	(charset => 'x-euc-jisx0213-packed');
412	} else {
413	(charset => 'euc-jisx0213-plane1');
414	}
415	} else {
416	(charset => 'euc-jp');
417	}
418	} elsif ($s =~ /\x0E\|\x0F\|\x1B[\x4E\x4F]/) {
419	(charset => 'euc-jisx0213'); ## Actually, this is not euc-japan
420	} else {
421	_name_7bit_iso2022 ($name, $s);
422	}
423	}
424
425	sub _name_shift_jis ($$) {
426	my $name = shift; my $s = shift;
427	if ($s =~ /[\x80-\xFF]/) {
428	if ($s =~ /[\x0E\x0F\x1B]/) {
429	(charset => 'x-sjis');
430	} elsif ($s =~ /
431	(?:\G\|[\x00-\x3F\x7F])
432	(?:[\x81-\x9F\xE0-\xFC][\x40-\x7E\x80-\xFC]
433	\|[\x40-\x7E\xA1-\xDF])*
434	[\xF0-\xFC][\x40-\x7E\x80-\xFC]
435	/x) {
436	(charset => 'shift_jisx0213');
437	} elsif ($s =~ /
438	(?:\G\|[\x00-\x3F\x7F])
439	(?:[\x81-\x9F\xE0-\xFC][\x40-\x7E\x80-\xFC]
440	\|[\x40-\x7E\xA1-\xDF])*
441	(?:
442	[\x85-\x87\xEB-\xEF][\x40-\x7E\x80-\xFC]
443	\|\x81[\xAD-\xB7\xC0-\xC7\xCF-\xD9\xE9-\xEF\xF8-\xFB]
444	\|\x82[\x40-\x4E\x59-\x5F\x7A-\x80\x9B-\x9E\xF2-\xFC]
445	\|\x83[\x97-\x9E\xB7-\xBE\xD7-\xFC]
446	\|\x84[\x61-\x6F\x72-\x9E\xBF-\xFC]
447	\|\x88[\x40-\x9E]\|\x98[\x73-\x9E]\|\xEA[\xA5-\xFC]
448	)
449	/x) {
450	(charset => 'shift_jisx0213-plane1');
451	} else {
452	(charset => 'shift_jis');
453	}
454	} elsif ($s =~ /[\x5C\x7E]/) {
455	if ($s =~ /\x1B\x0E\x0F/) {
456	(charset => 'x-sjis'); ## ISO 2022 with implied "ESC ( J"
457	## BUG: "ESC ( B foobar\aaa ESC ( J aiueo" also matchs this
458	} else {
459	(charset => 'jis_x0201');
460	}
461	} else {
462	_name_7bit_iso2022 ($name, $s);
463	}
464	}
465
466	sub _utf8_on ($) {
467	Encode::_utf8_on ($_[0]) if $Encode::VERSION;
468	}
469	sub _utf8_off ($) {
470	Encode::_utf8_off ($_[0]) if $Encode::VERSION;
471	}
472
473	sub is_mime_text ($) {
474	my $name = lc shift;
475	if (ref $CHARSET{$name}->{mime_text} eq 'CODE') {
476	return &{ $CHARSET{$name}->{mime_text} } ($name);
477	} elsif (defined $CHARSET{$name}->{mime_text}) {
478	return $CHARSET{$name}->{mime_text};
479	} elsif (defined $_IsMimeText{$name}) {
480	return $_IsMimeText{$name};
481	} elsif (ref $CHARSET{'*undef'}->{mime_text} eq 'CODE') {
482	return &{ $CHARSET{'*undef'}->{mime_text} } ($name);
483	}
484	0;
485	}
486
487	=head1 LICENSE
488
489	Copyright 2002 wakaba E<lt>w@suika.fam.cxE<gt>.
490
491	This program is free software; you can redistribute it and/or modify
492	it under the terms of the GNU General Public License as published by
493	the Free Software Foundation; either version 2 of the License, or
494	(at your option) any later version.
495
496	This program is distributed in the hope that it will be useful,
497	but WITHOUT ANY WARRANTY; without even the implied warranty of
498	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
499	GNU General Public License for more details.
500
501	You should have received a copy of the GNU General Public License
502	along with this program; see the file COPYING. If not, write to
503	the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
504	Boston, MA 02111-1307, USA.
505
506	=head1 CHANGE
507
508	See F<ChangeLog>.
509	$Date: 2002/08/18 06:22:36 $
510
511	=cut
512
513	1;