/[suikacvs]/messaging/manakai/lib/Message/MIME/Charset.pm
Suika

Diff of /messaging/manakai/lib/Message/MIME/Charset.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.17 by wakaba, Sun Aug 18 06:22:36 2002 UTC revision 1.18 by wakaba, Sat Dec 28 09:07:05 2002 UTC
# Line 1  Line 1 
1    
2  =head1 NAME  =head1 NAME
3    
4  Message::MIME::Charset Perl module  Message::MIME::Charset --- Message-pm: Coded character sets support
5    
6  =head1 DESCRIPTION  =head1 DESCRIPTION
7    
8  Perl module for MIME charset.  This module provides some abstracted functions to handle string
9    in various character codes with (as-far-as-possiblly-) coding system
10    independent implemention.
11    
12    Note that this module is not only used to implement MIME charset mechanism
13    but also used to support non-MIME schemes of character encoding.
14    
15    This module is part of Message::* Perl Modules.
16    
17  =cut  =cut
18    
# Line 42  $CHARSET{'us-ascii'} = { Line 49  $CHARSET{'us-ascii'} = {
49                    
50          mime_text       => 1,          mime_text       => 1,
51          cte_7bit_preferred      => 'quoted-printable',          cte_7bit_preferred      => 'quoted-printable',
52            
53            divide_string   => \&_divide_string_1,
54            cte_header_preferred    => 'q',
55            
56            is_representable_in     => sub { $_[1] =~ /[^\x00-\x7F]/ ? 0 : 1 },
57  };  };
58    
59  $CHARSET{'iso-2022-int-1'} = {  $CHARSET{'iso-2022-int-1'} = {
# Line 51  $CHARSET{'iso-2022-int-1'} = { Line 63  $CHARSET{'iso-2022-int-1'} = {
63          decoder => sub { $_[1] },          decoder => sub { $_[1] },
64                    
65          mime_text       => 1,          mime_text       => 1,
66            cte_7bit_preferred      => 'quoted-printable',
67            cte_header_preferred    => 'q',
68  };  };
69    
70  $CHARSET{'unknown-8bit'} = {  $CHARSET{'unknown-8bit'} = {
# Line 61  $CHARSET{'unknown-8bit'} = { Line 75  $CHARSET{'unknown-8bit'} = {
75                    
76          mime_text       => 1,          mime_text       => 1,
77          cte_7bit_preferred      => 'base64',          cte_7bit_preferred      => 'base64',
78            
79            divide_string   => \&_divide_string_1,
80  };  };
81    
82  $CHARSET{'x-unknown'} = {  $CHARSET{'x-unknown'} = {
# Line 71  $CHARSET{'x-unknown'} = { Line 87  $CHARSET{'x-unknown'} = {
87                    
88          mime_text       => 0,          mime_text       => 0,
89          cte_7bit_preferred      => 'base64',          cte_7bit_preferred      => 'base64',
90            
91            divide_string   => \&_divide_string_1,
92  };  };
93    
94  $CHARSET{'*undef'} = {  $CHARSET{'*undef'} = {
# Line 83  $CHARSET{'*undef'} = { Line 101  $CHARSET{'*undef'} = {
101          cte_7bit_preferred      => 'base64',          cte_7bit_preferred      => 'base64',
102  };  };
103    
104    $CHARSET{'*internal'} = {
105            preferred_name  => '',
106            
107            #encoder        => sub { $_[1] },
108            #decoder        => sub { $_[1] },
109            
110            mime_text       => 0,
111            cte_7bit_preferred      => 'base64',
112    };
113    
114    $CHARSET{'*default_value'} = {  ## Dummy charset for default property value
115            perl_name       => undef,
116            preferred_name  => undef,
117            mime_text       => 0,
118            cte_7bit_preferred      => 'base64',
119            cte_header_preferred    => '*auto',
120    };
121    
122  }       # /builtin_charset  }       # /builtin_charset
123    
124  my %_MINIMUMIZER = (  my %_MINIMUMIZER = (
# Line 102  my %_MINIMUMIZER = ( Line 138  my %_MINIMUMIZER = (
138          'iso-2022-kr'   => \&_name_8bit_iso2022,          'iso-2022-kr'   => \&_name_8bit_iso2022,
139          'iso-8859-1'    => \&_name_8bit_iso2022,          'iso-8859-1'    => \&_name_8bit_iso2022,
140          jis_x0201       => \&_name_shift_jis,          jis_x0201       => \&_name_shift_jis,
141          junet   => \&_name_8bit_iso2022,          'x-iso-2022-7bit'       => \&_name_8bit_iso2022,
142          'x-junet8'      => \&_name_net_ascii_8bit,          'x-iso-2022-7bit-utf-8' => \&_name_net_ascii_8bit,
143          shift_jis       => \&_name_shift_jis,          shift_jis       => \&_name_shift_jis,
144          shift_jisx0213  => \&_name_shift_jis,          shift_jisx0213  => \&_name_shift_jis,
145          'shift_jisx0213-plane1' => \&_name_shift_jis,          'shift_jisx0213-plane1' => \&_name_shift_jis,
# Line 122  for (qw( Line 158  for (qw(
158          hp-roman8          hp-roman8
159          hz-gb-2312          hz-gb-2312
160          ibm437          ibm437
161          junet   x-junet8        x-iso-2022          x-iso-2022-7bit x-iso-2022-7bit-utf-8   x-iso-2022
162          iso-2022-cn     iso-2022-cn-ext          iso-2022-cn     iso-2022-cn-ext
163          iso-2022-int-1          iso-2022-int-1
164          iso-2022-jp     iso-2022-jp-1   iso-2022-jp-2   iso-2022-jp-3          iso-2022-jp     iso-2022-jp-1   iso-2022-jp-2   iso-2022-jp-3
# Line 219  sub name_normalize ($) { Line 255  sub name_normalize ($) {
255    $name;    $name;
256  }  }
257    
258  sub name_minimumize ($$) {  sub name_minimumize ($$;$) {
259    require Message::MIME::Charset::MinName;    require Message::MIME::Charset::MinName;
260    my ($charset, $s) = (lc shift, shift);    my ($charset, $s, $option) = (lc shift, @_);
261    if (ref $CHARSET{$charset}->{name_minimumizer} eq 'CODE') {    if (ref $CHARSET{$charset}->{name_minimumizer} eq 'CODE') {
262      return &{$CHARSET{$charset}->{name_minimumizer}} ($charset, $s);      return &{$CHARSET{$charset}->{name_minimumizer}} ($charset, $s);
263    } elsif (ref $Message::MIME::Charset::MinName::MIN{$charset}) {    } elsif (ref $Message::MIME::Charset::MinName::MIN{$charset}) {
264      return &{$Message::MIME::Charset::MinName::MIN{$charset}} ($charset, $s);      return &{$Message::MIME::Charset::MinName::MIN{$charset}} ($charset, $s, $option);
265    } elsif (ref $_MINIMUMIZER{$charset}) {    } elsif (ref $_MINIMUMIZER{$charset}) {
266      return &{$_MINIMUMIZER{$charset}} ($charset, $s);      return &{$_MINIMUMIZER{$charset}} ($charset, $s, $option);
267    } elsif (ref $CHARSET{'*undef'}->{name_minimumizer} eq 'CODE') {    } elsif (ref $CHARSET{'*undef'}->{name_minimumizer} eq 'CODE') {
268      return &{$CHARSET{'*undef'}->{name_minimumizer}} ($charset, $s);      return &{$CHARSET{'*undef'}->{name_minimumizer}} ($charset, $s, $option);
269    }    }
270    (charset => $charset);    (charset => $charset);
271  }  }
# Line 293  sub _name_7bit_iso2022 ($$) {shift; Line 329  sub _name_7bit_iso2022 ($$) {shift;
329                     |\x1B\x24\x29[^C]                     |\x1B\x24\x29[^C]
330                     |\x1B\x28[^BJ]                     |\x1B\x28[^BJ]
331                     |\x1B\x2D[^AF]/x;                     |\x1B\x2D[^AF]/x;
332      return (charset => 'junet')      return (charset => 'x-iso-2022-7bit')
333        unless $s =~ /\x1B[^\x24\x28\x2C]        unless $s =~ /\x1B[^\x24\x28\x2C]
334                     |\x1B\x24[^\x28\x2C\x40-\x42]                     |\x1B\x24[^\x28\x2C\x40-\x42]
335                     |\x1B\x24[\x28\x2C][^\x20-\x7E]                     |\x1B\x24[\x28\x2C][^\x20-\x7E]
# Line 311  sub _name_net_ascii_8bit ($) { Line 347  sub _name_net_ascii_8bit ($) {
347    my $name = shift; my $s = shift;    my $name = shift; my $s = shift;
348    return (charset => 'us-ascii') unless $s =~ /[\x1B\x0E\x0F\x80-\xFF]/;    return (charset => 'us-ascii') unless $s =~ /[\x1B\x0E\x0F\x80-\xFF]/;
349    if ($s =~ /[\x80-\xFF]/) {    if ($s =~ /[\x80-\xFF]/) {
350      if ($s =~ /[\xC0-\xFD][\x80-\xBF]*[\x80-\x8F]/) {      if ($s =~ /[\xC0-\xFD][\x80-\xBF]*[\x80-\xBF]/) {
351        if ($s =~ /\x1B/) {        if ($s =~ /\x1B/) {
352          return (charset => 'x-junet8'); ## junet + UTF-8          return (charset => 'x-iso-2022-7bit');  ## iso-2022-7bit + UTF-8
353        } else {        } else {
354          return (charset => 'utf-8');          return (charset => 'utf-8');
355        }        }
# Line 463  sub _name_shift_jis ($$) { Line 499  sub _name_shift_jis ($$) {
499    }    }
500  }  }
501    
502    eval q{require Encode};
503  sub _utf8_on ($) {  sub _utf8_on ($) {
504    Encode::_utf8_on ($_[0]) if $Encode::VERSION;    Encode::_utf8_on ($_[0]) if $Encode::VERSION;
505  }  }
# Line 484  sub is_mime_text ($) { Line 521  sub is_mime_text ($) {
521    0;    0;
522  }  }
523    
524    sub divide_string ($$;%) {
525      my ($charset, $string, %option) = @_;
526      $option{-max} ||= 70;
527      if (ref $CHARSET{$charset}->{divide_string}) {
528        return &{$CHARSET{$charset}->{divide_string}} ($charset, $string, \%option);
529      } else {
530        my @r;      ## 12 = 3*4. Most of stateless codes are 1-4 octets per char.
531        my $l = int ($option{-max} / 12) * 12;
532        for my $i (0..int (length ($string) / $l)) {
533          push @r, substr ($string, $l*$i, $l);
534        }
535        return \@r;
536      }
537    }
538    sub _divide_string_1 ($%) {
539      my (undef, $string, $option) = @_;
540      my @r;
541      for my $i (0..int (length ($string) / $option->{-max})) {
542        push @r, substr ($string, $option->{-max}*$i, $option->{-max});
543      }
544      return \@r;
545    }
546    
547    sub get_property ($$) {
548      my ($property, $charset) = @_;
549      if (defined $CHARSET{$charset}->{$property}) {
550        return $CHARSET{$charset}->{$property};
551      } else {
552        return $CHARSET{'*default_value'}->{$property};
553      }
554    }
555    
556    =head1 {charset => $charset,...} = Message::MIME::Charset::get_interchange_charset ($charset, $string, {%option})
557    
558    Get charset name (for IANA name context) for information interchange.
559    
560    =cut
561    
562    sub get_interchange_charset ($$;$) {
563      my ($charset, $string, $option) = @_;
564      
565      {charset => $charset};
566    }
567    
568    =head1 1/0 = is_representable_in ($charset, $string, {%option})
569    
570    Return whether $string (encoded in *internal charset) is able to be
571    represented in the $charset.
572    
573    Options: Currently no option argument is available.
574    
575    =cut
576    
577    sub is_representable_in ($$;$) {
578      my ($charset, $string, $option) = @_;
579      if (ref $CHARSET{$charset}->{is_representable_in}) {
580        return &{$CHARSET{$charset}->{is_representable_in}} (@_);
581      } else {
582        return 0;
583      }
584    }
585    
586  =head1 LICENSE  =head1 LICENSE
587    
588  Copyright 2002 wakaba E<lt>w@suika.fam.cxE<gt>.  Copyright 2002 Wakaba <w@suika.fam.cx>
589    
590  This program is free software; you can redistribute it and/or modify  This program is free software; you can redistribute it and/or modify
591  it under the terms of the GNU General Public License as published by  it under the terms of the GNU General Public License as published by
# Line 503  along with this program; see the file CO Line 602  along with this program; see the file CO
602  the Free Software Foundation, Inc., 59 Temple Place - Suite 330,  the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
603  Boston, MA 02111-1307, USA.  Boston, MA 02111-1307, USA.
604    
 =head1 CHANGE  
   
 See F<ChangeLog>.  
 $Date$  
   
605  =cut  =cut
606    
607  1;  1; # $Date$

Legend:
Removed from v.1.17  
changed lines
  Added in v.1.18

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24