/[suikacvs]/perl/lib/Encode/SJIS.pm
Suika

Diff of /perl/lib/Encode/SJIS.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.3 by wakaba, Mon Oct 14 06:58:35 2002 UTC revision 1.5 by wakaba, Mon Dec 16 10:25:01 2002 UTC
# Line 5  Encode::SJIS --- Shift JIS coding system Line 5  Encode::SJIS --- Shift JIS coding system
5    
6  =head1 ENCODINGS  =head1 ENCODINGS
7    
8  This module defines only two basic version of shift JIS.  This module defines encoding engine for Shift JIS coding systems.
9  Other variants are defined in Encode::SJIS::* modules.  This module only provides general en/decoding parts.  Actual profiles
10    for Shift JISes are included in Encode::SJIS::*.
11    
12  =over 4  =over 4
13    
# Line 19  our $VERSION=do{my @r=(q$Revision$=~/\d+ Line 20  our $VERSION=do{my @r=(q$Revision$=~/\d+
20  require Encode::Charset;  require Encode::Charset;
21  use base qw(Encode::Encoding);  use base qw(Encode::Encoding);
22    
 ### --- Perl Encode module common functions  
   
 sub encode ($$;$) {  
   my ($obj, $str, $chk) = @_;  
   $_[1] = '' if $chk;  
   if (!defined $obj->{_encode_mapping} || $obj->{_encode_mapping}) {  
     require Encode::Table;  
     $str = Encode::Table::convert ($str, $obj->__encode_map,  
       -autoload => defined $obj->{_encode_mapping_autoload} ?  
                    $obj->{_encode_mapping_autoload} : 1);  
   }  
   $str = &internal_to_sjis ($str, $obj->__2022_encode);  
   $str;  
 }  
   
 sub decode ($$;$) {  
   my ($obj, $str, $chk) = @_;  
   $_[1] = '' if $chk;  
   $str = &sjis_to_internal ($str, $obj->__2022_decode);  
   if (!defined $obj->{_decode_mapping} || $obj->{_decode_mapping}) {  
     require Encode::Table;  
     $str = Encode::Table::convert ($str, $obj->__decode_map,  
       -autoload => defined $obj->{_decode_mapping_autoload} ?  
                    $obj->{_decode_mapping_autoload} : 1);  
   }  
   $str;  
 }  
   
 ### --- Encode::SJIS unique functions  
23  *new_object = \&Encode::Charset::new_object_sjis;  *new_object = \&Encode::Charset::new_object_sjis;
24    
25    ## Code extention escape sequence defined by ISO/IEC 2022 is
26    ## not supported in this version of this module.
27    
28  sub sjis_to_internal ($$) {  sub sjis_to_internal ($$) {
29    my ($s, $C) = @_;    my ($s, $C) = @_;
30    $C ||= &new_object;    $C ||= &new_object;
# Line 103  sub internal_to_sjis ($\%) { Line 78  sub internal_to_sjis ($\%) {
78    $C ||= &new_object;    $C ||= &new_object;
79        
80    my $r = '';    my $r = '';
81    for my $c (split //, $s) {    my @c = split //, $s;
82      my $cc = ord $c;    for my $i (0..$#c) {
83        my $c = $c[$i]; my $cc = ord $c;  Encode::_utf8_off ($c);
84      my $t;      my $t;
85        ## CL = C0 control characters
86      if ($cc <= 0x1F) {      if ($cc <= 0x1F) {
87        $t = $c if $C->{ $C->{CL} } eq $Encode::Charset::CHARSET{C0}->{"\x40"};        $t = $c if $C->{ $C->{CL} } eq $Encode::Charset::CHARSET{C0}->{"\x40"};
88        ## 0x20 == SP and 0x7E == DEL
89      } elsif ($cc == 0x20 || $cc == 0x7F) {      } elsif ($cc == 0x20 || $cc == 0x7F) {
       Encode::_utf8_off ($c);  
90        $t = $c;        $t = $c;
91        ## GL = G0 = ISO/IEC 646 graphic character set
92      } elsif ($cc < 0x7F) {      } elsif ($cc < 0x7F) {
       Encode::_utf8_off ($c);  
93        $t = $c if $C->{ $C->{GL} } eq $Encode::Charset::CHARSET{G94}->{"\x42"};        $t = $c if $C->{ $C->{GL} } eq $Encode::Charset::CHARSET{G94}->{"\x42"};
94        ## 0x80
95      } elsif ($C->{option}->{C1invoke_to_right} && $cc == 0x80) {      } elsif ($C->{option}->{C1invoke_to_right} && $cc == 0x80) {
96        $t = "\x80"        $t = "\x80"
97          if $C->{ $C->{CR} } eq $Encode::Charset::CHARSET{C1}->{'64291991C1'};          if $C->{ $C->{CR} } eq $Encode::Charset::CHARSET{C1}->{'64291991C1'};
98        ## ESC Fe = C1 control characters
99      } elsif ($cc <= 0x9F) {      } elsif ($cc <= 0x9F) {
100        $t = "\x1B".pack 'C', ($cc - 0x40)        $t = "\x1B".pack 'C', ($cc - 0x40)
101          if $C->{ $C->{ESC_Fe} } eq $Encode::Charset::CHARSET{C1}->{'64291991C1'};          if $C->{ $C->{ESC_Fe} } eq $Encode::Charset::CHARSET{C1}->{'64291991C1'};
102            ## G1 or G3 = 94^2 graphic character set from ISO-IR
103      } elsif (0xE9F6C0 <= $cc && $cc <= 0xF06F80) {      } elsif (0xE9F6C0 <= $cc && $cc <= 0xF06F80) {
104        my $c = $cc - 0xE9F6C0;  my $F = chr (($c / 8836)+0x30);        my $c = $cc - 0xE9F6C0;  my $F = chr (($c / 8836)+0x30);
105        if ($C->{G1} eq $Encode::Charset::CHARSET{G94n}->{ $F }) {        if ($C->{G1} eq $Encode::Charset::CHARSET{G94n}->{ $F }) {
# Line 130  sub internal_to_sjis ($\%) { Line 109  sub internal_to_sjis ($\%) {
109        } elsif ($C->{G3} eq $Encode::Charset::CHARSET{G94n}->{ $F }) {        } elsif ($C->{G3} eq $Encode::Charset::CHARSET{G94n}->{ $F }) {
110          my ($c1, $c2) = ((($c % 8836) / 94)+0x21, ($c % 94)+0x21);          my ($c1, $c2) = ((($c % 8836) / 94)+0x21, ($c % 94)+0x21);
111          if ($C->{G3}->{Csjis_first}) {          if ($C->{G3}->{Csjis_first}) {
112            $t = pack ('CC', $C->{G3}->{Csjis_first}->{ ($c % 8836) / 94 },            my $fb = $C->{G3}->{Csjis_first}->{ ($c % 8836) / 94 };
113                       $c2 + (($c1 & 1) ? ($c2 < 0x60 ? 0x1F : 0x20) : 0x7E));            $t = pack ('CC', $fb, $c2 + (($c1 & 1) ? ($c2 < 0x60 ? 0x1F : 0x20) : 0x7E)) if $fb;
114          } else {          } else {
115            $t = pack ('CC', ($c / 188) + 0xF0,            $t = pack ('CC', ($c / 188) + 0xF0,
116                       $c2 + (($c1 & 1) ? ($c2 < 0x60 ? 0x1F : 0x20) : 0x7E))                       $c2 + (($c1 & 1) ? ($c2 < 0x60 ? 0x1F : 0x20) : 0x7E))
117                 if ($c / 188) + 0xF0 < 0xFD;                 if ($c / 188) + 0xF0 < 0xFD;
118          }          }
119        }        }
120        ## G1 = JIS X 0208-1990/:1997
121      } elsif (0xF49D7C <= $cc && $cc <= 0xF4BFFF) {      } elsif (0xF49D7C <= $cc && $cc <= 0xF4BFFF) {
122        my $c = $cc - 0xF49D7C;        my $c = $cc - 0xF49D7C;
123        if ($C->{G1} eq $Encode::Charset::CHARSET{G94n}->{'B@'}) {        if ($C->{G1} eq $Encode::Charset::CHARSET{G94n}->{'B@'}) {
# Line 145  sub internal_to_sjis ($\%) { Line 125  sub internal_to_sjis ($\%) {
125          $t = pack ('CC', (($c1 - 1) >> 1) + ($c1 < 0x5F ? 0x71 : 0xB1),          $t = pack ('CC', (($c1 - 1) >> 1) + ($c1 < 0x5F ? 0x71 : 0xB1),
126                 $c2 + (($c1 & 1) ? ($c2 < 0x60 ? 0x1F : 0x20) : 0x7E));                 $c2 + (($c1 & 1) ? ($c2 < 0x60 ? 0x1F : 0x20) : 0x7E));
127        }        }
128            ## GL = G0 = ISO/IEC 646 graphic character set / GR = G2 = JIS X 0201 Katakana set
129      } elsif (0xE90940 <= $cc && $cc <= 0xE92641) {      } elsif (0xE90940 <= $cc && $cc <= 0xE92641) {
130        my $c = $cc - 0xE90940;  my $F = chr (($c / 94)+0x30);        my $c = $cc - 0xE90940;  my $F = chr (($c / 94)+0x30);
131        if ($C->{ $C->{GL} } eq $Encode::Charset::CHARSET{G94}->{ $F }) {        if ($C->{ $C->{GL} } eq $Encode::Charset::CHARSET{G94}->{ $F }) {
# Line 153  sub internal_to_sjis ($\%) { Line 133  sub internal_to_sjis ($\%) {
133        } elsif ($C->{ $C->{GR} } eq $Encode::Charset::CHARSET{G94}->{ $F }) {        } elsif ($C->{ $C->{GR} } eq $Encode::Charset::CHARSET{G94}->{ $F }) {
134          $t = pack 'C', (($c % 94) + 0xA1) if ($c % 94) < 0x3F;          $t = pack 'C', (($c % 94) + 0xA1) if ($c % 94) < 0x3F;
135        }        }
136        ## G1 / G3 = 94^2 graphic character set
137      } elsif (0x70420000 <= $cc && $cc <= 0x7046F19B) {      } elsif (0x70420000 <= $cc && $cc <= 0x7046F19B) {
138        my $c = $cc % 0x10000;        my $c = $cc % 0x10000;
139        my $F0=$C->{option}->{private_set}->{G94n}->[($cc/0x10000)-0x7042]->[$c/8836];        my $F0=$C->{option}->{private_set}->{G94n}->[($cc/0x10000)-0x7042]->[$c/8836];
# Line 161  sub internal_to_sjis ($\%) { Line 142  sub internal_to_sjis ($\%) {
142         || $C->{G3} eq $Encode::Charset::CHARSET{G94n}->{ $F1 }) {         || $C->{G3} eq $Encode::Charset::CHARSET{G94n}->{ $F1 }) {
143          my ($c1, $c2) = ((($c % 8836) / 94)+0x21, ($c % 94)+0x21);          my ($c1, $c2) = ((($c % 8836) / 94)+0x21, ($c % 94)+0x21);
144          if ($C->{G3}->{Csjis_first}) {          if ($C->{G3}->{Csjis_first}) {
145            $t = pack ('CC', $C->{G3}->{Csjis_first}->{ ($c % 8836) / 94 },            my $fb = $C->{G3}->{Csjis_first}->{ ($c % 8836) / 94 };
146                       $c2 + (($c1 & 1) ? ($c2 < 0x60 ? 0x1F : 0x20) : 0x7E));            $t = pack ('CC', $fb, $c2 + (($c1 & 1) ? ($c2 < 0x60 ? 0x1F : 0x20) : 0x7E)) if $fb;
147          } else {          } else {
148            $t = pack ('CC', ($c / 188) + 0xF0,            $t = pack ('CC', ($c / 188) + 0xF0,
149                       $c2 + (($c1 & 1) ? ($c2 < 0x60 ? 0x1F : 0x20) : 0x7E))                       $c2 + (($c1 & 1) ? ($c2 < 0x60 ? 0x1F : 0x20) : 0x7E))
150                 if ($c / 188) + 0xF0 < 0xFD;                 if ($c / 188) + 0xF0 < 0xFD;
151          }          }
152        }        }
153        ## Other character sets are not supported now (and there is no plan to implement them).
154      }      }
155            
156        ## Output the character itself
157      if (defined $t) {      if (defined $t) {
158        $r .= $t;        $r .= $t;
159        ## Output the character itself with mapping table of special code positions
160      } elsif ($C->{GsmapR}->{ $c }) {      } elsif ($C->{GsmapR}->{ $c }) {
161        $r .= $C->{GsmapR}->{ $c };        $r .= $C->{GsmapR}->{ $c };
162        } elsif ($C->{option}->{fallback_from_ucs} =~ /quiet/) {
163          return ($r, halfway => 1, converted_length => $i,
164                  warn => $C->{option}->{fallback_from_ucs} =~ /warn/ ? 1 : 0,
165                  reason => sprintf (q(U+%04X: There is no character mapped to), $cc));
166        } elsif ($C->{option}->{fallback_from_ucs} eq 'croak') {
167          return ($r, halfway => 1, die => 1,
168                  reason => sprintf (q(U+%04X: There is no character mapped to), $cc));
169        ##
170      } else {      } else {
171        $r .= $C->{option}->{undef_char_sjis} || "\x3F";        ## Try to output with fallback escape sequence (if specified)
172          my $t = Encode::Charset::fallback_escape ($C, $c);
173          if (defined $t) {
174            my %D = (fallback => $C->{option}->{fallback_from_ucs}, reset => $C->{option}->{reset});
175            $C->{option}->{fallback_from_ucs} = 'croak';
176            $C->{option}->{reset} = {Gdesignation => 0, Ginvoke => 0};
177            eval q{$t = $C->{_encoder}->_encode_internal ($t, $C)} or undef $t;
178            $C->{option}->{fallback_from_ucs} = $D{fallback};
179            $C->{option}->{reset} = $D{reset};
180          }
181          if (defined $t) {
182            $r .= $t;
183          } else {  ## Replacement character specified in charset definition
184            $r .= $C->{option}->{undef_char_sjis} || "\x3F";
185          }
186      }      }
187    }    }
188    $r;    $r;
189  }  }
190    
 sub __clone ($) {  
   my $self = shift;  
   bless {%$self}, ref $self;  
 };  
   
 __PACKAGE__->Define (qw!shift_jisx0213 japanese-shift-jisx0213  
 shift-jisx0213 x-shift_jisx0213 shift-jis-3 shift-jis-2000  
 sjis s-jis shift-jis x-sjis x_sjis x-sjis-jp shiftjis x-shiftjis  
 x-shift-jis shift.jis!);  
   
 =item sjis  
   
 "Shift JIS" coding system.  (Alias: shift-jis, shiftjis,  
 shift.jis, x-shiftjis, x-shift-jis, s-jis, x-sjis, x_sjis,  
 x-sjis-jp)  
   
 Since this name is ambiguous (it can now refer all or any  
 of shift JIS coding system family), this name should not  
 be used to address specific coding system.  In this module,  
 this is considered as an alias name to the shift JIS with  
 latest official definition, currently of JIS X 0213:2000  
 Appendix 1 (with implemention level 4).  
   
 Note that the name "Shift_JIS" is not associated with  
 this name, because IANA registry [IANAREG] assignes  
 it to a shift JIS defined by JIS X 0208:1997.  
   
 =item shift_jisx0213  
   
 Shift_JISX0213 coded representation, defined by  
 JIS X 0213:2000 Appendix 1 (implemention level 4).  
 (Alias: shift-jisx0213, x-shift_jisx0213, japanese-shift-jisx0213 (emacsen),  
 shift-jis-3 (Yudit), shift-jis-2000)  
   
 =cut  
   
 sub __2022__common ($) {  
   my $C = Encode::SJIS->new_object;  
   $C->{G0} = $Encode::Charset::CHARSET{G94}->{J};       ## JIS X 0201:1997 Latin  
   $C->{G1} = $Encode::Charset::CHARSET{G94n}->{"\x4F"}; ## JIS X 0213:2000 plane 1  
   $C->{G2} = $Encode::Charset::CHARSET{G94}->{I};       ## JIS X 0201:1997 Katakana  
   $C->{G3} = $Encode::Charset::CHARSET{G94n}->{"\x50"}; ## JIS X 0213:2000 plane 2  
   $C;  
 }  
 sub __2022_encode ($) {  
   my $C = shift->__2022__common;  
   $C;  
 }  
 sub __2022_decode ($) {  
   my $C = shift->__2022__common;  
   $C;  
 }  
 sub __encode_map ($) {  
   [qw/ucs_to_jisx0201_latin ucs_to_jisx0213_2000_1 ucs_to_jisx0213_2000_2 ucs_to_jisx0201_katakana/];  
 }  
 sub __decode_map ($) {  
   [qw/jisx0201_latin_to_ucs jisx0213_2000_1_to_ucs jisx0213_2000_2_to_ucs jisx0201_katakana_to_ucs/];  
 }  
   
 package Encode::SJIS::X0213ASCII;  
 use vars qw/@ISA/;  
 push @ISA, 'Encode::SJIS';  
 __PACKAGE__->Define (qw/shift_jisx0213-ascii shift-jis-2000-ascii  
 sjis-ascii shift-jis-ascii/);  
   
 =item sjis-ascii  
   
 Same as sjis but ASCII (ISO/IEC 646 IRV) instead of  
 JIS X 0201 Roman (or Latin) set.  (Alias: shift-jis-ascii)  
   
 In spite of the history of shift JIS, ASCII is sometimes  
 used instead of JIS X 0201 Roman set, because of compatibility  
 with ASCII world.  
   
 Note that this name is now an alias of shift_jisx0213-ascii,  
 as sjis is of shift_jisx0213.  
   
 =item shift_jisx0213-ascii  
   
 Same as Shift_JISX0213 but ASCII (ISO/IEC 646 IRV)  
 instead of JIS X 0201:1997 Latin character set.  
 (Alias: shift-jis-2000-ascii)  
   
 Note that this coding system does NOT comform to  
 JIS X 0213:2000 Appendix 1.  
   
 =cut  
   
 sub __2022__common ($) {  
   my $C = shift->SUPER::__2022__common;  
   $C->{G0} = $Encode::Charset::CHARSET{G94}->{B};       ## ASCII  
   $C;  
 }  
 sub __encode_map ($) {  
   [qw/ucs_to_ascii ucs_to_jisx0213_2000_1 ucs_to_jisx0213_2000_2 ucs_to_jisx0201_katakana/];  
 }  
 sub __decode_map ($) {  
   [qw/jisx0213_2000_1_to_ucs jisx0213_2000_2_to_ucs jisx0201_katakana_to_ucs/];  
 }  
   
 1;  
 __END__  
   
191  =back  =back
192    
193  =head1 SEE ALSO  =head1 SEE ALSO
# Line 297  JIS X 0213:2000, "7-bit and 8-bit double Line 200  JIS X 0213:2000, "7-bit and 8-bit double
200  sets for information interchange", Japan Industrial Standards  sets for information interchange", Japan Industrial Standards
201  Committee (JISC) <http://www.jisc.go.jp/>, 2000.  Committee (JISC) <http://www.jisc.go.jp/>, 2000.
202    
203  Encode, perlunicode  L<Encode::SJIS::JIS>
204    
205    L<Encode>, L<perlunicode>
206    
207  [IANAREG] "CHARACTER SETS", IANA <http://www.iana.org/>,  L<Encode::Charset>, L<Encode::ISO2022>
 <http://www.iana.org/assignments/character-sets>.  
 The charset registry for IETF <http://www.ietf.org/> standards.  
 (Note that in this registry two shift JISes are registered,  
 "Shift_JIS" and "Windows-31j".  Former is JIS X 0208:1997's  
 definition and later is the Windows standard character set.)  
208    
209  =head1 LICENSE  =head1 LICENSE
210    
211  Copyright 2002 Nanashi-san  Copyright 2002 Nanashi-san <nanashi-san@nanashi.invalid>
212    
213  This library is free software; you can redistribute it  This library is free software; you can redistribute it
214  and/or modify it under the same terms as Perl itself.  and/or modify it under the same terms as Perl itself.
215    
216  =cut  =cut
217    
218  # $Date$  1; # $Date$
 ### SJIS.pm ends here  

Legend:
Removed from v.1.3  
changed lines
  Added in v.1.5

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24