/[suikacvs]/perl/lib/Encode/SJIS.pm
Suika

Diff of /perl/lib/Encode/SJIS.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.1 by wakaba, Sat Oct 12 07:27:01 2002 UTC revision 1.6 by wakaba, Wed Dec 18 10:21:09 2002 UTC
# Line 5  Encode::SJIS --- Shift JIS coding system Line 5  Encode::SJIS --- Shift JIS coding system
5    
6  =head1 ENCODINGS  =head1 ENCODINGS
7    
8  This module defines only two basic version of shift JIS.  This module defines encoding engine for Shift JIS coding systems.
9  Other variants are defined in Encode::SJIS::* modules.  This module only provides general en/decoding parts.  Actual profiles
10    for Shift JISes are included in Encode::SJIS::*.
11    
12  =over 4  =over 4
13    
# Line 17  use 5.7.3; Line 18  use 5.7.3;
18  use strict;  use strict;
19  our $VERSION=do{my @r=(q$Revision$=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};  our $VERSION=do{my @r=(q$Revision$=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
20  require Encode::Charset;  require Encode::Charset;
21    use base qw(Encode::Encoding);
22    
 ### --- Perl Encode module common functions  
   
 sub encode ($$;$) {  
   my ($obj, $str, $chk) = @_;  
   $_[1] = '' if $chk;  
   if (!defined $obj->{_encode_mapping} || $obj->{_encode_mapping}) {  
     require Encode::Table;  
     $str = Encode::Table::convert ($str, $obj->__encode_map,  
       -autoload => defined $obj->{_encode_mapping_autoload} ?  
                    $obj->{_encode_mapping_autoload} : 1);  
   }  
   $str = &internal_to_sjis ($str, $obj->__2022_encode);  
   $str;  
 }  
   
 sub decode ($$;$) {  
   my ($obj, $str, $chk) = @_;  
   $_[1] = '' if $chk;  
   $str = &sjis_to_internal ($str, $obj->__2022_decode);  
   if (!defined $obj->{_decode_mapping} || $obj->{_decode_mapping}) {  
     require Encode::Table;  
     $str = Encode::Table::convert ($str, $obj->__decode_map,  
       -autoload => defined $obj->{_decode_mapping_autoload} ?  
                    $obj->{_decode_mapping_autoload} : 1);  
   }  
   $str;  
 }  
   
 ### --- Encode::SJIS unique functions  
23  *new_object = \&Encode::Charset::new_object_sjis;  *new_object = \&Encode::Charset::new_object_sjis;
24    
25    ## Code extention escape sequence defined by ISO/IEC 2022 is
26    ## not supported in this version of this module.
27    
28  sub sjis_to_internal ($$) {  sub sjis_to_internal ($$) {
29    my ($s, $C) = @_;    my ($s, $C) = @_;
30    $C ||= &new_object;    $C ||= &new_object;
# Line 76  sub sjis_to_internal ($$) { Line 52  sub sjis_to_internal ($$) {
52          $f -= $f < 0xA0 ? 0x81 : 0xC1;  $s -= 0x40 + ($s > 0x7F);          $f -= $f < 0xA0 ? 0x81 : 0xC1;  $s -= 0x40 + ($s > 0x7F);
53          chr ($C->{G1}->{ucs} + $f * 188 + $s);          chr ($C->{G1}->{ucs} + $f * 188 + $s);
54        } else {  ## [\xF0-\xFC].        } else {  ## [\xF0-\xFC].
55          my ($f, $s) = unpack ('CC', $c2);          my ($f, $s) = (ord substr ($c2, 0, 1), ord substr ($c2, 1, 1));
56          if ($C->{G3}->{Csjis_kuE}) {          if ($C->{G3}->{Csjis_kuE}) {    ## 94^2 set with first-byte->ku mapping
57            $f = $s > 0x9E ? $C->{G3}->{Csjis_kuE}->{ $f }:            my $F = $s > 0x9E ? $C->{G3}->{Csjis_kuE}->{ $f }:    ## ku of even number
58                             $C->{G3}->{Csjis_kuO}->{ $f };                                $C->{G3}->{Csjis_kuO}->{ $f };    ## ku of odd number
59            $s -= ($s > 0x9E ? 0x9F : $s > 0x7F ? 0x41 : 0x40);            if (defined $F) {
60            chr ($C->{G3}->{ucs} + $f * 94 + $s);              $s -= ($s > 0x9E ? 0x9F : $s > 0x7F ? 0x41 : 0x40);
61          } else {              chr ($C->{G3}->{ucs} + $F * 94 + $s);
62              } else {      ## Mapping is not defined
63                $f -= 0xF0; $s -= 0x40 + ($s > 0x7F);
64                chr ($Encode::Charset::CHARSET{G94n}->{"\x20\x40"}->{ucs} + $f * 188 + $s);
65              }
66            } elsif ($C->{G3}->{Csjis_ku}) {        ## n^2 set with first-byte->ku mapping
67              if (defined $C->{G3}->{Csjis_ku}->{ $f }) {
68                $f = $C->{G3}->{Csjis_ku}->{ $f };
69                $s -= ($s > 0x9E ? 0x9F : $s > 0x7F ? 0x41 : 0x40);
70                chr ($C->{G3}->{ucs} + $f * $C->{G3}->{chars} + $s);
71              } else {      ## Mapping is not defined
72                $f -= 0xF0; $s -= 0x40 + ($s > 0x7F);
73                chr ($Encode::Charset::CHARSET{G94n}->{"\x20\x40"}->{ucs} + $f * 188 + $s);
74              }
75            } else {        ## 94^2 set without special mapping information
76            $f -= 0xF0; $s -= 0x40 + ($s > 0x7F);            $f -= 0xF0; $s -= 0x40 + ($s > 0x7F);
77            chr ($C->{G3}->{ucs} + $f * 188 + $s);            chr ($C->{G3}->{ucs} + $f * 188 + $s);
78          }          }
# Line 96  sub sjis_to_internal ($$) { Line 86  sub sjis_to_internal ($$) {
86    $s;    $s;
87  }  }
88    
89  sub __clone ($) {  sub internal_to_sjis ($\%) {
90    my $self = shift;    use integer;
91    bless {%$self}, ref $self;    my ($s, $C) = @_;
92  };    $C ||= &new_object;
93      
94  use base qw(Encode::Encoding);    my $r = '';
95  __PACKAGE__->Define (qw!shift_jisx0213 japanese-shift-jisx0213    my @c = split //, $s;
96  shift-jisx0213 x-shift_jisx0213 shift-jis-3    for my $i (0..$#c) {
97  sjis shift-jis x-sjis x_sjis x-sjis-jp shiftjis x-shiftjis      my $c = $c[$i]; my $cc = ord $c;  Encode::_utf8_off ($c);
98  x-shift-jis shift.jis!);      my $t;
99        ## CL = C0 control characters
100  =item sjis      if ($cc <= 0x1F) {
101          $t = $c if $C->{ $C->{CL} } eq $Encode::Charset::CHARSET{C0}->{"\x40"};
102  "Shift JIS" coding system.  (Alias: shift-jis, shiftjis,      ## 0x20 == SP and 0x7E == DEL
103  shift.jis, x-shiftjis, x-shift-jis, x-sjis, x_sjis,      } elsif ($cc == 0x20 || $cc == 0x7F) {
104  x-sjis-jp)        $t = $c;
105        ## GL = G0 = ISO/IEC 646 graphic character set
106  Since this name is ambiguous (it can now refer all or any      } elsif ($cc < 0x7F) {
107  of shift JIS coding system family), this name should not        $t = $c if $C->{ $C->{GL} } eq $Encode::Charset::CHARSET{G94}->{"\x42"};
108  be used to address specific coding system.  In this module,      ## 0x80
109  this is considered as an alias name to the shift JIS with      } elsif ($C->{option}->{C1invoke_to_right} && $cc == 0x80) {
110  latest official definition, currently of JIS X 0213:2000        $t = "\x80"
111  Appendix 1 (with implemention level 4).          if $C->{ $C->{CR} } eq $Encode::Charset::CHARSET{C1}->{'64291991C1'};
112        ## ESC Fe = C1 control characters
113  Note that the name "Shift_JIS" is not associated with      } elsif ($cc <= 0x9F) {
114  this name, because IANA registry [IANAREG] assignes        $t = "\x1B".pack 'C', ($cc - 0x40)
115  it to a shift JIS defined by JIS X 0208:1997.          if $C->{ $C->{ESC_Fe} } eq $Encode::Charset::CHARSET{C1}->{'64291991C1'};
116        ## G1 or G3 = 94^2 graphic character set from ISO-IR
117  =item shift_jisx0213      } elsif (0xE9F6C0 <= $cc && $cc <= 0xF06F80) {
118          my $c = $cc - 0xE9F6C0;  my $F = chr (($c / 8836)+0x30);
119  Shift_JISX0213 coded representation, defined by        if ($C->{G1} eq $Encode::Charset::CHARSET{G94n}->{ $F }) {
120  JIS X 0213:2000 Appendix 1 (implemention level 4).          my ($c1, $c2) = ((($c % 8836) / 94)+0x21, ($c % 94)+0x21);
121  (Alias: shift-jisx0213, x-shift_jisx0213, japanese-shift-jisx0213 (emacsen),          $t = pack ('CC', (($c1 - 1) >> 1) + ($c1 < 0x5F ? 0x71 : 0xB1),
122  shift-jis-3 (Yudit))                 $c2 + (($c1 & 1) ? ($c2 < 0x60 ? 0x1F : 0x20) : 0x7E));
123          } elsif ($C->{G3} eq $Encode::Charset::CHARSET{G94n}->{ $F }) {
124  =cut          my ($c1, $c2) = ((($c % 8836) / 94)+0x21, ($c % 94)+0x21);
125            if ($C->{G3}->{Csjis_first}) {
126  sub __2022__common ($) {            my $fb = $C->{G3}->{Csjis_first}->{ ($c % 8836) / 94 };
127    my $C = Encode::SJIS->new_object;            $t = pack ('CC', $fb, $c2 + (($c1 & 1) ? ($c2 < 0x60 ? 0x1F : 0x20) : 0x7E)) if $fb;
128    $C->{G0} = $Encode::Charset::CHARSET{G94}->{J};       ## JIS X 0201:1997 Latin          } else {
129    $C->{G1} = $Encode::Charset::CHARSET{G94n}->{"\x4F"}; ## JIS X 0213:2000 plane 1            $t = pack ('CC', ($c / 188) + 0xF0,
130    $C->{G2} = $Encode::Charset::CHARSET{G94}->{I};       ## JIS X 0201:1997 Katakana                       $c2 + (($c1 & 1) ? ($c2 < 0x60 ? 0x1F : 0x20) : 0x7E))
131    $C->{G3} = $Encode::Charset::CHARSET{G94n}->{"\x50"}; ## JIS X 0213:2000 plane 2                 if ($c / 188) + 0xF0 < 0xFD;
132    $C;          }
133  }        }
134  sub __2022_encode ($) {      ## G1 = JIS X 0208-1990/:1997
135    my $C = shift->__2022__common;      } elsif (0xF49D7C <= $cc && $cc <= 0xF4BFFF) {
136    $C;        my $c = $cc - 0xF49D7C;
137  }        if ($C->{G1} eq $Encode::Charset::CHARSET{G94n}->{'B@'}) {
138  sub __2022_decode ($) {          my ($c1, $c2) = ((($c % 8836) / 94)+0x21, ($c % 94)+0x21);
139    my $C = shift->__2022__common;          $t = pack ('CC', (($c1 - 1) >> 1) + ($c1 < 0x5F ? 0x71 : 0xB1),
140    $C;                 $c2 + (($c1 & 1) ? ($c2 < 0x60 ? 0x1F : 0x20) : 0x7E));
141  }        }
142  sub __encode_map ($) {      ## GL = G0 = ISO/IEC 646 graphic character set / GR = G2 = JIS X 0201 Katakana set
143    [qw/ucs_to_jisx0201_latin ucs_to_jisx0213_2000_1 ucs_to_jisx0213_2000_2 ucs_to_jisx0201_katakana/];      } elsif (0xE90940 <= $cc && $cc <= 0xE92641) {
144  }        my $c = $cc - 0xE90940;  my $F = chr (($c / 94)+0x30);
145  sub __decode_map ($) {        if ($C->{ $C->{GL} } eq $Encode::Charset::CHARSET{G94}->{ $F }) {
146    [qw/jisx0201_latin_to_ucs jisx0213_2000_1_to_ucs jisx0213_2000_2_to_ucs jisx0201_katakana_to_ucs/];          $t = pack 'C', (($c % 94) + 0x21);
147          } elsif ($C->{ $C->{GR} } eq $Encode::Charset::CHARSET{G94}->{ $F }) {
148            $t = pack 'C', (($c % 94) + 0xA1) if ($c % 94) < 0x3F;
149          }
150        ## G1 / G3 = 94^2 graphic character set
151        } elsif (0x70420000 <= $cc && $cc <= 0x7046F19B) {
152          my $c = $cc % 0x10000;
153          my $F0=$C->{option}->{private_set}->{G94n}->[($cc/0x10000)-0x7042]->[$c/8836];
154          my $F1 = 'P'.(($cc / 0x10000) - 0x7042).'_'.($c / 8836);
155          if ($C->{G3} eq $Encode::Charset::CHARSET{G94n}->{ $F0 }
156           || $C->{G3} eq $Encode::Charset::CHARSET{G94n}->{ $F1 }) {
157            my ($c1, $c2) = ((($c % 8836) / 94)+0x21, ($c % 94)+0x21);
158            if ($C->{G3}->{Csjis_first}) {
159              my $fb = $C->{G3}->{Csjis_first}->{ ($c % 8836) / 94 };
160              $t = pack ('CC', $fb, $c2 + (($c1 & 1) ? ($c2 < 0x60 ? 0x1F : 0x20) : 0x7E)) if $fb;
161            } else {
162              $t = pack ('CC', ($c / 188) + 0xF0,
163                         $c2 + (($c1 & 1) ? ($c2 < 0x60 ? 0x1F : 0x20) : 0x7E))
164                   if ($c / 188) + 0xF0 < 0xFD;
165            }
166          }
167        ## Non-ISO/IEC 2022 Coded Character Sets Mapping Area
168        } elsif (0x71000000 <= $cc && $cc <= 0x71FFFFFF) {
169          if ($C->{G3}->{ucs} <= $cc) {
170            my $c = $cc - $C->{G3}->{ucs};
171            my $f = $C->{G3}->{Csjis_first}->{$c / $C->{G3}->{chars}};
172            if ($f) {
173              my $s = $c % $C->{G3}->{chars};
174              $t = pack ('CC', $f, 0x40 + $s + ($s > 62));
175            }
176          }
177        ## Other character sets are not supported now (and there is no plan to implement them).
178        }
179        
180        ## Output the character itself
181        if (defined $t) {
182          $r .= $t;
183        ## Output the character itself with mapping table of special code positions
184        } elsif ($C->{GsmapR}->{ $c }) {
185          $r .= $C->{GsmapR}->{ $c };
186        } elsif ($C->{option}->{fallback_from_ucs} =~ /quiet/) {
187          return ($r, halfway => 1, converted_length => $i,
188                  warn => $C->{option}->{fallback_from_ucs} =~ /warn/ ? 1 : 0,
189                  reason => sprintf (q(U+%04X: There is no character mapped to), $cc));
190        } elsif ($C->{option}->{fallback_from_ucs} eq 'croak') {
191          return ($r, halfway => 1, die => 1,
192                  reason => sprintf (q(U+%04X: There is no character mapped to), $cc));
193        ##
194        } else {
195          ## Try to output with fallback escape sequence (if specified)
196          my $t = Encode::Charset->fallback_escape ($C, $c);
197          if (defined $t) {
198            my %D = (fallback => $C->{option}->{fallback_from_ucs}, reset => $C->{option}->{reset});
199            $C->{option}->{fallback_from_ucs} = 'croak';
200            eval q{$t = $C->{_encoder}->_encode_internal ($t, $C)} or undef $t;
201            $C->{option}->{fallback_from_ucs} = $D{fallback};
202          }
203          if (defined $t) {
204            $r .= $t;
205          } else {  ## Replacement character specified in charset definition
206            $r .= $C->{option}->{undef_char_sjis} || "\x3F";
207          }
208        }
209      }
210      $r;
211  }  }
212    
213  package Encode::SJIS::X0213ASCII;  sub page_to_internal ($$) {
214  use vars qw/@ISA/;    my ($C, $s) = @_;
215  push @ISA, 'Encode::SJIS';    $s = pack ('U*', unpack ('C*', $s));
216  __PACKAGE__->Define (qw/shift_jisx0213-ascii sjis-ascii shift-jis-ascii/);    $s =~ s(\x1B\x24([EFGOPQ])([\x21-\x7E]+)\x0F)(
217        my $page = {qw/E 1 F 2 G 3 O 4 P 5 Q 6/}->{$1};
218  =item sjis-ascii      my $r = '';
219        for my $c (split //, $2) {
220  Same as sjis but ASCII (ISO/IEC 646 IRV) instead of        $r .= chr ($Encode::Charset::CHARSET{G94}->{'CSpictogram_page_'.$page}->{ucs} + ord ($c) - 0x21);
221  JIS X 0201 Roman (or Latin) set.  (Alias: shift-jis-ascii)      }
222        $r;
223  In spite of the history of shift JIS, ASCII is sometimes    )gex;
224  used instead of JIS X 0201 Roman set, because of compatibility    $s;
 with ASCII world.  
   
 Note that this name is now an alias of shift_jisx0213-ascii,  
 as sjis is of shift_jisx0213.  
   
 =item shift_jisx0213-ascii  
   
 Same as Shift_JISX0213 but ASCII (ISO/IEC 646 IRV)  
 instead of JIS X 0201:1997 Latin character set.  
   
 =cut  
   
 sub __2022__common ($) {  
   my $C = shift->SUPER::__2022__common;  
   $C->{G0} = $Encode::Charset::CHARSET{G94}->{B};       ## ASCII  
   $C;  
 }  
 sub __encode_map ($) {  
   [qw/ucs_to_ascii ucs_to_jisx0213_2000_1 ucs_to_jisx0213_2000_2 ucs_to_jisx0201_katakana/];  
225  }  }
226  sub __decode_map ($) {  
227    [qw/jisx0213_2000_1_to_ucs jisx0213_2000_2_to_ucs jisx0201_katakana_to_ucs/];  sub _internal_to_page ($$$%) {
228      my ($yourself, $C, $c, $option) = @_;
229      my $cc = ord $c;
230      for my $page (1..6) {
231        my $cs = $Encode::Charset::CHARSET{G94}->{'CSpictogram_page_'.$page};
232        if ($cs->{ucs} <= $cc && $cc < $cs->{ucs} + $cs->{chars} * $cs->{dimension}) {
233          return "\x1B\x24" . ([qw/_ E F G O P Q/]->[$page])
234                .pack ('C', 0x21 + $cc - $cs->{ucs}) . "\x0F";
235        }
236      }
237      ## $c is not a pictogram
238      $option->{fallback_from_ucs} = $C->{option}->{fallback_from_ucs_2};
239      $yourself->fallback_escape ($C, $c, %$option);
240  }  }
241    
242  1;  =back
 __END__  
243    
244  =head1 SEE ALSO  =head1 SEE ALSO
245    
# Line 205  JIS X 0213:2000, "7-bit and 8-bit double Line 251  JIS X 0213:2000, "7-bit and 8-bit double
251  sets for information interchange", Japan Industrial Standards  sets for information interchange", Japan Industrial Standards
252  Committee (JISC) <http://www.jisc.go.jp/>, 2000.  Committee (JISC) <http://www.jisc.go.jp/>, 2000.
253    
254  Encode, perlunicode  L<Encode::SJIS::JIS>
255    
256    L<Encode>, L<perlunicode>
257    
258  [IANAREG] "CHARACTER SETS", IANA <http://www.iana.org/>,  L<Encode::Charset>, L<Encode::ISO2022>
 <http://www.iana.org/assignments/character-sets>.  
 The charset registry for IETF <http://www.ietf.org/> standards.  
 (Note that in this registry two shift JISes are registered,  
 "Shift_JIS" and "Windows-31j".  Former is JIS X 0208:1997's  
 definition and later is the Windows standard character set.)  
259    
260  =head1 LICENSE  =head1 LICENSE
261    
262  Copyright 2002 Nanashi-san  Copyright 2002 Nanashi-san <nanashi-san@nanashi.invalid>
263    
264  This library is free software; you can redistribute it  This library is free software; you can redistribute it
265  and/or modify it under the same terms as Perl itself.  and/or modify it under the same terms as Perl itself.
266    
267  =cut  =cut
268    
269  # $Date$  1; # $Date$
 ### SJIS.pm ends here  

Legend:
Removed from v.1.1  
changed lines
  Added in v.1.6

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24