/[suikacvs]/perl/lib/Encode/SJIS.pm

Diff of /perl/lib/Encode/SJIS.pm

Parent Directory | Revision Log | View Patch Patch

-revision 1.4 by wakaba,
Thu Dec 12 08:17:16 2002 UTC
+revision 1.6 by wakaba,
Wed Dec 18 10:21:09 2002 UTC
 Line 5 
 Encode::SJIS --- Shift JIS coding system
  =head1 ENCODINGS
- This module defines only two basic version of shift JIS.
+ This module defines encoding engine for Shift JIS coding systems.
- Other variants are defined in Encode::SJIS::* modules.
+ This module only provides general en/decoding parts.  Actual profiles
+ for Shift JISes are included in Encode::SJIS::*.
  =over 4
-Line 19 
 our $VERSION=do{my @r=(q$Revision$=~/\d+
+Line 20 
 our $VERSION=do{my @r=(q$Revision$=~/\d+
  require Encode::Charset;
  use base qw(Encode::Encoding);
- ### --- Perl Encode module common functions
- sub encode ($$;$) {
-   my ($obj, $str, $chk) = @_;
-   $_[1] = '' if $chk;
-   if (!defined $obj->{_encode_mapping} || $obj->{_encode_mapping}) {
-     require Encode::Table;
-     $str = Encode::Table::convert ($str, $obj->__encode_map,
-       -autoload => defined $obj->{_encode_mapping_autoload} ?
-                    $obj->{_encode_mapping_autoload} : 1);
-   }
-   $str = &internal_to_sjis ($str, $obj->__2022_encode);
-   $str;
- }
- sub decode ($$;$) {
-   my ($obj, $str, $chk) = @_;
-   $_[1] = '' if $chk;
-   $str = &sjis_to_internal ($str, $obj->__2022_decode);
-   if (!defined $obj->{_decode_mapping} || $obj->{_decode_mapping}) {
-     require Encode::Table;
-     $str = Encode::Table::convert ($str, $obj->__decode_map,
-       -autoload => defined $obj->{_decode_mapping_autoload} ?
-                    $obj->{_decode_mapping_autoload} : 1);
-   }
-   $str;
- }
- ### --- Encode::SJIS unique functions
  *new_object = \&Encode::Charset::new_object_sjis;
+ ## Code extention escape sequence defined by ISO/IEC 2022 is
+ ## not supported in this version of this module.
  sub sjis_to_internal ($$) {
    my ($s, $C) = @_;
    $C ||= &new_object;
-Line 77 
 sub sjis_to_internal ($$) {
+Line 52 
 sub sjis_to_internal ($$) {
          $f -= $f < 0xA0 ? 0x81 : 0xC1;  $s -= 0x40 + ($s > 0x7F);
          chr ($C->{G1}->{ucs} + $f * 188 + $s);
        } else {  ## [\xF0-\xFC].
-         my ($f, $s) = unpack ('CC', $c2);
+         my ($f, $s) = (ord substr ($c2, 0, 1), ord substr ($c2, 1, 1));
-         if ($C->{G3}->{Csjis_kuE}) {
+         if ($C->{G3}->{Csjis_kuE}) {    ## 94^2 set with first-byte->ku mapping
-           $f = $s > 0x9E ? $C->{G3}->{Csjis_kuE}->{ $f }:
+           my $F = $s > 0x9E ? $C->{G3}->{Csjis_kuE}->{ $f }:    ## ku of even number
-                            $C->{G3}->{Csjis_kuO}->{ $f };
+                               $C->{G3}->{Csjis_kuO}->{ $f };    ## ku of odd number
-           $s -= ($s > 0x9E ? 0x9F : $s > 0x7F ? 0x41 : 0x40);
+           if (defined $F) {
-           chr ($C->{G3}->{ucs} + $f * 94 + $s);
+             $s -= ($s > 0x9E ? 0x9F : $s > 0x7F ? 0x41 : 0x40);
-         } else {
+             chr ($C->{G3}->{ucs} + $F * 94 + $s);
+           } else {      ## Mapping is not defined
+             $f -= 0xF0; $s -= 0x40 + ($s > 0x7F);
+             chr ($Encode::Charset::CHARSET{G94n}->{"\x20\x40"}->{ucs} + $f * 188 + $s);
+           }
+         } elsif ($C->{G3}->{Csjis_ku}) {        ## n^2 set with first-byte->ku mapping
+           if (defined $C->{G3}->{Csjis_ku}->{ $f }) {
+             $f = $C->{G3}->{Csjis_ku}->{ $f };
+             $s -= ($s > 0x9E ? 0x9F : $s > 0x7F ? 0x41 : 0x40);
+             chr ($C->{G3}->{ucs} + $f * $C->{G3}->{chars} + $s);
+           } else {      ## Mapping is not defined
+             $f -= 0xF0; $s -= 0x40 + ($s > 0x7F);
+             chr ($Encode::Charset::CHARSET{G94n}->{"\x20\x40"}->{ucs} + $f * 188 + $s);
+           }
+         } else {        ## 94^2 set without special mapping information
            $f -= 0xF0; $s -= 0x40 + ($s > 0x7F);
            chr ($C->{G3}->{ucs} + $f * 188 + $s);
          }
-Line 103 
 sub internal_to_sjis ($\%) {
+Line 92 
 sub internal_to_sjis ($\%) {
    $C ||= &new_object;
    my $r = '';
-   for my $c (split //, $s) {
+   my @c = split //, $s;
-     my $cc = ord $c;
+   for my $i (0..$#c) {
+     my $c = $c[$i]; my $cc = ord $c;  Encode::_utf8_off ($c);
      my $t;
+     ## CL = C0 control characters
      if ($cc <= 0x1F) {
        $t = $c if $C->{ $C->{CL} } eq $Encode::Charset::CHARSET{C0}->{"\x40"};
+     ## 0x20 == SP and 0x7E == DEL
      } elsif ($cc == 0x20 || $cc == 0x7F) {
-       Encode::_utf8_off ($c);
        $t = $c;
+     ## GL = G0 = ISO/IEC 646 graphic character set
      } elsif ($cc < 0x7F) {
-       Encode::_utf8_off ($c);
        $t = $c if $C->{ $C->{GL} } eq $Encode::Charset::CHARSET{G94}->{"\x42"};
+     ## 0x80
      } elsif ($C->{option}->{C1invoke_to_right} && $cc == 0x80) {
        $t = "\x80"
          if $C->{ $C->{CR} } eq $Encode::Charset::CHARSET{C1}->{'64291991C1'};
+     ## ESC Fe = C1 control characters
      } elsif ($cc <= 0x9F) {
        $t = "\x1B".pack 'C', ($cc - 0x40)
          if $C->{ $C->{ESC_Fe} } eq $Encode::Charset::CHARSET{C1}->{'64291991C1'};
+     ## G1 or G3 = 94^2 graphic character set from ISO-IR
      } elsif (0xE9F6C0 <= $cc && $cc <= 0xF06F80) {
        my $c = $cc - 0xE9F6C0;  my $F = chr (($c / 8836)+0x30);
        if ($C->{G1} eq $Encode::Charset::CHARSET{G94n}->{ $F }) {
-Line 130 
 sub internal_to_sjis ($\%) {
+Line 123 
 sub internal_to_sjis ($\%) {
        } elsif ($C->{G3} eq $Encode::Charset::CHARSET{G94n}->{ $F }) {
          my ($c1, $c2) = ((($c % 8836) / 94)+0x21, ($c % 94)+0x21);
          if ($C->{G3}->{Csjis_first}) {
-           $t = pack ('CC', $C->{G3}->{Csjis_first}->{ ($c % 8836) / 94 },
+           my $fb = $C->{G3}->{Csjis_first}->{ ($c % 8836) / 94 };
-                      $c2 + (($c1 & 1) ? ($c2 < 0x60 ? 0x1F : 0x20) : 0x7E));
+           $t = pack ('CC', $fb, $c2 + (($c1 & 1) ? ($c2 < 0x60 ? 0x1F : 0x20) : 0x7E)) if $fb;
          } else {
            $t = pack ('CC', ($c / 188) + 0xF0,
                       $c2 + (($c1 & 1) ? ($c2 < 0x60 ? 0x1F : 0x20) : 0x7E))
                 if ($c / 188) + 0xF0 < 0xFD;
          }
        }
+     ## G1 = JIS X 0208-1990/:1997
      } elsif (0xF49D7C <= $cc && $cc <= 0xF4BFFF) {
        my $c = $cc - 0xF49D7C;
        if ($C->{G1} eq $Encode::Charset::CHARSET{G94n}->{'B@'}) {
-Line 145 
 sub internal_to_sjis ($\%) {
+Line 139 
 sub internal_to_sjis ($\%) {
          $t = pack ('CC', (($c1 - 1) >> 1) + ($c1 < 0x5F ? 0x71 : 0xB1),
                 $c2 + (($c1 & 1) ? ($c2 < 0x60 ? 0x1F : 0x20) : 0x7E));
        }
+     ## GL = G0 = ISO/IEC 646 graphic character set / GR = G2 = JIS X 0201 Katakana set
      } elsif (0xE90940 <= $cc && $cc <= 0xE92641) {
        my $c = $cc - 0xE90940;  my $F = chr (($c / 94)+0x30);
        if ($C->{ $C->{GL} } eq $Encode::Charset::CHARSET{G94}->{ $F }) {
-Line 153 
 sub internal_to_sjis ($\%) {
+Line 147 
 sub internal_to_sjis ($\%) {
        } elsif ($C->{ $C->{GR} } eq $Encode::Charset::CHARSET{G94}->{ $F }) {
          $t = pack 'C', (($c % 94) + 0xA1) if ($c % 94) < 0x3F;
        }
+     ## G1 / G3 = 94^2 graphic character set
      } elsif (0x70420000 <= $cc && $cc <= 0x7046F19B) {
        my $c = $cc % 0x10000;
        my $F0=$C->{option}->{private_set}->{G94n}->[($cc/0x10000)-0x7042]->[$c/8836];
-Line 161 
 sub internal_to_sjis ($\%) {
+Line 156 
 sub internal_to_sjis ($\%) {
         || $C->{G3} eq $Encode::Charset::CHARSET{G94n}->{ $F1 }) {
          my ($c1, $c2) = ((($c % 8836) / 94)+0x21, ($c % 94)+0x21);
          if ($C->{G3}->{Csjis_first}) {
-           $t = pack ('CC', $C->{G3}->{Csjis_first}->{ ($c % 8836) / 94 },
+           my $fb = $C->{G3}->{Csjis_first}->{ ($c % 8836) / 94 };
-                      $c2 + (($c1 & 1) ? ($c2 < 0x60 ? 0x1F : 0x20) : 0x7E));
+           $t = pack ('CC', $fb, $c2 + (($c1 & 1) ? ($c2 < 0x60 ? 0x1F : 0x20) : 0x7E)) if $fb;
          } else {
            $t = pack ('CC', ($c / 188) + 0xF0,
                       $c2 + (($c1 & 1) ? ($c2 < 0x60 ? 0x1F : 0x20) : 0x7E))
                 if ($c / 188) + 0xF0 < 0xFD;
          }
        }
+     ## Non-ISO/IEC 2022 Coded Character Sets Mapping Area
+     } elsif (0x71000000 <= $cc && $cc <= 0x71FFFFFF) {
+       if ($C->{G3}->{ucs} <= $cc) {
+         my $c = $cc - $C->{G3}->{ucs};
+         my $f = $C->{G3}->{Csjis_first}->{$c / $C->{G3}->{chars}};
+         if ($f) {
+           my $s = $c % $C->{G3}->{chars};
+           $t = pack ('CC', $f, 0x40 + $s + ($s > 62));
+         }
+       }
+     ## Other character sets are not supported now (and there is no plan to implement them).
      }
+     ## Output the character itself
      if (defined $t) {
        $r .= $t;
+     ## Output the character itself with mapping table of special code positions
      } elsif ($C->{GsmapR}->{ $c }) {
        $r .= $C->{GsmapR}->{ $c };
+     } elsif ($C->{option}->{fallback_from_ucs} =~ /quiet/) {
+       return ($r, halfway => 1, converted_length => $i,
+               warn => $C->{option}->{fallback_from_ucs} =~ /warn/ ? 1 : 0,
+               reason => sprintf (q(U+%04X: There is no character mapped to), $cc));
+     } elsif ($C->{option}->{fallback_from_ucs} eq 'croak') {
+       return ($r, halfway => 1, die => 1,
+               reason => sprintf (q(U+%04X: There is no character mapped to), $cc));
+     ##
      } else {
-       $r .= $C->{option}->{undef_char_sjis} || "\x3F";
+       ## Try to output with fallback escape sequence (if specified)
+       my $t = Encode::Charset->fallback_escape ($C, $c);
+       if (defined $t) {
+         my %D = (fallback => $C->{option}->{fallback_from_ucs}, reset => $C->{option}->{reset});
+         $C->{option}->{fallback_from_ucs} = 'croak';
+         eval q{$t = $C->{_encoder}->_encode_internal ($t, $C)} or undef $t;
+         $C->{option}->{fallback_from_ucs} = $D{fallback};
+       }
+       if (defined $t) {
+         $r .= $t;
+       } else {  ## Replacement character specified in charset definition
+         $r .= $C->{option}->{undef_char_sjis} || "\x3F";
+       }
      }
    }
    $r;
  }
- sub __clone ($) {
+ sub page_to_internal ($$) {
-   my $self = shift;
+   my ($C, $s) = @_;
-   bless {%$self}, ref $self;
+   $s = pack ('U*', unpack ('C*', $s));
- };
+   $s =~ s(\x1B\x24([EFGOPQ])([\x21-\x7E]+)\x0F)(
+     my $page = {qw/E 1 F 2 G 3 O 4 P 5 Q 6/}->{$1};
- __PACKAGE__->Define (qw!shift_jisx0213 japanese-shift-jisx0213
+     my $r = '';
- shift-jisx0213 x-shift_jisx0213 shift-jis-3 shift-jis-2000 sjisx0213
+     for my $c (split //, $2) {
- sjis s-jis shift-jis x-sjis x_sjis x-sjis-jp shiftjis x-shiftjis
+       $r .= chr ($Encode::Charset::CHARSET{G94}->{'CSpictogram_page_'.$page}->{ucs} + ord ($c) - 0x21);
- x-shift-jis shift.jis!);
+     }
+     $r;
- =item sjis
+   )gex;
+   $s;
- "Shift JIS" coding system.  (Alias: shift-jis, shiftjis,
- shift.jis, x-shiftjis, x-shift-jis, s-jis, x-sjis, x_sjis,
- x-sjis-jp)
- Since this name is ambiguous (it can now refer all or any
- of shift JIS coding system family), this name should not
- be used to address specific coding system.  In this module,
- this is considered as an alias name to the shift JIS with
- latest official definition, currently of JIS X 0213:2000
- Appendix 1 (with implemention level 4).
- Note that the name "Shift_JIS" is not associated with
- this name, because IANA registry [IANAREG] assignes
- it to a shift JIS defined by JIS X 0208:1997.
- =item shift_jisx0213
- Shift_JISX0213 coded representation, defined by
- JIS X 0213:2000 Appendix 1 (implemention level 4).
- (Alias: shift-jisx0213, x-shift_jisx0213, japanese-shift-jisx0213 (emacsen),
- shift-jis-3 (Yudit), shift-jis-2000, sjisx0213)
- =cut
- sub __2022__common ($) {
-   my $C = Encode::SJIS->new_object;
-   $C->{G0} = $Encode::Charset::CHARSET{G94}->{J};       ## JIS X 0201:1997 Latin
-   $C->{G1} = $Encode::Charset::CHARSET{G94n}->{"\x4F"}; ## JIS X 0213:2000 plane 1
-   $C->{G2} = $Encode::Charset::CHARSET{G94}->{I};       ## JIS X 0201:1997 Katakana
-   $C->{G3} = $Encode::Charset::CHARSET{G94n}->{"\x50"}; ## JIS X 0213:2000 plane 2
-   $C;
- }
- sub __2022_encode ($) {
-   my $C = shift->__2022__common;
-   $C;
- }
- sub __2022_decode ($) {
-   my $C = shift->__2022__common;
-   $C;
- }
- sub __encode_map ($) {
-   [qw/ucs_to_jisx0201_latin ucs_to_jisx0213_2000_1 ucs_to_jisx0213_2000_2 ucs_to_jisx0201_katakana/];
- }
- sub __decode_map ($) {
-   [qw/jisx0201_latin_to_ucs jisx0213_2000_1_to_ucs jisx0213_2000_2_to_ucs jisx0201_katakana_to_ucs/];
  }
- package Encode::SJIS::X0213ASCII;
+ sub _internal_to_page ($$$%) {
- use vars qw/@ISA/;
+   my ($yourself, $C, $c, $option) = @_;
- push @ISA, 'Encode::SJIS';
+   my $cc = ord $c;
- __PACKAGE__->Define (qw/shift_jisx0213-ascii shift-jis-2000-ascii
+   for my $page (1..6) {
- sjis-ascii shift-jis-ascii/);
+     my $cs = $Encode::Charset::CHARSET{G94}->{'CSpictogram_page_'.$page};
+     if ($cs->{ucs} <= $cc && $cc < $cs->{ucs} + $cs->{chars} * $cs->{dimension}) {
- =item sjis-ascii
+       return "\x1B\x24" . ([qw/_ E F G O P Q/]->[$page])
+             .pack ('C', 0x21 + $cc - $cs->{ucs}) . "\x0F";
- Same as sjis but ASCII (ISO/IEC 646 IRV) instead of
+     }
- JIS X 0201 Roman (or Latin) set.  (Alias: shift-jis-ascii)
+   }
+   ## $c is not a pictogram
- In spite of the history of shift JIS, ASCII is sometimes
+   $option->{fallback_from_ucs} = $C->{option}->{fallback_from_ucs_2};
- used instead of JIS X 0201 Roman set, because of compatibility
+   $yourself->fallback_escape ($C, $c, %$option);
- with ASCII world.
- Note that this name is now an alias of shift_jisx0213-ascii,
- as sjis is of shift_jisx0213.
- =item shift_jisx0213-ascii
- Same as Shift_JISX0213 but ASCII (ISO/IEC 646 IRV)
- instead of JIS X 0201:1997 Latin character set.
- (Alias: shift-jis-2000-ascii)
- Note that this coding system does NOT comform to
- JIS X 0213:2000 Appendix 1.
- =cut
- sub __2022__common ($) {
-   my $C = shift->SUPER::__2022__common;
-   $C->{G0} = $Encode::Charset::CHARSET{G94}->{B};       ## ASCII
-   $C;
- }
- sub __encode_map ($) {
-   [qw/ucs_to_ascii ucs_to_jisx0213_2000_1 ucs_to_jisx0213_2000_2 ucs_to_jisx0201_katakana/];
- }
- sub __decode_map ($) {
-   [qw/jisx0213_2000_1_to_ucs jisx0213_2000_2_to_ucs jisx0201_katakana_to_ucs/];
  }
-;
- __END__
  =back
  =head1 SEE ALSO
-Line 297 
 JIS X 0213:2000, "7-bit and 8-bit double
+Line 251 
 JIS X 0213:2000, "7-bit and 8-bit double
  sets for information interchange", Japan Industrial Standards
  Committee (JISC) <http://www.jisc.go.jp/>, 2000.
- Encode, perlunicode
+ L<Encode::SJIS::JIS>
+ L<Encode>, L<perlunicode>
- [IANAREG] "CHARACTER SETS", IANA <http://www.iana.org/>,
+ L<Encode::Charset>, L<Encode::ISO2022>
- <http://www.iana.org/assignments/character-sets>.
- The charset registry for IETF <http://www.ietf.org/> standards.
- (Note that in this registry two shift JISes are registered,
- "Shift_JIS" and "Windows-31j".  Former is JIS X 0208:1997's
- definition and later is the Windows standard character set.)
  =head1 LICENSE
- Copyright 2002 Nanashi-san
+ Copyright 2002 Nanashi-san <nanashi-san@nanashi.invalid>
  This library is free software; you can redistribute it
  and/or modify it under the same terms as Perl itself.
  =cut
- # $Date$
+; # $Date$
- ### SJIS.pm ends here

 Legend:



Removed from v.1.4
 


changed lines


 
Added in v.1.6
 Legend:



Removed from v.1.4
 


changed lines


 
Added in v.1.6
-Removed from v.1.4
+Added in v.1.6

admin@suikawiki.org	ViewVC Help
Powered by ViewVC 1.1.24