=head1 NAME Encode::SJIS --- Shift JIS coding systems encoder and decoder =head1 ENCODINGS This module defines only two basic version of shift JIS. Other variants are defined in Encode::SJIS::* modules. =over 4 =cut package Encode::SJIS; use 5.7.3; use strict; our $VERSION=do{my @r=(q$Revision: 1.3 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r}; require Encode::Charset; use base qw(Encode::Encoding); ### --- Perl Encode module common functions sub encode ($$;$) { my ($obj, $str, $chk) = @_; $_[1] = '' if $chk; if (!defined $obj->{_encode_mapping} || $obj->{_encode_mapping}) { require Encode::Table; $str = Encode::Table::convert ($str, $obj->__encode_map, -autoload => defined $obj->{_encode_mapping_autoload} ? $obj->{_encode_mapping_autoload} : 1); } $str = &internal_to_sjis ($str, $obj->__2022_encode); $str; } sub decode ($$;$) { my ($obj, $str, $chk) = @_; $_[1] = '' if $chk; $str = &sjis_to_internal ($str, $obj->__2022_decode); if (!defined $obj->{_decode_mapping} || $obj->{_decode_mapping}) { require Encode::Table; $str = Encode::Table::convert ($str, $obj->__decode_map, -autoload => defined $obj->{_decode_mapping_autoload} ? $obj->{_decode_mapping_autoload} : 1); } $str; } ### --- Encode::SJIS unique functions *new_object = \&Encode::Charset::new_object_sjis; sub sjis_to_internal ($$) { my ($s, $C) = @_; $C ||= &new_object; $s =~ s{ ([\x00-\x7F\xA1-\xDF]) # ([\xA1-\xDF]) |([\x81-\x9F\xE0-\xFC][\x40-\x7E\x80-\xFC]) |\x1B([\x40-\x5F]) |([\x80-\xFF]) ## Broken or supplemental 1-byte character }{ my ($c7, $c2, $c1, $c8) = ($1, $2, $3, $4); if (defined $c7) { if ($c7 =~ /([\x21-\x7E])/) { chr ($C->{ $C->{GL} }->{ucs} + ord ($1) - 0x21); } elsif ($c7 =~ /([\x00-\x1F])/) { chr ($C->{ $C->{CL} }->{ucs} + ord ($1)); } elsif ($C->{GR} && $c7 =~ /([\xA1-\xDF])/) { chr ($C->{ $C->{GR} }->{ucs} + ord ($1) - 0xA1); } else { ## 0x20, 0x7F $C->{Gsmap}->{ $c7 } || $c7; } } elsif ($c2) { if ($c2 =~ /([\x81-\xEF])(.)/) { my ($f, $s) = (ord $1, ord $2); $f -= $f < 0xA0 ? 0x81 : 0xC1; $s -= 0x40 + ($s > 0x7F); chr ($C->{G1}->{ucs} + $f * 188 + $s); } else { ## [\xF0-\xFC]. my ($f, $s) = unpack ('CC', $c2); if ($C->{G3}->{Csjis_kuE}) { $f = $s > 0x9E ? $C->{G3}->{Csjis_kuE}->{ $f }: $C->{G3}->{Csjis_kuO}->{ $f }; $s -= ($s > 0x9E ? 0x9F : $s > 0x7F ? 0x41 : 0x40); chr ($C->{G3}->{ucs} + $f * 94 + $s); } else { $f -= 0xF0; $s -= 0x40 + ($s > 0x7F); chr ($C->{G3}->{ucs} + $f * 188 + $s); } } } elsif ($c1) { ## ESC Fe chr ($C->{ $C->{ESC_Fe} }->{ucs} + ord ($c1) - 0x40); } else { # $C8 $C->{Gsmap}->{ $c8 } || $c8; } }gex; $s; } sub internal_to_sjis ($\%) { use integer; my ($s, $C) = @_; $C ||= &new_object; my $r = ''; for my $c (split //, $s) { my $cc = ord $c; my $t; if ($cc <= 0x1F) { $t = $c if $C->{ $C->{CL} } eq $Encode::Charset::CHARSET{C0}->{"\x40"}; } elsif ($cc == 0x20 || $cc == 0x7F) { Encode::_utf8_off ($c); $t = $c; } elsif ($cc < 0x7F) { Encode::_utf8_off ($c); $t = $c if $C->{ $C->{GL} } eq $Encode::Charset::CHARSET{G94}->{"\x42"}; } elsif ($C->{option}->{C1invoke_to_right} && $cc == 0x80) { $t = "\x80" if $C->{ $C->{CR} } eq $Encode::Charset::CHARSET{C1}->{'64291991C1'}; } elsif ($cc <= 0x9F) { $t = "\x1B".pack 'C', ($cc - 0x40) if $C->{ $C->{ESC_Fe} } eq $Encode::Charset::CHARSET{C1}->{'64291991C1'}; } elsif (0xE9F6C0 <= $cc && $cc <= 0xF06F80) { my $c = $cc - 0xE9F6C0; my $F = chr (($c / 8836)+0x30); if ($C->{G1} eq $Encode::Charset::CHARSET{G94n}->{ $F }) { my ($c1, $c2) = ((($c % 8836) / 94)+0x21, ($c % 94)+0x21); $t = pack ('CC', (($c1 - 1) >> 1) + ($c1 < 0x5F ? 0x71 : 0xB1), $c2 + (($c1 & 1) ? ($c2 < 0x60 ? 0x1F : 0x20) : 0x7E)); } elsif ($C->{G3} eq $Encode::Charset::CHARSET{G94n}->{ $F }) { my ($c1, $c2) = ((($c % 8836) / 94)+0x21, ($c % 94)+0x21); if ($C->{G3}->{Csjis_first}) { $t = pack ('CC', $C->{G3}->{Csjis_first}->{ ($c % 8836) / 94 }, $c2 + (($c1 & 1) ? ($c2 < 0x60 ? 0x1F : 0x20) : 0x7E)); } else { $t = pack ('CC', ($c / 188) + 0xF0, $c2 + (($c1 & 1) ? ($c2 < 0x60 ? 0x1F : 0x20) : 0x7E)) if ($c / 188) + 0xF0 < 0xFD; } } } elsif (0xF49D7C <= $cc && $cc <= 0xF4BFFF) { my $c = $cc - 0xF49D7C; if ($C->{G1} eq $Encode::Charset::CHARSET{G94n}->{'B@'}) { my ($c1, $c2) = ((($c % 8836) / 94)+0x21, ($c % 94)+0x21); $t = pack ('CC', (($c1 - 1) >> 1) + ($c1 < 0x5F ? 0x71 : 0xB1), $c2 + (($c1 & 1) ? ($c2 < 0x60 ? 0x1F : 0x20) : 0x7E)); } } elsif (0xE90940 <= $cc && $cc <= 0xE92641) { my $c = $cc - 0xE90940; my $F = chr (($c / 94)+0x30); if ($C->{ $C->{GL} } eq $Encode::Charset::CHARSET{G94}->{ $F }) { $t = pack 'C', (($c % 94) + 0x21); } elsif ($C->{ $C->{GR} } eq $Encode::Charset::CHARSET{G94}->{ $F }) { $t = pack 'C', (($c % 94) + 0xA1) if ($c % 94) < 0x3F; } } elsif (0x70420000 <= $cc && $cc <= 0x7046F19B) { my $c = $cc % 0x10000; my $F0=$C->{option}->{private_set}->{G94n}->[($cc/0x10000)-0x7042]->[$c/8836]; my $F1 = 'P'.(($cc / 0x10000) - 0x7042).'_'.($c / 8836); if ($C->{G3} eq $Encode::Charset::CHARSET{G94n}->{ $F0 } || $C->{G3} eq $Encode::Charset::CHARSET{G94n}->{ $F1 }) { my ($c1, $c2) = ((($c % 8836) / 94)+0x21, ($c % 94)+0x21); if ($C->{G3}->{Csjis_first}) { $t = pack ('CC', $C->{G3}->{Csjis_first}->{ ($c % 8836) / 94 }, $c2 + (($c1 & 1) ? ($c2 < 0x60 ? 0x1F : 0x20) : 0x7E)); } else { $t = pack ('CC', ($c / 188) + 0xF0, $c2 + (($c1 & 1) ? ($c2 < 0x60 ? 0x1F : 0x20) : 0x7E)) if ($c / 188) + 0xF0 < 0xFD; } } } if (defined $t) { $r .= $t; } elsif ($C->{GsmapR}->{ $c }) { $r .= $C->{GsmapR}->{ $c }; } else { $r .= $C->{option}->{undef_char_sjis} || "\x3F"; } } $r; } sub __clone ($) { my $self = shift; bless {%$self}, ref $self; }; __PACKAGE__->Define (qw!shift_jisx0213 japanese-shift-jisx0213 shift-jisx0213 x-shift_jisx0213 shift-jis-3 shift-jis-2000 sjis s-jis shift-jis x-sjis x_sjis x-sjis-jp shiftjis x-shiftjis x-shift-jis shift.jis!); =item sjis "Shift JIS" coding system. (Alias: shift-jis, shiftjis, shift.jis, x-shiftjis, x-shift-jis, s-jis, x-sjis, x_sjis, x-sjis-jp) Since this name is ambiguous (it can now refer all or any of shift JIS coding system family), this name should not be used to address specific coding system. In this module, this is considered as an alias name to the shift JIS with latest official definition, currently of JIS X 0213:2000 Appendix 1 (with implemention level 4). Note that the name "Shift_JIS" is not associated with this name, because IANA registry [IANAREG] assignes it to a shift JIS defined by JIS X 0208:1997. =item shift_jisx0213 Shift_JISX0213 coded representation, defined by JIS X 0213:2000 Appendix 1 (implemention level 4). (Alias: shift-jisx0213, x-shift_jisx0213, japanese-shift-jisx0213 (emacsen), shift-jis-3 (Yudit), shift-jis-2000) =cut sub __2022__common ($) { my $C = Encode::SJIS->new_object; $C->{G0} = $Encode::Charset::CHARSET{G94}->{J}; ## JIS X 0201:1997 Latin $C->{G1} = $Encode::Charset::CHARSET{G94n}->{"\x4F"}; ## JIS X 0213:2000 plane 1 $C->{G2} = $Encode::Charset::CHARSET{G94}->{I}; ## JIS X 0201:1997 Katakana $C->{G3} = $Encode::Charset::CHARSET{G94n}->{"\x50"}; ## JIS X 0213:2000 plane 2 $C; } sub __2022_encode ($) { my $C = shift->__2022__common; $C; } sub __2022_decode ($) { my $C = shift->__2022__common; $C; } sub __encode_map ($) { [qw/ucs_to_jisx0201_latin ucs_to_jisx0213_2000_1 ucs_to_jisx0213_2000_2 ucs_to_jisx0201_katakana/]; } sub __decode_map ($) { [qw/jisx0201_latin_to_ucs jisx0213_2000_1_to_ucs jisx0213_2000_2_to_ucs jisx0201_katakana_to_ucs/]; } package Encode::SJIS::X0213ASCII; use vars qw/@ISA/; push @ISA, 'Encode::SJIS'; __PACKAGE__->Define (qw/shift_jisx0213-ascii shift-jis-2000-ascii sjis-ascii shift-jis-ascii/); =item sjis-ascii Same as sjis but ASCII (ISO/IEC 646 IRV) instead of JIS X 0201 Roman (or Latin) set. (Alias: shift-jis-ascii) In spite of the history of shift JIS, ASCII is sometimes used instead of JIS X 0201 Roman set, because of compatibility with ASCII world. Note that this name is now an alias of shift_jisx0213-ascii, as sjis is of shift_jisx0213. =item shift_jisx0213-ascii Same as Shift_JISX0213 but ASCII (ISO/IEC 646 IRV) instead of JIS X 0201:1997 Latin character set. (Alias: shift-jis-2000-ascii) Note that this coding system does NOT comform to JIS X 0213:2000 Appendix 1. =cut sub __2022__common ($) { my $C = shift->SUPER::__2022__common; $C->{G0} = $Encode::Charset::CHARSET{G94}->{B}; ## ASCII $C; } sub __encode_map ($) { [qw/ucs_to_ascii ucs_to_jisx0213_2000_1 ucs_to_jisx0213_2000_2 ucs_to_jisx0201_katakana/]; } sub __decode_map ($) { [qw/jisx0213_2000_1_to_ucs jisx0213_2000_2_to_ucs jisx0201_katakana_to_ucs/]; } 1; __END__ =back =head1 SEE ALSO JIS X 0208:1997, "7-bit and 8-bit double byte coded Kanji set for information interchange", Japan Industrial Standards Committee (JISC) , 1997. JIS X 0213:2000, "7-bit and 8-bit double byte coded extended Kanji sets for information interchange", Japan Industrial Standards Committee (JISC) , 2000. Encode, perlunicode [IANAREG] "CHARACTER SETS", IANA , . The charset registry for IETF standards. (Note that in this registry two shift JISes are registered, "Shift_JIS" and "Windows-31j". Former is JIS X 0208:1997's definition and later is the Windows standard character set.) =head1 LICENSE Copyright 2002 Nanashi-san This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =cut # $Date: 2002/10/14 06:58:35 $ ### SJIS.pm ends here