| 1 |
wakaba |
1.1 |
|
| 2 |
|
|
=head1 NAME |
| 3 |
|
|
|
| 4 |
|
|
Encode::SJIS --- Shift JIS coding systems encoder and decoder |
| 5 |
|
|
|
| 6 |
|
|
=head1 ENCODINGS |
| 7 |
|
|
|
| 8 |
wakaba |
1.5 |
This module defines encoding engine for Shift JIS coding systems. |
| 9 |
|
|
This module only provides general en/decoding parts. Actual profiles |
| 10 |
|
|
for Shift JISes are included in Encode::SJIS::*. |
| 11 |
wakaba |
1.1 |
|
| 12 |
|
|
=over 4 |
| 13 |
|
|
|
| 14 |
|
|
=cut |
| 15 |
|
|
|
| 16 |
|
|
package Encode::SJIS; |
| 17 |
|
|
use 5.7.3; |
| 18 |
|
|
use strict; |
| 19 |
wakaba |
1.6 |
our $VERSION=do{my @r=(q$Revision: 1.5 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r}; |
| 20 |
wakaba |
1.1 |
require Encode::Charset; |
| 21 |
wakaba |
1.2 |
use base qw(Encode::Encoding); |
| 22 |
wakaba |
1.1 |
|
| 23 |
wakaba |
1.5 |
*new_object = \&Encode::Charset::new_object_sjis; |
| 24 |
wakaba |
1.1 |
|
| 25 |
wakaba |
1.5 |
## Code extention escape sequence defined by ISO/IEC 2022 is |
| 26 |
|
|
## not supported in this version of this module. |
| 27 |
wakaba |
1.1 |
|
| 28 |
|
|
sub sjis_to_internal ($$) { |
| 29 |
|
|
my ($s, $C) = @_; |
| 30 |
|
|
$C ||= &new_object; |
| 31 |
|
|
$s =~ s{ |
| 32 |
|
|
([\x00-\x7F\xA1-\xDF]) |
| 33 |
|
|
# ([\xA1-\xDF]) |
| 34 |
|
|
|([\x81-\x9F\xE0-\xFC][\x40-\x7E\x80-\xFC]) |
| 35 |
|
|
|\x1B([\x40-\x5F]) |
| 36 |
|
|
|([\x80-\xFF]) ## Broken or supplemental 1-byte character |
| 37 |
|
|
}{ |
| 38 |
|
|
my ($c7, $c2, $c1, $c8) = ($1, $2, $3, $4); |
| 39 |
|
|
if (defined $c7) { |
| 40 |
|
|
if ($c7 =~ /([\x21-\x7E])/) { |
| 41 |
|
|
chr ($C->{ $C->{GL} }->{ucs} + ord ($1) - 0x21); |
| 42 |
|
|
} elsif ($c7 =~ /([\x00-\x1F])/) { |
| 43 |
|
|
chr ($C->{ $C->{CL} }->{ucs} + ord ($1)); |
| 44 |
|
|
} elsif ($C->{GR} && $c7 =~ /([\xA1-\xDF])/) { |
| 45 |
|
|
chr ($C->{ $C->{GR} }->{ucs} + ord ($1) - 0xA1); |
| 46 |
|
|
} else { ## 0x20, 0x7F |
| 47 |
|
|
$C->{Gsmap}->{ $c7 } || $c7; |
| 48 |
|
|
} |
| 49 |
|
|
} elsif ($c2) { |
| 50 |
|
|
if ($c2 =~ /([\x81-\xEF])(.)/) { |
| 51 |
|
|
my ($f, $s) = (ord $1, ord $2); |
| 52 |
|
|
$f -= $f < 0xA0 ? 0x81 : 0xC1; $s -= 0x40 + ($s > 0x7F); |
| 53 |
|
|
chr ($C->{G1}->{ucs} + $f * 188 + $s); |
| 54 |
|
|
} else { ## [\xF0-\xFC]. |
| 55 |
wakaba |
1.6 |
my ($f, $s) = (ord substr ($c2, 0, 1), ord substr ($c2, 1, 1)); |
| 56 |
|
|
if ($C->{G3}->{Csjis_kuE}) { ## 94^2 set with first-byte->ku mapping |
| 57 |
|
|
my $F = $s > 0x9E ? $C->{G3}->{Csjis_kuE}->{ $f }: ## ku of even number |
| 58 |
|
|
$C->{G3}->{Csjis_kuO}->{ $f }; ## ku of odd number |
| 59 |
|
|
if (defined $F) { |
| 60 |
|
|
$s -= ($s > 0x9E ? 0x9F : $s > 0x7F ? 0x41 : 0x40); |
| 61 |
|
|
chr ($C->{G3}->{ucs} + $F * 94 + $s); |
| 62 |
|
|
} else { ## Mapping is not defined |
| 63 |
|
|
$f -= 0xF0; $s -= 0x40 + ($s > 0x7F); |
| 64 |
|
|
chr ($Encode::Charset::CHARSET{G94n}->{"\x20\x40"}->{ucs} + $f * 188 + $s); |
| 65 |
|
|
} |
| 66 |
|
|
} elsif ($C->{G3}->{Csjis_ku}) { ## n^2 set with first-byte->ku mapping |
| 67 |
|
|
if (defined $C->{G3}->{Csjis_ku}->{ $f }) { |
| 68 |
|
|
$f = $C->{G3}->{Csjis_ku}->{ $f }; |
| 69 |
|
|
$s -= ($s > 0x9E ? 0x9F : $s > 0x7F ? 0x41 : 0x40); |
| 70 |
|
|
chr ($C->{G3}->{ucs} + $f * $C->{G3}->{chars} + $s); |
| 71 |
|
|
} else { ## Mapping is not defined |
| 72 |
|
|
$f -= 0xF0; $s -= 0x40 + ($s > 0x7F); |
| 73 |
|
|
chr ($Encode::Charset::CHARSET{G94n}->{"\x20\x40"}->{ucs} + $f * 188 + $s); |
| 74 |
|
|
} |
| 75 |
|
|
} else { ## 94^2 set without special mapping information |
| 76 |
wakaba |
1.1 |
$f -= 0xF0; $s -= 0x40 + ($s > 0x7F); |
| 77 |
|
|
chr ($C->{G3}->{ucs} + $f * 188 + $s); |
| 78 |
|
|
} |
| 79 |
|
|
} |
| 80 |
|
|
} elsif ($c1) { ## ESC Fe |
| 81 |
|
|
chr ($C->{ $C->{ESC_Fe} }->{ucs} + ord ($c1) - 0x40); |
| 82 |
|
|
} else { # $C8 |
| 83 |
|
|
$C->{Gsmap}->{ $c8 } || $c8; |
| 84 |
|
|
} |
| 85 |
|
|
}gex; |
| 86 |
|
|
$s; |
| 87 |
|
|
} |
| 88 |
|
|
|
| 89 |
wakaba |
1.2 |
sub internal_to_sjis ($\%) { |
| 90 |
|
|
use integer; |
| 91 |
|
|
my ($s, $C) = @_; |
| 92 |
|
|
$C ||= &new_object; |
| 93 |
|
|
|
| 94 |
|
|
my $r = ''; |
| 95 |
wakaba |
1.5 |
my @c = split //, $s; |
| 96 |
|
|
for my $i (0..$#c) { |
| 97 |
|
|
my $c = $c[$i]; my $cc = ord $c; Encode::_utf8_off ($c); |
| 98 |
wakaba |
1.2 |
my $t; |
| 99 |
wakaba |
1.5 |
## CL = C0 control characters |
| 100 |
wakaba |
1.2 |
if ($cc <= 0x1F) { |
| 101 |
|
|
$t = $c if $C->{ $C->{CL} } eq $Encode::Charset::CHARSET{C0}->{"\x40"}; |
| 102 |
wakaba |
1.5 |
## 0x20 == SP and 0x7E == DEL |
| 103 |
wakaba |
1.2 |
} elsif ($cc == 0x20 || $cc == 0x7F) { |
| 104 |
|
|
$t = $c; |
| 105 |
wakaba |
1.5 |
## GL = G0 = ISO/IEC 646 graphic character set |
| 106 |
wakaba |
1.2 |
} elsif ($cc < 0x7F) { |
| 107 |
|
|
$t = $c if $C->{ $C->{GL} } eq $Encode::Charset::CHARSET{G94}->{"\x42"}; |
| 108 |
wakaba |
1.5 |
## 0x80 |
| 109 |
wakaba |
1.2 |
} elsif ($C->{option}->{C1invoke_to_right} && $cc == 0x80) { |
| 110 |
wakaba |
1.3 |
$t = "\x80" |
| 111 |
|
|
if $C->{ $C->{CR} } eq $Encode::Charset::CHARSET{C1}->{'64291991C1'}; |
| 112 |
wakaba |
1.5 |
## ESC Fe = C1 control characters |
| 113 |
wakaba |
1.2 |
} elsif ($cc <= 0x9F) { |
| 114 |
wakaba |
1.3 |
$t = "\x1B".pack 'C', ($cc - 0x40) |
| 115 |
wakaba |
1.2 |
if $C->{ $C->{ESC_Fe} } eq $Encode::Charset::CHARSET{C1}->{'64291991C1'}; |
| 116 |
wakaba |
1.5 |
## G1 or G3 = 94^2 graphic character set from ISO-IR |
| 117 |
wakaba |
1.2 |
} elsif (0xE9F6C0 <= $cc && $cc <= 0xF06F80) { |
| 118 |
|
|
my $c = $cc - 0xE9F6C0; my $F = chr (($c / 8836)+0x30); |
| 119 |
|
|
if ($C->{G1} eq $Encode::Charset::CHARSET{G94n}->{ $F }) { |
| 120 |
|
|
my ($c1, $c2) = ((($c % 8836) / 94)+0x21, ($c % 94)+0x21); |
| 121 |
|
|
$t = pack ('CC', (($c1 - 1) >> 1) + ($c1 < 0x5F ? 0x71 : 0xB1), |
| 122 |
|
|
$c2 + (($c1 & 1) ? ($c2 < 0x60 ? 0x1F : 0x20) : 0x7E)); |
| 123 |
|
|
} elsif ($C->{G3} eq $Encode::Charset::CHARSET{G94n}->{ $F }) { |
| 124 |
|
|
my ($c1, $c2) = ((($c % 8836) / 94)+0x21, ($c % 94)+0x21); |
| 125 |
|
|
if ($C->{G3}->{Csjis_first}) { |
| 126 |
wakaba |
1.5 |
my $fb = $C->{G3}->{Csjis_first}->{ ($c % 8836) / 94 }; |
| 127 |
|
|
$t = pack ('CC', $fb, $c2 + (($c1 & 1) ? ($c2 < 0x60 ? 0x1F : 0x20) : 0x7E)) if $fb; |
| 128 |
wakaba |
1.2 |
} else { |
| 129 |
|
|
$t = pack ('CC', ($c / 188) + 0xF0, |
| 130 |
|
|
$c2 + (($c1 & 1) ? ($c2 < 0x60 ? 0x1F : 0x20) : 0x7E)) |
| 131 |
|
|
if ($c / 188) + 0xF0 < 0xFD; |
| 132 |
|
|
} |
| 133 |
|
|
} |
| 134 |
wakaba |
1.5 |
## G1 = JIS X 0208-1990/:1997 |
| 135 |
wakaba |
1.2 |
} elsif (0xF49D7C <= $cc && $cc <= 0xF4BFFF) { |
| 136 |
|
|
my $c = $cc - 0xF49D7C; |
| 137 |
|
|
if ($C->{G1} eq $Encode::Charset::CHARSET{G94n}->{'B@'}) { |
| 138 |
|
|
my ($c1, $c2) = ((($c % 8836) / 94)+0x21, ($c % 94)+0x21); |
| 139 |
|
|
$t = pack ('CC', (($c1 - 1) >> 1) + ($c1 < 0x5F ? 0x71 : 0xB1), |
| 140 |
|
|
$c2 + (($c1 & 1) ? ($c2 < 0x60 ? 0x1F : 0x20) : 0x7E)); |
| 141 |
|
|
} |
| 142 |
wakaba |
1.5 |
## GL = G0 = ISO/IEC 646 graphic character set / GR = G2 = JIS X 0201 Katakana set |
| 143 |
wakaba |
1.2 |
} elsif (0xE90940 <= $cc && $cc <= 0xE92641) { |
| 144 |
|
|
my $c = $cc - 0xE90940; my $F = chr (($c / 94)+0x30); |
| 145 |
|
|
if ($C->{ $C->{GL} } eq $Encode::Charset::CHARSET{G94}->{ $F }) { |
| 146 |
wakaba |
1.3 |
$t = pack 'C', (($c % 94) + 0x21); |
| 147 |
wakaba |
1.2 |
} elsif ($C->{ $C->{GR} } eq $Encode::Charset::CHARSET{G94}->{ $F }) { |
| 148 |
wakaba |
1.3 |
$t = pack 'C', (($c % 94) + 0xA1) if ($c % 94) < 0x3F; |
| 149 |
wakaba |
1.2 |
} |
| 150 |
wakaba |
1.5 |
## G1 / G3 = 94^2 graphic character set |
| 151 |
wakaba |
1.2 |
} elsif (0x70420000 <= $cc && $cc <= 0x7046F19B) { |
| 152 |
|
|
my $c = $cc % 0x10000; |
| 153 |
|
|
my $F0=$C->{option}->{private_set}->{G94n}->[($cc/0x10000)-0x7042]->[$c/8836]; |
| 154 |
|
|
my $F1 = 'P'.(($cc / 0x10000) - 0x7042).'_'.($c / 8836); |
| 155 |
|
|
if ($C->{G3} eq $Encode::Charset::CHARSET{G94n}->{ $F0 } |
| 156 |
|
|
|| $C->{G3} eq $Encode::Charset::CHARSET{G94n}->{ $F1 }) { |
| 157 |
|
|
my ($c1, $c2) = ((($c % 8836) / 94)+0x21, ($c % 94)+0x21); |
| 158 |
|
|
if ($C->{G3}->{Csjis_first}) { |
| 159 |
wakaba |
1.5 |
my $fb = $C->{G3}->{Csjis_first}->{ ($c % 8836) / 94 }; |
| 160 |
|
|
$t = pack ('CC', $fb, $c2 + (($c1 & 1) ? ($c2 < 0x60 ? 0x1F : 0x20) : 0x7E)) if $fb; |
| 161 |
wakaba |
1.2 |
} else { |
| 162 |
|
|
$t = pack ('CC', ($c / 188) + 0xF0, |
| 163 |
|
|
$c2 + (($c1 & 1) ? ($c2 < 0x60 ? 0x1F : 0x20) : 0x7E)) |
| 164 |
|
|
if ($c / 188) + 0xF0 < 0xFD; |
| 165 |
|
|
} |
| 166 |
|
|
} |
| 167 |
wakaba |
1.6 |
## Non-ISO/IEC 2022 Coded Character Sets Mapping Area |
| 168 |
|
|
} elsif (0x71000000 <= $cc && $cc <= 0x71FFFFFF) { |
| 169 |
|
|
if ($C->{G3}->{ucs} <= $cc) { |
| 170 |
|
|
my $c = $cc - $C->{G3}->{ucs}; |
| 171 |
|
|
my $f = $C->{G3}->{Csjis_first}->{$c / $C->{G3}->{chars}}; |
| 172 |
|
|
if ($f) { |
| 173 |
|
|
my $s = $c % $C->{G3}->{chars}; |
| 174 |
|
|
$t = pack ('CC', $f, 0x40 + $s + ($s > 62)); |
| 175 |
|
|
} |
| 176 |
|
|
} |
| 177 |
wakaba |
1.5 |
## Other character sets are not supported now (and there is no plan to implement them). |
| 178 |
wakaba |
1.2 |
} |
| 179 |
|
|
|
| 180 |
wakaba |
1.5 |
## Output the character itself |
| 181 |
wakaba |
1.2 |
if (defined $t) { |
| 182 |
|
|
$r .= $t; |
| 183 |
wakaba |
1.5 |
## Output the character itself with mapping table of special code positions |
| 184 |
wakaba |
1.2 |
} elsif ($C->{GsmapR}->{ $c }) { |
| 185 |
|
|
$r .= $C->{GsmapR}->{ $c }; |
| 186 |
wakaba |
1.5 |
} elsif ($C->{option}->{fallback_from_ucs} =~ /quiet/) { |
| 187 |
|
|
return ($r, halfway => 1, converted_length => $i, |
| 188 |
|
|
warn => $C->{option}->{fallback_from_ucs} =~ /warn/ ? 1 : 0, |
| 189 |
|
|
reason => sprintf (q(U+%04X: There is no character mapped to), $cc)); |
| 190 |
|
|
} elsif ($C->{option}->{fallback_from_ucs} eq 'croak') { |
| 191 |
|
|
return ($r, halfway => 1, die => 1, |
| 192 |
|
|
reason => sprintf (q(U+%04X: There is no character mapped to), $cc)); |
| 193 |
|
|
## |
| 194 |
wakaba |
1.2 |
} else { |
| 195 |
wakaba |
1.5 |
## Try to output with fallback escape sequence (if specified) |
| 196 |
wakaba |
1.6 |
my $t = Encode::Charset->fallback_escape ($C, $c); |
| 197 |
wakaba |
1.5 |
if (defined $t) { |
| 198 |
|
|
my %D = (fallback => $C->{option}->{fallback_from_ucs}, reset => $C->{option}->{reset}); |
| 199 |
|
|
$C->{option}->{fallback_from_ucs} = 'croak'; |
| 200 |
|
|
eval q{$t = $C->{_encoder}->_encode_internal ($t, $C)} or undef $t; |
| 201 |
|
|
$C->{option}->{fallback_from_ucs} = $D{fallback}; |
| 202 |
|
|
} |
| 203 |
|
|
if (defined $t) { |
| 204 |
|
|
$r .= $t; |
| 205 |
|
|
} else { ## Replacement character specified in charset definition |
| 206 |
|
|
$r .= $C->{option}->{undef_char_sjis} || "\x3F"; |
| 207 |
|
|
} |
| 208 |
wakaba |
1.2 |
} |
| 209 |
|
|
} |
| 210 |
|
|
$r; |
| 211 |
|
|
} |
| 212 |
|
|
|
| 213 |
wakaba |
1.6 |
sub page_to_internal ($$) { |
| 214 |
|
|
my ($C, $s) = @_; |
| 215 |
|
|
$s = pack ('U*', unpack ('C*', $s)); |
| 216 |
|
|
$s =~ s(\x1B\x24([EFGOPQ])([\x21-\x7E]+)\x0F)( |
| 217 |
|
|
my $page = {qw/E 1 F 2 G 3 O 4 P 5 Q 6/}->{$1}; |
| 218 |
|
|
my $r = ''; |
| 219 |
|
|
for my $c (split //, $2) { |
| 220 |
|
|
$r .= chr ($Encode::Charset::CHARSET{G94}->{'CSpictogram_page_'.$page}->{ucs} + ord ($c) - 0x21); |
| 221 |
|
|
} |
| 222 |
|
|
$r; |
| 223 |
|
|
)gex; |
| 224 |
|
|
$s; |
| 225 |
|
|
} |
| 226 |
|
|
|
| 227 |
|
|
sub _internal_to_page ($$$%) { |
| 228 |
|
|
my ($yourself, $C, $c, $option) = @_; |
| 229 |
|
|
my $cc = ord $c; |
| 230 |
|
|
for my $page (1..6) { |
| 231 |
|
|
my $cs = $Encode::Charset::CHARSET{G94}->{'CSpictogram_page_'.$page}; |
| 232 |
|
|
if ($cs->{ucs} <= $cc && $cc < $cs->{ucs} + $cs->{chars} * $cs->{dimension}) { |
| 233 |
|
|
return "\x1B\x24" . ([qw/_ E F G O P Q/]->[$page]) |
| 234 |
|
|
.pack ('C', 0x21 + $cc - $cs->{ucs}) . "\x0F"; |
| 235 |
|
|
} |
| 236 |
|
|
} |
| 237 |
|
|
## $c is not a pictogram |
| 238 |
|
|
$option->{fallback_from_ucs} = $C->{option}->{fallback_from_ucs_2}; |
| 239 |
|
|
$yourself->fallback_escape ($C, $c, %$option); |
| 240 |
|
|
} |
| 241 |
|
|
|
| 242 |
wakaba |
1.2 |
=back |
| 243 |
|
|
|
| 244 |
wakaba |
1.1 |
=head1 SEE ALSO |
| 245 |
|
|
|
| 246 |
|
|
JIS X 0208:1997, "7-bit and 8-bit double byte coded Kanji |
| 247 |
|
|
set for information interchange", Japan Industrial Standards |
| 248 |
|
|
Committee (JISC) <http://www.jisc.go.jp/>, 1997. |
| 249 |
|
|
|
| 250 |
|
|
JIS X 0213:2000, "7-bit and 8-bit double byte coded extended Kanji |
| 251 |
|
|
sets for information interchange", Japan Industrial Standards |
| 252 |
|
|
Committee (JISC) <http://www.jisc.go.jp/>, 2000. |
| 253 |
|
|
|
| 254 |
wakaba |
1.5 |
L<Encode::SJIS::JIS> |
| 255 |
|
|
|
| 256 |
|
|
L<Encode>, L<perlunicode> |
| 257 |
wakaba |
1.1 |
|
| 258 |
wakaba |
1.5 |
L<Encode::Charset>, L<Encode::ISO2022> |
| 259 |
wakaba |
1.1 |
|
| 260 |
|
|
=head1 LICENSE |
| 261 |
|
|
|
| 262 |
wakaba |
1.5 |
Copyright 2002 Nanashi-san <nanashi-san@nanashi.invalid> |
| 263 |
wakaba |
1.1 |
|
| 264 |
|
|
This library is free software; you can redistribute it |
| 265 |
|
|
and/or modify it under the same terms as Perl itself. |
| 266 |
|
|
|
| 267 |
|
|
=cut |
| 268 |
|
|
|
| 269 |
wakaba |
1.6 |
1; # $Date: 2002/12/16 10:25:01 $ |