1 |
wakaba |
1.1 |
|
2 |
|
|
=head1 NAME |
3 |
|
|
|
4 |
|
|
Encode::SJIS --- Shift JIS coding systems encoder and decoder |
5 |
|
|
|
6 |
|
|
=head1 ENCODINGS |
7 |
|
|
|
8 |
wakaba |
1.5 |
This module defines encoding engine for Shift JIS coding systems. |
9 |
|
|
This module only provides general en/decoding parts. Actual profiles |
10 |
|
|
for Shift JISes are included in Encode::SJIS::*. |
11 |
wakaba |
1.1 |
|
12 |
|
|
=over 4 |
13 |
|
|
|
14 |
|
|
=cut |
15 |
|
|
|
16 |
|
|
package Encode::SJIS; |
17 |
|
|
use 5.7.3; |
18 |
|
|
use strict; |
19 |
wakaba |
1.5 |
our $VERSION=do{my @r=(q$Revision: 1.4 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r}; |
20 |
wakaba |
1.1 |
require Encode::Charset; |
21 |
wakaba |
1.2 |
use base qw(Encode::Encoding); |
22 |
wakaba |
1.1 |
|
23 |
wakaba |
1.5 |
*new_object = \&Encode::Charset::new_object_sjis; |
24 |
wakaba |
1.1 |
|
25 |
wakaba |
1.5 |
## Code extention escape sequence defined by ISO/IEC 2022 is |
26 |
|
|
## not supported in this version of this module. |
27 |
wakaba |
1.1 |
|
28 |
|
|
sub sjis_to_internal ($$) { |
29 |
|
|
my ($s, $C) = @_; |
30 |
|
|
$C ||= &new_object; |
31 |
|
|
$s =~ s{ |
32 |
|
|
([\x00-\x7F\xA1-\xDF]) |
33 |
|
|
# ([\xA1-\xDF]) |
34 |
|
|
|([\x81-\x9F\xE0-\xFC][\x40-\x7E\x80-\xFC]) |
35 |
|
|
|\x1B([\x40-\x5F]) |
36 |
|
|
|([\x80-\xFF]) ## Broken or supplemental 1-byte character |
37 |
|
|
}{ |
38 |
|
|
my ($c7, $c2, $c1, $c8) = ($1, $2, $3, $4); |
39 |
|
|
if (defined $c7) { |
40 |
|
|
if ($c7 =~ /([\x21-\x7E])/) { |
41 |
|
|
chr ($C->{ $C->{GL} }->{ucs} + ord ($1) - 0x21); |
42 |
|
|
} elsif ($c7 =~ /([\x00-\x1F])/) { |
43 |
|
|
chr ($C->{ $C->{CL} }->{ucs} + ord ($1)); |
44 |
|
|
} elsif ($C->{GR} && $c7 =~ /([\xA1-\xDF])/) { |
45 |
|
|
chr ($C->{ $C->{GR} }->{ucs} + ord ($1) - 0xA1); |
46 |
|
|
} else { ## 0x20, 0x7F |
47 |
|
|
$C->{Gsmap}->{ $c7 } || $c7; |
48 |
|
|
} |
49 |
|
|
} elsif ($c2) { |
50 |
|
|
if ($c2 =~ /([\x81-\xEF])(.)/) { |
51 |
|
|
my ($f, $s) = (ord $1, ord $2); |
52 |
|
|
$f -= $f < 0xA0 ? 0x81 : 0xC1; $s -= 0x40 + ($s > 0x7F); |
53 |
|
|
chr ($C->{G1}->{ucs} + $f * 188 + $s); |
54 |
|
|
} else { ## [\xF0-\xFC]. |
55 |
|
|
my ($f, $s) = unpack ('CC', $c2); |
56 |
|
|
if ($C->{G3}->{Csjis_kuE}) { |
57 |
|
|
$f = $s > 0x9E ? $C->{G3}->{Csjis_kuE}->{ $f }: |
58 |
|
|
$C->{G3}->{Csjis_kuO}->{ $f }; |
59 |
|
|
$s -= ($s > 0x9E ? 0x9F : $s > 0x7F ? 0x41 : 0x40); |
60 |
|
|
chr ($C->{G3}->{ucs} + $f * 94 + $s); |
61 |
|
|
} else { |
62 |
|
|
$f -= 0xF0; $s -= 0x40 + ($s > 0x7F); |
63 |
|
|
chr ($C->{G3}->{ucs} + $f * 188 + $s); |
64 |
|
|
} |
65 |
|
|
} |
66 |
|
|
} elsif ($c1) { ## ESC Fe |
67 |
|
|
chr ($C->{ $C->{ESC_Fe} }->{ucs} + ord ($c1) - 0x40); |
68 |
|
|
} else { # $C8 |
69 |
|
|
$C->{Gsmap}->{ $c8 } || $c8; |
70 |
|
|
} |
71 |
|
|
}gex; |
72 |
|
|
$s; |
73 |
|
|
} |
74 |
|
|
|
75 |
wakaba |
1.2 |
sub internal_to_sjis ($\%) { |
76 |
|
|
use integer; |
77 |
|
|
my ($s, $C) = @_; |
78 |
|
|
$C ||= &new_object; |
79 |
|
|
|
80 |
|
|
my $r = ''; |
81 |
wakaba |
1.5 |
my @c = split //, $s; |
82 |
|
|
for my $i (0..$#c) { |
83 |
|
|
my $c = $c[$i]; my $cc = ord $c; Encode::_utf8_off ($c); |
84 |
wakaba |
1.2 |
my $t; |
85 |
wakaba |
1.5 |
## CL = C0 control characters |
86 |
wakaba |
1.2 |
if ($cc <= 0x1F) { |
87 |
|
|
$t = $c if $C->{ $C->{CL} } eq $Encode::Charset::CHARSET{C0}->{"\x40"}; |
88 |
wakaba |
1.5 |
## 0x20 == SP and 0x7E == DEL |
89 |
wakaba |
1.2 |
} elsif ($cc == 0x20 || $cc == 0x7F) { |
90 |
|
|
$t = $c; |
91 |
wakaba |
1.5 |
## GL = G0 = ISO/IEC 646 graphic character set |
92 |
wakaba |
1.2 |
} elsif ($cc < 0x7F) { |
93 |
|
|
$t = $c if $C->{ $C->{GL} } eq $Encode::Charset::CHARSET{G94}->{"\x42"}; |
94 |
wakaba |
1.5 |
## 0x80 |
95 |
wakaba |
1.2 |
} elsif ($C->{option}->{C1invoke_to_right} && $cc == 0x80) { |
96 |
wakaba |
1.3 |
$t = "\x80" |
97 |
|
|
if $C->{ $C->{CR} } eq $Encode::Charset::CHARSET{C1}->{'64291991C1'}; |
98 |
wakaba |
1.5 |
## ESC Fe = C1 control characters |
99 |
wakaba |
1.2 |
} elsif ($cc <= 0x9F) { |
100 |
wakaba |
1.3 |
$t = "\x1B".pack 'C', ($cc - 0x40) |
101 |
wakaba |
1.2 |
if $C->{ $C->{ESC_Fe} } eq $Encode::Charset::CHARSET{C1}->{'64291991C1'}; |
102 |
wakaba |
1.5 |
## G1 or G3 = 94^2 graphic character set from ISO-IR |
103 |
wakaba |
1.2 |
} elsif (0xE9F6C0 <= $cc && $cc <= 0xF06F80) { |
104 |
|
|
my $c = $cc - 0xE9F6C0; my $F = chr (($c / 8836)+0x30); |
105 |
|
|
if ($C->{G1} eq $Encode::Charset::CHARSET{G94n}->{ $F }) { |
106 |
|
|
my ($c1, $c2) = ((($c % 8836) / 94)+0x21, ($c % 94)+0x21); |
107 |
|
|
$t = pack ('CC', (($c1 - 1) >> 1) + ($c1 < 0x5F ? 0x71 : 0xB1), |
108 |
|
|
$c2 + (($c1 & 1) ? ($c2 < 0x60 ? 0x1F : 0x20) : 0x7E)); |
109 |
|
|
} elsif ($C->{G3} eq $Encode::Charset::CHARSET{G94n}->{ $F }) { |
110 |
|
|
my ($c1, $c2) = ((($c % 8836) / 94)+0x21, ($c % 94)+0x21); |
111 |
|
|
if ($C->{G3}->{Csjis_first}) { |
112 |
wakaba |
1.5 |
my $fb = $C->{G3}->{Csjis_first}->{ ($c % 8836) / 94 }; |
113 |
|
|
$t = pack ('CC', $fb, $c2 + (($c1 & 1) ? ($c2 < 0x60 ? 0x1F : 0x20) : 0x7E)) if $fb; |
114 |
wakaba |
1.2 |
} else { |
115 |
|
|
$t = pack ('CC', ($c / 188) + 0xF0, |
116 |
|
|
$c2 + (($c1 & 1) ? ($c2 < 0x60 ? 0x1F : 0x20) : 0x7E)) |
117 |
|
|
if ($c / 188) + 0xF0 < 0xFD; |
118 |
|
|
} |
119 |
|
|
} |
120 |
wakaba |
1.5 |
## G1 = JIS X 0208-1990/:1997 |
121 |
wakaba |
1.2 |
} elsif (0xF49D7C <= $cc && $cc <= 0xF4BFFF) { |
122 |
|
|
my $c = $cc - 0xF49D7C; |
123 |
|
|
if ($C->{G1} eq $Encode::Charset::CHARSET{G94n}->{'B@'}) { |
124 |
|
|
my ($c1, $c2) = ((($c % 8836) / 94)+0x21, ($c % 94)+0x21); |
125 |
|
|
$t = pack ('CC', (($c1 - 1) >> 1) + ($c1 < 0x5F ? 0x71 : 0xB1), |
126 |
|
|
$c2 + (($c1 & 1) ? ($c2 < 0x60 ? 0x1F : 0x20) : 0x7E)); |
127 |
|
|
} |
128 |
wakaba |
1.5 |
## GL = G0 = ISO/IEC 646 graphic character set / GR = G2 = JIS X 0201 Katakana set |
129 |
wakaba |
1.2 |
} elsif (0xE90940 <= $cc && $cc <= 0xE92641) { |
130 |
|
|
my $c = $cc - 0xE90940; my $F = chr (($c / 94)+0x30); |
131 |
|
|
if ($C->{ $C->{GL} } eq $Encode::Charset::CHARSET{G94}->{ $F }) { |
132 |
wakaba |
1.3 |
$t = pack 'C', (($c % 94) + 0x21); |
133 |
wakaba |
1.2 |
} elsif ($C->{ $C->{GR} } eq $Encode::Charset::CHARSET{G94}->{ $F }) { |
134 |
wakaba |
1.3 |
$t = pack 'C', (($c % 94) + 0xA1) if ($c % 94) < 0x3F; |
135 |
wakaba |
1.2 |
} |
136 |
wakaba |
1.5 |
## G1 / G3 = 94^2 graphic character set |
137 |
wakaba |
1.2 |
} elsif (0x70420000 <= $cc && $cc <= 0x7046F19B) { |
138 |
|
|
my $c = $cc % 0x10000; |
139 |
|
|
my $F0=$C->{option}->{private_set}->{G94n}->[($cc/0x10000)-0x7042]->[$c/8836]; |
140 |
|
|
my $F1 = 'P'.(($cc / 0x10000) - 0x7042).'_'.($c / 8836); |
141 |
|
|
if ($C->{G3} eq $Encode::Charset::CHARSET{G94n}->{ $F0 } |
142 |
|
|
|| $C->{G3} eq $Encode::Charset::CHARSET{G94n}->{ $F1 }) { |
143 |
|
|
my ($c1, $c2) = ((($c % 8836) / 94)+0x21, ($c % 94)+0x21); |
144 |
|
|
if ($C->{G3}->{Csjis_first}) { |
145 |
wakaba |
1.5 |
my $fb = $C->{G3}->{Csjis_first}->{ ($c % 8836) / 94 }; |
146 |
|
|
$t = pack ('CC', $fb, $c2 + (($c1 & 1) ? ($c2 < 0x60 ? 0x1F : 0x20) : 0x7E)) if $fb; |
147 |
wakaba |
1.2 |
} else { |
148 |
|
|
$t = pack ('CC', ($c / 188) + 0xF0, |
149 |
|
|
$c2 + (($c1 & 1) ? ($c2 < 0x60 ? 0x1F : 0x20) : 0x7E)) |
150 |
|
|
if ($c / 188) + 0xF0 < 0xFD; |
151 |
|
|
} |
152 |
|
|
} |
153 |
wakaba |
1.5 |
## Other character sets are not supported now (and there is no plan to implement them). |
154 |
wakaba |
1.2 |
} |
155 |
|
|
|
156 |
wakaba |
1.5 |
## Output the character itself |
157 |
wakaba |
1.2 |
if (defined $t) { |
158 |
|
|
$r .= $t; |
159 |
wakaba |
1.5 |
## Output the character itself with mapping table of special code positions |
160 |
wakaba |
1.2 |
} elsif ($C->{GsmapR}->{ $c }) { |
161 |
|
|
$r .= $C->{GsmapR}->{ $c }; |
162 |
wakaba |
1.5 |
} elsif ($C->{option}->{fallback_from_ucs} =~ /quiet/) { |
163 |
|
|
return ($r, halfway => 1, converted_length => $i, |
164 |
|
|
warn => $C->{option}->{fallback_from_ucs} =~ /warn/ ? 1 : 0, |
165 |
|
|
reason => sprintf (q(U+%04X: There is no character mapped to), $cc)); |
166 |
|
|
} elsif ($C->{option}->{fallback_from_ucs} eq 'croak') { |
167 |
|
|
return ($r, halfway => 1, die => 1, |
168 |
|
|
reason => sprintf (q(U+%04X: There is no character mapped to), $cc)); |
169 |
|
|
## |
170 |
wakaba |
1.2 |
} else { |
171 |
wakaba |
1.5 |
## Try to output with fallback escape sequence (if specified) |
172 |
|
|
my $t = Encode::Charset::fallback_escape ($C, $c); |
173 |
|
|
if (defined $t) { |
174 |
|
|
my %D = (fallback => $C->{option}->{fallback_from_ucs}, reset => $C->{option}->{reset}); |
175 |
|
|
$C->{option}->{fallback_from_ucs} = 'croak'; |
176 |
|
|
$C->{option}->{reset} = {Gdesignation => 0, Ginvoke => 0}; |
177 |
|
|
eval q{$t = $C->{_encoder}->_encode_internal ($t, $C)} or undef $t; |
178 |
|
|
$C->{option}->{fallback_from_ucs} = $D{fallback}; |
179 |
|
|
$C->{option}->{reset} = $D{reset}; |
180 |
|
|
} |
181 |
|
|
if (defined $t) { |
182 |
|
|
$r .= $t; |
183 |
|
|
} else { ## Replacement character specified in charset definition |
184 |
|
|
$r .= $C->{option}->{undef_char_sjis} || "\x3F"; |
185 |
|
|
} |
186 |
wakaba |
1.2 |
} |
187 |
|
|
} |
188 |
|
|
$r; |
189 |
|
|
} |
190 |
|
|
|
191 |
|
|
=back |
192 |
|
|
|
193 |
wakaba |
1.1 |
=head1 SEE ALSO |
194 |
|
|
|
195 |
|
|
JIS X 0208:1997, "7-bit and 8-bit double byte coded Kanji |
196 |
|
|
set for information interchange", Japan Industrial Standards |
197 |
|
|
Committee (JISC) <http://www.jisc.go.jp/>, 1997. |
198 |
|
|
|
199 |
|
|
JIS X 0213:2000, "7-bit and 8-bit double byte coded extended Kanji |
200 |
|
|
sets for information interchange", Japan Industrial Standards |
201 |
|
|
Committee (JISC) <http://www.jisc.go.jp/>, 2000. |
202 |
|
|
|
203 |
wakaba |
1.5 |
L<Encode::SJIS::JIS> |
204 |
|
|
|
205 |
|
|
L<Encode>, L<perlunicode> |
206 |
wakaba |
1.1 |
|
207 |
wakaba |
1.5 |
L<Encode::Charset>, L<Encode::ISO2022> |
208 |
wakaba |
1.1 |
|
209 |
|
|
=head1 LICENSE |
210 |
|
|
|
211 |
wakaba |
1.5 |
Copyright 2002 Nanashi-san <nanashi-san@nanashi.invalid> |
212 |
wakaba |
1.1 |
|
213 |
|
|
This library is free software; you can redistribute it |
214 |
|
|
and/or modify it under the same terms as Perl itself. |
215 |
|
|
|
216 |
|
|
=cut |
217 |
|
|
|
218 |
wakaba |
1.5 |
1; # $Date: 2002/12/12 08:17:16 $ |