1 |
wakaba |
1.1 |
|
2 |
|
|
=head1 NAME |
3 |
|
|
|
4 |
|
|
Encode::SJIS --- Shift JIS coding systems encoder and decoder |
5 |
|
|
|
6 |
|
|
=head1 ENCODINGS |
7 |
|
|
|
8 |
wakaba |
1.5 |
This module defines encoding engine for Shift JIS coding systems. |
9 |
|
|
This module only provides general en/decoding parts. Actual profiles |
10 |
|
|
for Shift JISes are included in Encode::SJIS::*. |
11 |
wakaba |
1.1 |
|
12 |
|
|
=over 4 |
13 |
|
|
|
14 |
|
|
=cut |
15 |
|
|
|
16 |
|
|
package Encode::SJIS; |
17 |
|
|
use 5.7.3; |
18 |
|
|
use strict; |
19 |
wakaba |
1.6 |
our $VERSION=do{my @r=(q$Revision: 1.5 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r}; |
20 |
wakaba |
1.1 |
require Encode::Charset; |
21 |
wakaba |
1.2 |
use base qw(Encode::Encoding); |
22 |
wakaba |
1.1 |
|
23 |
wakaba |
1.5 |
*new_object = \&Encode::Charset::new_object_sjis; |
24 |
wakaba |
1.1 |
|
25 |
wakaba |
1.5 |
## Code extention escape sequence defined by ISO/IEC 2022 is |
26 |
|
|
## not supported in this version of this module. |
27 |
wakaba |
1.1 |
|
28 |
|
|
sub sjis_to_internal ($$) { |
29 |
|
|
my ($s, $C) = @_; |
30 |
|
|
$C ||= &new_object; |
31 |
|
|
$s =~ s{ |
32 |
|
|
([\x00-\x7F\xA1-\xDF]) |
33 |
|
|
# ([\xA1-\xDF]) |
34 |
|
|
|([\x81-\x9F\xE0-\xFC][\x40-\x7E\x80-\xFC]) |
35 |
|
|
|\x1B([\x40-\x5F]) |
36 |
|
|
|([\x80-\xFF]) ## Broken or supplemental 1-byte character |
37 |
|
|
}{ |
38 |
|
|
my ($c7, $c2, $c1, $c8) = ($1, $2, $3, $4); |
39 |
|
|
if (defined $c7) { |
40 |
|
|
if ($c7 =~ /([\x21-\x7E])/) { |
41 |
|
|
chr ($C->{ $C->{GL} }->{ucs} + ord ($1) - 0x21); |
42 |
|
|
} elsif ($c7 =~ /([\x00-\x1F])/) { |
43 |
|
|
chr ($C->{ $C->{CL} }->{ucs} + ord ($1)); |
44 |
|
|
} elsif ($C->{GR} && $c7 =~ /([\xA1-\xDF])/) { |
45 |
|
|
chr ($C->{ $C->{GR} }->{ucs} + ord ($1) - 0xA1); |
46 |
|
|
} else { ## 0x20, 0x7F |
47 |
|
|
$C->{Gsmap}->{ $c7 } || $c7; |
48 |
|
|
} |
49 |
|
|
} elsif ($c2) { |
50 |
|
|
if ($c2 =~ /([\x81-\xEF])(.)/) { |
51 |
|
|
my ($f, $s) = (ord $1, ord $2); |
52 |
|
|
$f -= $f < 0xA0 ? 0x81 : 0xC1; $s -= 0x40 + ($s > 0x7F); |
53 |
|
|
chr ($C->{G1}->{ucs} + $f * 188 + $s); |
54 |
|
|
} else { ## [\xF0-\xFC]. |
55 |
wakaba |
1.6 |
my ($f, $s) = (ord substr ($c2, 0, 1), ord substr ($c2, 1, 1)); |
56 |
|
|
if ($C->{G3}->{Csjis_kuE}) { ## 94^2 set with first-byte->ku mapping |
57 |
|
|
my $F = $s > 0x9E ? $C->{G3}->{Csjis_kuE}->{ $f }: ## ku of even number |
58 |
|
|
$C->{G3}->{Csjis_kuO}->{ $f }; ## ku of odd number |
59 |
|
|
if (defined $F) { |
60 |
|
|
$s -= ($s > 0x9E ? 0x9F : $s > 0x7F ? 0x41 : 0x40); |
61 |
|
|
chr ($C->{G3}->{ucs} + $F * 94 + $s); |
62 |
|
|
} else { ## Mapping is not defined |
63 |
|
|
$f -= 0xF0; $s -= 0x40 + ($s > 0x7F); |
64 |
|
|
chr ($Encode::Charset::CHARSET{G94n}->{"\x20\x40"}->{ucs} + $f * 188 + $s); |
65 |
|
|
} |
66 |
|
|
} elsif ($C->{G3}->{Csjis_ku}) { ## n^2 set with first-byte->ku mapping |
67 |
|
|
if (defined $C->{G3}->{Csjis_ku}->{ $f }) { |
68 |
|
|
$f = $C->{G3}->{Csjis_ku}->{ $f }; |
69 |
|
|
$s -= ($s > 0x9E ? 0x9F : $s > 0x7F ? 0x41 : 0x40); |
70 |
|
|
chr ($C->{G3}->{ucs} + $f * $C->{G3}->{chars} + $s); |
71 |
|
|
} else { ## Mapping is not defined |
72 |
|
|
$f -= 0xF0; $s -= 0x40 + ($s > 0x7F); |
73 |
|
|
chr ($Encode::Charset::CHARSET{G94n}->{"\x20\x40"}->{ucs} + $f * 188 + $s); |
74 |
|
|
} |
75 |
|
|
} else { ## 94^2 set without special mapping information |
76 |
wakaba |
1.1 |
$f -= 0xF0; $s -= 0x40 + ($s > 0x7F); |
77 |
|
|
chr ($C->{G3}->{ucs} + $f * 188 + $s); |
78 |
|
|
} |
79 |
|
|
} |
80 |
|
|
} elsif ($c1) { ## ESC Fe |
81 |
|
|
chr ($C->{ $C->{ESC_Fe} }->{ucs} + ord ($c1) - 0x40); |
82 |
|
|
} else { # $C8 |
83 |
|
|
$C->{Gsmap}->{ $c8 } || $c8; |
84 |
|
|
} |
85 |
|
|
}gex; |
86 |
|
|
$s; |
87 |
|
|
} |
88 |
|
|
|
89 |
wakaba |
1.2 |
sub internal_to_sjis ($\%) { |
90 |
|
|
use integer; |
91 |
|
|
my ($s, $C) = @_; |
92 |
|
|
$C ||= &new_object; |
93 |
|
|
|
94 |
|
|
my $r = ''; |
95 |
wakaba |
1.5 |
my @c = split //, $s; |
96 |
|
|
for my $i (0..$#c) { |
97 |
|
|
my $c = $c[$i]; my $cc = ord $c; Encode::_utf8_off ($c); |
98 |
wakaba |
1.2 |
my $t; |
99 |
wakaba |
1.5 |
## CL = C0 control characters |
100 |
wakaba |
1.2 |
if ($cc <= 0x1F) { |
101 |
|
|
$t = $c if $C->{ $C->{CL} } eq $Encode::Charset::CHARSET{C0}->{"\x40"}; |
102 |
wakaba |
1.5 |
## 0x20 == SP and 0x7E == DEL |
103 |
wakaba |
1.2 |
} elsif ($cc == 0x20 || $cc == 0x7F) { |
104 |
|
|
$t = $c; |
105 |
wakaba |
1.5 |
## GL = G0 = ISO/IEC 646 graphic character set |
106 |
wakaba |
1.2 |
} elsif ($cc < 0x7F) { |
107 |
|
|
$t = $c if $C->{ $C->{GL} } eq $Encode::Charset::CHARSET{G94}->{"\x42"}; |
108 |
wakaba |
1.5 |
## 0x80 |
109 |
wakaba |
1.2 |
} elsif ($C->{option}->{C1invoke_to_right} && $cc == 0x80) { |
110 |
wakaba |
1.3 |
$t = "\x80" |
111 |
|
|
if $C->{ $C->{CR} } eq $Encode::Charset::CHARSET{C1}->{'64291991C1'}; |
112 |
wakaba |
1.5 |
## ESC Fe = C1 control characters |
113 |
wakaba |
1.2 |
} elsif ($cc <= 0x9F) { |
114 |
wakaba |
1.3 |
$t = "\x1B".pack 'C', ($cc - 0x40) |
115 |
wakaba |
1.2 |
if $C->{ $C->{ESC_Fe} } eq $Encode::Charset::CHARSET{C1}->{'64291991C1'}; |
116 |
wakaba |
1.5 |
## G1 or G3 = 94^2 graphic character set from ISO-IR |
117 |
wakaba |
1.2 |
} elsif (0xE9F6C0 <= $cc && $cc <= 0xF06F80) { |
118 |
|
|
my $c = $cc - 0xE9F6C0; my $F = chr (($c / 8836)+0x30); |
119 |
|
|
if ($C->{G1} eq $Encode::Charset::CHARSET{G94n}->{ $F }) { |
120 |
|
|
my ($c1, $c2) = ((($c % 8836) / 94)+0x21, ($c % 94)+0x21); |
121 |
|
|
$t = pack ('CC', (($c1 - 1) >> 1) + ($c1 < 0x5F ? 0x71 : 0xB1), |
122 |
|
|
$c2 + (($c1 & 1) ? ($c2 < 0x60 ? 0x1F : 0x20) : 0x7E)); |
123 |
|
|
} elsif ($C->{G3} eq $Encode::Charset::CHARSET{G94n}->{ $F }) { |
124 |
|
|
my ($c1, $c2) = ((($c % 8836) / 94)+0x21, ($c % 94)+0x21); |
125 |
|
|
if ($C->{G3}->{Csjis_first}) { |
126 |
wakaba |
1.5 |
my $fb = $C->{G3}->{Csjis_first}->{ ($c % 8836) / 94 }; |
127 |
|
|
$t = pack ('CC', $fb, $c2 + (($c1 & 1) ? ($c2 < 0x60 ? 0x1F : 0x20) : 0x7E)) if $fb; |
128 |
wakaba |
1.2 |
} else { |
129 |
|
|
$t = pack ('CC', ($c / 188) + 0xF0, |
130 |
|
|
$c2 + (($c1 & 1) ? ($c2 < 0x60 ? 0x1F : 0x20) : 0x7E)) |
131 |
|
|
if ($c / 188) + 0xF0 < 0xFD; |
132 |
|
|
} |
133 |
|
|
} |
134 |
wakaba |
1.5 |
## G1 = JIS X 0208-1990/:1997 |
135 |
wakaba |
1.2 |
} elsif (0xF49D7C <= $cc && $cc <= 0xF4BFFF) { |
136 |
|
|
my $c = $cc - 0xF49D7C; |
137 |
|
|
if ($C->{G1} eq $Encode::Charset::CHARSET{G94n}->{'B@'}) { |
138 |
|
|
my ($c1, $c2) = ((($c % 8836) / 94)+0x21, ($c % 94)+0x21); |
139 |
|
|
$t = pack ('CC', (($c1 - 1) >> 1) + ($c1 < 0x5F ? 0x71 : 0xB1), |
140 |
|
|
$c2 + (($c1 & 1) ? ($c2 < 0x60 ? 0x1F : 0x20) : 0x7E)); |
141 |
|
|
} |
142 |
wakaba |
1.5 |
## GL = G0 = ISO/IEC 646 graphic character set / GR = G2 = JIS X 0201 Katakana set |
143 |
wakaba |
1.2 |
} elsif (0xE90940 <= $cc && $cc <= 0xE92641) { |
144 |
|
|
my $c = $cc - 0xE90940; my $F = chr (($c / 94)+0x30); |
145 |
|
|
if ($C->{ $C->{GL} } eq $Encode::Charset::CHARSET{G94}->{ $F }) { |
146 |
wakaba |
1.3 |
$t = pack 'C', (($c % 94) + 0x21); |
147 |
wakaba |
1.2 |
} elsif ($C->{ $C->{GR} } eq $Encode::Charset::CHARSET{G94}->{ $F }) { |
148 |
wakaba |
1.3 |
$t = pack 'C', (($c % 94) + 0xA1) if ($c % 94) < 0x3F; |
149 |
wakaba |
1.2 |
} |
150 |
wakaba |
1.5 |
## G1 / G3 = 94^2 graphic character set |
151 |
wakaba |
1.2 |
} elsif (0x70420000 <= $cc && $cc <= 0x7046F19B) { |
152 |
|
|
my $c = $cc % 0x10000; |
153 |
|
|
my $F0=$C->{option}->{private_set}->{G94n}->[($cc/0x10000)-0x7042]->[$c/8836]; |
154 |
|
|
my $F1 = 'P'.(($cc / 0x10000) - 0x7042).'_'.($c / 8836); |
155 |
|
|
if ($C->{G3} eq $Encode::Charset::CHARSET{G94n}->{ $F0 } |
156 |
|
|
|| $C->{G3} eq $Encode::Charset::CHARSET{G94n}->{ $F1 }) { |
157 |
|
|
my ($c1, $c2) = ((($c % 8836) / 94)+0x21, ($c % 94)+0x21); |
158 |
|
|
if ($C->{G3}->{Csjis_first}) { |
159 |
wakaba |
1.5 |
my $fb = $C->{G3}->{Csjis_first}->{ ($c % 8836) / 94 }; |
160 |
|
|
$t = pack ('CC', $fb, $c2 + (($c1 & 1) ? ($c2 < 0x60 ? 0x1F : 0x20) : 0x7E)) if $fb; |
161 |
wakaba |
1.2 |
} else { |
162 |
|
|
$t = pack ('CC', ($c / 188) + 0xF0, |
163 |
|
|
$c2 + (($c1 & 1) ? ($c2 < 0x60 ? 0x1F : 0x20) : 0x7E)) |
164 |
|
|
if ($c / 188) + 0xF0 < 0xFD; |
165 |
|
|
} |
166 |
|
|
} |
167 |
wakaba |
1.6 |
## Non-ISO/IEC 2022 Coded Character Sets Mapping Area |
168 |
|
|
} elsif (0x71000000 <= $cc && $cc <= 0x71FFFFFF) { |
169 |
|
|
if ($C->{G3}->{ucs} <= $cc) { |
170 |
|
|
my $c = $cc - $C->{G3}->{ucs}; |
171 |
|
|
my $f = $C->{G3}->{Csjis_first}->{$c / $C->{G3}->{chars}}; |
172 |
|
|
if ($f) { |
173 |
|
|
my $s = $c % $C->{G3}->{chars}; |
174 |
|
|
$t = pack ('CC', $f, 0x40 + $s + ($s > 62)); |
175 |
|
|
} |
176 |
|
|
} |
177 |
wakaba |
1.5 |
## Other character sets are not supported now (and there is no plan to implement them). |
178 |
wakaba |
1.2 |
} |
179 |
|
|
|
180 |
wakaba |
1.5 |
## Output the character itself |
181 |
wakaba |
1.2 |
if (defined $t) { |
182 |
|
|
$r .= $t; |
183 |
wakaba |
1.5 |
## Output the character itself with mapping table of special code positions |
184 |
wakaba |
1.2 |
} elsif ($C->{GsmapR}->{ $c }) { |
185 |
|
|
$r .= $C->{GsmapR}->{ $c }; |
186 |
wakaba |
1.5 |
} elsif ($C->{option}->{fallback_from_ucs} =~ /quiet/) { |
187 |
|
|
return ($r, halfway => 1, converted_length => $i, |
188 |
|
|
warn => $C->{option}->{fallback_from_ucs} =~ /warn/ ? 1 : 0, |
189 |
|
|
reason => sprintf (q(U+%04X: There is no character mapped to), $cc)); |
190 |
|
|
} elsif ($C->{option}->{fallback_from_ucs} eq 'croak') { |
191 |
|
|
return ($r, halfway => 1, die => 1, |
192 |
|
|
reason => sprintf (q(U+%04X: There is no character mapped to), $cc)); |
193 |
|
|
## |
194 |
wakaba |
1.2 |
} else { |
195 |
wakaba |
1.5 |
## Try to output with fallback escape sequence (if specified) |
196 |
wakaba |
1.6 |
my $t = Encode::Charset->fallback_escape ($C, $c); |
197 |
wakaba |
1.5 |
if (defined $t) { |
198 |
|
|
my %D = (fallback => $C->{option}->{fallback_from_ucs}, reset => $C->{option}->{reset}); |
199 |
|
|
$C->{option}->{fallback_from_ucs} = 'croak'; |
200 |
|
|
eval q{$t = $C->{_encoder}->_encode_internal ($t, $C)} or undef $t; |
201 |
|
|
$C->{option}->{fallback_from_ucs} = $D{fallback}; |
202 |
|
|
} |
203 |
|
|
if (defined $t) { |
204 |
|
|
$r .= $t; |
205 |
|
|
} else { ## Replacement character specified in charset definition |
206 |
|
|
$r .= $C->{option}->{undef_char_sjis} || "\x3F"; |
207 |
|
|
} |
208 |
wakaba |
1.2 |
} |
209 |
|
|
} |
210 |
|
|
$r; |
211 |
|
|
} |
212 |
|
|
|
213 |
wakaba |
1.6 |
sub page_to_internal ($$) { |
214 |
|
|
my ($C, $s) = @_; |
215 |
|
|
$s = pack ('U*', unpack ('C*', $s)); |
216 |
|
|
$s =~ s(\x1B\x24([EFGOPQ])([\x21-\x7E]+)\x0F)( |
217 |
|
|
my $page = {qw/E 1 F 2 G 3 O 4 P 5 Q 6/}->{$1}; |
218 |
|
|
my $r = ''; |
219 |
|
|
for my $c (split //, $2) { |
220 |
|
|
$r .= chr ($Encode::Charset::CHARSET{G94}->{'CSpictogram_page_'.$page}->{ucs} + ord ($c) - 0x21); |
221 |
|
|
} |
222 |
|
|
$r; |
223 |
|
|
)gex; |
224 |
|
|
$s; |
225 |
|
|
} |
226 |
|
|
|
227 |
|
|
sub _internal_to_page ($$$%) { |
228 |
|
|
my ($yourself, $C, $c, $option) = @_; |
229 |
|
|
my $cc = ord $c; |
230 |
|
|
for my $page (1..6) { |
231 |
|
|
my $cs = $Encode::Charset::CHARSET{G94}->{'CSpictogram_page_'.$page}; |
232 |
|
|
if ($cs->{ucs} <= $cc && $cc < $cs->{ucs} + $cs->{chars} * $cs->{dimension}) { |
233 |
|
|
return "\x1B\x24" . ([qw/_ E F G O P Q/]->[$page]) |
234 |
|
|
.pack ('C', 0x21 + $cc - $cs->{ucs}) . "\x0F"; |
235 |
|
|
} |
236 |
|
|
} |
237 |
|
|
## $c is not a pictogram |
238 |
|
|
$option->{fallback_from_ucs} = $C->{option}->{fallback_from_ucs_2}; |
239 |
|
|
$yourself->fallback_escape ($C, $c, %$option); |
240 |
|
|
} |
241 |
|
|
|
242 |
wakaba |
1.2 |
=back |
243 |
|
|
|
244 |
wakaba |
1.1 |
=head1 SEE ALSO |
245 |
|
|
|
246 |
|
|
JIS X 0208:1997, "7-bit and 8-bit double byte coded Kanji |
247 |
|
|
set for information interchange", Japan Industrial Standards |
248 |
|
|
Committee (JISC) <http://www.jisc.go.jp/>, 1997. |
249 |
|
|
|
250 |
|
|
JIS X 0213:2000, "7-bit and 8-bit double byte coded extended Kanji |
251 |
|
|
sets for information interchange", Japan Industrial Standards |
252 |
|
|
Committee (JISC) <http://www.jisc.go.jp/>, 2000. |
253 |
|
|
|
254 |
wakaba |
1.5 |
L<Encode::SJIS::JIS> |
255 |
|
|
|
256 |
|
|
L<Encode>, L<perlunicode> |
257 |
wakaba |
1.1 |
|
258 |
wakaba |
1.5 |
L<Encode::Charset>, L<Encode::ISO2022> |
259 |
wakaba |
1.1 |
|
260 |
|
|
=head1 LICENSE |
261 |
|
|
|
262 |
wakaba |
1.5 |
Copyright 2002 Nanashi-san <nanashi-san@nanashi.invalid> |
263 |
wakaba |
1.1 |
|
264 |
|
|
This library is free software; you can redistribute it |
265 |
|
|
and/or modify it under the same terms as Perl itself. |
266 |
|
|
|
267 |
|
|
=cut |
268 |
|
|
|
269 |
wakaba |
1.6 |
1; # $Date: 2002/12/16 10:25:01 $ |