/[suikacvs]/perl/lib/Encode/SJIS.pm
Suika

Contents of /perl/lib/Encode/SJIS.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.3 - (show annotations) (download)
Mon Oct 14 06:58:35 2002 UTC (22 years ago) by wakaba
Branch: MAIN
Changes since 1.2: +11 -8 lines
2002-10-14  Nanashi-san

	* ISO2022.pm, SJIS.pm: Bug fix of utf8 flag control.
	(Committed by Wakaba <w@suika.fam.cx>.)

1
2 =head1 NAME
3
4 Encode::SJIS --- Shift JIS coding systems encoder and decoder
5
6 =head1 ENCODINGS
7
8 This module defines only two basic version of shift JIS.
9 Other variants are defined in Encode::SJIS::* modules.
10
11 =over 4
12
13 =cut
14
15 package Encode::SJIS;
16 use 5.7.3;
17 use strict;
18 our $VERSION=do{my @r=(q$Revision: 1.2 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
19 require Encode::Charset;
20 use base qw(Encode::Encoding);
21
22 ### --- Perl Encode module common functions
23
24 sub encode ($$;$) {
25 my ($obj, $str, $chk) = @_;
26 $_[1] = '' if $chk;
27 if (!defined $obj->{_encode_mapping} || $obj->{_encode_mapping}) {
28 require Encode::Table;
29 $str = Encode::Table::convert ($str, $obj->__encode_map,
30 -autoload => defined $obj->{_encode_mapping_autoload} ?
31 $obj->{_encode_mapping_autoload} : 1);
32 }
33 $str = &internal_to_sjis ($str, $obj->__2022_encode);
34 $str;
35 }
36
37 sub decode ($$;$) {
38 my ($obj, $str, $chk) = @_;
39 $_[1] = '' if $chk;
40 $str = &sjis_to_internal ($str, $obj->__2022_decode);
41 if (!defined $obj->{_decode_mapping} || $obj->{_decode_mapping}) {
42 require Encode::Table;
43 $str = Encode::Table::convert ($str, $obj->__decode_map,
44 -autoload => defined $obj->{_decode_mapping_autoload} ?
45 $obj->{_decode_mapping_autoload} : 1);
46 }
47 $str;
48 }
49
50 ### --- Encode::SJIS unique functions
51 *new_object = \&Encode::Charset::new_object_sjis;
52
53 sub sjis_to_internal ($$) {
54 my ($s, $C) = @_;
55 $C ||= &new_object;
56 $s =~ s{
57 ([\x00-\x7F\xA1-\xDF])
58 # ([\xA1-\xDF])
59 |([\x81-\x9F\xE0-\xFC][\x40-\x7E\x80-\xFC])
60 |\x1B([\x40-\x5F])
61 |([\x80-\xFF]) ## Broken or supplemental 1-byte character
62 }{
63 my ($c7, $c2, $c1, $c8) = ($1, $2, $3, $4);
64 if (defined $c7) {
65 if ($c7 =~ /([\x21-\x7E])/) {
66 chr ($C->{ $C->{GL} }->{ucs} + ord ($1) - 0x21);
67 } elsif ($c7 =~ /([\x00-\x1F])/) {
68 chr ($C->{ $C->{CL} }->{ucs} + ord ($1));
69 } elsif ($C->{GR} && $c7 =~ /([\xA1-\xDF])/) {
70 chr ($C->{ $C->{GR} }->{ucs} + ord ($1) - 0xA1);
71 } else { ## 0x20, 0x7F
72 $C->{Gsmap}->{ $c7 } || $c7;
73 }
74 } elsif ($c2) {
75 if ($c2 =~ /([\x81-\xEF])(.)/) {
76 my ($f, $s) = (ord $1, ord $2);
77 $f -= $f < 0xA0 ? 0x81 : 0xC1; $s -= 0x40 + ($s > 0x7F);
78 chr ($C->{G1}->{ucs} + $f * 188 + $s);
79 } else { ## [\xF0-\xFC].
80 my ($f, $s) = unpack ('CC', $c2);
81 if ($C->{G3}->{Csjis_kuE}) {
82 $f = $s > 0x9E ? $C->{G3}->{Csjis_kuE}->{ $f }:
83 $C->{G3}->{Csjis_kuO}->{ $f };
84 $s -= ($s > 0x9E ? 0x9F : $s > 0x7F ? 0x41 : 0x40);
85 chr ($C->{G3}->{ucs} + $f * 94 + $s);
86 } else {
87 $f -= 0xF0; $s -= 0x40 + ($s > 0x7F);
88 chr ($C->{G3}->{ucs} + $f * 188 + $s);
89 }
90 }
91 } elsif ($c1) { ## ESC Fe
92 chr ($C->{ $C->{ESC_Fe} }->{ucs} + ord ($c1) - 0x40);
93 } else { # $C8
94 $C->{Gsmap}->{ $c8 } || $c8;
95 }
96 }gex;
97 $s;
98 }
99
100 sub internal_to_sjis ($\%) {
101 use integer;
102 my ($s, $C) = @_;
103 $C ||= &new_object;
104
105 my $r = '';
106 for my $c (split //, $s) {
107 my $cc = ord $c;
108 my $t;
109 if ($cc <= 0x1F) {
110 $t = $c if $C->{ $C->{CL} } eq $Encode::Charset::CHARSET{C0}->{"\x40"};
111 } elsif ($cc == 0x20 || $cc == 0x7F) {
112 Encode::_utf8_off ($c);
113 $t = $c;
114 } elsif ($cc < 0x7F) {
115 Encode::_utf8_off ($c);
116 $t = $c if $C->{ $C->{GL} } eq $Encode::Charset::CHARSET{G94}->{"\x42"};
117 } elsif ($C->{option}->{C1invoke_to_right} && $cc == 0x80) {
118 $t = "\x80"
119 if $C->{ $C->{CR} } eq $Encode::Charset::CHARSET{C1}->{'64291991C1'};
120 } elsif ($cc <= 0x9F) {
121 $t = "\x1B".pack 'C', ($cc - 0x40)
122 if $C->{ $C->{ESC_Fe} } eq $Encode::Charset::CHARSET{C1}->{'64291991C1'};
123
124 } elsif (0xE9F6C0 <= $cc && $cc <= 0xF06F80) {
125 my $c = $cc - 0xE9F6C0; my $F = chr (($c / 8836)+0x30);
126 if ($C->{G1} eq $Encode::Charset::CHARSET{G94n}->{ $F }) {
127 my ($c1, $c2) = ((($c % 8836) / 94)+0x21, ($c % 94)+0x21);
128 $t = pack ('CC', (($c1 - 1) >> 1) + ($c1 < 0x5F ? 0x71 : 0xB1),
129 $c2 + (($c1 & 1) ? ($c2 < 0x60 ? 0x1F : 0x20) : 0x7E));
130 } elsif ($C->{G3} eq $Encode::Charset::CHARSET{G94n}->{ $F }) {
131 my ($c1, $c2) = ((($c % 8836) / 94)+0x21, ($c % 94)+0x21);
132 if ($C->{G3}->{Csjis_first}) {
133 $t = pack ('CC', $C->{G3}->{Csjis_first}->{ ($c % 8836) / 94 },
134 $c2 + (($c1 & 1) ? ($c2 < 0x60 ? 0x1F : 0x20) : 0x7E));
135 } else {
136 $t = pack ('CC', ($c / 188) + 0xF0,
137 $c2 + (($c1 & 1) ? ($c2 < 0x60 ? 0x1F : 0x20) : 0x7E))
138 if ($c / 188) + 0xF0 < 0xFD;
139 }
140 }
141 } elsif (0xF49D7C <= $cc && $cc <= 0xF4BFFF) {
142 my $c = $cc - 0xF49D7C;
143 if ($C->{G1} eq $Encode::Charset::CHARSET{G94n}->{'B@'}) {
144 my ($c1, $c2) = ((($c % 8836) / 94)+0x21, ($c % 94)+0x21);
145 $t = pack ('CC', (($c1 - 1) >> 1) + ($c1 < 0x5F ? 0x71 : 0xB1),
146 $c2 + (($c1 & 1) ? ($c2 < 0x60 ? 0x1F : 0x20) : 0x7E));
147 }
148
149 } elsif (0xE90940 <= $cc && $cc <= 0xE92641) {
150 my $c = $cc - 0xE90940; my $F = chr (($c / 94)+0x30);
151 if ($C->{ $C->{GL} } eq $Encode::Charset::CHARSET{G94}->{ $F }) {
152 $t = pack 'C', (($c % 94) + 0x21);
153 } elsif ($C->{ $C->{GR} } eq $Encode::Charset::CHARSET{G94}->{ $F }) {
154 $t = pack 'C', (($c % 94) + 0xA1) if ($c % 94) < 0x3F;
155 }
156 } elsif (0x70420000 <= $cc && $cc <= 0x7046F19B) {
157 my $c = $cc % 0x10000;
158 my $F0=$C->{option}->{private_set}->{G94n}->[($cc/0x10000)-0x7042]->[$c/8836];
159 my $F1 = 'P'.(($cc / 0x10000) - 0x7042).'_'.($c / 8836);
160 if ($C->{G3} eq $Encode::Charset::CHARSET{G94n}->{ $F0 }
161 || $C->{G3} eq $Encode::Charset::CHARSET{G94n}->{ $F1 }) {
162 my ($c1, $c2) = ((($c % 8836) / 94)+0x21, ($c % 94)+0x21);
163 if ($C->{G3}->{Csjis_first}) {
164 $t = pack ('CC', $C->{G3}->{Csjis_first}->{ ($c % 8836) / 94 },
165 $c2 + (($c1 & 1) ? ($c2 < 0x60 ? 0x1F : 0x20) : 0x7E));
166 } else {
167 $t = pack ('CC', ($c / 188) + 0xF0,
168 $c2 + (($c1 & 1) ? ($c2 < 0x60 ? 0x1F : 0x20) : 0x7E))
169 if ($c / 188) + 0xF0 < 0xFD;
170 }
171 }
172 }
173
174 if (defined $t) {
175 $r .= $t;
176 } elsif ($C->{GsmapR}->{ $c }) {
177 $r .= $C->{GsmapR}->{ $c };
178 } else {
179 $r .= $C->{option}->{undef_char_sjis} || "\x3F";
180 }
181 }
182 $r;
183 }
184
185 sub __clone ($) {
186 my $self = shift;
187 bless {%$self}, ref $self;
188 };
189
190 __PACKAGE__->Define (qw!shift_jisx0213 japanese-shift-jisx0213
191 shift-jisx0213 x-shift_jisx0213 shift-jis-3 shift-jis-2000
192 sjis s-jis shift-jis x-sjis x_sjis x-sjis-jp shiftjis x-shiftjis
193 x-shift-jis shift.jis!);
194
195 =item sjis
196
197 "Shift JIS" coding system. (Alias: shift-jis, shiftjis,
198 shift.jis, x-shiftjis, x-shift-jis, s-jis, x-sjis, x_sjis,
199 x-sjis-jp)
200
201 Since this name is ambiguous (it can now refer all or any
202 of shift JIS coding system family), this name should not
203 be used to address specific coding system. In this module,
204 this is considered as an alias name to the shift JIS with
205 latest official definition, currently of JIS X 0213:2000
206 Appendix 1 (with implemention level 4).
207
208 Note that the name "Shift_JIS" is not associated with
209 this name, because IANA registry [IANAREG] assignes
210 it to a shift JIS defined by JIS X 0208:1997.
211
212 =item shift_jisx0213
213
214 Shift_JISX0213 coded representation, defined by
215 JIS X 0213:2000 Appendix 1 (implemention level 4).
216 (Alias: shift-jisx0213, x-shift_jisx0213, japanese-shift-jisx0213 (emacsen),
217 shift-jis-3 (Yudit), shift-jis-2000)
218
219 =cut
220
221 sub __2022__common ($) {
222 my $C = Encode::SJIS->new_object;
223 $C->{G0} = $Encode::Charset::CHARSET{G94}->{J}; ## JIS X 0201:1997 Latin
224 $C->{G1} = $Encode::Charset::CHARSET{G94n}->{"\x4F"}; ## JIS X 0213:2000 plane 1
225 $C->{G2} = $Encode::Charset::CHARSET{G94}->{I}; ## JIS X 0201:1997 Katakana
226 $C->{G3} = $Encode::Charset::CHARSET{G94n}->{"\x50"}; ## JIS X 0213:2000 plane 2
227 $C;
228 }
229 sub __2022_encode ($) {
230 my $C = shift->__2022__common;
231 $C;
232 }
233 sub __2022_decode ($) {
234 my $C = shift->__2022__common;
235 $C;
236 }
237 sub __encode_map ($) {
238 [qw/ucs_to_jisx0201_latin ucs_to_jisx0213_2000_1 ucs_to_jisx0213_2000_2 ucs_to_jisx0201_katakana/];
239 }
240 sub __decode_map ($) {
241 [qw/jisx0201_latin_to_ucs jisx0213_2000_1_to_ucs jisx0213_2000_2_to_ucs jisx0201_katakana_to_ucs/];
242 }
243
244 package Encode::SJIS::X0213ASCII;
245 use vars qw/@ISA/;
246 push @ISA, 'Encode::SJIS';
247 __PACKAGE__->Define (qw/shift_jisx0213-ascii shift-jis-2000-ascii
248 sjis-ascii shift-jis-ascii/);
249
250 =item sjis-ascii
251
252 Same as sjis but ASCII (ISO/IEC 646 IRV) instead of
253 JIS X 0201 Roman (or Latin) set. (Alias: shift-jis-ascii)
254
255 In spite of the history of shift JIS, ASCII is sometimes
256 used instead of JIS X 0201 Roman set, because of compatibility
257 with ASCII world.
258
259 Note that this name is now an alias of shift_jisx0213-ascii,
260 as sjis is of shift_jisx0213.
261
262 =item shift_jisx0213-ascii
263
264 Same as Shift_JISX0213 but ASCII (ISO/IEC 646 IRV)
265 instead of JIS X 0201:1997 Latin character set.
266 (Alias: shift-jis-2000-ascii)
267
268 Note that this coding system does NOT comform to
269 JIS X 0213:2000 Appendix 1.
270
271 =cut
272
273 sub __2022__common ($) {
274 my $C = shift->SUPER::__2022__common;
275 $C->{G0} = $Encode::Charset::CHARSET{G94}->{B}; ## ASCII
276 $C;
277 }
278 sub __encode_map ($) {
279 [qw/ucs_to_ascii ucs_to_jisx0213_2000_1 ucs_to_jisx0213_2000_2 ucs_to_jisx0201_katakana/];
280 }
281 sub __decode_map ($) {
282 [qw/jisx0213_2000_1_to_ucs jisx0213_2000_2_to_ucs jisx0201_katakana_to_ucs/];
283 }
284
285 1;
286 __END__
287
288 =back
289
290 =head1 SEE ALSO
291
292 JIS X 0208:1997, "7-bit and 8-bit double byte coded Kanji
293 set for information interchange", Japan Industrial Standards
294 Committee (JISC) <http://www.jisc.go.jp/>, 1997.
295
296 JIS X 0213:2000, "7-bit and 8-bit double byte coded extended Kanji
297 sets for information interchange", Japan Industrial Standards
298 Committee (JISC) <http://www.jisc.go.jp/>, 2000.
299
300 Encode, perlunicode
301
302 [IANAREG] "CHARACTER SETS", IANA <http://www.iana.org/>,
303 <http://www.iana.org/assignments/character-sets>.
304 The charset registry for IETF <http://www.ietf.org/> standards.
305 (Note that in this registry two shift JISes are registered,
306 "Shift_JIS" and "Windows-31j". Former is JIS X 0208:1997's
307 definition and later is the Windows standard character set.)
308
309 =head1 LICENSE
310
311 Copyright 2002 Nanashi-san
312
313 This library is free software; you can redistribute it
314 and/or modify it under the same terms as Perl itself.
315
316 =cut
317
318 # $Date: 2002/10/12 11:03:00 $
319 ### SJIS.pm ends here

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24