/[suikacvs]/perl/lib/Encode/SJIS.pm
Suika

Contents of /perl/lib/Encode/SJIS.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (show annotations) (download)
Sat Oct 12 11:03:00 2002 UTC (22 years, 1 month ago) by wakaba
Branch: MAIN
Changes since 1.1: +95 -6 lines
2002-10-12  Nanashi-san

	* SJIS.pm: New module.
	* SJIS/: New directory.
	(Commited by Wakaba <w@suika.fam.cx>)

1
2 =head1 NAME
3
4 Encode::SJIS --- Shift JIS coding systems encoder and decoder
5
6 =head1 ENCODINGS
7
8 This module defines only two basic version of shift JIS.
9 Other variants are defined in Encode::SJIS::* modules.
10
11 =over 4
12
13 =cut
14
15 package Encode::SJIS;
16 use 5.7.3;
17 use strict;
18 our $VERSION=do{my @r=(q$Revision: 1.1 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
19 require Encode::Charset;
20 use base qw(Encode::Encoding);
21
22 ### --- Perl Encode module common functions
23
24 sub encode ($$;$) {
25 my ($obj, $str, $chk) = @_;
26 $_[1] = '' if $chk;
27 if (!defined $obj->{_encode_mapping} || $obj->{_encode_mapping}) {
28 require Encode::Table;
29 $str = Encode::Table::convert ($str, $obj->__encode_map,
30 -autoload => defined $obj->{_encode_mapping_autoload} ?
31 $obj->{_encode_mapping_autoload} : 1);
32 }
33 $str = &internal_to_sjis ($str, $obj->__2022_encode);
34 $str;
35 }
36
37 sub decode ($$;$) {
38 my ($obj, $str, $chk) = @_;
39 $_[1] = '' if $chk;
40 $str = &sjis_to_internal ($str, $obj->__2022_decode);
41 if (!defined $obj->{_decode_mapping} || $obj->{_decode_mapping}) {
42 require Encode::Table;
43 $str = Encode::Table::convert ($str, $obj->__decode_map,
44 -autoload => defined $obj->{_decode_mapping_autoload} ?
45 $obj->{_decode_mapping_autoload} : 1);
46 }
47 $str;
48 }
49
50 ### --- Encode::SJIS unique functions
51 *new_object = \&Encode::Charset::new_object_sjis;
52
53 sub sjis_to_internal ($$) {
54 my ($s, $C) = @_;
55 $C ||= &new_object;
56 $s =~ s{
57 ([\x00-\x7F\xA1-\xDF])
58 # ([\xA1-\xDF])
59 |([\x81-\x9F\xE0-\xFC][\x40-\x7E\x80-\xFC])
60 |\x1B([\x40-\x5F])
61 |([\x80-\xFF]) ## Broken or supplemental 1-byte character
62 }{
63 my ($c7, $c2, $c1, $c8) = ($1, $2, $3, $4);
64 if (defined $c7) {
65 if ($c7 =~ /([\x21-\x7E])/) {
66 chr ($C->{ $C->{GL} }->{ucs} + ord ($1) - 0x21);
67 } elsif ($c7 =~ /([\x00-\x1F])/) {
68 chr ($C->{ $C->{CL} }->{ucs} + ord ($1));
69 } elsif ($C->{GR} && $c7 =~ /([\xA1-\xDF])/) {
70 chr ($C->{ $C->{GR} }->{ucs} + ord ($1) - 0xA1);
71 } else { ## 0x20, 0x7F
72 $C->{Gsmap}->{ $c7 } || $c7;
73 }
74 } elsif ($c2) {
75 if ($c2 =~ /([\x81-\xEF])(.)/) {
76 my ($f, $s) = (ord $1, ord $2);
77 $f -= $f < 0xA0 ? 0x81 : 0xC1; $s -= 0x40 + ($s > 0x7F);
78 chr ($C->{G1}->{ucs} + $f * 188 + $s);
79 } else { ## [\xF0-\xFC].
80 my ($f, $s) = unpack ('CC', $c2);
81 if ($C->{G3}->{Csjis_kuE}) {
82 $f = $s > 0x9E ? $C->{G3}->{Csjis_kuE}->{ $f }:
83 $C->{G3}->{Csjis_kuO}->{ $f };
84 $s -= ($s > 0x9E ? 0x9F : $s > 0x7F ? 0x41 : 0x40);
85 chr ($C->{G3}->{ucs} + $f * 94 + $s);
86 } else {
87 $f -= 0xF0; $s -= 0x40 + ($s > 0x7F);
88 chr ($C->{G3}->{ucs} + $f * 188 + $s);
89 }
90 }
91 } elsif ($c1) { ## ESC Fe
92 chr ($C->{ $C->{ESC_Fe} }->{ucs} + ord ($c1) - 0x40);
93 } else { # $C8
94 $C->{Gsmap}->{ $c8 } || $c8;
95 }
96 }gex;
97 $s;
98 }
99
100 sub internal_to_sjis ($\%) {
101 use integer;
102 my ($s, $C) = @_;
103 $C ||= &new_object;
104
105 my $r = '';
106 for my $c (split //, $s) {
107 my $cc = ord $c;
108 my $t;
109 if ($cc <= 0x1F) {
110 $t = $c if $C->{ $C->{CL} } eq $Encode::Charset::CHARSET{C0}->{"\x40"};
111 } elsif ($cc == 0x20 || $cc == 0x7F) {
112 $t = $c;
113 } elsif ($cc < 0x7F) {
114 $t = $c if $C->{ $C->{GL} } eq $Encode::Charset::CHARSET{G94}->{"\x42"};
115 } elsif ($C->{option}->{C1invoke_to_right} && $cc == 0x80) {
116 $t = $c if $C->{ $C->{CR} } eq $Encode::Charset::CHARSET{C1}->{'64291991C1'};
117 } elsif ($cc <= 0x9F) {
118 $t = "\x1B".chr ($cc - 0x40)
119 if $C->{ $C->{ESC_Fe} } eq $Encode::Charset::CHARSET{C1}->{'64291991C1'};
120
121 } elsif (0xE9F6C0 <= $cc && $cc <= 0xF06F80) {
122 my $c = $cc - 0xE9F6C0; my $F = chr (($c / 8836)+0x30);
123 if ($C->{G1} eq $Encode::Charset::CHARSET{G94n}->{ $F }) {
124 my ($c1, $c2) = ((($c % 8836) / 94)+0x21, ($c % 94)+0x21);
125 $t = pack ('CC', (($c1 - 1) >> 1) + ($c1 < 0x5F ? 0x71 : 0xB1),
126 $c2 + (($c1 & 1) ? ($c2 < 0x60 ? 0x1F : 0x20) : 0x7E));
127 } elsif ($C->{G3} eq $Encode::Charset::CHARSET{G94n}->{ $F }) {
128 my ($c1, $c2) = ((($c % 8836) / 94)+0x21, ($c % 94)+0x21);
129 if ($C->{G3}->{Csjis_first}) {
130 $t = pack ('CC', $C->{G3}->{Csjis_first}->{ ($c % 8836) / 94 },
131 $c2 + (($c1 & 1) ? ($c2 < 0x60 ? 0x1F : 0x20) : 0x7E));
132 } else {
133 $t = pack ('CC', ($c / 188) + 0xF0,
134 $c2 + (($c1 & 1) ? ($c2 < 0x60 ? 0x1F : 0x20) : 0x7E))
135 if ($c / 188) + 0xF0 < 0xFD;
136 }
137 }
138 } elsif (0xF49D7C <= $cc && $cc <= 0xF4BFFF) {
139 my $c = $cc - 0xF49D7C;
140 if ($C->{G1} eq $Encode::Charset::CHARSET{G94n}->{'B@'}) {
141 my ($c1, $c2) = ((($c % 8836) / 94)+0x21, ($c % 94)+0x21);
142 $t = pack ('CC', (($c1 - 1) >> 1) + ($c1 < 0x5F ? 0x71 : 0xB1),
143 $c2 + (($c1 & 1) ? ($c2 < 0x60 ? 0x1F : 0x20) : 0x7E));
144 }
145
146 } elsif (0xE90940 <= $cc && $cc <= 0xE92641) {
147 my $c = $cc - 0xE90940; my $F = chr (($c / 94)+0x30);
148 if ($C->{ $C->{GL} } eq $Encode::Charset::CHARSET{G94}->{ $F }) {
149 $t = chr (($c % 94) + 0x21);
150 } elsif ($C->{ $C->{GR} } eq $Encode::Charset::CHARSET{G94}->{ $F }) {
151 $t = chr (($c % 94) + 0xA1) if ($c % 94) < 0x3F;
152 }
153 } elsif (0x70420000 <= $cc && $cc <= 0x7046F19B) {
154 my $c = $cc % 0x10000;
155 my $F0=$C->{option}->{private_set}->{G94n}->[($cc/0x10000)-0x7042]->[$c/8836];
156 my $F1 = 'P'.(($cc / 0x10000) - 0x7042).'_'.($c / 8836);
157 if ($C->{G3} eq $Encode::Charset::CHARSET{G94n}->{ $F0 }
158 || $C->{G3} eq $Encode::Charset::CHARSET{G94n}->{ $F1 }) {
159 my ($c1, $c2) = ((($c % 8836) / 94)+0x21, ($c % 94)+0x21);
160 if ($C->{G3}->{Csjis_first}) {
161 $t = pack ('CC', $C->{G3}->{Csjis_first}->{ ($c % 8836) / 94 },
162 $c2 + (($c1 & 1) ? ($c2 < 0x60 ? 0x1F : 0x20) : 0x7E));
163 } else {
164 $t = pack ('CC', ($c / 188) + 0xF0,
165 $c2 + (($c1 & 1) ? ($c2 < 0x60 ? 0x1F : 0x20) : 0x7E))
166 if ($c / 188) + 0xF0 < 0xFD;
167 }
168 }
169 }
170
171 if (defined $t) {
172 $r .= $t;
173 } elsif ($C->{GsmapR}->{ $c }) {
174 $r .= $C->{GsmapR}->{ $c };
175 } else {
176 $r .= $C->{option}->{undef_char_sjis} || "\x3F";
177 }
178 }
179 $r;
180 }
181
182 sub __clone ($) {
183 my $self = shift;
184 bless {%$self}, ref $self;
185 };
186
187 __PACKAGE__->Define (qw!shift_jisx0213 japanese-shift-jisx0213
188 shift-jisx0213 x-shift_jisx0213 shift-jis-3 shift-jis-2000
189 sjis shift-jis x-sjis x_sjis x-sjis-jp shiftjis x-shiftjis
190 x-shift-jis shift.jis!);
191
192 =item sjis
193
194 "Shift JIS" coding system. (Alias: shift-jis, shiftjis,
195 shift.jis, x-shiftjis, x-shift-jis, x-sjis, x_sjis,
196 x-sjis-jp)
197
198 Since this name is ambiguous (it can now refer all or any
199 of shift JIS coding system family), this name should not
200 be used to address specific coding system. In this module,
201 this is considered as an alias name to the shift JIS with
202 latest official definition, currently of JIS X 0213:2000
203 Appendix 1 (with implemention level 4).
204
205 Note that the name "Shift_JIS" is not associated with
206 this name, because IANA registry [IANAREG] assignes
207 it to a shift JIS defined by JIS X 0208:1997.
208
209 =item shift_jisx0213
210
211 Shift_JISX0213 coded representation, defined by
212 JIS X 0213:2000 Appendix 1 (implemention level 4).
213 (Alias: shift-jisx0213, x-shift_jisx0213, japanese-shift-jisx0213 (emacsen),
214 shift-jis-3 (Yudit), shift-jis-2000)
215
216 =cut
217
218 sub __2022__common ($) {
219 my $C = Encode::SJIS->new_object;
220 $C->{G0} = $Encode::Charset::CHARSET{G94}->{J}; ## JIS X 0201:1997 Latin
221 $C->{G1} = $Encode::Charset::CHARSET{G94n}->{"\x4F"}; ## JIS X 0213:2000 plane 1
222 $C->{G2} = $Encode::Charset::CHARSET{G94}->{I}; ## JIS X 0201:1997 Katakana
223 $C->{G3} = $Encode::Charset::CHARSET{G94n}->{"\x50"}; ## JIS X 0213:2000 plane 2
224 $C;
225 }
226 sub __2022_encode ($) {
227 my $C = shift->__2022__common;
228 $C;
229 }
230 sub __2022_decode ($) {
231 my $C = shift->__2022__common;
232 $C;
233 }
234 sub __encode_map ($) {
235 [qw/ucs_to_jisx0201_latin ucs_to_jisx0213_2000_1 ucs_to_jisx0213_2000_2 ucs_to_jisx0201_katakana/];
236 }
237 sub __decode_map ($) {
238 [qw/jisx0201_latin_to_ucs jisx0213_2000_1_to_ucs jisx0213_2000_2_to_ucs jisx0201_katakana_to_ucs/];
239 }
240
241 package Encode::SJIS::X0213ASCII;
242 use vars qw/@ISA/;
243 push @ISA, 'Encode::SJIS';
244 __PACKAGE__->Define (qw/shift_jisx0213-ascii shift-jis-2000-ascii
245 sjis-ascii shift-jis-ascii/);
246
247 =item sjis-ascii
248
249 Same as sjis but ASCII (ISO/IEC 646 IRV) instead of
250 JIS X 0201 Roman (or Latin) set. (Alias: shift-jis-ascii)
251
252 In spite of the history of shift JIS, ASCII is sometimes
253 used instead of JIS X 0201 Roman set, because of compatibility
254 with ASCII world.
255
256 Note that this name is now an alias of shift_jisx0213-ascii,
257 as sjis is of shift_jisx0213.
258
259 =item shift_jisx0213-ascii
260
261 Same as Shift_JISX0213 but ASCII (ISO/IEC 646 IRV)
262 instead of JIS X 0201:1997 Latin character set.
263 (Alias: shift-jis-2000-ascii)
264
265 Note that this coding system does NOT comform to
266 JIS X 0213:2000 Appendix 1.
267
268 =cut
269
270 sub __2022__common ($) {
271 my $C = shift->SUPER::__2022__common;
272 $C->{G0} = $Encode::Charset::CHARSET{G94}->{B}; ## ASCII
273 $C;
274 }
275 sub __encode_map ($) {
276 [qw/ucs_to_ascii ucs_to_jisx0213_2000_1 ucs_to_jisx0213_2000_2 ucs_to_jisx0201_katakana/];
277 }
278 sub __decode_map ($) {
279 [qw/jisx0213_2000_1_to_ucs jisx0213_2000_2_to_ucs jisx0201_katakana_to_ucs/];
280 }
281
282 1;
283 __END__
284
285 =back
286
287 =head1 SEE ALSO
288
289 JIS X 0208:1997, "7-bit and 8-bit double byte coded Kanji
290 set for information interchange", Japan Industrial Standards
291 Committee (JISC) <http://www.jisc.go.jp/>, 1997.
292
293 JIS X 0213:2000, "7-bit and 8-bit double byte coded extended Kanji
294 sets for information interchange", Japan Industrial Standards
295 Committee (JISC) <http://www.jisc.go.jp/>, 2000.
296
297 Encode, perlunicode
298
299 [IANAREG] "CHARACTER SETS", IANA <http://www.iana.org/>,
300 <http://www.iana.org/assignments/character-sets>.
301 The charset registry for IETF <http://www.ietf.org/> standards.
302 (Note that in this registry two shift JISes are registered,
303 "Shift_JIS" and "Windows-31j". Former is JIS X 0208:1997's
304 definition and later is the Windows standard character set.)
305
306 =head1 LICENSE
307
308 Copyright 2002 Nanashi-san
309
310 This library is free software; you can redistribute it
311 and/or modify it under the same terms as Perl itself.
312
313 =cut
314
315 # $Date: 2002/10/12 07:27:01 $
316 ### SJIS.pm ends here

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24