/[suikacvs]/perl/lib/Encode/SJIS.pm
Suika

Contents of /perl/lib/Encode/SJIS.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.3 - (hide annotations) (download)
Mon Oct 14 06:58:35 2002 UTC (22 years ago) by wakaba
Branch: MAIN
Changes since 1.2: +11 -8 lines
2002-10-14  Nanashi-san

	* ISO2022.pm, SJIS.pm: Bug fix of utf8 flag control.
	(Committed by Wakaba <w@suika.fam.cx>.)

1 wakaba 1.1
2     =head1 NAME
3    
4     Encode::SJIS --- Shift JIS coding systems encoder and decoder
5    
6     =head1 ENCODINGS
7    
8     This module defines only two basic version of shift JIS.
9     Other variants are defined in Encode::SJIS::* modules.
10    
11     =over 4
12    
13     =cut
14    
15     package Encode::SJIS;
16     use 5.7.3;
17     use strict;
18 wakaba 1.3 our $VERSION=do{my @r=(q$Revision: 1.2 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
19 wakaba 1.1 require Encode::Charset;
20 wakaba 1.2 use base qw(Encode::Encoding);
21 wakaba 1.1
22     ### --- Perl Encode module common functions
23    
24     sub encode ($$;$) {
25     my ($obj, $str, $chk) = @_;
26     $_[1] = '' if $chk;
27     if (!defined $obj->{_encode_mapping} || $obj->{_encode_mapping}) {
28     require Encode::Table;
29     $str = Encode::Table::convert ($str, $obj->__encode_map,
30     -autoload => defined $obj->{_encode_mapping_autoload} ?
31     $obj->{_encode_mapping_autoload} : 1);
32     }
33     $str = &internal_to_sjis ($str, $obj->__2022_encode);
34     $str;
35     }
36    
37     sub decode ($$;$) {
38     my ($obj, $str, $chk) = @_;
39     $_[1] = '' if $chk;
40     $str = &sjis_to_internal ($str, $obj->__2022_decode);
41     if (!defined $obj->{_decode_mapping} || $obj->{_decode_mapping}) {
42     require Encode::Table;
43     $str = Encode::Table::convert ($str, $obj->__decode_map,
44     -autoload => defined $obj->{_decode_mapping_autoload} ?
45     $obj->{_decode_mapping_autoload} : 1);
46     }
47     $str;
48     }
49    
50     ### --- Encode::SJIS unique functions
51     *new_object = \&Encode::Charset::new_object_sjis;
52    
53     sub sjis_to_internal ($$) {
54     my ($s, $C) = @_;
55     $C ||= &new_object;
56     $s =~ s{
57     ([\x00-\x7F\xA1-\xDF])
58     # ([\xA1-\xDF])
59     |([\x81-\x9F\xE0-\xFC][\x40-\x7E\x80-\xFC])
60     |\x1B([\x40-\x5F])
61     |([\x80-\xFF]) ## Broken or supplemental 1-byte character
62     }{
63     my ($c7, $c2, $c1, $c8) = ($1, $2, $3, $4);
64     if (defined $c7) {
65     if ($c7 =~ /([\x21-\x7E])/) {
66     chr ($C->{ $C->{GL} }->{ucs} + ord ($1) - 0x21);
67     } elsif ($c7 =~ /([\x00-\x1F])/) {
68     chr ($C->{ $C->{CL} }->{ucs} + ord ($1));
69     } elsif ($C->{GR} && $c7 =~ /([\xA1-\xDF])/) {
70     chr ($C->{ $C->{GR} }->{ucs} + ord ($1) - 0xA1);
71     } else { ## 0x20, 0x7F
72     $C->{Gsmap}->{ $c7 } || $c7;
73     }
74     } elsif ($c2) {
75     if ($c2 =~ /([\x81-\xEF])(.)/) {
76     my ($f, $s) = (ord $1, ord $2);
77     $f -= $f < 0xA0 ? 0x81 : 0xC1; $s -= 0x40 + ($s > 0x7F);
78     chr ($C->{G1}->{ucs} + $f * 188 + $s);
79     } else { ## [\xF0-\xFC].
80     my ($f, $s) = unpack ('CC', $c2);
81     if ($C->{G3}->{Csjis_kuE}) {
82     $f = $s > 0x9E ? $C->{G3}->{Csjis_kuE}->{ $f }:
83     $C->{G3}->{Csjis_kuO}->{ $f };
84     $s -= ($s > 0x9E ? 0x9F : $s > 0x7F ? 0x41 : 0x40);
85     chr ($C->{G3}->{ucs} + $f * 94 + $s);
86     } else {
87     $f -= 0xF0; $s -= 0x40 + ($s > 0x7F);
88     chr ($C->{G3}->{ucs} + $f * 188 + $s);
89     }
90     }
91     } elsif ($c1) { ## ESC Fe
92     chr ($C->{ $C->{ESC_Fe} }->{ucs} + ord ($c1) - 0x40);
93     } else { # $C8
94     $C->{Gsmap}->{ $c8 } || $c8;
95     }
96     }gex;
97     $s;
98     }
99    
100 wakaba 1.2 sub internal_to_sjis ($\%) {
101     use integer;
102     my ($s, $C) = @_;
103     $C ||= &new_object;
104    
105     my $r = '';
106     for my $c (split //, $s) {
107     my $cc = ord $c;
108     my $t;
109     if ($cc <= 0x1F) {
110     $t = $c if $C->{ $C->{CL} } eq $Encode::Charset::CHARSET{C0}->{"\x40"};
111     } elsif ($cc == 0x20 || $cc == 0x7F) {
112 wakaba 1.3 Encode::_utf8_off ($c);
113 wakaba 1.2 $t = $c;
114     } elsif ($cc < 0x7F) {
115 wakaba 1.3 Encode::_utf8_off ($c);
116 wakaba 1.2 $t = $c if $C->{ $C->{GL} } eq $Encode::Charset::CHARSET{G94}->{"\x42"};
117     } elsif ($C->{option}->{C1invoke_to_right} && $cc == 0x80) {
118 wakaba 1.3 $t = "\x80"
119     if $C->{ $C->{CR} } eq $Encode::Charset::CHARSET{C1}->{'64291991C1'};
120 wakaba 1.2 } elsif ($cc <= 0x9F) {
121 wakaba 1.3 $t = "\x1B".pack 'C', ($cc - 0x40)
122 wakaba 1.2 if $C->{ $C->{ESC_Fe} } eq $Encode::Charset::CHARSET{C1}->{'64291991C1'};
123    
124     } elsif (0xE9F6C0 <= $cc && $cc <= 0xF06F80) {
125     my $c = $cc - 0xE9F6C0; my $F = chr (($c / 8836)+0x30);
126     if ($C->{G1} eq $Encode::Charset::CHARSET{G94n}->{ $F }) {
127     my ($c1, $c2) = ((($c % 8836) / 94)+0x21, ($c % 94)+0x21);
128     $t = pack ('CC', (($c1 - 1) >> 1) + ($c1 < 0x5F ? 0x71 : 0xB1),
129     $c2 + (($c1 & 1) ? ($c2 < 0x60 ? 0x1F : 0x20) : 0x7E));
130     } elsif ($C->{G3} eq $Encode::Charset::CHARSET{G94n}->{ $F }) {
131     my ($c1, $c2) = ((($c % 8836) / 94)+0x21, ($c % 94)+0x21);
132     if ($C->{G3}->{Csjis_first}) {
133     $t = pack ('CC', $C->{G3}->{Csjis_first}->{ ($c % 8836) / 94 },
134     $c2 + (($c1 & 1) ? ($c2 < 0x60 ? 0x1F : 0x20) : 0x7E));
135     } else {
136     $t = pack ('CC', ($c / 188) + 0xF0,
137     $c2 + (($c1 & 1) ? ($c2 < 0x60 ? 0x1F : 0x20) : 0x7E))
138     if ($c / 188) + 0xF0 < 0xFD;
139     }
140     }
141     } elsif (0xF49D7C <= $cc && $cc <= 0xF4BFFF) {
142     my $c = $cc - 0xF49D7C;
143     if ($C->{G1} eq $Encode::Charset::CHARSET{G94n}->{'B@'}) {
144     my ($c1, $c2) = ((($c % 8836) / 94)+0x21, ($c % 94)+0x21);
145     $t = pack ('CC', (($c1 - 1) >> 1) + ($c1 < 0x5F ? 0x71 : 0xB1),
146     $c2 + (($c1 & 1) ? ($c2 < 0x60 ? 0x1F : 0x20) : 0x7E));
147     }
148    
149     } elsif (0xE90940 <= $cc && $cc <= 0xE92641) {
150     my $c = $cc - 0xE90940; my $F = chr (($c / 94)+0x30);
151     if ($C->{ $C->{GL} } eq $Encode::Charset::CHARSET{G94}->{ $F }) {
152 wakaba 1.3 $t = pack 'C', (($c % 94) + 0x21);
153 wakaba 1.2 } elsif ($C->{ $C->{GR} } eq $Encode::Charset::CHARSET{G94}->{ $F }) {
154 wakaba 1.3 $t = pack 'C', (($c % 94) + 0xA1) if ($c % 94) < 0x3F;
155 wakaba 1.2 }
156     } elsif (0x70420000 <= $cc && $cc <= 0x7046F19B) {
157     my $c = $cc % 0x10000;
158     my $F0=$C->{option}->{private_set}->{G94n}->[($cc/0x10000)-0x7042]->[$c/8836];
159     my $F1 = 'P'.(($cc / 0x10000) - 0x7042).'_'.($c / 8836);
160     if ($C->{G3} eq $Encode::Charset::CHARSET{G94n}->{ $F0 }
161     || $C->{G3} eq $Encode::Charset::CHARSET{G94n}->{ $F1 }) {
162     my ($c1, $c2) = ((($c % 8836) / 94)+0x21, ($c % 94)+0x21);
163     if ($C->{G3}->{Csjis_first}) {
164     $t = pack ('CC', $C->{G3}->{Csjis_first}->{ ($c % 8836) / 94 },
165     $c2 + (($c1 & 1) ? ($c2 < 0x60 ? 0x1F : 0x20) : 0x7E));
166     } else {
167     $t = pack ('CC', ($c / 188) + 0xF0,
168     $c2 + (($c1 & 1) ? ($c2 < 0x60 ? 0x1F : 0x20) : 0x7E))
169     if ($c / 188) + 0xF0 < 0xFD;
170     }
171     }
172     }
173    
174     if (defined $t) {
175     $r .= $t;
176     } elsif ($C->{GsmapR}->{ $c }) {
177     $r .= $C->{GsmapR}->{ $c };
178     } else {
179     $r .= $C->{option}->{undef_char_sjis} || "\x3F";
180     }
181     }
182     $r;
183     }
184    
185 wakaba 1.1 sub __clone ($) {
186     my $self = shift;
187     bless {%$self}, ref $self;
188     };
189    
190     __PACKAGE__->Define (qw!shift_jisx0213 japanese-shift-jisx0213
191 wakaba 1.2 shift-jisx0213 x-shift_jisx0213 shift-jis-3 shift-jis-2000
192 wakaba 1.3 sjis s-jis shift-jis x-sjis x_sjis x-sjis-jp shiftjis x-shiftjis
193 wakaba 1.1 x-shift-jis shift.jis!);
194    
195     =item sjis
196    
197     "Shift JIS" coding system. (Alias: shift-jis, shiftjis,
198 wakaba 1.3 shift.jis, x-shiftjis, x-shift-jis, s-jis, x-sjis, x_sjis,
199 wakaba 1.1 x-sjis-jp)
200    
201     Since this name is ambiguous (it can now refer all or any
202     of shift JIS coding system family), this name should not
203     be used to address specific coding system. In this module,
204     this is considered as an alias name to the shift JIS with
205     latest official definition, currently of JIS X 0213:2000
206     Appendix 1 (with implemention level 4).
207    
208     Note that the name "Shift_JIS" is not associated with
209     this name, because IANA registry [IANAREG] assignes
210     it to a shift JIS defined by JIS X 0208:1997.
211    
212     =item shift_jisx0213
213    
214     Shift_JISX0213 coded representation, defined by
215     JIS X 0213:2000 Appendix 1 (implemention level 4).
216     (Alias: shift-jisx0213, x-shift_jisx0213, japanese-shift-jisx0213 (emacsen),
217 wakaba 1.2 shift-jis-3 (Yudit), shift-jis-2000)
218 wakaba 1.1
219     =cut
220    
221     sub __2022__common ($) {
222     my $C = Encode::SJIS->new_object;
223     $C->{G0} = $Encode::Charset::CHARSET{G94}->{J}; ## JIS X 0201:1997 Latin
224     $C->{G1} = $Encode::Charset::CHARSET{G94n}->{"\x4F"}; ## JIS X 0213:2000 plane 1
225     $C->{G2} = $Encode::Charset::CHARSET{G94}->{I}; ## JIS X 0201:1997 Katakana
226     $C->{G3} = $Encode::Charset::CHARSET{G94n}->{"\x50"}; ## JIS X 0213:2000 plane 2
227     $C;
228     }
229     sub __2022_encode ($) {
230     my $C = shift->__2022__common;
231     $C;
232     }
233     sub __2022_decode ($) {
234     my $C = shift->__2022__common;
235     $C;
236     }
237     sub __encode_map ($) {
238     [qw/ucs_to_jisx0201_latin ucs_to_jisx0213_2000_1 ucs_to_jisx0213_2000_2 ucs_to_jisx0201_katakana/];
239     }
240     sub __decode_map ($) {
241     [qw/jisx0201_latin_to_ucs jisx0213_2000_1_to_ucs jisx0213_2000_2_to_ucs jisx0201_katakana_to_ucs/];
242     }
243    
244     package Encode::SJIS::X0213ASCII;
245     use vars qw/@ISA/;
246     push @ISA, 'Encode::SJIS';
247 wakaba 1.2 __PACKAGE__->Define (qw/shift_jisx0213-ascii shift-jis-2000-ascii
248     sjis-ascii shift-jis-ascii/);
249 wakaba 1.1
250     =item sjis-ascii
251    
252     Same as sjis but ASCII (ISO/IEC 646 IRV) instead of
253     JIS X 0201 Roman (or Latin) set. (Alias: shift-jis-ascii)
254    
255     In spite of the history of shift JIS, ASCII is sometimes
256     used instead of JIS X 0201 Roman set, because of compatibility
257     with ASCII world.
258    
259     Note that this name is now an alias of shift_jisx0213-ascii,
260     as sjis is of shift_jisx0213.
261    
262     =item shift_jisx0213-ascii
263    
264     Same as Shift_JISX0213 but ASCII (ISO/IEC 646 IRV)
265     instead of JIS X 0201:1997 Latin character set.
266 wakaba 1.2 (Alias: shift-jis-2000-ascii)
267    
268     Note that this coding system does NOT comform to
269     JIS X 0213:2000 Appendix 1.
270 wakaba 1.1
271     =cut
272    
273     sub __2022__common ($) {
274     my $C = shift->SUPER::__2022__common;
275     $C->{G0} = $Encode::Charset::CHARSET{G94}->{B}; ## ASCII
276     $C;
277     }
278     sub __encode_map ($) {
279     [qw/ucs_to_ascii ucs_to_jisx0213_2000_1 ucs_to_jisx0213_2000_2 ucs_to_jisx0201_katakana/];
280     }
281     sub __decode_map ($) {
282     [qw/jisx0213_2000_1_to_ucs jisx0213_2000_2_to_ucs jisx0201_katakana_to_ucs/];
283     }
284    
285     1;
286     __END__
287    
288 wakaba 1.2 =back
289    
290 wakaba 1.1 =head1 SEE ALSO
291    
292     JIS X 0208:1997, "7-bit and 8-bit double byte coded Kanji
293     set for information interchange", Japan Industrial Standards
294     Committee (JISC) <http://www.jisc.go.jp/>, 1997.
295    
296     JIS X 0213:2000, "7-bit and 8-bit double byte coded extended Kanji
297     sets for information interchange", Japan Industrial Standards
298     Committee (JISC) <http://www.jisc.go.jp/>, 2000.
299    
300     Encode, perlunicode
301    
302     [IANAREG] "CHARACTER SETS", IANA <http://www.iana.org/>,
303     <http://www.iana.org/assignments/character-sets>.
304     The charset registry for IETF <http://www.ietf.org/> standards.
305     (Note that in this registry two shift JISes are registered,
306     "Shift_JIS" and "Windows-31j". Former is JIS X 0208:1997's
307     definition and later is the Windows standard character set.)
308    
309     =head1 LICENSE
310    
311     Copyright 2002 Nanashi-san
312    
313     This library is free software; you can redistribute it
314     and/or modify it under the same terms as Perl itself.
315    
316     =cut
317    
318 wakaba 1.3 # $Date: 2002/10/12 11:03:00 $
319 wakaba 1.1 ### SJIS.pm ends here

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24