/[suikacvs]/perl/lib/Encode/SJIS.pm
Suika

Contents of /perl/lib/Encode/SJIS.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (hide annotations) (download)
Sat Oct 12 11:03:00 2002 UTC (22 years, 1 month ago) by wakaba
Branch: MAIN
Changes since 1.1: +95 -6 lines
2002-10-12  Nanashi-san

	* SJIS.pm: New module.
	* SJIS/: New directory.
	(Commited by Wakaba <w@suika.fam.cx>)

1 wakaba 1.1
2     =head1 NAME
3    
4     Encode::SJIS --- Shift JIS coding systems encoder and decoder
5    
6     =head1 ENCODINGS
7    
8     This module defines only two basic version of shift JIS.
9     Other variants are defined in Encode::SJIS::* modules.
10    
11     =over 4
12    
13     =cut
14    
15     package Encode::SJIS;
16     use 5.7.3;
17     use strict;
18 wakaba 1.2 our $VERSION=do{my @r=(q$Revision: 1.1 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
19 wakaba 1.1 require Encode::Charset;
20 wakaba 1.2 use base qw(Encode::Encoding);
21 wakaba 1.1
22     ### --- Perl Encode module common functions
23    
24     sub encode ($$;$) {
25     my ($obj, $str, $chk) = @_;
26     $_[1] = '' if $chk;
27     if (!defined $obj->{_encode_mapping} || $obj->{_encode_mapping}) {
28     require Encode::Table;
29     $str = Encode::Table::convert ($str, $obj->__encode_map,
30     -autoload => defined $obj->{_encode_mapping_autoload} ?
31     $obj->{_encode_mapping_autoload} : 1);
32     }
33     $str = &internal_to_sjis ($str, $obj->__2022_encode);
34     $str;
35     }
36    
37     sub decode ($$;$) {
38     my ($obj, $str, $chk) = @_;
39     $_[1] = '' if $chk;
40     $str = &sjis_to_internal ($str, $obj->__2022_decode);
41     if (!defined $obj->{_decode_mapping} || $obj->{_decode_mapping}) {
42     require Encode::Table;
43     $str = Encode::Table::convert ($str, $obj->__decode_map,
44     -autoload => defined $obj->{_decode_mapping_autoload} ?
45     $obj->{_decode_mapping_autoload} : 1);
46     }
47     $str;
48     }
49    
50     ### --- Encode::SJIS unique functions
51     *new_object = \&Encode::Charset::new_object_sjis;
52    
53     sub sjis_to_internal ($$) {
54     my ($s, $C) = @_;
55     $C ||= &new_object;
56     $s =~ s{
57     ([\x00-\x7F\xA1-\xDF])
58     # ([\xA1-\xDF])
59     |([\x81-\x9F\xE0-\xFC][\x40-\x7E\x80-\xFC])
60     |\x1B([\x40-\x5F])
61     |([\x80-\xFF]) ## Broken or supplemental 1-byte character
62     }{
63     my ($c7, $c2, $c1, $c8) = ($1, $2, $3, $4);
64     if (defined $c7) {
65     if ($c7 =~ /([\x21-\x7E])/) {
66     chr ($C->{ $C->{GL} }->{ucs} + ord ($1) - 0x21);
67     } elsif ($c7 =~ /([\x00-\x1F])/) {
68     chr ($C->{ $C->{CL} }->{ucs} + ord ($1));
69     } elsif ($C->{GR} && $c7 =~ /([\xA1-\xDF])/) {
70     chr ($C->{ $C->{GR} }->{ucs} + ord ($1) - 0xA1);
71     } else { ## 0x20, 0x7F
72     $C->{Gsmap}->{ $c7 } || $c7;
73     }
74     } elsif ($c2) {
75     if ($c2 =~ /([\x81-\xEF])(.)/) {
76     my ($f, $s) = (ord $1, ord $2);
77     $f -= $f < 0xA0 ? 0x81 : 0xC1; $s -= 0x40 + ($s > 0x7F);
78     chr ($C->{G1}->{ucs} + $f * 188 + $s);
79     } else { ## [\xF0-\xFC].
80     my ($f, $s) = unpack ('CC', $c2);
81     if ($C->{G3}->{Csjis_kuE}) {
82     $f = $s > 0x9E ? $C->{G3}->{Csjis_kuE}->{ $f }:
83     $C->{G3}->{Csjis_kuO}->{ $f };
84     $s -= ($s > 0x9E ? 0x9F : $s > 0x7F ? 0x41 : 0x40);
85     chr ($C->{G3}->{ucs} + $f * 94 + $s);
86     } else {
87     $f -= 0xF0; $s -= 0x40 + ($s > 0x7F);
88     chr ($C->{G3}->{ucs} + $f * 188 + $s);
89     }
90     }
91     } elsif ($c1) { ## ESC Fe
92     chr ($C->{ $C->{ESC_Fe} }->{ucs} + ord ($c1) - 0x40);
93     } else { # $C8
94     $C->{Gsmap}->{ $c8 } || $c8;
95     }
96     }gex;
97     $s;
98     }
99    
100 wakaba 1.2 sub internal_to_sjis ($\%) {
101     use integer;
102     my ($s, $C) = @_;
103     $C ||= &new_object;
104    
105     my $r = '';
106     for my $c (split //, $s) {
107     my $cc = ord $c;
108     my $t;
109     if ($cc <= 0x1F) {
110     $t = $c if $C->{ $C->{CL} } eq $Encode::Charset::CHARSET{C0}->{"\x40"};
111     } elsif ($cc == 0x20 || $cc == 0x7F) {
112     $t = $c;
113     } elsif ($cc < 0x7F) {
114     $t = $c if $C->{ $C->{GL} } eq $Encode::Charset::CHARSET{G94}->{"\x42"};
115     } elsif ($C->{option}->{C1invoke_to_right} && $cc == 0x80) {
116     $t = $c if $C->{ $C->{CR} } eq $Encode::Charset::CHARSET{C1}->{'64291991C1'};
117     } elsif ($cc <= 0x9F) {
118     $t = "\x1B".chr ($cc - 0x40)
119     if $C->{ $C->{ESC_Fe} } eq $Encode::Charset::CHARSET{C1}->{'64291991C1'};
120    
121     } elsif (0xE9F6C0 <= $cc && $cc <= 0xF06F80) {
122     my $c = $cc - 0xE9F6C0; my $F = chr (($c / 8836)+0x30);
123     if ($C->{G1} eq $Encode::Charset::CHARSET{G94n}->{ $F }) {
124     my ($c1, $c2) = ((($c % 8836) / 94)+0x21, ($c % 94)+0x21);
125     $t = pack ('CC', (($c1 - 1) >> 1) + ($c1 < 0x5F ? 0x71 : 0xB1),
126     $c2 + (($c1 & 1) ? ($c2 < 0x60 ? 0x1F : 0x20) : 0x7E));
127     } elsif ($C->{G3} eq $Encode::Charset::CHARSET{G94n}->{ $F }) {
128     my ($c1, $c2) = ((($c % 8836) / 94)+0x21, ($c % 94)+0x21);
129     if ($C->{G3}->{Csjis_first}) {
130     $t = pack ('CC', $C->{G3}->{Csjis_first}->{ ($c % 8836) / 94 },
131     $c2 + (($c1 & 1) ? ($c2 < 0x60 ? 0x1F : 0x20) : 0x7E));
132     } else {
133     $t = pack ('CC', ($c / 188) + 0xF0,
134     $c2 + (($c1 & 1) ? ($c2 < 0x60 ? 0x1F : 0x20) : 0x7E))
135     if ($c / 188) + 0xF0 < 0xFD;
136     }
137     }
138     } elsif (0xF49D7C <= $cc && $cc <= 0xF4BFFF) {
139     my $c = $cc - 0xF49D7C;
140     if ($C->{G1} eq $Encode::Charset::CHARSET{G94n}->{'B@'}) {
141     my ($c1, $c2) = ((($c % 8836) / 94)+0x21, ($c % 94)+0x21);
142     $t = pack ('CC', (($c1 - 1) >> 1) + ($c1 < 0x5F ? 0x71 : 0xB1),
143     $c2 + (($c1 & 1) ? ($c2 < 0x60 ? 0x1F : 0x20) : 0x7E));
144     }
145    
146     } elsif (0xE90940 <= $cc && $cc <= 0xE92641) {
147     my $c = $cc - 0xE90940; my $F = chr (($c / 94)+0x30);
148     if ($C->{ $C->{GL} } eq $Encode::Charset::CHARSET{G94}->{ $F }) {
149     $t = chr (($c % 94) + 0x21);
150     } elsif ($C->{ $C->{GR} } eq $Encode::Charset::CHARSET{G94}->{ $F }) {
151     $t = chr (($c % 94) + 0xA1) if ($c % 94) < 0x3F;
152     }
153     } elsif (0x70420000 <= $cc && $cc <= 0x7046F19B) {
154     my $c = $cc % 0x10000;
155     my $F0=$C->{option}->{private_set}->{G94n}->[($cc/0x10000)-0x7042]->[$c/8836];
156     my $F1 = 'P'.(($cc / 0x10000) - 0x7042).'_'.($c / 8836);
157     if ($C->{G3} eq $Encode::Charset::CHARSET{G94n}->{ $F0 }
158     || $C->{G3} eq $Encode::Charset::CHARSET{G94n}->{ $F1 }) {
159     my ($c1, $c2) = ((($c % 8836) / 94)+0x21, ($c % 94)+0x21);
160     if ($C->{G3}->{Csjis_first}) {
161     $t = pack ('CC', $C->{G3}->{Csjis_first}->{ ($c % 8836) / 94 },
162     $c2 + (($c1 & 1) ? ($c2 < 0x60 ? 0x1F : 0x20) : 0x7E));
163     } else {
164     $t = pack ('CC', ($c / 188) + 0xF0,
165     $c2 + (($c1 & 1) ? ($c2 < 0x60 ? 0x1F : 0x20) : 0x7E))
166     if ($c / 188) + 0xF0 < 0xFD;
167     }
168     }
169     }
170    
171     if (defined $t) {
172     $r .= $t;
173     } elsif ($C->{GsmapR}->{ $c }) {
174     $r .= $C->{GsmapR}->{ $c };
175     } else {
176     $r .= $C->{option}->{undef_char_sjis} || "\x3F";
177     }
178     }
179     $r;
180     }
181    
182 wakaba 1.1 sub __clone ($) {
183     my $self = shift;
184     bless {%$self}, ref $self;
185     };
186    
187     __PACKAGE__->Define (qw!shift_jisx0213 japanese-shift-jisx0213
188 wakaba 1.2 shift-jisx0213 x-shift_jisx0213 shift-jis-3 shift-jis-2000
189 wakaba 1.1 sjis shift-jis x-sjis x_sjis x-sjis-jp shiftjis x-shiftjis
190     x-shift-jis shift.jis!);
191    
192     =item sjis
193    
194     "Shift JIS" coding system. (Alias: shift-jis, shiftjis,
195     shift.jis, x-shiftjis, x-shift-jis, x-sjis, x_sjis,
196     x-sjis-jp)
197    
198     Since this name is ambiguous (it can now refer all or any
199     of shift JIS coding system family), this name should not
200     be used to address specific coding system. In this module,
201     this is considered as an alias name to the shift JIS with
202     latest official definition, currently of JIS X 0213:2000
203     Appendix 1 (with implemention level 4).
204    
205     Note that the name "Shift_JIS" is not associated with
206     this name, because IANA registry [IANAREG] assignes
207     it to a shift JIS defined by JIS X 0208:1997.
208    
209     =item shift_jisx0213
210    
211     Shift_JISX0213 coded representation, defined by
212     JIS X 0213:2000 Appendix 1 (implemention level 4).
213     (Alias: shift-jisx0213, x-shift_jisx0213, japanese-shift-jisx0213 (emacsen),
214 wakaba 1.2 shift-jis-3 (Yudit), shift-jis-2000)
215 wakaba 1.1
216     =cut
217    
218     sub __2022__common ($) {
219     my $C = Encode::SJIS->new_object;
220     $C->{G0} = $Encode::Charset::CHARSET{G94}->{J}; ## JIS X 0201:1997 Latin
221     $C->{G1} = $Encode::Charset::CHARSET{G94n}->{"\x4F"}; ## JIS X 0213:2000 plane 1
222     $C->{G2} = $Encode::Charset::CHARSET{G94}->{I}; ## JIS X 0201:1997 Katakana
223     $C->{G3} = $Encode::Charset::CHARSET{G94n}->{"\x50"}; ## JIS X 0213:2000 plane 2
224     $C;
225     }
226     sub __2022_encode ($) {
227     my $C = shift->__2022__common;
228     $C;
229     }
230     sub __2022_decode ($) {
231     my $C = shift->__2022__common;
232     $C;
233     }
234     sub __encode_map ($) {
235     [qw/ucs_to_jisx0201_latin ucs_to_jisx0213_2000_1 ucs_to_jisx0213_2000_2 ucs_to_jisx0201_katakana/];
236     }
237     sub __decode_map ($) {
238     [qw/jisx0201_latin_to_ucs jisx0213_2000_1_to_ucs jisx0213_2000_2_to_ucs jisx0201_katakana_to_ucs/];
239     }
240    
241     package Encode::SJIS::X0213ASCII;
242     use vars qw/@ISA/;
243     push @ISA, 'Encode::SJIS';
244 wakaba 1.2 __PACKAGE__->Define (qw/shift_jisx0213-ascii shift-jis-2000-ascii
245     sjis-ascii shift-jis-ascii/);
246 wakaba 1.1
247     =item sjis-ascii
248    
249     Same as sjis but ASCII (ISO/IEC 646 IRV) instead of
250     JIS X 0201 Roman (or Latin) set. (Alias: shift-jis-ascii)
251    
252     In spite of the history of shift JIS, ASCII is sometimes
253     used instead of JIS X 0201 Roman set, because of compatibility
254     with ASCII world.
255    
256     Note that this name is now an alias of shift_jisx0213-ascii,
257     as sjis is of shift_jisx0213.
258    
259     =item shift_jisx0213-ascii
260    
261     Same as Shift_JISX0213 but ASCII (ISO/IEC 646 IRV)
262     instead of JIS X 0201:1997 Latin character set.
263 wakaba 1.2 (Alias: shift-jis-2000-ascii)
264    
265     Note that this coding system does NOT comform to
266     JIS X 0213:2000 Appendix 1.
267 wakaba 1.1
268     =cut
269    
270     sub __2022__common ($) {
271     my $C = shift->SUPER::__2022__common;
272     $C->{G0} = $Encode::Charset::CHARSET{G94}->{B}; ## ASCII
273     $C;
274     }
275     sub __encode_map ($) {
276     [qw/ucs_to_ascii ucs_to_jisx0213_2000_1 ucs_to_jisx0213_2000_2 ucs_to_jisx0201_katakana/];
277     }
278     sub __decode_map ($) {
279     [qw/jisx0213_2000_1_to_ucs jisx0213_2000_2_to_ucs jisx0201_katakana_to_ucs/];
280     }
281    
282     1;
283     __END__
284    
285 wakaba 1.2 =back
286    
287 wakaba 1.1 =head1 SEE ALSO
288    
289     JIS X 0208:1997, "7-bit and 8-bit double byte coded Kanji
290     set for information interchange", Japan Industrial Standards
291     Committee (JISC) <http://www.jisc.go.jp/>, 1997.
292    
293     JIS X 0213:2000, "7-bit and 8-bit double byte coded extended Kanji
294     sets for information interchange", Japan Industrial Standards
295     Committee (JISC) <http://www.jisc.go.jp/>, 2000.
296    
297     Encode, perlunicode
298    
299     [IANAREG] "CHARACTER SETS", IANA <http://www.iana.org/>,
300     <http://www.iana.org/assignments/character-sets>.
301     The charset registry for IETF <http://www.ietf.org/> standards.
302     (Note that in this registry two shift JISes are registered,
303     "Shift_JIS" and "Windows-31j". Former is JIS X 0208:1997's
304     definition and later is the Windows standard character set.)
305    
306     =head1 LICENSE
307    
308     Copyright 2002 Nanashi-san
309    
310     This library is free software; you can redistribute it
311     and/or modify it under the same terms as Perl itself.
312    
313     =cut
314    
315 wakaba 1.2 # $Date: 2002/10/12 07:27:01 $
316 wakaba 1.1 ### SJIS.pm ends here

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24