/[suikacvs]/perl/lib/Encode/SJIS.pm
Suika

Contents of /perl/lib/Encode/SJIS.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.5 - (hide annotations) (download)
Mon Dec 16 10:25:01 2002 UTC (21 years, 10 months ago) by wakaba
Branch: MAIN
Changes since 1.4: +55 -156 lines
*** empty log message ***

1 wakaba 1.1
2     =head1 NAME
3    
4     Encode::SJIS --- Shift JIS coding systems encoder and decoder
5    
6     =head1 ENCODINGS
7    
8 wakaba 1.5 This module defines encoding engine for Shift JIS coding systems.
9     This module only provides general en/decoding parts. Actual profiles
10     for Shift JISes are included in Encode::SJIS::*.
11 wakaba 1.1
12     =over 4
13    
14     =cut
15    
16     package Encode::SJIS;
17     use 5.7.3;
18     use strict;
19 wakaba 1.5 our $VERSION=do{my @r=(q$Revision: 1.4 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
20 wakaba 1.1 require Encode::Charset;
21 wakaba 1.2 use base qw(Encode::Encoding);
22 wakaba 1.1
23 wakaba 1.5 *new_object = \&Encode::Charset::new_object_sjis;
24 wakaba 1.1
25 wakaba 1.5 ## Code extention escape sequence defined by ISO/IEC 2022 is
26     ## not supported in this version of this module.
27 wakaba 1.1
28     sub sjis_to_internal ($$) {
29     my ($s, $C) = @_;
30     $C ||= &new_object;
31     $s =~ s{
32     ([\x00-\x7F\xA1-\xDF])
33     # ([\xA1-\xDF])
34     |([\x81-\x9F\xE0-\xFC][\x40-\x7E\x80-\xFC])
35     |\x1B([\x40-\x5F])
36     |([\x80-\xFF]) ## Broken or supplemental 1-byte character
37     }{
38     my ($c7, $c2, $c1, $c8) = ($1, $2, $3, $4);
39     if (defined $c7) {
40     if ($c7 =~ /([\x21-\x7E])/) {
41     chr ($C->{ $C->{GL} }->{ucs} + ord ($1) - 0x21);
42     } elsif ($c7 =~ /([\x00-\x1F])/) {
43     chr ($C->{ $C->{CL} }->{ucs} + ord ($1));
44     } elsif ($C->{GR} && $c7 =~ /([\xA1-\xDF])/) {
45     chr ($C->{ $C->{GR} }->{ucs} + ord ($1) - 0xA1);
46     } else { ## 0x20, 0x7F
47     $C->{Gsmap}->{ $c7 } || $c7;
48     }
49     } elsif ($c2) {
50     if ($c2 =~ /([\x81-\xEF])(.)/) {
51     my ($f, $s) = (ord $1, ord $2);
52     $f -= $f < 0xA0 ? 0x81 : 0xC1; $s -= 0x40 + ($s > 0x7F);
53     chr ($C->{G1}->{ucs} + $f * 188 + $s);
54     } else { ## [\xF0-\xFC].
55     my ($f, $s) = unpack ('CC', $c2);
56     if ($C->{G3}->{Csjis_kuE}) {
57     $f = $s > 0x9E ? $C->{G3}->{Csjis_kuE}->{ $f }:
58     $C->{G3}->{Csjis_kuO}->{ $f };
59     $s -= ($s > 0x9E ? 0x9F : $s > 0x7F ? 0x41 : 0x40);
60     chr ($C->{G3}->{ucs} + $f * 94 + $s);
61     } else {
62     $f -= 0xF0; $s -= 0x40 + ($s > 0x7F);
63     chr ($C->{G3}->{ucs} + $f * 188 + $s);
64     }
65     }
66     } elsif ($c1) { ## ESC Fe
67     chr ($C->{ $C->{ESC_Fe} }->{ucs} + ord ($c1) - 0x40);
68     } else { # $C8
69     $C->{Gsmap}->{ $c8 } || $c8;
70     }
71     }gex;
72     $s;
73     }
74    
75 wakaba 1.2 sub internal_to_sjis ($\%) {
76     use integer;
77     my ($s, $C) = @_;
78     $C ||= &new_object;
79    
80     my $r = '';
81 wakaba 1.5 my @c = split //, $s;
82     for my $i (0..$#c) {
83     my $c = $c[$i]; my $cc = ord $c; Encode::_utf8_off ($c);
84 wakaba 1.2 my $t;
85 wakaba 1.5 ## CL = C0 control characters
86 wakaba 1.2 if ($cc <= 0x1F) {
87     $t = $c if $C->{ $C->{CL} } eq $Encode::Charset::CHARSET{C0}->{"\x40"};
88 wakaba 1.5 ## 0x20 == SP and 0x7E == DEL
89 wakaba 1.2 } elsif ($cc == 0x20 || $cc == 0x7F) {
90     $t = $c;
91 wakaba 1.5 ## GL = G0 = ISO/IEC 646 graphic character set
92 wakaba 1.2 } elsif ($cc < 0x7F) {
93     $t = $c if $C->{ $C->{GL} } eq $Encode::Charset::CHARSET{G94}->{"\x42"};
94 wakaba 1.5 ## 0x80
95 wakaba 1.2 } elsif ($C->{option}->{C1invoke_to_right} && $cc == 0x80) {
96 wakaba 1.3 $t = "\x80"
97     if $C->{ $C->{CR} } eq $Encode::Charset::CHARSET{C1}->{'64291991C1'};
98 wakaba 1.5 ## ESC Fe = C1 control characters
99 wakaba 1.2 } elsif ($cc <= 0x9F) {
100 wakaba 1.3 $t = "\x1B".pack 'C', ($cc - 0x40)
101 wakaba 1.2 if $C->{ $C->{ESC_Fe} } eq $Encode::Charset::CHARSET{C1}->{'64291991C1'};
102 wakaba 1.5 ## G1 or G3 = 94^2 graphic character set from ISO-IR
103 wakaba 1.2 } elsif (0xE9F6C0 <= $cc && $cc <= 0xF06F80) {
104     my $c = $cc - 0xE9F6C0; my $F = chr (($c / 8836)+0x30);
105     if ($C->{G1} eq $Encode::Charset::CHARSET{G94n}->{ $F }) {
106     my ($c1, $c2) = ((($c % 8836) / 94)+0x21, ($c % 94)+0x21);
107     $t = pack ('CC', (($c1 - 1) >> 1) + ($c1 < 0x5F ? 0x71 : 0xB1),
108     $c2 + (($c1 & 1) ? ($c2 < 0x60 ? 0x1F : 0x20) : 0x7E));
109     } elsif ($C->{G3} eq $Encode::Charset::CHARSET{G94n}->{ $F }) {
110     my ($c1, $c2) = ((($c % 8836) / 94)+0x21, ($c % 94)+0x21);
111     if ($C->{G3}->{Csjis_first}) {
112 wakaba 1.5 my $fb = $C->{G3}->{Csjis_first}->{ ($c % 8836) / 94 };
113     $t = pack ('CC', $fb, $c2 + (($c1 & 1) ? ($c2 < 0x60 ? 0x1F : 0x20) : 0x7E)) if $fb;
114 wakaba 1.2 } else {
115     $t = pack ('CC', ($c / 188) + 0xF0,
116     $c2 + (($c1 & 1) ? ($c2 < 0x60 ? 0x1F : 0x20) : 0x7E))
117     if ($c / 188) + 0xF0 < 0xFD;
118     }
119     }
120 wakaba 1.5 ## G1 = JIS X 0208-1990/:1997
121 wakaba 1.2 } elsif (0xF49D7C <= $cc && $cc <= 0xF4BFFF) {
122     my $c = $cc - 0xF49D7C;
123     if ($C->{G1} eq $Encode::Charset::CHARSET{G94n}->{'B@'}) {
124     my ($c1, $c2) = ((($c % 8836) / 94)+0x21, ($c % 94)+0x21);
125     $t = pack ('CC', (($c1 - 1) >> 1) + ($c1 < 0x5F ? 0x71 : 0xB1),
126     $c2 + (($c1 & 1) ? ($c2 < 0x60 ? 0x1F : 0x20) : 0x7E));
127     }
128 wakaba 1.5 ## GL = G0 = ISO/IEC 646 graphic character set / GR = G2 = JIS X 0201 Katakana set
129 wakaba 1.2 } elsif (0xE90940 <= $cc && $cc <= 0xE92641) {
130     my $c = $cc - 0xE90940; my $F = chr (($c / 94)+0x30);
131     if ($C->{ $C->{GL} } eq $Encode::Charset::CHARSET{G94}->{ $F }) {
132 wakaba 1.3 $t = pack 'C', (($c % 94) + 0x21);
133 wakaba 1.2 } elsif ($C->{ $C->{GR} } eq $Encode::Charset::CHARSET{G94}->{ $F }) {
134 wakaba 1.3 $t = pack 'C', (($c % 94) + 0xA1) if ($c % 94) < 0x3F;
135 wakaba 1.2 }
136 wakaba 1.5 ## G1 / G3 = 94^2 graphic character set
137 wakaba 1.2 } elsif (0x70420000 <= $cc && $cc <= 0x7046F19B) {
138     my $c = $cc % 0x10000;
139     my $F0=$C->{option}->{private_set}->{G94n}->[($cc/0x10000)-0x7042]->[$c/8836];
140     my $F1 = 'P'.(($cc / 0x10000) - 0x7042).'_'.($c / 8836);
141     if ($C->{G3} eq $Encode::Charset::CHARSET{G94n}->{ $F0 }
142     || $C->{G3} eq $Encode::Charset::CHARSET{G94n}->{ $F1 }) {
143     my ($c1, $c2) = ((($c % 8836) / 94)+0x21, ($c % 94)+0x21);
144     if ($C->{G3}->{Csjis_first}) {
145 wakaba 1.5 my $fb = $C->{G3}->{Csjis_first}->{ ($c % 8836) / 94 };
146     $t = pack ('CC', $fb, $c2 + (($c1 & 1) ? ($c2 < 0x60 ? 0x1F : 0x20) : 0x7E)) if $fb;
147 wakaba 1.2 } else {
148     $t = pack ('CC', ($c / 188) + 0xF0,
149     $c2 + (($c1 & 1) ? ($c2 < 0x60 ? 0x1F : 0x20) : 0x7E))
150     if ($c / 188) + 0xF0 < 0xFD;
151     }
152     }
153 wakaba 1.5 ## Other character sets are not supported now (and there is no plan to implement them).
154 wakaba 1.2 }
155    
156 wakaba 1.5 ## Output the character itself
157 wakaba 1.2 if (defined $t) {
158     $r .= $t;
159 wakaba 1.5 ## Output the character itself with mapping table of special code positions
160 wakaba 1.2 } elsif ($C->{GsmapR}->{ $c }) {
161     $r .= $C->{GsmapR}->{ $c };
162 wakaba 1.5 } elsif ($C->{option}->{fallback_from_ucs} =~ /quiet/) {
163     return ($r, halfway => 1, converted_length => $i,
164     warn => $C->{option}->{fallback_from_ucs} =~ /warn/ ? 1 : 0,
165     reason => sprintf (q(U+%04X: There is no character mapped to), $cc));
166     } elsif ($C->{option}->{fallback_from_ucs} eq 'croak') {
167     return ($r, halfway => 1, die => 1,
168     reason => sprintf (q(U+%04X: There is no character mapped to), $cc));
169     ##
170 wakaba 1.2 } else {
171 wakaba 1.5 ## Try to output with fallback escape sequence (if specified)
172     my $t = Encode::Charset::fallback_escape ($C, $c);
173     if (defined $t) {
174     my %D = (fallback => $C->{option}->{fallback_from_ucs}, reset => $C->{option}->{reset});
175     $C->{option}->{fallback_from_ucs} = 'croak';
176     $C->{option}->{reset} = {Gdesignation => 0, Ginvoke => 0};
177     eval q{$t = $C->{_encoder}->_encode_internal ($t, $C)} or undef $t;
178     $C->{option}->{fallback_from_ucs} = $D{fallback};
179     $C->{option}->{reset} = $D{reset};
180     }
181     if (defined $t) {
182     $r .= $t;
183     } else { ## Replacement character specified in charset definition
184     $r .= $C->{option}->{undef_char_sjis} || "\x3F";
185     }
186 wakaba 1.2 }
187     }
188     $r;
189     }
190    
191     =back
192    
193 wakaba 1.1 =head1 SEE ALSO
194    
195     JIS X 0208:1997, "7-bit and 8-bit double byte coded Kanji
196     set for information interchange", Japan Industrial Standards
197     Committee (JISC) <http://www.jisc.go.jp/>, 1997.
198    
199     JIS X 0213:2000, "7-bit and 8-bit double byte coded extended Kanji
200     sets for information interchange", Japan Industrial Standards
201     Committee (JISC) <http://www.jisc.go.jp/>, 2000.
202    
203 wakaba 1.5 L<Encode::SJIS::JIS>
204    
205     L<Encode>, L<perlunicode>
206 wakaba 1.1
207 wakaba 1.5 L<Encode::Charset>, L<Encode::ISO2022>
208 wakaba 1.1
209     =head1 LICENSE
210    
211 wakaba 1.5 Copyright 2002 Nanashi-san <nanashi-san@nanashi.invalid>
212 wakaba 1.1
213     This library is free software; you can redistribute it
214     and/or modify it under the same terms as Perl itself.
215    
216     =cut
217    
218 wakaba 1.5 1; # $Date: 2002/12/12 08:17:16 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24