/[suikacvs]/test/oldencodeutils/lib/M17N/Code/JA.pm
Suika

Contents of /test/oldencodeutils/lib/M17N/Code/JA.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.3 - (hide annotations) (download)
Sat Jul 20 08:49:09 2002 UTC (23 years ago) by wakaba
Branch: MAIN
Changes since 1.2: +373 -118 lines
2002-04-06  wakaba <w@suika.fam.cx>

	* JA.pm (eucjapan_to_internal): Bug fix of
	may not return valid value problem.

1 wakaba 1.1
2 wakaba 1.3 =head1 NAME
3    
4     M17N::Code::JA --- Perl module for convertion between
5     coding systems of string written in Japanese language
6    
7     =cut
8    
9 wakaba 1.1 package M17N::Code::JA;
10     use strict;
11 wakaba 1.3 use vars qw(%re %_cp);
12 wakaba 1.1
13     %re = (
14 wakaba 1.3 94x1 => qr/\x1B\x28[\x40-\x7D](?:[\x21-\x7E])*/,
15     94x2 => qr/(?:\x1B\x26[\x40-\x7E])?\x1B\x24\x28?[\x40-\x5F](?:[\x21-\x7E][\x21-\x7E])*/,
16     96x1 => qr/\x1B\x2C[\x40-\x7D](?:[\x20-\x7F])*/,
17     utf8 => qr/\x1B\x25(?:\x47|\x2F[\x47-\x49])(?:[\x20-\x7F]|(?:[\xC0-\xFD][\x80-\xBF]+))*\x1B\x25\x40/,
18     M_94x1 => qr/\x1B\x28([\x40-\x7D])((?:[\x21-\x7E])*)/,
19     M_94x2 => qr/(?:\x1B\x26([\x40-\x7E]))?\x1B\x24\x28?([\x40-\x5F])((?:[\x21-\x7E][\x21-\x7E])*)/,
20     M_96x1 => qr/\x1B\x2C([\x40-\x7D])((?:[\x20-\x7F])*)/,
21     M_utf8 => qr/\x1B\x25(?:\x47|\x2F[\x47-\x49])((?:[\x20-\x7F]|(?:[\xC0-\xFD][\x80-\xBF]+))*)\x1B\x25\x40/,
22 wakaba 1.1 );
23 wakaba 1.3 %_cp = (
24 wakaba 1.1 94x1 => 0xE90940,
25     94x2 => 0xE9F6C0,
26     96x1 => 0xE926A0,
27     jisx02081990 => 0xE9F6C0 + 94*94*79,
28 wakaba 1.3 ## (0xFC - 0x3F - 1) * (0xFC - 0xEF) = 2444
29     x_cp932 => 0xE000, x_cp932_last => 0xE000 + 2443,
30     x_imode_doti => 0xFE000,
31     x_jsky => 0xFFB00,
32     );
33     %_cp = (%_cp,
34     jisx0201kana => $_cp{94x1} + 94 * (0x49-0x30),
35     jisx0201kana_last1 => $_cp{94x1} + 94 * (0x49-0x30+1) -0x21,
36     jisx0201kana_last => $_cp{94x1} + 94 * (0x49-0x30+1) -1,
37     jisx0201latin => $_cp{94x1} + 94 * (0x4A-0x30),
38     jisx0201latin_last => $_cp{94x1} + 94 * (0x4A-0x30+1) -1,
39     jisx02081978 => $_cp{94x2} + 94*94 * (0x40-0x30),
40     jisx02081978_last => $_cp{94x2} + 94*94 * (0x40-0x30+1) -1,
41     jisx02081983 => $_cp{94x2} + 94*94 * (0x42-0x30),
42     jisx02081983_last => $_cp{94x2} + 94*94 * (0x42-0x30+1) -1,
43     jisx02081990_last => $_cp{jisx02081990} + 94*94 -1,
44     jisx02121990 => $_cp{94x2} + 94*94 * (0x44-0x30),
45     jisx02121990_last => $_cp{94x2} + 94*94 * (0x44-0x30+1) -1,
46     jisx02132000_1 => $_cp{94x2} + 94*94 * (0x4F-0x30),
47     jisx02132000_1_last => $_cp{94x2} + 94*94 * (0x4F-0x30+1) -1,
48     jisx02132000_2 => $_cp{94x2} + 94*94 * (0x50-0x30),
49     jisx02132000_2_last => $_cp{94x2} + 94*94 * (0x50-0x30+1) -1,
50     ## (0xF7 - 0xEF - 1) * (0xFC - 0xEF) = 1316
51     x_doti => $_cp{x_imode_doti},
52     x_doti_last => $_cp{x_imode_doti} + 1316-1,
53     x_imode => $_cp{x_imode_doti} + 1316,
54     x_imode_last => $_cp{x_imode_doti} + 2443,
55 wakaba 1.1 );
56 wakaba 1.3 my %SJIS_G3_to_KUe = ( ## KU-1
57     0xF0 => 7, 0xF1 => 3, 0xF2 => 11, 0xF3 => 13,
58     0xF4 => 77, 0xF5 => 79, 0xF6 => 81, 0xF7 => 83,
59     0xF8 => 85, 0xF9 => 87, 0xFA => 89, 0xFB => 91,
60     0xFC => 93,
61 wakaba 1.1 );
62 wakaba 1.3 my %SJIS_G3_to_KUo = ( ## KU-1
63     0xF0 => 0, 0xF1 => 2, 0xF2 => 4, 0xF3 => 12,
64     0xF4 => 14, 0xF5 => 78, 0xF6 => 80, 0xF7 => 82,
65     0xF8 => 84, 0xF9 => 86, 0xFA => 88, 0xFB => 90,
66     0xFC => 92,
67     ); ## KU-1
68     my %_KU_to_SJIS_G3 = reverse (%SJIS_G3_to_KUe, %SJIS_G3_to_KUo);
69 wakaba 1.1
70     my %_charset_name_n11n_table = (
71 wakaba 1.3 ascii => 'us-ascii',
72     euc => 'euc-jp',
73     'euc-japan' => 'euc-jp',
74     'euc-jisx0208' => 'euc-jp',
75     'euc-jisx0213' => ['euc-jp', 2000],
76     jis => 'junet',
77     'shift-jis' => 'shift_jis',
78     'shift-jisx0213' => ['shift_jis', 2000],
79     shift_jisx0213 => ['shift_jis', 2000],
80     sjis => 'shift_jis',
81     'sjis-doti' => ['shift_jis', 1997, 'doti'],
82     'sjis-imode' => ['shift_jis', 1997, 'imode'],
83     'sjis-jsky' => ['shift_jis', 1997, 'jsky'],
84     'unicode-2-0-utf-8' => 'utf-8',
85     utf8 => 'utf-8',
86     'utf-8n' => 'utf-8',
87     'x-euc-jisx0213' => ['euc-jp', 2000],
88     'x-euc-jp' => 'euc-jp',
89     'x-shift_jisx0213' => ['shift_jis', 2000],
90     'x-sjis' => 'shift_jis',
91     'x-sjis-doti' => ['shift_jis', 1997, 'doti'],
92     'x-sjis-imode' => ['shift_jis', 1997, 'imode'],
93     'x-sjis-jsky' => ['shift_jis', 1997, 'jsky'],
94 wakaba 1.1 );
95     sub _charset_name_n11n ($;$) {
96 wakaba 1.3 my ($name, $year, $ext) = (shift, shift, lc shift);
97     $name = $_charset_name_n11n_table{lc $name} || $name;
98     if (ref $name eq 'ARRAY') {
99     ($name, $year, $ext) = @$name;
100 wakaba 1.1 }
101 wakaba 1.3 (lc $name, $year, $ext);
102 wakaba 1.1 }
103    
104     =head2 convert ($string, $output_code, $input_code, $options)
105    
106     Convert coded charset of string.
107    
108     =cut
109    
110     sub convert ($;$$$) {
111 wakaba 1.3 my ($string, $o_code, $i_code, $options) = @_;
112     my ($i_edition, $o_edition, $i_ext, $o_ext);
113     ($i_code, $i_edition, $i_ext) = _charset_name_n11n ($i_code);
114     ($o_code, $o_edition, $o_ext) = _charset_name_n11n ($o_code || "junet");
115     if ($i_code eq "euc-jp") {
116     eucjapan_to_internal ($string, -edition => $i_edition, -extension => $i_ext);
117     } elsif ($i_code eq "junet" || $i_code =~ /^iso-2022-jp/
118     || $i_code eq 'us-ascii') {
119 wakaba 1.1 junet_to_internal ($string);
120 wakaba 1.3 } elsif ($i_code eq "shift_jis") {
121     shiftjis_to_internal ($string, -edition => $i_edition, -extension => $i_ext);
122 wakaba 1.1 }
123    
124 wakaba 1.3 if ($o_code eq "euc-jp") {
125     internal_to_eucjapan ($string, -edition => $o_edition, -extension => $o_ext);
126     } elsif ($o_code eq "junet" || $o_code =~ /^iso-2022-jp/
127     || $o_code eq 'us-ascii') {
128 wakaba 1.1 internal_to_junet ($string);
129 wakaba 1.3 } elsif ($o_code eq "shift_jis") {
130     internal_to_shiftjis ($string, -edition => $o_edition, -extension => $o_ext);
131 wakaba 1.1 }
132     }
133    
134     =head2 junet_to_internal ($string)
135    
136     Convert junet coded string to internal coded string.
137    
138     =cut
139    
140     sub junet_to_internal ($) {
141     my $s = shift;
142 wakaba 1.3 #$s = \$s unless ref $s;
143     $s =~ s{
144 wakaba 1.1 ($re{94x1} | $re{94x2} | $re{96x1} | $re{utf8})
145     }{
146     my $st = $1;
147 wakaba 1.3 if ($st =~ /$re{M_94x1}/) {
148 wakaba 1.1 my ($f, $str) = ($1, $2);
149     $f = unpack 'C', $f;
150 wakaba 1.3 $f = $f == 0x42 ? 0x21 : $_cp{94x1} + 94 * ($f - 0x30);
151 wakaba 1.1 $str =~ s{([\x21-\x7E])}{
152     _u8($f + unpack('C', $1) - 0x21);
153     }goesx;
154     $st = $str;
155 wakaba 1.3 } elsif ($st =~ /$re{M_94x2}/) {
156 wakaba 1.1 my ($rev, $f, $str) = ($1, $2, $3);
157     if ($rev eq '@' && $f eq 'B') {
158 wakaba 1.3 $f = $_cp{jisx02081990};
159 wakaba 1.1 } else {
160     $f = unpack 'C', $f;
161 wakaba 1.3 $f = $_cp{94x2} + 94*94 * ($f - 0x30);
162 wakaba 1.1 }
163     $str =~ s{([\x21-\x7E])([\x21-\x7E])}{
164     _u8($f + (unpack('C', $1) - 0x21)*94 + unpack('C', $2) - 0x21);
165     }goesx;
166     $st = $str;
167 wakaba 1.3 } elsif ($st =~ /$re{M_96x1}/) {
168 wakaba 1.1 my ($f, $str) = ($1, $2);
169     $f = unpack 'C', $f;
170 wakaba 1.3 $f = $f == 0x41 ? 0xA0 : $_cp{96x1} + 96 * ($f - 0x30);
171 wakaba 1.1 $str =~ s{([\x20-\x7F])}{
172     _u8($f + unpack('C', $1) - 0x20);
173     }goesx;
174     $st = $str;
175 wakaba 1.3 } elsif ($st =~ /$re{M_utf8}/) {
176 wakaba 1.1 $st = $1;
177     }
178     $st;
179     }goesx;
180 wakaba 1.3 $s;
181 wakaba 1.1 }
182    
183     sub _u8 ($) {
184     my ($ret, $uc);
185     $uc = shift;
186     if ($uc < 0x80) { ## 1 byte
187     $ret .= chr($uc);
188     } elsif ($uc < 0x800) { ## 2 byte
189     $ret .= chr(0xC0 | ($uc >> 6))
190     . chr(0x80 | ($uc & 0x3F));
191     } elsif ($uc < 0x10000) { ## 3 byte
192     $ret .= chr(0xE0 | ($uc >> 12) )
193     . chr(0x80 | (($uc >> 6) & 0x3F))
194     . chr(0x80 | ($uc & 0x3F) );
195     } elsif ($uc < 0x200000) { ## 4 byte
196     $ret .= chr( 240 | ($uc >> 18) )
197     . chr(0x80 | (($uc >> 12) & 0x3F))
198     . chr(0x80 | (($uc >> 6) & 0x3F))
199     . chr(0x80 | ($uc & 0x3F) );
200     } elsif ($uc < 0x4000000) { ## 5 byte
201     $ret .= chr( 248 | ($uc >> 24) )
202     . chr(0x80 | (($uc >> 18) & 0x3F))
203     . chr(0x80 | (($uc >> 12) & 0x3F))
204     . chr(0x80 | (($uc >> 6) & 0x3F))
205     . chr(0x80 | ($uc & 0x3F) );
206     } elsif ($uc < 0x80000000) { ## 6 byte
207     $ret .= chr( 252 | ($uc >> 30) )
208     . chr(0x80 | (($uc >> 24) & 0x3F))
209     . chr(0x80 | (($uc >> 18) & 0x3F))
210     . chr(0x80 | (($uc >> 12) & 0x3F))
211     . chr(0x80 | (($uc >> 6) & 0x3F))
212     . chr(0x80 | ($uc & 0x3F) );
213     }
214     $ret;
215     }
216    
217     sub _ucode ($) {
218     my $s = shift;
219     return unpack("C", $s) if length($s) < 2;
220     my ($iterations, $c, @c)
221     = (0, unpack("C", substr($s, 0, 1)), unpack('C*', substr($s, 1)));
222     if (($c & 0xFE) == 0xFC) {
223     $c = ($c & 0x01);
224     $iterations = 5;
225     } elsif (($c & 0xFC) == 0xF8) {
226     $c = ($c & 0x03);
227     $iterations = 4;
228     } elsif (($c & 0xF8) == 0xF0) {
229     $c = ($c & 0x07);
230     $iterations = 3;
231     } elsif (($c & 0xF0) == 0xE0) {
232     $c = ($c & 0x0F);
233     $iterations = 2;
234     } elsif (($c & 0xE0) == 0xC0) {
235     $c = ($c & 0x1F);
236     $iterations = 1;
237     }
238     if ($iterations == $#c+1) {
239     for (my $i = 0; $i < $iterations; $i++) {
240     $c = ($c << 6);
241     $c = ($c | ($c[$i] & 0x3F));
242     }
243     }
244     $c;
245     }
246    
247     =head2 internal_to_eucjapan ($string, [%options])
248    
249     Convert internal coded string to euc-japan coded string.
250    
251     =head3 Example
252    
253     internal_to_eucjapan (\$s, jisx0201kana => 1);
254    
255     =cut
256    
257     sub internal_to_eucjapan ($;%) {
258     my $s = shift;
259 wakaba 1.3 #$s = \$s unless ref $s;
260 wakaba 1.1 my %output = @_;
261     my $year = $output{edition} || -1; ## -1 (= all unify), 1983, 1990 = 1997, 2000
262     $output{jisx02081978} = 1 if $year < 1983;
263     $output{jisx02081983} = 1 if $year == -1 || (1983 <= $year && $year < 1990);
264     $output{jisx02081990} = 1 if $year == -1 || (1990 <= $year && $year < 2000);
265     $output{jisx02121990} = 1 if $year == -1 || (1990 <= $year && $year < 2000);
266     $output{jisx02132000_1} = 1 if $year == -1 || (2000 <= $year);
267     $output{jisx02132000_2} = 1 if $year == -1 || (2000 <= $year);
268     $output{undefchar} ||= "\xA2\xAE";
269 wakaba 1.3 $s =~ s{([\xC0-\xFF][\x80-\xBF]*)}{
270 wakaba 1.1 my $char = $1; my $code = _ucode($char); my $ret = "";
271     if ($code < 0x7F) { ## G0 = ASCII
272     $ret = pack("C", $code);
273 wakaba 1.3 } elsif ($output{jisx0201latin} && $_cp{jisx0201latin} < $code
274     && $code < $_cp{jisx0201latin_last}) { ## G0 = JIS X 0201 latin
275     $ret = pack("C", $code - $_cp{jisx0201latin} + 0x21);
276 wakaba 1.1 } elsif ($output{jisx02081978}
277 wakaba 1.3 && ($_cp{jisx02081978} < $code && $code < $_cp{jisx02081978_last})
278 wakaba 1.1 ) { ## G1 = JIS X 0208-1978
279 wakaba 1.3 my $ku = $code - $_cp{jisx02081978};
280 wakaba 1.1 my $ten = ($ku % 94) + 0xA1; $ku = int($ku / 94) + 0xA1;
281     $ret = pack("CC", $ku, $ten);
282     } elsif ($output{jisx02081983}
283 wakaba 1.3 && ($_cp{jisx02081983} < $code && $code < $_cp{jisx02081983_last})
284 wakaba 1.1 ) { ## G1 = JIS X 0208-1983
285 wakaba 1.3 my $ku = $code - $_cp{jisx02081983};
286 wakaba 1.1 my $ten = ($ku % 94) + 0xA1; $ku = int($ku / 94) + 0xA1;
287     $ret = pack("CC", $ku, $ten);
288     } elsif ($output{jisx02081990}
289 wakaba 1.3 && ($_cp{jisx02081990} < $code && $code < $_cp{jisx02081990_last})
290 wakaba 1.1 ) { ## G1 = JIS X 0208-1990 / JIS X 0208:1997
291 wakaba 1.3 my $ku = $code - $_cp{jisx02081990};
292 wakaba 1.1 my $ten = ($ku % 94) + 0xA1; $ku = int($ku / 94) + 0xA1;
293     $ret = pack("CC", $ku, $ten);
294     } elsif ($output{jisx02132000_1}
295 wakaba 1.3 && ($_cp{jisx02132000_1} < $code && $code < $_cp{jisx02132000_1_last})
296 wakaba 1.1 ) { ## G1 = JIS X 0213:2000 plane 1
297 wakaba 1.3 my $ku = $code - $_cp{jisx02132000_1};
298 wakaba 1.1 my $ten = ($ku % 94) + 0xA1; $ku = int($ku / 94) + 0xA1;
299     $ret = pack("CC", $ku, $ten);
300 wakaba 1.3 } elsif ($output{jisx0201kana} && $_cp{jisx0201kana} < $code
301     && $code < $_cp{jisx0201kana_last}) { ## G2 = JIS X 0201 katakana
302     $ret = "\x8E" . pack("C", $code - $_cp{jisx0201kana} + 0xA1);
303 wakaba 1.1 } elsif ($output{jisx02121990}
304 wakaba 1.3 && ($_cp{jisx02121990} < $code && $code < $_cp{jisx02121990_last})
305 wakaba 1.1 ) { ## G3 = JIS X 0213:2000 plane 2
306 wakaba 1.3 my $ku = $code - $_cp{jisx02121990};
307 wakaba 1.1 my $ten = ($ku % 94) + 0xA1; $ku = int($ku / 94) + 0xA1;
308     $ret = "\x8F" . pack("CC", $ku, $ten);
309     } elsif ($output{jisx02132000_2}
310 wakaba 1.3 && ($_cp{jisx02132000_2} < $code && $code < $_cp{jisx02132000_2_last})
311 wakaba 1.1 ) { ## G3 = JIS X 0213:2000 plane 2
312 wakaba 1.3 my $ku = $code - $_cp{jisx02132000_2};
313 wakaba 1.1 my $ten = ($ku % 94) + 0xA1; $ku = int($ku / 94) + 0xA1;
314     $ret = "\x8F" . pack("CC", $ku, $ten);
315     } else {
316     $ret = $output{undefchar};
317     }
318     $ret;
319     }goex;
320 wakaba 1.3 $s;
321 wakaba 1.1 }
322    
323     =head2 internal_to_junet ($sering, [%option])
324    
325     Convert internal coded string to junet coded string.
326    
327     =head3 options
328    
329     * g0_96
330     * utf8
331    
332     =cut
333    
334     sub internal_to_junet ($;%) {
335     my $s = shift;
336 wakaba 1.3 #$s = \$s unless ref $s;
337 wakaba 1.1 my %output = @_; my $mode = "\x1B\x28\x42";
338     $output{undefchar} ||= 0x3F-0x21;
339     $output{undefcharset} ||= "\x1B\x28\x42";
340 wakaba 1.3 $s =~ s{([\x00-\x7F]|(?:[\xC0-\xFF][\x80-\xBF]*))}{
341 wakaba 1.1 my $char = $1; my $code = _ucode($char); my $ret = "";
342     if ($code < 0x9F) { ## ASCII
343     $ret = _2022_putchar(\$mode => "\x1B\x28\x42", $code-0x21);
344     } elsif ($code < 0xFF) { ## ISO 8859-1 right half
345     $ret = _2022_putchar(\$mode => "\x1B\x2C\x41", $code-0xA0);
346 wakaba 1.3 } elsif ($_cp{94x1} < $code && $code < $_cp{94x1} + 94*78 -1) {
347 wakaba 1.1 ## 94 charsets
348 wakaba 1.3 my $final = pack("C", int(($code - $_cp{94x1}) / 94) + 0x30);
349 wakaba 1.1 $ret = _2022_putchar(\$mode => "\x1B\x28".$final,
350 wakaba 1.3 ($code - $_cp{94x1}) % 94);
351 wakaba 1.1 } elsif ($output{g0_96}
352 wakaba 1.3 && $_cp{96x1} < $code && $code < $_cp{96x1} + 96*78 -1) {
353 wakaba 1.1 ## 96 charsets
354 wakaba 1.3 my $final = pack("C", int(($code - $_cp{96x1}) / 96) + 0x30);
355 wakaba 1.1 $ret = _2022_putchar(\$mode => "\x1B\x2C".$final,
356 wakaba 1.3 ($code - $_cp{96x1}) % 96);
357     } elsif ($_cp{94x2} < $code && $code < $_cp{94x2} + 94*94*78 -1) {
358 wakaba 1.1 ## 94x2 charsets
359 wakaba 1.3 my $final = pack("C", int(($code - $_cp{94x2}) / (94*94)) + 0x30);
360 wakaba 1.1 $ret = _2022_putchar(\$mode => "\x1B\x24\x28".$final,
361 wakaba 1.3 ($code - $_cp{94x2}) % (94*94));
362     } elsif ($_cp{jisx02081990} < $code
363     && $code < $_cp{jisx02081990} + 94*94 -1) {
364 wakaba 1.1 $ret = _2022_putchar(\$mode => "\x1B\x26\x40\x1B\x24B",
365 wakaba 1.3 ($code - $_cp{94x2}) % (94*94));
366 wakaba 1.1 } elsif ($output{utf8}) {
367     $ret = _2022_putchar(\$mode => "\x1B\x25G", $char)
368     } else {
369     $ret = _2022_putchar(\$mode => $output{undefcharset} => $output{undefchar});
370     }
371     $ret;
372     }goesx;
373 wakaba 1.3 $s .= _2022_putchar (\$mode => "\x1B\x28\x42" => '');
374     $s;
375 wakaba 1.1 }
376     sub _2022_putchar ($$$) {
377     my ($mode, $newmode, $char) = @_;
378     my $ret = ""; my %is;
379     $is{multibyte} = 1 if $newmode =~ /\x24/;
380     $is{set96} = 1 if $newmode =~ /\x2C/;
381     if ($$mode ne $newmode) {
382     if ($$mode eq "\x1B\x25G") {
383     $ret = "\x1B\x25\x40\x1B\x28\x42";
384     }
385 wakaba 1.3 if (($is{multibyte} && $newmode ne "\x1B\x24\x28\x42"
386     && $newmode ne "\x1B\x24\x28\x40")
387     || $is{set96} || $newmode eq "\x1B\x25G") {
388     ## If it is not well-known escape sequence(s),
389     ## designate ASCII for safe.
390     $ret .= "\x1B\x28\x42" unless $$mode eq "\x1B\x28\x42";
391 wakaba 1.1 }
392     if ($newmode =~ /\x1B\x24\x28([\x40-\x42])/) {
393     $ret .= "\x1B\x24$1";
394     } else {
395     $ret .= $newmode;
396     }
397     $$mode = $newmode;
398     }
399 wakaba 1.3 if (length $char) {
400 wakaba 1.1 if ($is{multibyte} && $is{set96}) { ## 96x2
401     $ret .= pack("CC", int($char / 96) + 0x20, ($char % 96) + 0x20);
402     } elsif ($is{multibyte}) { ## 94x2
403     $ret .= pack("CC", int($char / 94) + 0x21, ($char % 94) + 0x21);
404     } elsif ($is{set96}) { ## 96x1
405     $ret .= pack("C", $char + 0x20);
406     } elsif ($newmode eq "\x1B\x25G") { ## utf-8
407     $ret .= $char; ## if utf-8, $char is char itself!
408     } else { ## 94x1
409     $ret .= pack("C", $char + 0x21);
410     }
411 wakaba 1.3 }
412 wakaba 1.1 $ret;
413     }
414    
415     sub eucjapan_to_internal ($;%) {
416     my $s = shift;
417     my %option = @_;
418     my $year = $option{edition} || 2000; ## 1978, 1983, 1990=1997, 2000
419 wakaba 1.3 $option{G0start} ||= 0x21; $option{G2start} ||= $_cp{jisx0201kana};
420 wakaba 1.1 if ($option{G1start} && $option{G3start}) {}
421     elsif ($year < 1983)
422 wakaba 1.3 {$option{G1start} ||= $_cp{jisx02081978}; $option{G3start} ||= -1}
423 wakaba 1.1 elsif (1983 <= $year && $year < 1990)
424 wakaba 1.3 {$option{G1start} ||= $_cp{jisx02081983}; $option{G3start} ||= -1}
425 wakaba 1.1 elsif (1990 <= $year && $year < 2000)
426 wakaba 1.3 {$option{G1start} ||= $_cp{jisx02081990}; $option{G3start} ||= -1}
427 wakaba 1.1 else #elsif (2000 <= $year)
428 wakaba 1.3 {$option{G1start} ||= $_cp{jisx02132000_1}; $option{G3start} ||= -1}
429     $s =~ s{([\x21-\x7E]|\x8E[\xA1-\xFE]|\x8F?[\xA1-\xFE][\xA1-\xFE])}{
430 wakaba 1.1 my $char = $1; my $ret = "";
431     if ($char =~ /[\x21-\x7E]/) {
432     $ret = _u8(unpack("C", $char) - 0x21 + $option{G0start});
433     } elsif ($char =~ /\x8F/) {
434     my $ku = unpack("C", substr($char,1,1)) - 0xA1;
435     my $ten = unpack("C", substr($char,2,1)) - 0xA1;
436     if ($option{G3start} == -1) {
437     if (77 <= $ku || $ku == 0 || $ku == 2 || $ku == 3 || $ku == 4
438     || $ku == 7 || $ku == 11 || $ku == 12 || $ku == 13 || $ku == 14) {
439 wakaba 1.3 $ret = _u8($ku * 94 + $ten + $_cp{jisx02132000_2});
440 wakaba 1.1 } else {
441 wakaba 1.3 $ret = _u8($ku * 94 + $ten + $_cp{jisx02121990});
442 wakaba 1.1 }
443     } else {
444     $ret = _u8($ku * 94 + $ten + $option{G3start});
445     }
446     } elsif ($char =~ /\x8E/) {
447     $ret = _u8(unpack("C", substr($char,1,1)) - 0xA1 + $option{G2start});
448     } else {
449     my $ku = unpack("C", substr($char,0,1)) - 0xA1;
450     my $ten = unpack("C", substr($char,1,1)) - 0xA1;
451     $ret = _u8($ku * 94 + $ten + $option{G1start});
452     }
453     $ret;
454     }goex;
455 wakaba 1.3 $s;
456 wakaba 1.1 }
457    
458 wakaba 1.3 =item M17N::Code::JA::shiftjis_to_internal ($string, [%options])
459    
460     Converts string from shift-jis to internal code.
461    
462     Available options:
463    
464     =over 4
465    
466     =item -edition => 1978 / 1983 / 1990 / 2000
467    
468     Edition of double byte character set (Default: 2000)
469    
470     =item -extension => [none] / cp932 / mac / imode / doti / jsky
471    
472     Type of private (non official) extension (Default: no private extensions)
473    
474     =back
475    
476     =cut
477    
478     ## G0 = 0x21-0x7E
479     ## G1 = 0xA1-0xDE
480     ## G2 = 0x8141-0x9FFC, 0xE041-0xEFFC
481     ## G3 = 0xF041-0xFCFC
482     my %S2I; ## cache
483     sub shiftjis_to_internal ($;%) {
484     my $s = shift; my %o = @_;
485     my $year = $o{-edition} || 2000;
486     my $legacy = 0; ## legacy variants before X0213 G3
487     $o{-G0start} ||= 0x21; $o{-G1start} ||= $_cp{jisx0201kana};
488     if ($o{-G2start} && $o{-G3start}) {}
489     elsif ($year < 1983)
490     {$o{-G2start} ||= $_cp{jisx02081978}; $o{-G3start} ||= $_cp{jisx02132000_2}}
491     elsif (1983 <= $year && $year < 1990)
492     {$o{-G2start} ||= $_cp{jisx02081983}; $o{-G3start} ||= $_cp{jisx02132000_2}}
493     elsif (1990 <= $year && $year < 2000)
494     {$o{-G2start} ||= $_cp{jisx02081990}; $o{-G3start} ||= $_cp{jisx02132000_2}}
495     else #elsif (2000 <= $year)
496     {$o{-G2start} ||= $_cp{jisx02132000_1}; $o{-G3start} ||= $_cp{jisx02132000_2}}
497     if ($o{-extension} eq 'imode' || $o{-extension} eq 'doti') {
498     $o{-G3start} = $_cp{x_imode_doti}; $legacy = 1;
499     }
500     $s =~ s{
501     ([\x21-\x7E])
502     | ([\x81-\x9F\xE0-\xEF][\x40-\x7E\x80-\xFC])
503     | ([\xF0-\xFC][\x40-\x7E\x80-\xFC])
504     | ([\xA1-\xDF])
505     | ([\x80-\xFF])
506     }{
507     my ($g0, $g2, $g3, $g1, $broken) = ($1, $2, $3, $4, $5);
508     my $ret = '';
509     if ($g0) {
510     $S2I{$g0}->{$o{-G0start}} = _u8(unpack('C', $g0) - 0x21 + $o{-G0start})
511     unless $S2I{$g0}->{$o{-G0start}};
512     $ret = $S2I{$g0}->{$o{-G0start}};
513     } elsif ($g2) {
514     unless ($S2I{$g2}->{$o{-G2start}}) {
515     my ($f, $s) = unpack ('CC', $g2);
516     if (0x9E < $s) {
517     $f = $f * 2 - ($f >= 0xE0 ? 0x181 : 0x101); $s -= 0x9F;
518     } else {
519     $f = $f * 2 - ($f >= 0xE0 ? 0x182 : 0x102); $s -= 0x40 + ($s > 0x7F);
520     }
521     $S2I{$g2}->{$o{-G2start}} = _u8 ($f * 94 + $s + $o{-G2start});
522     }
523     $ret = $S2I{$g2}->{$o{-G2start}};
524     } elsif ($g3) {
525     unless ($S2I{$g3}->{$o{-G3start}}) {
526     my ($f, $s) = unpack ('CC', $g3);
527     if ($legacy) {
528     $s -= 0x40 + ($s > 0x7F);
529     $S2I{$g3}->{$o{-G3start}} = _u8 (($f - 0xF0) * 188 + $s + $o{-G3start});
530     } else { ## X0213
531     if (0x9F <= $s) {
532     $f = $SJIS_G3_to_KUe{$f}; $s -= 0x9F;
533     } else {
534     $f = $SJIS_G3_to_KUo{$f}; $s -= 0x40 + ($s > 0x7F);
535     }
536     $S2I{$g3}->{$o{-G3start}} = _u8 ($f * 94 + $s + $o{-G3start});
537     }
538     }
539     $ret = $S2I{$g3}->{$o{-G3start}};
540     } elsif ($g1) {
541     $S2I{$g1}->{$o{-G1start}} = _u8(unpack('C', $g1) - 0xA1 + $o{-G1start})
542     unless $S2I{$g1}->{$o{-G1start}};
543     $ret = $S2I{$g1}->{$o{-G1start}};
544     }
545     $ret;
546     }gex;
547     $s;
548     }
549    
550     =item $s = M17N::Code::JA::internal_to_shiftjis ($s, [%options])
551    
552     Converts internal coded string to shift-jis coded string.
553    
554     =cut
555    
556     my (%_I2S_1, %_I2S_2, %_I2S_2L, %_I2S); ## caches
557     sub internal_to_shiftjis ($;%) {
558     my $s = shift; my %o = @_;
559     my $year = $o{-edition}; ## 0 (= all unify), 1983, 1990 = 1997, 2000
560     $o{-x0201_latin} ||= 1; $o{-x0201_katakana} ||= 1;
561     $o{-x0208_78} = 1 if $year < 1983;
562     $o{-x0208_83} = 1 if !$year || (1983 <= $year && $year < 1990);
563     $o{-x0208_90} = 1 if !$year || (1990 <= $year && $year < 2000);
564     if (!$year || ($year >= 2000)) {
565     $o{-x0213_A0_1} = 1; $o{-x0213_A0_2} = 1;
566     }
567     if ($o{-extension}) { ## one of private extensions
568     $o{-x0213_A0_2} = 0; $o{ '-' . $o{-extension} } = 1;
569     }
570     $o{-undefchar} ||= "\x81\xAC";
571     $s =~ s{([\xC0-\xFF][\x80-\xBF]*)}{
572     my ($code, $ret) = (_ucode($1), '');
573     #if ($code < 0x7F) { ## G0 = ASCII
574     # $ret = pack("C", $code);
575     #} els
576     if ($o{-x0201_latin}
577     && $_cp{jisx0201latin} <= $code && $code < $_cp{jisx0201latin_last}) {
578     $_I2S{$code} = pack("C", $code - $_cp{jisx0201latin} + 0x21)
579     unless $_I2S{$code};
580     $ret = $_I2S{$code};
581     } elsif ($o{-x0208_78}
582     && $_cp{jisx02081978} <= $code && $code < $_cp{jisx02081978_last}) {
583     $code -= $_cp{jisx02081978};
584     unless ($_I2S_1{$code}) {
585     my ($c1, $c2) = (int($code / 94) + 0x21, ($code % 94) + 0x21);
586     $_I2S_1{$code} = pack('CC', (($c1 - 1) >> 1) + ($c1 < 0x5F ? 0x71 : 0xb1),
587     $c2 + (($c1 & 1) ? ($c2 < 0x60 ? 0x1F : 0x20) : 0x7E));
588     }
589     $ret = $_I2S_1{$code};
590     } elsif ($o{-x0208_83}
591     && $_cp{jisx02081983} <= $code && $code < $_cp{jisx02081983_last}) {
592     $code -= $_cp{jisx02081983};
593     unless ($_I2S_1{$code}) {
594     my ($c1, $c2) = (int($code / 94) + 0x21, ($code % 94) + 0x21);
595     $_I2S_1{$code} = pack('CC', (($c1 - 1) >> 1) + ($c1 < 0x5F ? 0x71 : 0xb1),
596     $c2 + (($c1 & 1) ? ($c2 < 0x60 ? 0x1F : 0x20) : 0x7E));
597     }
598     $ret = $_I2S_1{$code};
599     } elsif ($o{-x0208_90}
600     && $_cp{jisx02081990} <= $code && $code < $_cp{jisx02081990_last}) {
601     $code -= $_cp{jisx02081990};
602     unless ($_I2S_1{$code}) {
603     my ($c1, $c2) = (int($code / 94) + 0x21, ($code % 94) + 0x21);
604     $_I2S_1{$code} = pack('CC', (($c1 - 1) >> 1) + ($c1 < 0x5F ? 0x71 : 0xb1),
605     $c2 + (($c1 & 1) ? ($c2 < 0x60 ? 0x1F : 0x20) : 0x7E));
606     }
607     $ret = $_I2S_1{$code};
608     } elsif ($o{-x0213_A0_1}
609     && $_cp{jisx02132000_1} <=$code && $code < $_cp{jisx02132000_1_last}) {
610     $code -= $_cp{jisx02132000_1};
611     unless ($_I2S_1{$code}) {
612     my ($c1, $c2) = (int($code / 94) + 0x21, ($code % 94) + 0x21);
613     $_I2S_1{$code} = pack('CC', (($c1 - 1) >> 1) + ($c1 < 0x5F ? 0x71 : 0xb1),
614     $c2 + (($c1 & 1) ? ($c2 < 0x60 ? 0x1F : 0x20) : 0x7E));
615     }
616     $ret = $_I2S_1{$code};
617     } elsif ($o{-x0201_katakana}
618     && $_cp{jisx0201kana} <= $code && $code < $_cp{jisx0201kana_last1}) {
619     $_I2S{$code} = pack("C", $code - $_cp{jisx0201kana} + 0xA1)
620     unless $_I2S{$code};
621     $ret = $_I2S{$code};
622     } elsif ($o{-x0213_A0_2}
623     && ($_cp{jisx02132000_2} <= $code && $code < $_cp{jisx02132000_2_last})) {
624     $code -= $_cp{jisx02132000_2};
625     unless ($_I2S_2{$code}) {
626     my ($c1, $c2) = (int($code / 94), ($code % 94) + 0x21);
627     $_I2S_2{$code} = pack('CC', $_KU_to_SJIS_G3{ $c1 },
628     $c2 + (($c1 & 1) ? 0x7E : ($c2 < 0x60 ? 0x1F : 0x20)));
629     }
630     $ret = $_I2S_2{$code};
631     } elsif ($o{-imode} && $_cp{x_imode} <= $code && $code < $_cp{x_imode_last}) {
632     $code -= $_cp{x_imode_doti};
633     unless ($_I2S_2L{$code}) {
634     my $c2 = $code % 188; $_I2S_2L{$code} = pack('CC',
635     int($code / 188) + 0xF0, $c2 + ($c2 > 0x3E ? 0x41 : 0x40));
636     }
637     $ret = $_I2S_2L{$code};
638     } elsif ($o{-doti} && $_cp{x_doti} <= $code && $code < $_cp{x_doti_last}) {
639     $code -= $_cp{x_imode_doti};
640     unless ($_I2S_2L{$code}) {
641     my $c2 = $code % 188; $_I2S_2L{$code} = pack('CC',
642     int($code / 188) + 0xF0, $c2 + ($c2 > 0x3E ? 0x41 : 0x40));
643     }
644     $ret = $_I2S_2L{$code};
645     } else {
646     $ret = $o{-undefchar};
647     }
648     $ret;
649     }goex;
650     $s;
651     }
652    
653     =head1 SEE ALSO
654    
655     =over 4
656    
657     =item ASTEL Emoji
658    
659     <http://www.ttnet.co.jp/tokyodenwa_astel/doti/siyou/emoji.htm>
660 wakaba 1.1
661 wakaba 1.3 =back
662 wakaba 1.1
663     =head1 LICENSE
664    
665 wakaba 1.3 This program is free software; you can redistribute it and/or
666     modify it under the same terms as Perl itself.
667 wakaba 1.1
668 wakaba 1.3 =head1 CHANGE
669 wakaba 1.1
670 wakaba 1.3 See F<ChangeLog>.
671     $Date: JA.pm,v 1.2 2001/12/24 08:13:56 wakaba Exp $
672 wakaba 1.2
673 wakaba 1.1 =cut
674    
675     1;

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24