/[pub]/test/oldencodeutils/lib/M17N/Code/JA.pm
Suika

Contents of /test/oldencodeutils/lib/M17N/Code/JA.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (show annotations) (download)
Mon Dec 24 07:55:51 2001 UTC (24 years ago) by wakaba
Branch: MAIN
2001-12-24  wakaba <wakaba@suika.fam.cx>

	* JA.pm: New file.
	
	* ChangeLog: New file.

1
2 package M17N::Code::JA;
3 use strict;
4 use vars qw(%re %internalo);
5
6 %re = (
7 94x1 => qr/\x1B\x28([\x40-\x7D])((?:[\x21-\x7E])*)/,
8 94x2 => qr/(?:\x1B\x26([\x40-\x7E]))?\x1B\x24\x28?([\x40-\x5F])((?:[\x21-\x7E][\x21-\x7E])*)/,
9 96x1 => qr/\x1B\x2C([\x40-\x7D])((?:[\x20-\x7F])*)/,
10 utf8 => qr/\x1B\x25(?:\x47|\x2F[\x47-\x49])((?:[\x20-\x7F]|(?:[\xC0-\xFD][\x80-\xBF]+))*)\x1B\x25\x40/,
11 );
12 %internalo = (
13 94x1 => 0xE90940,
14 94x2 => 0xE9F6C0,
15 96x1 => 0xE926A0,
16 jisx02081990 => 0xE9F6C0 + 94*94*79,
17 );
18 %internalo = (%internalo,
19 jisx0201kana => $internalo{94x1} + 94 * (0x49-0x30),
20 jisx0201kana_end => $internalo{94x1} + 94 * (0x49-0x30+1) -1,
21 jisx0201latin => $internalo{94x1} + 94 * (0x4A-0x30),
22 jisx0201latin_end => $internalo{94x1} + 94 * (0x4A-0x30+1) -1,
23 jisx02081978 => $internalo{94x2} + 94*94 * (0x40-0x30),
24 jisx02081978_end => $internalo{94x2} + 94*94 * (0x40-0x30+1) -1,
25 jisx02081983 => $internalo{94x2} + 94*94 * (0x42-0x30),
26 jisx02081983_end => $internalo{94x2} + 94*94 * (0x42-0x30+1) -1,
27 jisx02081990_end => $internalo{jisx02081990} + 94*94 -1,
28 jisx02121990 => $internalo{94x2} + 94*94 * (0x44-0x30),
29 jisx02121990_end => $internalo{94x2} + 94*94 * (0x44-0x30+1) -1,
30 jisx02132000_1 => $internalo{94x2} + 94*94 * (0x4F-0x30),
31 jisx02132000_1_end => $internalo{94x2} + 94*94 * (0x4F-0x30+1) -1,
32 jisx02132000_2 => $internalo{94x2} + 94*94 * (0x50-0x30),
33 jisx02132000_2_end => $internalo{94x2} + 94*94 * (0x50-0x30+1) -1,
34 );
35
36 my %_charset_name_n11n_table = (
37 euc => "euc-jp",
38 "euc-japan" => "euc-jp",
39 "euc-jisx0208" => "euc-jp",
40 "euc-jisx0213" => "euc-jp:2000",
41 jis => "junet",
42 "shift-jis" => "shift_jis",
43 "shift-jisx0213" => "shift_jis:2000",
44 shift_jisx0213 => "shift_jis:2000",
45 sjis => "shift_jis",
46 "x-euc-jisx0213" => "euc-jp:2000",
47 "x-euc-jp" => "euc-jp",
48 "x-shift_jisx0213" => "shift_jis:2000",
49 "x-sjis" => "shift_jis",
50 );
51 sub _charset_name_n11n ($;$) {
52 my $name = lc shift; my $year = shift;
53 $name = $_charset_name_n11n_table{$name} || $name;
54 if ($name =~ /^(.+):(\d+)$/) {
55 $name = $1;
56 $year ||= $2;
57 }
58 ($name, $year);
59 }
60
61 =head2 convert ($string, $output_code, $input_code, $options)
62
63 Convert coded charset of string.
64
65 =cut
66
67 sub convert ($;$$$) {
68 my ($string, $output_code, $input_code, $options) = @_;
69 $string = \$string unless ref $string;
70 my ($input_edition, $output_edition);
71 ($input_code, $input_edition) = _charset_name_n11n ($input_code);
72 ($output_code, $output_edition) = _charset_name_n11n ($output_code || "junet");
73 if ($input_code eq "euc-jp") {
74 eucjapan_to_internal ($string, edition => $input_edition);
75 } elsif ($input_code eq "junet" || $input_code =~ /^iso-2022-jp/) {
76 junet_to_internal ($string);
77 }
78
79 if ($output_code eq "euc-jp") {
80 internal_to_eucjapan ($string, edition => $output_edition);
81 } elsif ($output_code eq "junet" || $output_code =~ /^iso-2022-jp/) {
82 internal_to_junet ($string);
83 }
84 }
85
86 =head2 junet_to_internal ($string)
87
88 Convert junet coded string to internal coded string.
89
90 =cut
91
92 sub junet_to_internal ($) {
93 my $s = shift;
94 $s = \$s unless ref $s;
95 $$s =~ s{
96 ($re{94x1} | $re{94x2} | $re{96x1} | $re{utf8})
97 }{
98 my $st = $1;
99 if ($st =~ /$re{94x1}/) {
100 my ($f, $str) = ($1, $2);
101 $f = unpack 'C', $f;
102 $f = $f == 0x42 ? 0x21 : $internalo{94x1} + 94 * ($f - 0x30);
103 $str =~ s{([\x21-\x7E])}{
104 _u8($f + unpack('C', $1) - 0x21);
105 }goesx;
106 $st = $str;
107 } elsif ($st =~ /$re{94x2}/) {
108 my ($rev, $f, $str) = ($1, $2, $3);
109 if ($rev eq '@' && $f eq 'B') {
110 $f = $internalo{jisx02081990};
111 } else {
112 $f = unpack 'C', $f;
113 $f = $internalo{94x2} + 94*94 * ($f - 0x30);
114 }
115 $str =~ s{([\x21-\x7E])([\x21-\x7E])}{
116 _u8($f + (unpack('C', $1) - 0x21)*94 + unpack('C', $2) - 0x21);
117 }goesx;
118 $st = $str;
119 } elsif ($st =~ /$re{96x1}/) {
120 my ($f, $str) = ($1, $2);
121 $f = unpack 'C', $f;
122 $f = $f == 0x41 ? 0xA0 : $internalo{96x1} + 96 * ($f - 0x30);
123 $str =~ s{([\x20-\x7F])}{
124 _u8($f + unpack('C', $1) - 0x20);
125 }goesx;
126 $st = $str;
127 } elsif ($st =~ /$re{utf8}/) {
128 $st = $1;
129 }
130 $st;
131 }goesx;
132 $$s;
133 }
134
135 sub _u8 ($) {
136 my ($ret, $uc);
137 $uc = shift;
138 if ($uc < 0x80) { ## 1 byte
139 $ret .= chr($uc);
140 } elsif ($uc < 0x800) { ## 2 byte
141 $ret .= chr(0xC0 | ($uc >> 6))
142 . chr(0x80 | ($uc & 0x3F));
143 } elsif ($uc < 0x10000) { ## 3 byte
144 $ret .= chr(0xE0 | ($uc >> 12) )
145 . chr(0x80 | (($uc >> 6) & 0x3F))
146 . chr(0x80 | ($uc & 0x3F) );
147 } elsif ($uc < 0x200000) { ## 4 byte
148 $ret .= chr( 240 | ($uc >> 18) )
149 . chr(0x80 | (($uc >> 12) & 0x3F))
150 . chr(0x80 | (($uc >> 6) & 0x3F))
151 . chr(0x80 | ($uc & 0x3F) );
152 } elsif ($uc < 0x4000000) { ## 5 byte
153 $ret .= chr( 248 | ($uc >> 24) )
154 . chr(0x80 | (($uc >> 18) & 0x3F))
155 . chr(0x80 | (($uc >> 12) & 0x3F))
156 . chr(0x80 | (($uc >> 6) & 0x3F))
157 . chr(0x80 | ($uc & 0x3F) );
158 } elsif ($uc < 0x80000000) { ## 6 byte
159 $ret .= chr( 252 | ($uc >> 30) )
160 . chr(0x80 | (($uc >> 24) & 0x3F))
161 . chr(0x80 | (($uc >> 18) & 0x3F))
162 . chr(0x80 | (($uc >> 12) & 0x3F))
163 . chr(0x80 | (($uc >> 6) & 0x3F))
164 . chr(0x80 | ($uc & 0x3F) );
165 }
166 $ret;
167 }
168
169 sub _ucode ($) {
170 my $s = shift;
171 return unpack("C", $s) if length($s) < 2;
172 my ($iterations, $c, @c)
173 = (0, unpack("C", substr($s, 0, 1)), unpack('C*', substr($s, 1)));
174 if (($c & 0xFE) == 0xFC) {
175 $c = ($c & 0x01);
176 $iterations = 5;
177 } elsif (($c & 0xFC) == 0xF8) {
178 $c = ($c & 0x03);
179 $iterations = 4;
180 } elsif (($c & 0xF8) == 0xF0) {
181 $c = ($c & 0x07);
182 $iterations = 3;
183 } elsif (($c & 0xF0) == 0xE0) {
184 $c = ($c & 0x0F);
185 $iterations = 2;
186 } elsif (($c & 0xE0) == 0xC0) {
187 $c = ($c & 0x1F);
188 $iterations = 1;
189 }
190 if ($iterations == $#c+1) {
191 for (my $i = 0; $i < $iterations; $i++) {
192 $c = ($c << 6);
193 $c = ($c | ($c[$i] & 0x3F));
194 }
195 }
196 $c;
197 }
198
199 =head2 internal_to_eucjapan ($string, [%options])
200
201 Convert internal coded string to euc-japan coded string.
202
203 =head3 Example
204
205 internal_to_eucjapan (\$s, jisx0201kana => 1);
206
207 =cut
208
209 sub internal_to_eucjapan ($;%) {
210 my $s = shift;
211 $s = \$s unless ref $s;
212 my %output = @_;
213 my $year = $output{edition} || -1; ## -1 (= all unify), 1983, 1990 = 1997, 2000
214 $output{jisx02081978} = 1 if $year < 1983;
215 $output{jisx02081983} = 1 if $year == -1 || (1983 <= $year && $year < 1990);
216 $output{jisx02081990} = 1 if $year == -1 || (1990 <= $year && $year < 2000);
217 $output{jisx02121990} = 1 if $year == -1 || (1990 <= $year && $year < 2000);
218 $output{jisx02132000_1} = 1 if $year == -1 || (2000 <= $year);
219 $output{jisx02132000_2} = 1 if $year == -1 || (2000 <= $year);
220 $output{undefchar} ||= "\xA2\xAE";
221 $$s =~ s{([\xC0-\xFF][\x80-\xBF]*)}{
222 my $char = $1; my $code = _ucode($char); my $ret = "";
223 if ($code < 0x7F) { ## G0 = ASCII
224 $ret = pack("C", $code);
225 } elsif ($output{jisx0201latin} && $internalo{jisx0201latin} < $code
226 && $code < $internalo{jisx0201latin_end}) { ## G0 = JIS X 0201 latin
227 $ret = pack("C", $code - $internalo{jisx0201latin} + 0x21);
228 } elsif ($output{jisx02081978}
229 && ($internalo{jisx02081978} < $code && $code < $internalo{jisx02081978_end})
230 ) { ## G1 = JIS X 0208-1978
231 my $ku = $code - $internalo{jisx02081978};
232 my $ten = ($ku % 94) + 0xA1; $ku = int($ku / 94) + 0xA1;
233 $ret = pack("CC", $ku, $ten);
234 } elsif ($output{jisx02081983}
235 && ($internalo{jisx02081983} < $code && $code < $internalo{jisx02081983_end})
236 ) { ## G1 = JIS X 0208-1983
237 my $ku = $code - $internalo{jisx02081983};
238 my $ten = ($ku % 94) + 0xA1; $ku = int($ku / 94) + 0xA1;
239 $ret = pack("CC", $ku, $ten);
240 } elsif ($output{jisx02081990}
241 && ($internalo{jisx02081990} < $code && $code < $internalo{jisx02081990_end})
242 ) { ## G1 = JIS X 0208-1990 / JIS X 0208:1997
243 my $ku = $code - $internalo{jisx02081990};
244 my $ten = ($ku % 94) + 0xA1; $ku = int($ku / 94) + 0xA1;
245 $ret = pack("CC", $ku, $ten);
246 } elsif ($output{jisx02132000_1}
247 && ($internalo{jisx02132000_1} < $code && $code < $internalo{jisx02132000_1_end})
248 ) { ## G1 = JIS X 0213:2000 plane 1
249 my $ku = $code - $internalo{jisx02132000_1};
250 my $ten = ($ku % 94) + 0xA1; $ku = int($ku / 94) + 0xA1;
251 $ret = pack("CC", $ku, $ten);
252 } elsif ($output{jisx0201kana} && $internalo{jisx0201kana} < $code
253 && $code < $internalo{jisx0201kana_end}) { ## G2 = JIS X 0201 katakana
254 $ret = "\x8E" . pack("C", $code - $internalo{jisx0201kana} + 0xA1);
255 } elsif ($output{jisx02121990}
256 && ($internalo{jisx02121990} < $code && $code < $internalo{jisx02121990_end})
257 ) { ## G3 = JIS X 0213:2000 plane 2
258 my $ku = $code - $internalo{jisx02121990};
259 my $ten = ($ku % 94) + 0xA1; $ku = int($ku / 94) + 0xA1;
260 $ret = "\x8F" . pack("CC", $ku, $ten);
261 } elsif ($output{jisx02132000_2}
262 && ($internalo{jisx02132000_2} < $code && $code < $internalo{jisx02132000_2_end})
263 ) { ## G3 = JIS X 0213:2000 plane 2
264 my $ku = $code - $internalo{jisx02132000_2};
265 my $ten = ($ku % 94) + 0xA1; $ku = int($ku / 94) + 0xA1;
266 $ret = "\x8F" . pack("CC", $ku, $ten);
267 } else {
268 $ret = $output{undefchar};
269 }
270 $ret;
271 }goex;
272 $$s;
273 }
274
275 =head2 internal_to_junet ($sering, [%option])
276
277 Convert internal coded string to junet coded string.
278
279 =head3 options
280
281 * g0_96
282 * utf8
283
284 =cut
285
286 sub internal_to_junet ($;%) {
287 my $s = shift;
288 $s = \$s unless ref $s;
289 my %output = @_; my $mode = "\x1B\x28\x42";
290 $output{undefchar} ||= 0x3F-0x21;
291 $output{undefcharset} ||= "\x1B\x28\x42";
292 $$s =~ s{([\x00-\x7F]|(?:[\xC0-\xFF][\x80-\xBF]*))}{
293 my $char = $1; my $code = _ucode($char); my $ret = "";
294 if ($code < 0x9F) { ## ASCII
295 $ret = _2022_putchar(\$mode => "\x1B\x28\x42", $code-0x21);
296 } elsif ($code < 0xFF) { ## ISO 8859-1 right half
297 $ret = _2022_putchar(\$mode => "\x1B\x2C\x41", $code-0xA0);
298 } elsif ($internalo{94x1} < $code && $code < $internalo{94x1} + 94*78 -1) {
299 ## 94 charsets
300 my $final = pack("C", int(($code - $internalo{94x1}) / 94) + 0x30);
301 $ret = _2022_putchar(\$mode => "\x1B\x28".$final,
302 ($code - $internalo{94x1}) % 94);
303 } elsif ($output{g0_96}
304 && $internalo{96x1} < $code && $code < $internalo{96x1} + 96*78 -1) {
305 ## 96 charsets
306 my $final = pack("C", int(($code - $internalo{96x1}) / 96) + 0x30);
307 $ret = _2022_putchar(\$mode => "\x1B\x2C".$final,
308 ($code - $internalo{96x1}) % 96);
309 } elsif ($internalo{94x2} < $code && $code < $internalo{94x2} + 94*94*78 -1) {
310 ## 94x2 charsets
311 my $final = pack("C", int(($code - $internalo{94x2}) / (94*94)) + 0x30);
312 $ret = _2022_putchar(\$mode => "\x1B\x24\x28".$final,
313 ($code - $internalo{94x2}) % (94*94));
314 } elsif ($internalo{jisx02081990} < $code
315 && $code < $internalo{jisx02081990} + 94*94 -1) {
316 $ret = _2022_putchar(\$mode => "\x1B\x26\x40\x1B\x24B",
317 ($code - $internalo{94x2}) % (94*94));
318 } elsif ($output{utf8}) {
319 $ret = _2022_putchar(\$mode => "\x1B\x25G", $char)
320 } else {
321 $ret = _2022_putchar(\$mode => $output{undefcharset} => $output{undefchar});
322 }
323 $ret;
324 }goesx;
325 $$s;
326 }
327 sub _2022_putchar ($$$) {
328 my ($mode, $newmode, $char) = @_;
329 my $ret = ""; my %is;
330 $is{multibyte} = 1 if $newmode =~ /\x24/;
331 $is{set96} = 1 if $newmode =~ /\x2C/;
332 if ($$mode ne $newmode) {
333 if ($$mode eq "\x1B\x25G") {
334 $ret = "\x1B\x25\x40\x1B\x28\x42";
335 }
336 if ($is{multibyte} || $is{set96} || $newmode eq "\x1B\x25G") {
337 $ret .= "\x1B\x28\x42";
338 }
339 if ($newmode =~ /\x1B\x24\x28([\x40-\x42])/) {
340 $ret .= "\x1B\x24$1";
341 } else {
342 $ret .= $newmode;
343 }
344 $$mode = $newmode;
345 }
346 if ($is{multibyte} && $is{set96}) { ## 96x2
347 $ret .= pack("CC", int($char / 96) + 0x20, ($char % 96) + 0x20);
348 } elsif ($is{multibyte}) { ## 94x2
349 $ret .= pack("CC", int($char / 94) + 0x21, ($char % 94) + 0x21);
350 } elsif ($is{set96}) { ## 96x1
351 $ret .= pack("C", $char + 0x20);
352 } elsif ($newmode eq "\x1B\x25G") { ## utf-8
353 $ret .= $char; ## if utf-8, $char is char itself!
354 } else { ## 94x1
355 $ret .= pack("C", $char + 0x21);
356 }
357 $ret;
358 }
359
360 sub eucjapan_to_internal ($;%) {
361 my $s = shift;
362 $s = \$s unless ref $s;
363 my %option = @_;
364 my $year = $option{edition} || 2000; ## 1978, 1983, 1990=1997, 2000
365 $option{G0start} ||= 0x21; $option{G2start} ||= $internalo{jisx0201kana};
366 if ($option{G1start} && $option{G3start}) {}
367 elsif ($year < 1983)
368 {$option{G1start} ||= $internalo{jisx02081978}; $option{G3start} ||= -1}
369 elsif (1983 <= $year && $year < 1990)
370 {$option{G1start} ||= $internalo{jisx02081983}; $option{G3start} ||= -1}
371 elsif (1990 <= $year && $year < 2000)
372 {$option{G1start} ||= $internalo{jisx02081990}; $option{G3start} ||= -1}
373 else #elsif (2000 <= $year)
374 {$option{G1start} ||= $internalo{jisx02132000_1}; $option{G3start} ||= -1}
375 $$s =~ s{([\x21-\x7E]|\x8E[\xA1-\xFE]|\x8F?[\xA1-\xFE][\xA1-\xFE])}{
376 my $char = $1; my $ret = "";
377 if ($char =~ /[\x21-\x7E]/) {
378 $ret = _u8(unpack("C", $char) - 0x21 + $option{G0start});
379 } elsif ($char =~ /\x8F/) {
380 my $ku = unpack("C", substr($char,1,1)) - 0xA1;
381 my $ten = unpack("C", substr($char,2,1)) - 0xA1;
382 if ($option{G3start} == -1) {
383 if (77 <= $ku || $ku == 0 || $ku == 2 || $ku == 3 || $ku == 4
384 || $ku == 7 || $ku == 11 || $ku == 12 || $ku == 13 || $ku == 14) {
385 $ret = _u8($ku * 94 + $ten + $internalo{jisx02132000_2});
386 } else {
387 $ret = _u8($ku * 94 + $ten + $internalo{jisx02121990});
388 }
389 } else {
390 $ret = _u8($ku * 94 + $ten + $option{G3start});
391 }
392 } elsif ($char =~ /\x8E/) {
393 $ret = _u8(unpack("C", substr($char,1,1)) - 0xA1 + $option{G2start});
394 } else {
395 my $ku = unpack("C", substr($char,0,1)) - 0xA1;
396 my $ten = unpack("C", substr($char,1,1)) - 0xA1;
397 $ret = _u8($ku * 94 + $ten + $option{G1start});
398 }
399 $ret;
400 }goex;
401 }
402
403 =head1 NAME
404
405 M17N::Code::JA --- Japanese string coding system convertion
406
407 =head1 LICENSE
408
409 This program is free software; you can redistribute it and/or
410 modify it under the same terms as Perl itself.
411
412 =head1 AUTHOR
413
414 wakaba <wakaba@suika.fam.cx>
415
416 =cut
417
418 1;

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24