/[suikacvs]/test/oldencodeutils/lib/M17N/Code/JA.pm
Suika

Contents of /test/oldencodeutils/lib/M17N/Code/JA.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (hide annotations) (download)
Mon Dec 24 07:55:51 2001 UTC (23 years, 6 months ago) by wakaba
Branch: MAIN
2001-12-24  wakaba <wakaba@suika.fam.cx>

	* JA.pm: New file.
	
	* ChangeLog: New file.

1 wakaba 1.1
2     package M17N::Code::JA;
3     use strict;
4     use vars qw(%re %internalo);
5    
6     %re = (
7     94x1 => qr/\x1B\x28([\x40-\x7D])((?:[\x21-\x7E])*)/,
8     94x2 => qr/(?:\x1B\x26([\x40-\x7E]))?\x1B\x24\x28?([\x40-\x5F])((?:[\x21-\x7E][\x21-\x7E])*)/,
9     96x1 => qr/\x1B\x2C([\x40-\x7D])((?:[\x20-\x7F])*)/,
10     utf8 => qr/\x1B\x25(?:\x47|\x2F[\x47-\x49])((?:[\x20-\x7F]|(?:[\xC0-\xFD][\x80-\xBF]+))*)\x1B\x25\x40/,
11     );
12     %internalo = (
13     94x1 => 0xE90940,
14     94x2 => 0xE9F6C0,
15     96x1 => 0xE926A0,
16     jisx02081990 => 0xE9F6C0 + 94*94*79,
17     );
18     %internalo = (%internalo,
19     jisx0201kana => $internalo{94x1} + 94 * (0x49-0x30),
20     jisx0201kana_end => $internalo{94x1} + 94 * (0x49-0x30+1) -1,
21     jisx0201latin => $internalo{94x1} + 94 * (0x4A-0x30),
22     jisx0201latin_end => $internalo{94x1} + 94 * (0x4A-0x30+1) -1,
23     jisx02081978 => $internalo{94x2} + 94*94 * (0x40-0x30),
24     jisx02081978_end => $internalo{94x2} + 94*94 * (0x40-0x30+1) -1,
25     jisx02081983 => $internalo{94x2} + 94*94 * (0x42-0x30),
26     jisx02081983_end => $internalo{94x2} + 94*94 * (0x42-0x30+1) -1,
27     jisx02081990_end => $internalo{jisx02081990} + 94*94 -1,
28     jisx02121990 => $internalo{94x2} + 94*94 * (0x44-0x30),
29     jisx02121990_end => $internalo{94x2} + 94*94 * (0x44-0x30+1) -1,
30     jisx02132000_1 => $internalo{94x2} + 94*94 * (0x4F-0x30),
31     jisx02132000_1_end => $internalo{94x2} + 94*94 * (0x4F-0x30+1) -1,
32     jisx02132000_2 => $internalo{94x2} + 94*94 * (0x50-0x30),
33     jisx02132000_2_end => $internalo{94x2} + 94*94 * (0x50-0x30+1) -1,
34     );
35    
36     my %_charset_name_n11n_table = (
37     euc => "euc-jp",
38     "euc-japan" => "euc-jp",
39     "euc-jisx0208" => "euc-jp",
40     "euc-jisx0213" => "euc-jp:2000",
41     jis => "junet",
42     "shift-jis" => "shift_jis",
43     "shift-jisx0213" => "shift_jis:2000",
44     shift_jisx0213 => "shift_jis:2000",
45     sjis => "shift_jis",
46     "x-euc-jisx0213" => "euc-jp:2000",
47     "x-euc-jp" => "euc-jp",
48     "x-shift_jisx0213" => "shift_jis:2000",
49     "x-sjis" => "shift_jis",
50     );
51     sub _charset_name_n11n ($;$) {
52     my $name = lc shift; my $year = shift;
53     $name = $_charset_name_n11n_table{$name} || $name;
54     if ($name =~ /^(.+):(\d+)$/) {
55     $name = $1;
56     $year ||= $2;
57     }
58     ($name, $year);
59     }
60    
61     =head2 convert ($string, $output_code, $input_code, $options)
62    
63     Convert coded charset of string.
64    
65     =cut
66    
67     sub convert ($;$$$) {
68     my ($string, $output_code, $input_code, $options) = @_;
69     $string = \$string unless ref $string;
70     my ($input_edition, $output_edition);
71     ($input_code, $input_edition) = _charset_name_n11n ($input_code);
72     ($output_code, $output_edition) = _charset_name_n11n ($output_code || "junet");
73     if ($input_code eq "euc-jp") {
74     eucjapan_to_internal ($string, edition => $input_edition);
75     } elsif ($input_code eq "junet" || $input_code =~ /^iso-2022-jp/) {
76     junet_to_internal ($string);
77     }
78    
79     if ($output_code eq "euc-jp") {
80     internal_to_eucjapan ($string, edition => $output_edition);
81     } elsif ($output_code eq "junet" || $output_code =~ /^iso-2022-jp/) {
82     internal_to_junet ($string);
83     }
84     }
85    
86     =head2 junet_to_internal ($string)
87    
88     Convert junet coded string to internal coded string.
89    
90     =cut
91    
92     sub junet_to_internal ($) {
93     my $s = shift;
94     $s = \$s unless ref $s;
95     $$s =~ s{
96     ($re{94x1} | $re{94x2} | $re{96x1} | $re{utf8})
97     }{
98     my $st = $1;
99     if ($st =~ /$re{94x1}/) {
100     my ($f, $str) = ($1, $2);
101     $f = unpack 'C', $f;
102     $f = $f == 0x42 ? 0x21 : $internalo{94x1} + 94 * ($f - 0x30);
103     $str =~ s{([\x21-\x7E])}{
104     _u8($f + unpack('C', $1) - 0x21);
105     }goesx;
106     $st = $str;
107     } elsif ($st =~ /$re{94x2}/) {
108     my ($rev, $f, $str) = ($1, $2, $3);
109     if ($rev eq '@' && $f eq 'B') {
110     $f = $internalo{jisx02081990};
111     } else {
112     $f = unpack 'C', $f;
113     $f = $internalo{94x2} + 94*94 * ($f - 0x30);
114     }
115     $str =~ s{([\x21-\x7E])([\x21-\x7E])}{
116     _u8($f + (unpack('C', $1) - 0x21)*94 + unpack('C', $2) - 0x21);
117     }goesx;
118     $st = $str;
119     } elsif ($st =~ /$re{96x1}/) {
120     my ($f, $str) = ($1, $2);
121     $f = unpack 'C', $f;
122     $f = $f == 0x41 ? 0xA0 : $internalo{96x1} + 96 * ($f - 0x30);
123     $str =~ s{([\x20-\x7F])}{
124     _u8($f + unpack('C', $1) - 0x20);
125     }goesx;
126     $st = $str;
127     } elsif ($st =~ /$re{utf8}/) {
128     $st = $1;
129     }
130     $st;
131     }goesx;
132     $$s;
133     }
134    
135     sub _u8 ($) {
136     my ($ret, $uc);
137     $uc = shift;
138     if ($uc < 0x80) { ## 1 byte
139     $ret .= chr($uc);
140     } elsif ($uc < 0x800) { ## 2 byte
141     $ret .= chr(0xC0 | ($uc >> 6))
142     . chr(0x80 | ($uc & 0x3F));
143     } elsif ($uc < 0x10000) { ## 3 byte
144     $ret .= chr(0xE0 | ($uc >> 12) )
145     . chr(0x80 | (($uc >> 6) & 0x3F))
146     . chr(0x80 | ($uc & 0x3F) );
147     } elsif ($uc < 0x200000) { ## 4 byte
148     $ret .= chr( 240 | ($uc >> 18) )
149     . chr(0x80 | (($uc >> 12) & 0x3F))
150     . chr(0x80 | (($uc >> 6) & 0x3F))
151     . chr(0x80 | ($uc & 0x3F) );
152     } elsif ($uc < 0x4000000) { ## 5 byte
153     $ret .= chr( 248 | ($uc >> 24) )
154     . chr(0x80 | (($uc >> 18) & 0x3F))
155     . chr(0x80 | (($uc >> 12) & 0x3F))
156     . chr(0x80 | (($uc >> 6) & 0x3F))
157     . chr(0x80 | ($uc & 0x3F) );
158     } elsif ($uc < 0x80000000) { ## 6 byte
159     $ret .= chr( 252 | ($uc >> 30) )
160     . chr(0x80 | (($uc >> 24) & 0x3F))
161     . chr(0x80 | (($uc >> 18) & 0x3F))
162     . chr(0x80 | (($uc >> 12) & 0x3F))
163     . chr(0x80 | (($uc >> 6) & 0x3F))
164     . chr(0x80 | ($uc & 0x3F) );
165     }
166     $ret;
167     }
168    
169     sub _ucode ($) {
170     my $s = shift;
171     return unpack("C", $s) if length($s) < 2;
172     my ($iterations, $c, @c)
173     = (0, unpack("C", substr($s, 0, 1)), unpack('C*', substr($s, 1)));
174     if (($c & 0xFE) == 0xFC) {
175     $c = ($c & 0x01);
176     $iterations = 5;
177     } elsif (($c & 0xFC) == 0xF8) {
178     $c = ($c & 0x03);
179     $iterations = 4;
180     } elsif (($c & 0xF8) == 0xF0) {
181     $c = ($c & 0x07);
182     $iterations = 3;
183     } elsif (($c & 0xF0) == 0xE0) {
184     $c = ($c & 0x0F);
185     $iterations = 2;
186     } elsif (($c & 0xE0) == 0xC0) {
187     $c = ($c & 0x1F);
188     $iterations = 1;
189     }
190     if ($iterations == $#c+1) {
191     for (my $i = 0; $i < $iterations; $i++) {
192     $c = ($c << 6);
193     $c = ($c | ($c[$i] & 0x3F));
194     }
195     }
196     $c;
197     }
198    
199     =head2 internal_to_eucjapan ($string, [%options])
200    
201     Convert internal coded string to euc-japan coded string.
202    
203     =head3 Example
204    
205     internal_to_eucjapan (\$s, jisx0201kana => 1);
206    
207     =cut
208    
209     sub internal_to_eucjapan ($;%) {
210     my $s = shift;
211     $s = \$s unless ref $s;
212     my %output = @_;
213     my $year = $output{edition} || -1; ## -1 (= all unify), 1983, 1990 = 1997, 2000
214     $output{jisx02081978} = 1 if $year < 1983;
215     $output{jisx02081983} = 1 if $year == -1 || (1983 <= $year && $year < 1990);
216     $output{jisx02081990} = 1 if $year == -1 || (1990 <= $year && $year < 2000);
217     $output{jisx02121990} = 1 if $year == -1 || (1990 <= $year && $year < 2000);
218     $output{jisx02132000_1} = 1 if $year == -1 || (2000 <= $year);
219     $output{jisx02132000_2} = 1 if $year == -1 || (2000 <= $year);
220     $output{undefchar} ||= "\xA2\xAE";
221     $$s =~ s{([\xC0-\xFF][\x80-\xBF]*)}{
222     my $char = $1; my $code = _ucode($char); my $ret = "";
223     if ($code < 0x7F) { ## G0 = ASCII
224     $ret = pack("C", $code);
225     } elsif ($output{jisx0201latin} && $internalo{jisx0201latin} < $code
226     && $code < $internalo{jisx0201latin_end}) { ## G0 = JIS X 0201 latin
227     $ret = pack("C", $code - $internalo{jisx0201latin} + 0x21);
228     } elsif ($output{jisx02081978}
229     && ($internalo{jisx02081978} < $code && $code < $internalo{jisx02081978_end})
230     ) { ## G1 = JIS X 0208-1978
231     my $ku = $code - $internalo{jisx02081978};
232     my $ten = ($ku % 94) + 0xA1; $ku = int($ku / 94) + 0xA1;
233     $ret = pack("CC", $ku, $ten);
234     } elsif ($output{jisx02081983}
235     && ($internalo{jisx02081983} < $code && $code < $internalo{jisx02081983_end})
236     ) { ## G1 = JIS X 0208-1983
237     my $ku = $code - $internalo{jisx02081983};
238     my $ten = ($ku % 94) + 0xA1; $ku = int($ku / 94) + 0xA1;
239     $ret = pack("CC", $ku, $ten);
240     } elsif ($output{jisx02081990}
241     && ($internalo{jisx02081990} < $code && $code < $internalo{jisx02081990_end})
242     ) { ## G1 = JIS X 0208-1990 / JIS X 0208:1997
243     my $ku = $code - $internalo{jisx02081990};
244     my $ten = ($ku % 94) + 0xA1; $ku = int($ku / 94) + 0xA1;
245     $ret = pack("CC", $ku, $ten);
246     } elsif ($output{jisx02132000_1}
247     && ($internalo{jisx02132000_1} < $code && $code < $internalo{jisx02132000_1_end})
248     ) { ## G1 = JIS X 0213:2000 plane 1
249     my $ku = $code - $internalo{jisx02132000_1};
250     my $ten = ($ku % 94) + 0xA1; $ku = int($ku / 94) + 0xA1;
251     $ret = pack("CC", $ku, $ten);
252     } elsif ($output{jisx0201kana} && $internalo{jisx0201kana} < $code
253     && $code < $internalo{jisx0201kana_end}) { ## G2 = JIS X 0201 katakana
254     $ret = "\x8E" . pack("C", $code - $internalo{jisx0201kana} + 0xA1);
255     } elsif ($output{jisx02121990}
256     && ($internalo{jisx02121990} < $code && $code < $internalo{jisx02121990_end})
257     ) { ## G3 = JIS X 0213:2000 plane 2
258     my $ku = $code - $internalo{jisx02121990};
259     my $ten = ($ku % 94) + 0xA1; $ku = int($ku / 94) + 0xA1;
260     $ret = "\x8F" . pack("CC", $ku, $ten);
261     } elsif ($output{jisx02132000_2}
262     && ($internalo{jisx02132000_2} < $code && $code < $internalo{jisx02132000_2_end})
263     ) { ## G3 = JIS X 0213:2000 plane 2
264     my $ku = $code - $internalo{jisx02132000_2};
265     my $ten = ($ku % 94) + 0xA1; $ku = int($ku / 94) + 0xA1;
266     $ret = "\x8F" . pack("CC", $ku, $ten);
267     } else {
268     $ret = $output{undefchar};
269     }
270     $ret;
271     }goex;
272     $$s;
273     }
274    
275     =head2 internal_to_junet ($sering, [%option])
276    
277     Convert internal coded string to junet coded string.
278    
279     =head3 options
280    
281     * g0_96
282     * utf8
283    
284     =cut
285    
286     sub internal_to_junet ($;%) {
287     my $s = shift;
288     $s = \$s unless ref $s;
289     my %output = @_; my $mode = "\x1B\x28\x42";
290     $output{undefchar} ||= 0x3F-0x21;
291     $output{undefcharset} ||= "\x1B\x28\x42";
292     $$s =~ s{([\x00-\x7F]|(?:[\xC0-\xFF][\x80-\xBF]*))}{
293     my $char = $1; my $code = _ucode($char); my $ret = "";
294     if ($code < 0x9F) { ## ASCII
295     $ret = _2022_putchar(\$mode => "\x1B\x28\x42", $code-0x21);
296     } elsif ($code < 0xFF) { ## ISO 8859-1 right half
297     $ret = _2022_putchar(\$mode => "\x1B\x2C\x41", $code-0xA0);
298     } elsif ($internalo{94x1} < $code && $code < $internalo{94x1} + 94*78 -1) {
299     ## 94 charsets
300     my $final = pack("C", int(($code - $internalo{94x1}) / 94) + 0x30);
301     $ret = _2022_putchar(\$mode => "\x1B\x28".$final,
302     ($code - $internalo{94x1}) % 94);
303     } elsif ($output{g0_96}
304     && $internalo{96x1} < $code && $code < $internalo{96x1} + 96*78 -1) {
305     ## 96 charsets
306     my $final = pack("C", int(($code - $internalo{96x1}) / 96) + 0x30);
307     $ret = _2022_putchar(\$mode => "\x1B\x2C".$final,
308     ($code - $internalo{96x1}) % 96);
309     } elsif ($internalo{94x2} < $code && $code < $internalo{94x2} + 94*94*78 -1) {
310     ## 94x2 charsets
311     my $final = pack("C", int(($code - $internalo{94x2}) / (94*94)) + 0x30);
312     $ret = _2022_putchar(\$mode => "\x1B\x24\x28".$final,
313     ($code - $internalo{94x2}) % (94*94));
314     } elsif ($internalo{jisx02081990} < $code
315     && $code < $internalo{jisx02081990} + 94*94 -1) {
316     $ret = _2022_putchar(\$mode => "\x1B\x26\x40\x1B\x24B",
317     ($code - $internalo{94x2}) % (94*94));
318     } elsif ($output{utf8}) {
319     $ret = _2022_putchar(\$mode => "\x1B\x25G", $char)
320     } else {
321     $ret = _2022_putchar(\$mode => $output{undefcharset} => $output{undefchar});
322     }
323     $ret;
324     }goesx;
325     $$s;
326     }
327     sub _2022_putchar ($$$) {
328     my ($mode, $newmode, $char) = @_;
329     my $ret = ""; my %is;
330     $is{multibyte} = 1 if $newmode =~ /\x24/;
331     $is{set96} = 1 if $newmode =~ /\x2C/;
332     if ($$mode ne $newmode) {
333     if ($$mode eq "\x1B\x25G") {
334     $ret = "\x1B\x25\x40\x1B\x28\x42";
335     }
336     if ($is{multibyte} || $is{set96} || $newmode eq "\x1B\x25G") {
337     $ret .= "\x1B\x28\x42";
338     }
339     if ($newmode =~ /\x1B\x24\x28([\x40-\x42])/) {
340     $ret .= "\x1B\x24$1";
341     } else {
342     $ret .= $newmode;
343     }
344     $$mode = $newmode;
345     }
346     if ($is{multibyte} && $is{set96}) { ## 96x2
347     $ret .= pack("CC", int($char / 96) + 0x20, ($char % 96) + 0x20);
348     } elsif ($is{multibyte}) { ## 94x2
349     $ret .= pack("CC", int($char / 94) + 0x21, ($char % 94) + 0x21);
350     } elsif ($is{set96}) { ## 96x1
351     $ret .= pack("C", $char + 0x20);
352     } elsif ($newmode eq "\x1B\x25G") { ## utf-8
353     $ret .= $char; ## if utf-8, $char is char itself!
354     } else { ## 94x1
355     $ret .= pack("C", $char + 0x21);
356     }
357     $ret;
358     }
359    
360     sub eucjapan_to_internal ($;%) {
361     my $s = shift;
362     $s = \$s unless ref $s;
363     my %option = @_;
364     my $year = $option{edition} || 2000; ## 1978, 1983, 1990=1997, 2000
365     $option{G0start} ||= 0x21; $option{G2start} ||= $internalo{jisx0201kana};
366     if ($option{G1start} && $option{G3start}) {}
367     elsif ($year < 1983)
368     {$option{G1start} ||= $internalo{jisx02081978}; $option{G3start} ||= -1}
369     elsif (1983 <= $year && $year < 1990)
370     {$option{G1start} ||= $internalo{jisx02081983}; $option{G3start} ||= -1}
371     elsif (1990 <= $year && $year < 2000)
372     {$option{G1start} ||= $internalo{jisx02081990}; $option{G3start} ||= -1}
373     else #elsif (2000 <= $year)
374     {$option{G1start} ||= $internalo{jisx02132000_1}; $option{G3start} ||= -1}
375     $$s =~ s{([\x21-\x7E]|\x8E[\xA1-\xFE]|\x8F?[\xA1-\xFE][\xA1-\xFE])}{
376     my $char = $1; my $ret = "";
377     if ($char =~ /[\x21-\x7E]/) {
378     $ret = _u8(unpack("C", $char) - 0x21 + $option{G0start});
379     } elsif ($char =~ /\x8F/) {
380     my $ku = unpack("C", substr($char,1,1)) - 0xA1;
381     my $ten = unpack("C", substr($char,2,1)) - 0xA1;
382     if ($option{G3start} == -1) {
383     if (77 <= $ku || $ku == 0 || $ku == 2 || $ku == 3 || $ku == 4
384     || $ku == 7 || $ku == 11 || $ku == 12 || $ku == 13 || $ku == 14) {
385     $ret = _u8($ku * 94 + $ten + $internalo{jisx02132000_2});
386     } else {
387     $ret = _u8($ku * 94 + $ten + $internalo{jisx02121990});
388     }
389     } else {
390     $ret = _u8($ku * 94 + $ten + $option{G3start});
391     }
392     } elsif ($char =~ /\x8E/) {
393     $ret = _u8(unpack("C", substr($char,1,1)) - 0xA1 + $option{G2start});
394     } else {
395     my $ku = unpack("C", substr($char,0,1)) - 0xA1;
396     my $ten = unpack("C", substr($char,1,1)) - 0xA1;
397     $ret = _u8($ku * 94 + $ten + $option{G1start});
398     }
399     $ret;
400     }goex;
401     }
402    
403     =head1 NAME
404    
405     M17N::Code::JA --- Japanese string coding system convertion
406    
407     =head1 LICENSE
408    
409     This program is free software; you can redistribute it and/or
410     modify it under the same terms as Perl itself.
411    
412     =head1 AUTHOR
413    
414     wakaba <wakaba@suika.fam.cx>
415    
416     =cut
417    
418     1;

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24