/[suikacvs]/perl/lib/Encode/ISO2022.pm
Suika

Contents of /perl/lib/Encode/ISO2022.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.9 - (show annotations) (download)
Mon Oct 14 06:58:35 2002 UTC (23 years, 3 months ago) by wakaba
Branch: MAIN
Changes since 1.8: +32 -29 lines
2002-10-14  Nanashi-san

	* ISO2022.pm, SJIS.pm: Bug fix of utf8 flag control.
	(Committed by Wakaba <w@suika.fam.cx>.)

1
2 =head1 NAME
3
4 Encode::ISO2022 --- ISO/IEC 2022 encoder and decoder
5
6 =head1 ENCODINGS
7
8 =over 4
9
10 =item iso2022
11
12 ISO/IEC 2022:1994. Default status is:
13
14 =over 2
15
16 =item CL = C0 = ISO/IEC 6429:1991 C0 set
17
18 =item CR = C1 = ISO/IEC 6429:1991 C1 set
19
20 =item GL = G0 = ISO/IEC 646:1991 IRV GL(G0) set
21
22 =item GR = G1 = empty set
23
24 =item G2 = empty set
25
26 =item G3 = empty set
27
28 =back
29
30 (Alias: iso/iec2022, iso-2022, 2022, cp2022)
31
32 =back
33
34 Note that ISO/IEC 2022 based encodings are found in
35 Encode::ISO2022::* modules. This module, Encode::ISO2022
36 only provides a general ISO/IEC 2022 encoder/decoder.
37
38 =cut
39
40 require v5.7.3;
41 package Encode::ISO2022;
42 use strict;
43 use vars qw(%CHARSET %CODING_SYSTEM $VERSION);
44 $VERSION=do{my @r=(q$Revision: 1.8 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
45 use base qw(Encode::Encoding);
46 __PACKAGE__->Define (qw!iso-2022 iso/iec2022 iso2022 2022 cp2022!);
47 require Encode::Charset;
48 *CHARSET = \%Encode::Charset::CHARSET;
49 *CODING_SYSTEM = \%Encode::Charset::CODING_SYSTEM;
50
51 ### --- Perl Encode module common functions
52
53 sub encode ($$;$) {
54 my ($obj, $str, $chk) = @_;
55 $_[1] = '' if $chk;
56 $str = &internal_to_iso2022 ($str);
57 return $str;
58 }
59
60 sub decode ($$;$) {
61 my ($obj, $str, $chk) = @_;
62 $_[1] = '' if $chk;
63 return &iso2022_to_internal ($str);
64 }
65
66 ### --- Encode::ISO2022 unique functions
67 *new_object = \&Encode::Charset::new_object;
68
69 sub iso2022_to_internal ($;\%) {
70 my ($s, $C) = @_;
71 $C ||= &new_object;
72 my $t = '';
73 $s =~ s{^((?:(?!\x1B\x25\x2F?[\x30-\x7E]).)*)}{
74 my $i2 = $1;
75 $t = _iso2022_to_internal ($i2, $C);
76 '';
77 }es;
78 my $pad = '';
79 use re 'eval';
80 $s =~ s{
81 ## ISO/IEC 2022
82 (??{"$pad\x1B$pad\x25$pad\x40"})((?:(?!\x1B\x25\x2F?[\x30-\x7E]).)*)
83 ## UTF-8
84 |(??{"$pad\x1B$pad\x25$pad(?:\x47|\x2F$pad"."[\x47-\x49])"})
85 ((?:(?!\x1B\x25\x2F?[\x30-\x7E]).)*)
86 ## UCS-2, UTF-16
87 |(??{"$pad\x1B$pad\x25$pad\x2F$pad"})([\x40\x43\x45\x4A-\x4C])
88 ((?:(?!\x00\x1B\x00\x25(?:\x00\x2F)?\x00[\x30-\x7E])..)*)
89 ## UCS-4
90 |(??{"$pad\x1B$pad\x25$pad\x2F$pad"})[\x41\x44\x46]
91 ((?:(?!\x00\x00\x00\x1B\x00\x00\x00\x25(?:\x00\x00\x00\x2F)?
92 \x00\x00\x00[\x30-\x7E])....)*)
93 ## with standard return
94 |(??{"$pad\x1B$pad\x25$pad"})([\x30-\x7E])
95 ((?:(?!\x1B\x25\x2F?[\x30-\x7E]).)*)
96 ## without standard return
97 |(??{"$pad\x1B$pad\x25$pad\x2F$pad"})([\x30-\x7E])(.*)
98 }{
99 my ($i2,$u8,$Fu2,$u2,$u4,$Fsr,$sr,$Fnsr,$nsr) = ($1,$2,$3,$4,$5,$6,$7,$8,$9);
100 my $r = '';
101 if (defined $i2) {
102 $r = _iso2022_to_internal ($i2, $C); $pad = '';
103 } elsif (defined $u8) {
104 $r = Encode::decode ('utf8', $u8); $pad = '';
105 } elsif ($Fu2) {
106 if (ord ($Fu2) > 0x49) {
107 $r = Encode::decode ('utf-16be', $u2);
108 } else {
109 $r = Encode::decode ('ucs-2be', $u2);
110 }
111 $pad = "\x00";
112 } elsif (defined $u4) {
113 $r = Encode::decode ('ucs-4be', $u2); $pad = "\x00\x00\x00";
114 } elsif (defined $Fsr && $CODING_SYSTEM{$Fsr}->{perl_name}) {
115 $r = Encode::decode ($CODING_SYSTEM{$Fsr}->{perl_name}, $sr); $pad = '';
116 } elsif (defined $Fnsr && $CODING_SYSTEM{$Fnsr}->{perl_name}) {
117 $r = Encode::decode ($CODING_SYSTEM{$Fnsr}->{perl_name}, $nsr); $pad = '';
118 } else { ## temporary
119 $r = '?' x length ($sr.$nsr); $pad = '';
120 }
121 $r;
122 }gesx;
123 $t . $s;
124 }
125
126 # this is very very trickey. my perl 5.8.0 does not process
127 # regex with eval except the first time (i think it's a bug
128 # of perl), so we redefine this function whenever being called!
129 # when this unexpected behavior is fixed or someone finds
130 # better way to avoid it, we will rewrite this code.
131 &_iso2022_to_internal (undef);
132 sub _iso2022_to_internal ($;\%) {
133 eval q{ sub __iso2022_to_internal ($;\%) { 0 } };
134 eval q{
135 sub __iso2022_to_internal ($;\%) {
136 use re 'eval';
137 my ($s, $C) = @_;
138 my %_GB_to_GN = (
139 "\x28"=>'G0',"\x29"=>'G1',"\x2A"=>'G2',"\x2B"=>'G3',
140 "\x2C"=>'G0',"\x2D"=>'G1',"\x2E"=>'G2',"\x2F"=>'G3',
141 );
142 my %_CHARS_to_RANGE = (
143 l94 => q/[\x21-\x7E]/, l96 => q/[\x20-\x7F]/,
144 l128 => q/[\x00-\x7F]/, l256 => q/[\x00-\xFF]/,
145 r94 => q/[\xA1-\xFE]/, r96 => q/[\xA0-\xFF]/,
146 r128 => q/[\x80-\xFF]/, r256 => q/[\x80-\xFF]/,
147 b94 => q/[\x21-\x7E\xA1-\xFE]/, b96 => q/[\x20-\x7F\xA0-\xFF]/,
148 b128 => q/[\x00-\xFF]/, b256 => q/[\x00-\xFF]/,
149 );
150
151 $s =~ s{
152 ((??{ $_CHARS_to_RANGE{'l'.$C->{$C->{GL}}->{chars}}
153 . qq/{$C->{$C->{GL}}->{dimension},$C->{$C->{GL}}->{dimension}}/ }))
154 |((??{ $_CHARS_to_RANGE{'r'.$C->{$C->{GR}}->{chars}}
155 . qq/{$C->{$C->{GR}}->{dimension},$C->{$C->{GR}}->{dimension}}/ }))
156 | (??{ q/(?:/ . ($C->{$C->{CR}}->{r_SS2} || '(?!)')
157 . ($C->{$C->{ESC_Fe}}->{r_SS2_ESC} ?
158 qq/|$C->{$C->{ESC_Fe}}->{r_SS2_ESC}/ : '')
159 . ($C->{$C->{CL}}->{r_SS2} ? qq/|$C->{$C->{CL}}->{r_SS2}/ : '') . q/)/
160 . ( $C->{$C->{CL}}->{r_LS0}
161 ||$C->{$C->{CL}}->{r_LS1}? ## ISO/IEC 6429:1992 9
162 qq/[$C->{$C->{CL}}->{r_LS0}$C->{$C->{CL}}->{r_LS1}]*/:'')
163 })
164 ((??{ $_CHARS_to_RANGE{'b'.$C->{G2}->{chars}}
165 . qq/{$C->{G2}->{dimension},$C->{G2}->{dimension}}/ }))
166 | (??{ q/(?:/ . ($C->{$C->{CR}}->{r_SS3} || '(?!)')
167 . ($C->{$C->{ESC_Fe}}->{r_SS3_ESC} ?
168 qq/|$C->{$C->{ESC_Fe}}->{r_SS3_ESC}/ : '')
169 . ($C->{$C->{CL}}->{r_SS3} ? qq/|$C->{$C->{CL}}->{r_SS3}/ : '') . q/)/
170 . ( $C->{$C->{CL}}->{r_LS0}
171 || $C->{$C->{CL}}->{r_LS1}? ## ISO/IEC 6429:1992 9
172 qq/[$C->{$C->{CL}}->{r_LS0}$C->{$C->{CL}}->{r_LS1}]*/:'')
173 })
174 ((??{ $_CHARS_to_RANGE{'b'.$C->{G3}->{chars}}
175 . qq/{$C->{G3}->{dimension},$C->{G3}->{dimension}}/ }))
176
177 ## Locking shift
178 |( (??{ $C->{$C->{CL}}->{r_LS0}||'(?!)' })
179 |(??{ $C->{$C->{CL}}->{r_LS1}||'(?!)' })
180 )
181
182 ## Control sequence
183 |(??{ '(?:'.($C->{$C->{CR}}->{r_CSI}||'(?!)')
184 .($C->{$C->{ESC_Fe}}->{r_CSI_ESC} ?
185 qq/|$C->{$C->{ESC_Fe}}->{r_CSI_ESC}/: '')
186 .')'
187 })
188 ((??{ qq/[\x30-\x3F$C->{$C->{CL}}->{LS0}$C->{$C->{CL}}->{LS1}\xB0-\xBF]*/
189 .qq/[\x20-\x2F$C->{$C->{CL}}->{LS0}$C->{$C->{CL}}->{LS1}\xA0-\xAF]*/
190 }) [\x40-\x7E\xD0-\xFE])
191
192 ## Other escape sequence
193 |(\x1B[\x20-\x2F]*[\x30-\x7E])
194
195 ## Misc. sequence (SP, control, or broken data)
196 |([\x00-\xFF])
197 }{
198 my ($gl,$gr,$ss2,$ss3,$ls,$csi,$esc,$misc)
199 = ($1,$2,$3,$4,$5,$6,$7,$8,$9);
200 $C->{_irr} = undef unless defined $esc;
201 ## GL graphic character
202 if (defined $gl) {
203 my $c = 0;
204 my $m = $C->{$C->{GL}}->{chars}==94?0x21:$C->{$C->{GL}}->{chars}==96?0x20:0;
205 for (split //, $gl) {
206 $c = $c * $C->{$C->{GL}}->{chars} + unpack ('C', $_) - $m;
207 }
208 chr ($C->{$C->{GL}}->{ucs} + $c);
209 ## Control, SP, or broken data
210 ## TODO: support control sets other than ISO/IEC 6429's
211 } elsif (defined $misc) {
212 $misc;
213 ## GR graphic character
214 } elsif ($gr) {
215 my $c = 0;
216 my $m = $C->{$C->{GR}}->{chars}==94?0xA1:$C->{$C->{GR}}->{chars}==96?0xA0:0x80;
217 for (split //, $gr) {
218 $c = $c * $C->{$C->{GR}}->{chars} + unpack ('C', $_) - $m;
219 }
220 chr ($C->{$C->{GR}}->{ucs} + $c);
221 ## Graphic character with SS2
222 } elsif ($ss2) {
223 $ss2 =~ tr/\x80-\xFF/\x00-\x7F/;
224 my $c = 0; my $m = $C->{G2}->{chars}==94?0x21:$C->{G2}->{chars}==96?0x20:0;
225 for (split //, $ss2) {
226 $c = $c * $C->{G2}->{chars} + unpack ('C', $_) - $m;
227 }
228 chr ($C->{G2}->{ucs} + $c);
229 ## Graphic character with SS3
230 } elsif ($ss3) {
231 $ss3 =~ tr/\x80-\xFF/\x00-\x7F/;
232 my $c = 0; my $m = $C->{G3}->{chars}==94?0x21:$C->{G3}->{chars}==96?0x20:0;
233 for (split //, $ss3) {
234 $c = $c * $C->{G3}->{chars} + unpack ('C', $_) - $m;
235 }
236 chr ($C->{G3}->{ucs} + $c);
237 ## Escape sequence
238 } elsif ($esc) {
239 ## IRR (revision number)
240 if ($esc =~ /\x1B\x26([\x40-\x7E])/) {
241 $C->{_irr} = $1; $esc = '';
242 } else {
243 $esc =~ s{
244 \x1B([\x28-\x2B])(\x20?[\x21-\x23]?[\x30-\x7E]) ## Gx = 94^1
245 |\x1B\x24([\x28-\x2B]?)(\x20?[\x21-\x23]?[\x30-\x7E]) ## Gx = 94^n
246
247 |\x1B([\x2C-\x2F])(\x20?[\x21-\x23]?[\x30-\x7E]) ## Gx = 96^1
248 |\x1B\x24([\x2C-\x2F])(\x20?[\x21-\x23]?[\x30-\x7E]) ## Gx = 96^n
249
250 |\x1B([\x40-\x5F]) ## ESC Fe
251
252 |\x1B\x21([\x21-\x23]?[\x30-\x7E]) ## CL = C0
253 |\x1B\x22([\x21-\x23]?[\x30-\x7E]) ## CR & ESC Fe = C1
254
255 |\x1B([\x60-\x7E]) ## Single control functions
256 |\x1B\x23([\x21-\x23]?)([\x30-\x7E])
257
258 |\x1B\x20([\x40-\x7E]) ## Announcer
259 }{
260 my ($g94_g,$g94_f,$g94n_g,$g94n_f,$g96_g,$g96_f,$g96n_g,$g96n_f,$Fe,
261 $CZD, $C1D, $Fs, $sI, $sF,$ACS)
262 = ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15);
263 my $rev = $C->{_irr} || '';
264 if ($g94_g) {
265 $C->{ $_GB_to_GN{ $g94_g } } = $CHARSET{G94}->{ $g94_f.$rev }
266 || $CHARSET{G94}->{ $g94_f }
267 || $CHARSET{G94}->{ "\x7E" }; '';
268 } elsif (defined $g94n_f) {
269 $C->{ $_GB_to_GN{ $g94n_g } || 'G0' } = $CHARSET{G94n}->{ $g94n_f.$rev }
270 || $CHARSET{G94n}->{ $g94n_f }
271 || $CHARSET{G94n}->{ "\x7E" }; '';
272 } elsif ($g96_g) {
273 $C->{ $_GB_to_GN{ $g96_g } } = $CHARSET{G96}->{ $g96_f.$rev }
274 || $CHARSET{G96}->{ $g96_f }
275 || $CHARSET{G96}->{ "\x7E" }; '';
276 } elsif (defined $g96n_f) {
277 $C->{ $_GB_to_GN{ $g96n_g } } = $CHARSET{G96n}->{ $g96n_f.$rev }
278 || $CHARSET{G96n}->{ $g96n_f }
279 || $CHARSET{G96n}->{ "\x7E" }; '';
280 } elsif ($Fe) { ## ESC Fe => C1
281 chr ($C->{ $C->{ESC_Fe} }->{ucs} + (ord ($Fe) - 0x40));
282 } elsif ($CZD) {
283 $C->{C0} = $CHARSET{C0}->{ $CZD.$rev }
284 || $CHARSET{C0}->{ $CZD } || $CHARSET{C0}->{ "\x7E" }; '';
285 } elsif ($C1D) {
286 $C->{C1} = $CHARSET{C1}->{ $C1D.$rev }
287 || $CHARSET{C1}->{ $C1D } || $CHARSET{C1}->{ "\x7E" }; '';
288 } elsif ($Fs) {
289 if ($Fs eq "\x6E") { ## LS2
290 $C->{GL} = 'G2'; '';
291 } elsif ($Fs eq "\x6F") { ## LS3
292 $C->{GL} = 'G3'; '';
293 } elsif ($Fs eq "\x7E" || $Fs eq "\x6B") { ## LS1R
294 $C->{GR} = 'G1'; $C->{GL} = 'G1' if $C->{bit} == 7; '';
295 } elsif ($Fs eq "\x7D" || $Fs eq "\x6C") { ## LS2R
296 $C->{GR} = 'G2'; $C->{GL} = 'G2' if $C->{bit} == 7; '';
297 } elsif ($Fs eq "\x7C" || $Fs eq "\x6D") { ## LS3R
298 $C->{GR} = 'G3'; $C->{GL} = 'G3' if $C->{bit} == 7; '';
299 } else {
300 chr ($CHARSET{single_control}->{Fs}->{ucs} + (ord ($Fs) - 0x60));
301 }
302 } elsif ($sI) {
303 chr ($CHARSET{single_control}->{'3F'.$sI}->{ucs} + (ord ($sF) - 0x30));
304 } elsif ($ACS) { ## Announcer
305 if ($ACS eq "\x4A") { $C->{bit} = 7 }
306 elsif ($ACS eq "\x4B") { $C->{bit} = 8 }
307 '';
308 }
309 }gex;
310 $C->{_irr} = undef;
311 }
312 $esc;
313 ## Locking shifts
314 } elsif ($ls) {
315 if ($ls eq $C->{$C->{CL}}->{LS0}) {
316 $C->{GL} = 'G0'; '';
317 } elsif ($ls eq $C->{$C->{CL}}->{LS1}) {
318 $C->{GL} = 'G1'; '';
319 }
320 ## Control sequence
321 } elsif ($csi) {
322 $csi =~ tr/\xA0-\xFF/\x20-\x7F/d;
323 $csi =~ s/$C->{$C->{CL}}->{LS0}//g if $C->{$C->{CL}}->{LS0};
324 $csi =~ s/$C->{$C->{CL}}->{LS1}//g if $C->{$C->{CL}}->{LS1};
325 "\x9B".$csi;
326 }
327 }gesx;
328 $s;
329 } # __iso2022_to_internal
330
331 };
332 &__iso2022_to_internal (@_) if defined $_[0];
333
334 } # _iso2022_to_internal
335
336 sub internal_to_iso2022 ($\%) {
337 my ($s, $C) = @_;
338 $C ||= &new_object;
339
340 my $r = '';
341 for my $c (split //, $s) {
342 my $cc = ord $c; Encode::_utf8_off ($c);
343 my $t;
344 if ($cc <= 0x1F) {
345 $t = _i2c ($c, $C, type => 'C0', charset => '@');
346 } elsif ($cc == 0x20 || $cc == 0x7F) {
347 $t = _back2ascii ($C) . $c;
348 } elsif ($cc < 0x7F) {
349 $t = _i2g ($c, $C, type => 'G94', charset => 'B');
350 } elsif ($cc <= 0x9F) {
351 $t = _i2c ($c, $C, type => 'C1', charset_id => '64291991C1',
352 charset => $C->{option}->{private_set}->{XC1}->{'64291991C1'});
353 } elsif ($cc <= 0xFF) {
354 $t = _i2g (chr($cc-0x80), $C, type => 'G96', charset => 'A');
355 } elsif ($cc <= 0x24FF) {
356 my $c = $cc - 0x100;
357 my $final = $C->{option}->{private_set}->{U96n}->[0];
358 if (length $final) {
359 $t = _i2g (chr(($c / 96)+0x20).chr(($c % 96)+0x20), $C,
360 type => 'G96n', charset => $final);
361 }
362 } elsif ($cc <= 0x33FF) {
363 my $c = $cc - 0x2500;
364 my $final = $C->{option}->{private_set}->{U96n}->[1];
365 if (length $final) {
366 $t = _i2g (chr(($c / 96)+0x20).chr(($c % 96)+0x20), $C,
367 type => 'G96n', charset => $final);
368 }
369 } elsif (0xE000 <= $cc && $cc <= 0xFFFF) {
370 my $c = $cc - 0xE000;
371 my $final = $C->{option}->{private_set}->{U96n}->[2];
372 if (length $final) {
373 $t = _i2g (chr(($c / 96)+0x20).chr(($c % 96)+0x20), $C,
374 type => 'G96n', charset => $final);
375 }
376
377 } elsif (0xE9F6C0 <= $cc && $cc <= 0xF06F80) {
378 my $c = $cc - 0xE9F6C0;
379 $t = _i2g (chr((($c % 8836) / 94)+0x21).chr(($c % 94)+0x21), $C,
380 type => 'G94n', charset => chr(($c / 8836)+0x30));
381 } elsif (0xF49D7C <= $cc && $cc <= 0xF4BFFF) {
382 my $c = $cc - 0xF49D7C;
383 $t = _i2g (chr(($c / 94)+0x21).chr(($c % 94)+0x21), $C,
384 type => 'G94n', charset => 'B', revision => '@');
385
386 } elsif (0xF0000 <= $cc && $cc <= 0x10F7FF) {
387 my $c = $cc - 0xF0000;
388 $t = _i2g (chr((($c % 9216) / 96)+0x20).chr(($c % 96)+0x20), $C,
389 type => 'G96n', charset => "\x20".chr(($c / 9216)+0x40));
390 } elsif (0xE90940 <= $cc && $cc <= 0xE92641) {
391 my $c = $cc - 0xE90940;
392 $t = _i2g (chr(($c % 94)+0x21), $C,
393 type => 'G94', charset => chr(($c / 94)+0x30));
394 } elsif (0xE92642 <= $cc && $cc <= 0xE9269F) {
395 my $c = $cc - 0xE92642;
396 $t = _i2g (chr($c+0x21),$C,type => 'G94', charset => "\x64", revision => '@');
397 } elsif (0xE926A0 <= $cc && $cc <= 0xE9443F) {
398 my $c = $cc - 0xE926A0;
399 $t = _i2g (chr(($c % 96)+0x20), $C,
400 type => 'G96', charset => chr(($c / 96)+0x30));
401 } elsif (0xE944A0 <= $cc && $cc <= 0xE961A1) {
402 my $c = $cc - 0xE944A0;
403 $t = _i2g (chr(($c % 94)+0x21), $C,
404 type => 'G94', charset => '!'.chr(($c / 94)+0x30));
405 } elsif (0xE96200 <= $cc && $cc <= 0xE97F9F) {
406 my $c = $cc - 0xE96200;
407 $t = _i2g (chr(($c % 96)+0x20), $C,
408 type => 'G96', charset => '!'.chr(($c / 96)+0x30));
409 } elsif (0xE98000 <= $cc && $cc <= 0xE99D01) {
410 my $c = $cc - 0xE98000;
411 $t = _i2g (chr(($c % 94)+0x21), $C,
412 type => 'G94', charset => '"'.chr(($c / 94)+0x30));
413 } elsif (0xE99D60 <= $cc && $cc <= 0xE9BAFF) {
414 my $c = $cc - 0xE99D60;
415 $t = _i2g (chr(($c % 96)+0x20), $C,
416 type => 'G96', charset => '"'.chr(($c / 96)+0x30));
417 } elsif (0xE9BB60 <= $cc && $cc <= 0xE9D861) {
418 my $c = $cc - 0xE9BB60;
419 $t = _i2g (chr(($c % 94)+0x21), $C,
420 type => 'G94', charset => '#'.chr(($c / 94)+0x30));
421 } elsif (0xE9D8C0 <= $cc && $cc <= 0xE9F65F) {
422 my $c = $cc - 0xE9D8C0;
423 $t = _i2g (chr(($c % 96)+0x20), $C,
424 type => 'G96', charset => '#'.chr(($c / 96)+0x30));
425 } elsif (0x70090940 <= $cc && $cc <= 0x70092641) {
426 my $c = $cc - 0x70090940;
427 $t = _i2g (chr(($c % 94)+0x21), $C,
428 type => 'G94', charset => "\x20".chr(($c / 94)+0x30));
429 } elsif (0x700926A0 <= $cc && $cc <= 0x7009443F) {
430 my $c = $cc - 0x700926A0;
431 $t = _i2g (chr(($c % 96)+0x20), $C,
432 type => 'G96', charset => "\x20".chr(($c / 96)+0x30));
433 ## TODO: DRCS with I byte: U+700944A0-U+7009F6BF
434 } elsif (0x70400000 <= $cc && $cc <= 0x7040FFED) {
435 my $c = $cc - 0x70400000;
436 $t = _i2g (chr(($c % 94)+0x21), $C, charset_id => 'P'.int ($c / 94),
437 type => 'G94', charset => $C->{option}->{private_set}->{G94}->[ $c / 94 ]);
438 } elsif (0x70410000 <= $cc && $cc <= 0x7041FFBF) {
439 my $c = $cc - 0x70410000;
440 $t = _i2g (chr(($c % 96)+0x20), $C, charset_id => 'P'.int ($c / 96),
441 type => 'G96', charset => $C->{option}->{private_set}->{G96}->[ $c / 96 ]);
442 } elsif (0x70420000 <= $cc && $cc <= 0x7046F19B) {
443 my $c = $cc % 0x10000;
444 $t = _i2g (chr((($c % 8836) / 94)+0x21).chr(($c % 94)+0x21), $C,
445 type => 'G94n',
446 charset_id => 'P'.int(($cc / 0x10000) - 0x7042).'_'.int($c / 8836),
447 charset => $C->{option}->{private_set}->{G94n}
448 ->[ ($cc / 0x10000) - 0x7042 ]->[ $c / 8836 ]);
449 }
450 if (defined $t) {
451 $t = _i2o ($t, $C, cs_F => "\x40")
452 if $C->{coding_system} ne $CODING_SYSTEM{"\x40"};
453 } else {
454 my $F; my @F = qw~G /G /H /I B /A /D /F~;
455 push @F, qw~/J /K /L~ if $cc <= 0x10FFFF;
456 push @F, qw~/@ /C /E~ if $cc <= 0xFFFF;
457 for (@F) {
458 if (defined $C->{option}->{designate_to}->{coding_system}->{$_}
459 && $C->{option}->{designate_to}->{coding_system}->{$_} > -1) {
460 $F = $_; last;
461 } elsif ($C->{option}->{designate_to}->{coding_system}->{default} > -1) {
462 $F = $_; last;
463 }
464 }
465 $t = _i2o ($c, $C, cs_F => $F) if $F;
466 }
467 if (defined $t) {
468 $r .= $t;
469 } else {
470 unless ($C->{option}->{undef_char}->[0] eq "\x20") {
471 $t = _i2g ($C->{option}->{undef_char}->[0], $C,
472 %{ $C->{option}->{undef_char}->[1] });
473 } else { ## SP
474 $t = _back2ascii ($C) . "\x20";
475 }
476 $r .= $C->{coding_system} eq $CODING_SYSTEM{"\x40"} ?
477 $t : _i2o ($t, $C, cs_F => "\x40");
478 }
479 }
480 $r . _back2ascii ($C);
481 }
482
483 ## $O{charset} eq undef means that charset is same as the current designated one.
484 sub _i2c ($%%) {
485 my ($s, $C, %O) = @_;
486 my $r = '';
487 if ($O{type} eq 'C0') {
488 if (defined $O{charset}) {
489 if ( $C->{C0} ne $CHARSET{C0}->{$O{charset}}
490 && $C->{C0} ne $CHARSET{C0}->{$O{charset_id}}) {
491 for ($C->{option}->{designate_to}->{C0}->{$O{charset}},
492 $C->{option}->{designate_to}->{C0}->{default}) {
493 if (defined $_) { return undef if $_ == -1; last }
494 }
495 $r .= "\x1B\x21".$O{charset};
496 $C->{C0} = $CHARSET{C0}->{$O{charset}};
497 }
498 } elsif (defined $O{charset_id}) {
499 if ($C->{C0} ne $CHARSET{C0}->{$O{charset_id}}) {
500 return undef; ## Control set is not designated nor has F byte
501 }
502 }
503 $r .= _back2ascii ($C, reset_all => $C->{C0}->{reset_all}->{$s});
504 return $r . $s;
505 } elsif ($O{type} eq 'C1') {
506 if (defined $O{charset}) {
507 if ( $C->{C1} ne $CHARSET{C1}->{$O{charset}}
508 && $C->{C1} ne $CHARSET{C1}->{$O{charset_id}}) {
509 for ($C->{option}->{designate_to}->{C1}->{$O{charset}},
510 $C->{option}->{designate_to}->{C1}->{default}) {
511 if (defined $_) { return undef if $_ == -1; last }
512 }
513 $r .= "\x1B\x22".$O{charset};
514 $C->{C1} = $CHARSET{C1}->{$O{charset}};
515 }
516 } elsif (defined $O{charset_id}) {
517 if ($C->{C1} ne $CHARSET{C1}->{$O{charset_id}}) {
518 return undef; ## Control set is not designated nor has F byte
519 }
520 }
521 $r .= _back2ascii ($C, reset_all => $C->{C1}->{reset_all}->{$s});
522 unless ($C->{option}->{C1invoke_to_right}) { ## ESC Fe
523 $s =~ s/([\x80-\x9F])/"\x1B" . chr (ord ($1) - 0x40)/ge;
524 }
525 return $r . $s;
526 }
527 }
528 sub _i2g ($%%) {
529 my ($s, $C, %O) = @_;
530 my $r = '';
531 my $set = $CHARSET{$O{type}}->{$O{charset}.
532 ($O{revision}&&$C->{option}->{use_revision}?$O{revision}:'')};
533 my $set0 = $CHARSET{$O{type}}->{$O{charset_id}};
534 ## -- designate character set
535 my $G = 0;
536 if ($C->{G0} eq $set || $C->{G0} eq $set0) { $G = 0 }
537 elsif ($C->{G1} eq $set || $C->{G1} eq $set0) { $G = 1 }
538 elsif ($C->{G2} eq $set || $C->{G2} eq $set0) { $G = 2 }
539 elsif ($C->{G3} eq $set || $C->{G3} eq $set0) { $G = 3 }
540 else {
541 return undef unless $set; ## charset does not have F byte
542 $G = 1 if $O{type} eq 'G96' || $O{type} eq 'G96n';
543 for ($C->{option}->{designate_to}->{$O{type}}->{$O{charset}},
544 $C->{option}->{designate_to}->{$O{type}}->{default}) {
545 if (defined $_) {
546 $G = $_; last;
547 }
548 }
549 if ($G == -1) {
550 return undef;
551 }
552 if ($O{type} eq 'G94') {
553 $r .= ($O{revision}&&$C->{option}->{use_revision}?"\x1B\x26".$O{revision}:'')
554 ."\x1B".("\x28","\x29","\x2A","\x2B")[$G].$O{charset};
555 } elsif ($O{type} eq 'G94n') {
556 if ($G == 0 && !$C->{option}->{G94n_designate_long}
557 && ($O{charset} eq '@' || $O{charset} eq 'A' || $O{charset} eq 'B')) {
558 $r .= ($O{revision}&&$C->{option}->{use_revision}?"\x1B\x26".$O{revision}:'')
559 ."\x1B\x24".$O{charset};
560 } else {
561 $r .= ($O{revision}&&$C->{option}->{use_revision}?"\x1B\x26".$O{revision}:'')
562 ."\x1B\x24".("\x28","\x29","\x2A","\x2B")[$G].$O{charset};
563 }
564 } elsif ($O{type} eq 'G96') {
565 $r .= ($O{revision}&&$C->{option}->{use_revision}?"\x1B\x26".$O{revision}:'')
566 ."\x1B".("\x2C","\x2D","\x2E","\x2F")[$G].$O{charset};
567 } elsif ($O{type} eq 'G96n') {
568 $r .= ($O{revision}&&$C->{option}->{use_revision}?"\x1B\x26".$O{revision}:'')
569 ."\x1B\x24".("\x2C","\x2D","\x2E","\x2F")[$G].$O{charset};
570 }
571 $C->{'G'.$G} = $CHARSET{$O{type}}->{$O{charset}};
572 }
573 ## -- invoke G buffer
574 my $left = $C->{option}->{Ginvoke_to_left}->[$G];
575 if ($C->{GL} eq 'G'.$G) {
576 $left = 1;
577 } elsif ($C->{GR} eq 'G'.$G) {
578 $left = 0;
579 } else {
580 if ($C->{option}->{Ginvoke_by_single_shift}->[$G]) {
581 if ($C->{C1}->{'C_SS'.$G}) {
582 $r .= _i2c ($C->{C1}->{'C_SS'.$G}, $C, type => 'C1') || return undef;
583 } elsif ($C->{C0}->{'C_SS'.$G}) {
584 $r .= _i2c ($C->{C0}->{'C_SS'.$G}, $C, type => 'C0') || return undef;
585 } else { ## Both C0 and C1 set do not have SS2/3.
586 $left = 0 if $G == 1 && !$C->{C0}->{C_LS1};
587 $r .= __invoke ($C, $G => $left) if $C->{$left?'GL':'GR'} ne 'G'.$G;
588 }
589 } else {
590 $left = 0 if $G == 1 && !$C->{C0}->{C_LS1};
591 $r .= __invoke ($C, $G => $left) if $C->{$left?'GL':'GR'} ne 'G'.$G;
592 }
593 }
594 $s =~ tr/\x00-\x7F/\x80-\xFF/ unless $left;
595 $r . $s;
596 }
597 sub _back2ascii (%) {
598 my ($C, %O) = @_;
599 my $r = '';
600 if ($C->{option}->{reset}->{Gdesignation}) {
601 my $F = $C->{option}->{reset}->{Gdesignation}; # \x42
602 $r .= "\x1B\x28".$F unless $C->{G0} eq $CHARSET{G94}->{$F};
603 $C->{G0} = $CHARSET{G94}->{$F};
604 if ($O{reset_all}) {
605 $C->{G1} = $CHARSET{G94}->{"\x7E"};
606 $C->{G2} = $CHARSET{G94}->{"\x7E"};
607 $C->{G3} = $CHARSET{G94}->{"\x7E"};
608 }
609 }
610 if ($C->{option}->{reset}->{Ginvoke}) {
611 if ($C->{GL} ne 'G0') {
612 $r .= $C->{C0}->{C_LS0} || ($C->{C0} = $CHARSET{C0}->{'@'},"\x1B\x21\x40\x0F");
613 $C->{GL} = 'G0';
614 }
615 $C->{GR} = undef if $O{reset_all};
616 }
617 $r;
618 }
619 ## __invoke (\%C, $G, $left_or_right)
620 sub __invoke (\%$$) {
621 my ($C, $G) = @_;
622 if ($_[2]) {
623 $C->{GL} = 'G'.$G;
624 return ($C->{C0}->{C_LS0}
625 || scalar ($C->{C0}=$CHARSET{C0}->{'@'},"\x1B\x21\x40\x0F"),
626 $C->{C0}->{C_LS1}, "\x1B\x6E", "\x1B\x6F")[$G];
627 } else {
628 $C->{GR} = 'G'.$G;
629 return ("", "\x1B\x7E", "\x1B\x7D", "\x1B\x7C")[$G];
630 }
631 '';
632 }
633 sub _i2o ($\%%) {
634 my ($s, $C, %O) = @_;
635 my $CS = $CODING_SYSTEM{ $O{cs_F} } || $CODING_SYSTEM{ $O{cs_id} } || return undef;
636 my $r = '';
637 if ($CS ne $C->{coding_system}) {
638 my $e = '';
639 $e .= "\x1B\x25";
640 $e .= $O{cs_F} || $C->{option}->{private_set}->{coding_system}->{ $O{cs_id} }
641 || return undef;
642 if ($C->{coding_system} eq $CODING_SYSTEM{"\x2F\x40"}
643 || $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x43"}
644 || $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x45"}
645 || $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x4A"}
646 || $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x4B"}
647 || $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x4C"}) {
648 $e =~ s/(.)/\x00$1/go;
649 } elsif ($C->{coding_system} eq $CODING_SYSTEM{"\x2F\x41"}
650 || $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x44"}
651 || $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x46"}) {
652 $e =~ s/(.)/\x00\x00\x00$1/go;
653 }
654 $r .= $e;
655 $C->{coding_system} = $CS;
656 if ($CS->{reset_state}) {
657 $C->{GL} = undef; $C->{GR} = undef;
658 $C->{C0} = $CHARSET{C0}->{"\x7E"};
659 $C->{C1} = $CHARSET{C1}->{"\x7E"};
660 $C->{G0} = $CHARSET{G94}->{"\x7E"};
661 $C->{G1} = $CHARSET{G94}->{"\x7E"};
662 $C->{G2} = $CHARSET{G94}->{"\x7E"};
663 $C->{G3} = $CHARSET{G94}->{"\x7E"};
664 }
665 }
666 if ($CS eq $CODING_SYSTEM{"\x40"}) {
667 #
668 } elsif ($CS eq $CODING_SYSTEM{G} || $CS eq $CODING_SYSTEM{'/G'}
669 || $CS eq $CODING_SYSTEM{'/H'} || $CS eq $CODING_SYSTEM{'/I'}) {
670 Encode::_utf8_off ($s);
671 } elsif ($CS eq $CODING_SYSTEM{'/@'} || $CS eq $CODING_SYSTEM{'/C'}
672 || $CS eq $CODING_SYSTEM{'/E'}) {
673 $s = Encode::encode ('ucs-2be', $s);
674 } elsif ($CS eq $CODING_SYSTEM{'/A'} || $CS eq $CODING_SYSTEM{'/D'}
675 || $CS eq $CODING_SYSTEM{'/F'}) {
676 $s = Encode::encode ('ucs-4be', $s);
677 } elsif ($CS eq $CODING_SYSTEM{'/J'} || $CS eq $CODING_SYSTEM{'/K'}
678 || $CS eq $CODING_SYSTEM{'/L'}) {
679 $s = Encode::encode ('UTF-16BE', $s);
680 } elsif ($CS eq $CODING_SYSTEM{B}) {
681 $s = Encode::encode ('utf-1', $s);
682 } else {
683 return undef;
684 }
685 $r . $s;
686 }
687
688 1;
689 __END__
690
691 =head1 SEE ALSO
692
693 ISO/IEC 646:1991, "7-bit coded graphic character set for intormation interchange".
694
695 ISO/IEC 2022:1994, "Character Code Structure and Extension Techniques".
696 (IDT with ECMA 35, JIS X 0202:1998)
697
698 ISO/IEC 4873:1991, "8-Bit Coded Character Set Structure and Rules".
699 (IDT with ECMA 43)
700
701 ISO/IEC 6429:1992, "Control Functions for Coded Character Sets".
702 (IDT with ECMA 48:1991, JIS X 0211:1998)
703
704 ISO/IEC 8859, "8-Bit Single-Byte Coded Graphic Character Sets".
705
706 Encode, perlunicode
707
708 =head1 TODO
709
710 =over 4
711
712 =item NCR (coding system other than ISO/IEC 2022) support
713
714 =over 2
715
716 =item ESC 02/05 02/15 03/x of X Compound Text
717
718 =back
719
720 =item Output of control character sets, single control functions
721
722 =item Designation sequence of control character sets (input)
723
724 =item Special graphic character sets such as G3 of EUC-TW
725
726 =item SUPER SHIFT (SS) invoke function of old control character set
727
728 =item Safe transparent of control string (ISO/IEC 6429)
729
730 =item Output of unoutputable characters as alternative notation such as SGML-like entity
731
732 =item C0 set invoked to CR area like ISIRI code
733
734 Really need?
735
736 =item special treatment of 0x20, 0x7E, 0xA0, 0xFF
737
738 For example, GB mongolian sets use MSP (MONGOLIAN SPACE)
739 with these code positions.
740
741 And, no less coding systems does not use (or does ban using) DEL.
742
743 =item A lot of character sets don't have pseudo-UCS mapping.
744
745 Most of 9m^n (n >= 3) sets, 9m^n sets with I byte, 9m^n
746 DRCSes do not have pseudo-UCS mapping area. It is
747 questionable to allocate lots of code positions to these
748 rarely-(or no-)used character sets.
749
750 =item Even character sets that have pseudo-UCS mapping, some of them can't be outputed in ISO/IEC 2022.
751
752 Because output of rarely-used character sets is
753 not implemented yet.
754
755 =back
756
757 =head1 AUTHORS
758
759 Nanashi-san
760
761 Wakaba <w@suika.fam.cx>
762
763 =head1 LICENSE
764
765 Copyright 2002 AUTHORS
766
767 This library is free software; you can redistribute it
768 and/or modify it under the same terms as Perl itself.
769
770 =cut
771
772 # $Date: 2002/10/12 11:03:00 $
773 ### ISO2022.pm ends here

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24