/[suikacvs]/perl/lib/Encode/ISO2022.pm
Suika

Contents of /perl/lib/Encode/ISO2022.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.7 - (show annotations) (download)
Sun Sep 22 11:09:38 2002 UTC (22 years, 1 month ago) by wakaba
Branch: MAIN
Changes since 1.6: +11 -7 lines
2002-09-22  Wakaba <w@suika.fam.cx>

	* ISO2022.pm (_internal_to_iso2022): Allow SP as a
	replacement character.

1
2 =head1 NAME
3
4 Encode::ISO2022 --- ISO/IEC 2022 encoder and decoder
5
6 =head1 ENCODINGS
7
8 =over 4
9
10 =item iso2022
11
12 ISO/IEC 2022:1994. Default status is:
13
14 =over 2
15
16 =item CL = C0 = ISO/IEC 6429:1991 C0 set
17
18 =item CR = C1 = ISO/IEC 6429:1991 C1 set
19
20 =item GL = G0 = ISO/IEC 646:1991 IRV GL(G0) set
21
22 =item GR = G1 = empty set
23
24 =item G2 = empty set
25
26 =item G3 = empty set
27
28 =back
29
30 (Alias: iso/iec2022, iso-2022, 2022, cp2022)
31
32 =back
33
34 Note that ISO/IEC 2022 based encodings are found in
35 Encode::ISO2022::* modules. This module, Encode::ISO2022
36 only provides a general ISO/IEC 2022 encoder/decoder.
37
38 =cut
39
40 require v5.7.3;
41 package Encode::ISO2022;
42 use strict;
43 use vars qw(%CHARSET %CODING_SYSTEM $VERSION);
44 $VERSION=do{my @r=(q$Revision: 1.6 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
45 use base qw(Encode::Encoding);
46 __PACKAGE__->Define (qw!iso-2022 iso/iec2022 iso2022 2022 cp2022!);
47 require Encode::Charset;
48 *CHARSET = \%Encode::Charset::CHARSET;
49 *CODING_SYSTEM = \%Encode::Charset::CODING_SYSTEM;
50
51 ### --- Intialization
52
53 my %_CHARS_to_RANGE = (
54 l94 => q/[\x21-\x7E]/,
55 l96 => q/[\x20-\x7F]/,
56 l128 => q/[\x00-\x7F]/,
57 l256 => q/[\x00-\xFF]/,
58 r94 => q/[\xA1-\xFE]/,
59 r96 => q/[\xA0-\xFF]/,
60 r128 => q/[\x80-\xFF]/,
61 r256 => q/[\x80-\xFF]/,
62 b94 => q/[\x21-\x7E\xA1-\xFE]/,
63 b96 => q/[\x20-\x7F\xA0-\xFF]/,
64 b128 => q/[\x00-\xFF]/,
65 b256 => q/[\x00-\xFF]/,
66 );
67
68 ### --- Perl Encode module common functions
69
70 sub encode ($$;$) {
71 my ($obj, $str, $chk) = @_;
72 $_[1] = '' if $chk;
73 $str = &internal_to_iso2022 ($str);
74 return $str;
75 }
76
77 sub decode ($$;$) {
78 my ($obj, $str, $chk) = @_;
79 $_[1] = '' if $chk;
80 return &iso2022_to_internal ($str);
81 }
82
83 ### --- Encode::ISO2022 unique functions
84 *new_object = \&Encode::Charset::new_object;
85
86 sub iso2022_to_internal ($;\%) {
87 my ($s, $C) = @_;
88 $C ||= &new_object;
89 my $t = '';
90 $s =~ s{
91 ^((?:(?!\x1B\x25\x2F?[\x30-\x7E]).)*)
92 }{
93 my $i2 = $1;
94 $t = _iso2022_to_internal ($i2, $C);
95 '';
96 }gesx;
97 my $pad = '';
98 use re 'eval';
99 $s =~ s{
100 ## ISO/IEC 2022
101 (??{"$pad\x1B$pad\x25$pad\x40"})((?:(?!\x1B\x25\x2F?[\x30-\x7E]).)*)
102 ## UTF-8
103 |(??{"$pad\x1B$pad\x25$pad(?:\x47|\x2F$pad"."[\x47-\x49])"})
104 ((?:(?!\x1B\x25\x2F?[\x30-\x7E]).)*)
105 ## UCS-2, UTF-16
106 |(??{"$pad\x1B$pad\x25$pad\x2F$pad"})([\x40\x43\x45\x4A-\x4C])
107 ((?:(?!\x00\x1B\x00\x25(?:\x00\x2F)?\x00[\x30-\x7E])..)*)
108 ## UCS-4
109 |(??{"$pad\x1B$pad\x25$pad\x2F$pad"})[\x41\x44\x46]
110 ((?:(?!\x00\x00\x00\x1B\x00\x00\x00\x25(?:\x00\x00\x00\x2F)?
111 \x00\x00\x00[\x30-\x7E])....)*)
112 ## with standard return
113 |(??{"$pad\x1B$pad\x25$pad"})([\x30-\x7E])
114 ((?:(?!\x1B\x25\x2F?[\x30-\x7E]).)*)
115 ## without standard return
116 |(??{"$pad\x1B$pad\x25$pad\x2F$pad"})([\x30-\x7E])(.*)
117 }{
118 my ($i2,$u8,$Fu2,$u2,$u4,$Fsr,$sr,$Fnsr,$nsr) = ($1,$2,$3,$4,$5,$6,$7,$8,$9);
119 my $r = '';
120 if (defined $i2) {
121 $r = _iso2022_to_internal ($i2, $C); $pad = '';
122 } elsif (defined $u8) {
123 $r = Encode::decode ('utf8', $u8); $pad = '';
124 } elsif ($Fu2) {
125 if (ord ($Fu2) > 0x49) {
126 $r = Encode::decode ('utf-16be', $u2);
127 } else {
128 $r = Encode::decode ('ucs-2be', $u2);
129 }
130 $pad = "\x00";
131 } elsif (defined $u4) {
132 $r = Encode::decode ('ucs-4be', $u2); $pad = "\x00\x00\x00";
133 } elsif (defined $Fsr && $CODING_SYSTEM{$Fsr}->{perl_name}) {
134 $r = Encode::decode ($CODING_SYSTEM{$Fsr}->{perl_name}, $sr); $pad = '';
135 } elsif (defined $Fnsr && $CODING_SYSTEM{$Fnsr}->{perl_name}) {
136 $r = Encode::decode ($CODING_SYSTEM{$Fnsr}->{perl_name}, $nsr); $pad = '';
137 } else { ## temporary
138 $r = '?' x length ($sr.$nsr); $pad = '';
139 }
140 $r;
141 }gesx;
142 $t . $s;
143 }
144
145 sub _iso2022_to_internal ($;\%) {
146 my ($s, $C) = @_;
147 my %_GB_to_GN = (
148 "\x28"=>'G0',"\x29"=>'G1',"\x2A"=>'G2',"\x2B"=>'G3',
149 "\x2C"=>'G0',"\x2D"=>'G1',"\x2E"=>'G2',"\x2F"=>'G3',
150 );
151
152 use re 'eval';
153 $s =~ s{
154 ((??{ $_CHARS_to_RANGE{'l'.$C->{$C->{GL}}->{chars}}
155 . qq/{$C->{$C->{GL}}->{dimension},$C->{$C->{GL}}->{dimension}}/ }))
156 |((??{ $_CHARS_to_RANGE{'r'.$C->{$C->{GR}}->{chars}}
157 . qq/{$C->{$C->{GR}}->{dimension},$C->{$C->{GR}}->{dimension}}/ }))
158 | (??{ q/(?:/ . ($C->{$C->{CR}}->{r_SS2} || '(?!)')
159 . ($C->{$C->{ESC_Fe}}->{r_SS2_ESC} ?
160 qq/|$C->{$C->{ESC_Fe}}->{r_SS2_ESC}/ : '')
161 . ($C->{$C->{CL}}->{r_SS2} ? qq/|$C->{$C->{CL}}->{r_SS2}/ : '') . q/)/
162 . ( $C->{$C->{CL}}->{r_LS0}
163 ||$C->{$C->{CL}}->{r_LS1}? ## ISO/IEC 6429:1992 9
164 qq/[$C->{$C->{CL}}->{r_LS0}$C->{$C->{CL}}->{r_LS1}]*/:'')
165 })
166 ((??{ $_CHARS_to_RANGE{'b'.$C->{G2}->{chars}}
167 . qq/{$C->{G2}->{dimension},$C->{G2}->{dimension}}/ }))
168 | (??{ q/(?:/ . ($C->{$C->{CR}}->{r_SS3} || '(?!)')
169 . ($C->{$C->{ESC_Fe}}->{r_SS3_ESC} ?
170 qq/|$C->{$C->{ESC_Fe}}->{r_SS3_ESC}/ : '')
171 . ($C->{$C->{CL}}->{r_SS3} ? qq/|$C->{$C->{CL}}->{r_SS3}/ : '') . q/)/
172 . ( $C->{$C->{CL}}->{r_LS0}
173 || $C->{$C->{CL}}->{r_LS1}? ## ISO/IEC 6429:1992 9
174 qq/[$C->{$C->{CL}}->{r_LS0}$C->{$C->{CL}}->{r_LS1}]*/:'')
175 })
176 ((??{ $_CHARS_to_RANGE{'b'.$C->{G3}->{chars}}
177 . qq/{$C->{G3}->{dimension},$C->{G3}->{dimension}}/ }))
178
179 ## Locking shift
180 |( (??{ $C->{$C->{CL}}->{r_LS0}||'(?!)' })
181 |(??{ $C->{$C->{CL}}->{r_LS1}||'(?!)' })
182 )
183
184 ## Control sequence
185 |(??{ '(?:'.($C->{$C->{CR}}->{r_CSI}||'(?!)')
186 .($C->{$C->{ESC_Fe}}->{r_CSI_ESC} ?
187 qq/|$C->{$C->{ESC_Fe}}->{r_CSI_ESC}/: '')
188 .')'
189 })
190 ((??{ qq/[\x30-\x3F$C->{$C->{CL}}->{LS0}$C->{$C->{CL}}->{LS1}\xB0-\xBF]*/
191 .qq/[\x20-\x2F$C->{$C->{CL}}->{LS0}$C->{$C->{CL}}->{LS1}\xA0-\xAF]*/
192 }) [\x40-\x7E\xD0-\xFE])
193
194 ## Other escape sequence
195 |(\x1B[\x20-\x2F]*[\x30-\x7E])
196
197 ## Misc. sequence (SP, control, or broken data)
198 |([\x00-\xFF])
199 }{
200 my ($gl,$gr,$ss2,$ss3,$ls,$csi,$esc,$misc)
201 = ($1,$2,$3,$4,$5,$6,$7,$8,$9);
202 $C->{_irr} = undef unless defined $esc;
203 ## GL graphic character
204 if (defined $gl) {
205 my $c = 0;
206 my $m = $C->{$C->{GL}}->{chars}==94?0x21:$C->{$C->{GL}}->{chars}==96?0x20:0;
207 for (split //, $gl) {
208 $c = $c * $C->{$C->{GL}}->{chars} + unpack ('C', $_) - $m;
209 }
210 chr ($C->{$C->{GL}}->{ucs} + $c);
211 ## Control, SP, or broken data
212 ## TODO: support control sets other than ISO/IEC 6429's
213 } elsif (defined $misc) {
214 $misc;
215 ## GR graphic character
216 } elsif ($gr) {
217 my $c = 0;
218 my $m = $C->{$C->{GR}}->{chars}==94?0xA1:$C->{$C->{GR}}->{chars}==96?0xA0:0x80;
219 for (split //, $gr) {
220 $c = $c * $C->{$C->{GR}}->{chars} + unpack ('C', $_) - $m;
221 }
222 chr ($C->{$C->{GR}}->{ucs} + $c);
223 ## Graphic character with SS2
224 } elsif ($ss2) {
225 $ss2 =~ tr/\x80-\xFF/\x00-\x7F/;
226 my $c = 0; my $m = $C->{G2}->{chars}==94?0x21:$C->{G2}->{chars}==96?0x20:0;
227 for (split //, $ss2) {
228 $c = $c * $C->{G2}->{chars} + unpack ('C', $_) - $m;
229 }
230 chr ($C->{G2}->{ucs} + $c);
231 ## Graphic character with SS3
232 } elsif ($ss3) {
233 $ss3 =~ tr/\x80-\xFF/\x00-\x7F/;
234 my $c = 0; my $m = $C->{G3}->{chars}==94?0x21:$C->{G3}->{chars}==96?0x20:0;
235 for (split //, $ss3) {
236 $c = $c * $C->{G3}->{chars} + unpack ('C', $_) - $m;
237 }
238 chr ($C->{G3}->{ucs} + $c);
239 ## Escape sequence
240 } elsif ($esc) {
241 ## IRR (revision number)
242 if ($esc =~ /\x1B\x26([\x40-\x7E])/) {
243 $C->{_irr} = $1; $esc = '';
244 } else {
245 $esc =~ s{
246 \x1B([\x28-\x2B])(\x20?[\x21-\x23]?[\x30-\x7E]) ## Gx = 94^1
247 |\x1B\x24([\x28-\x2B]?)(\x20?[\x21-\x23]?[\x30-\x7E]) ## Gx = 94^n
248
249 |\x1B([\x2C-\x2F])(\x20?[\x21-\x23]?[\x30-\x7E]) ## Gx = 96^1
250 |\x1B\x24([\x2C-\x2F])(\x20?[\x21-\x23]?[\x30-\x7E]) ## Gx = 96^n
251
252 |\x1B([\x40-\x5F]) ## ESC Fe
253
254 |\x1B\x21([\x21-\x23]?[\x30-\x7E]) ## CL = C0
255 |\x1B\x22([\x21-\x23]?[\x30-\x7E]) ## CR & ESC Fe = C1
256
257 |\x1B([\x60-\x7E]) ## Single control functions
258 |\x1B\x23([\x21-\x23]?)([\x30-\x7E])
259
260 |\x1B\x20([\x40-\x7E]) ## Announcer
261 }{
262 my ($g94_g,$g94_f,$g94n_g,$g94n_f,$g96_g,$g96_f,$g96n_g,$g96n_f,$Fe,
263 $CZD, $C1D, $Fs, $sI, $sF,$ACS)
264 = ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15);
265 my $rev = $C->{_irr} || '';
266 if ($g94_g) {
267 $C->{ $_GB_to_GN{ $g94_g } } = $CHARSET{G94}->{ $g94_f.$rev }
268 || $CHARSET{G94}->{ $g94_f }
269 || $CHARSET{G94}->{ "\x7E" }; '';
270 } elsif (defined $g94n_f) {
271 $C->{ $_GB_to_GN{ $g94n_g } || 'G0' } = $CHARSET{G94n}->{ $g94n_f.$rev }
272 || $CHARSET{G94n}->{ $g94n_f }
273 || $CHARSET{G94n}->{ "\x7E" }; '';
274 } elsif ($g96_g) {
275 $C->{ $_GB_to_GN{ $g96_g } } = $CHARSET{G96}->{ $g96_f.$rev }
276 || $CHARSET{G96}->{ $g96_f }
277 || $CHARSET{G96}->{ "\x7E" }; '';
278 } elsif (defined $g96n_f) {
279 $C->{ $_GB_to_GN{ $g96n_g } } = $CHARSET{G96n}->{ $g96n_f.$rev }
280 || $CHARSET{G96n}->{ $g96n_f }
281 || $CHARSET{G96n}->{ "\x7E" }; '';
282 } elsif ($Fe) { ## ESC Fe => C1
283 chr ($C->{ $C->{ESC_Fe} }->{ucs} + (ord ($Fe) - 0x40));
284 } elsif ($CZD) {
285 $C->{C0} = $CHARSET{C0}->{ $CZD.$rev }
286 || $CHARSET{C0}->{ $CZD } || $CHARSET{C0}->{ "\x7E" }; '';
287 } elsif ($C1D) {
288 $C->{C1} = $CHARSET{C1}->{ $C1D.$rev }
289 || $CHARSET{C1}->{ $C1D } || $CHARSET{C1}->{ "\x7E" }; '';
290 } elsif ($Fs) {
291 if ($Fs eq "\x6E") { ## LS2
292 $C->{GL} = 'G2'; '';
293 } elsif ($Fs eq "\x6F") { ## LS3
294 $C->{GL} = 'G3'; '';
295 } elsif ($Fs eq "\x7E" || $Fs eq "\x6B") { ## LS1R
296 $C->{GR} = 'G1'; $C->{GL} = 'G1' if $C->{bit} == 7; '';
297 } elsif ($Fs eq "\x7D" || $Fs eq "\x6C") { ## LS2R
298 $C->{GR} = 'G2'; $C->{GL} = 'G2' if $C->{bit} == 7; '';
299 } elsif ($Fs eq "\x7C" || $Fs eq "\x6D") { ## LS3R
300 $C->{GR} = 'G3'; $C->{GL} = 'G3' if $C->{bit} == 7; '';
301 } else {
302 chr ($CHARSET{single_control}->{Fs}->{ucs} + (ord ($Fs) - 0x60));
303 }
304 } elsif ($sI) {
305 chr ($CHARSET{single_control}->{'3F'.$sI}->{ucs} + (ord ($sF) - 0x30));
306 } elsif ($ACS) { ## Announcer
307 if ($ACS eq "\x4A") { $C->{bit} = 7 }
308 elsif ($ACS eq "\x4B") { $C->{bit} = 8 }
309 '';
310 }
311 }gex;
312 $C->{_irr} = undef;
313 }
314 $esc;
315 ## Locking shifts
316 } elsif ($ls) {
317 if ($ls eq $C->{$C->{CL}}->{LS0}) {
318 $C->{GL} = 'G0'; '';
319 } elsif ($ls eq $C->{$C->{CL}}->{LS1}) {
320 $C->{GL} = 'G1'; '';
321 }
322 ## Control sequence
323 } elsif ($csi) {
324 $csi =~ tr/\xA0-\xFF/\x20-\x7F/d;
325 $csi =~ s/$C->{$C->{CL}}->{LS0}//g if $C->{$C->{CL}}->{LS0};
326 $csi =~ s/$C->{$C->{CL}}->{LS1}//g if $C->{$C->{CL}}->{LS1};
327 "\x9B".$csi;
328 }
329 }gex;
330 $s;
331 }
332
333 sub internal_to_iso2022 ($\%) {
334 my ($s, $C) = @_;
335 $C ||= &new_object;
336
337 my $r = '';
338 for my $c (split //, $s) {
339 my $cc = ord $c;
340 my $t;
341 if ($cc <= 0x1F) {
342 $t = _i2c ($c, $C, type => 'C0', charset => '@');
343 } elsif ($cc == 0x20 || $cc == 0x7F) {
344 $t = _back2ascii ($C) . $c;
345 } elsif ($cc < 0x7F) {
346 $t = _i2g ($c, $C, type => 'G94', charset => 'B');
347 } elsif ($cc <= 0x9F) {
348 $t = _i2c ($c, $C, type => 'C1', charset_id => '64291991C1',
349 charset => $C->{private_set}->{XC1}->{'64291991C1'});
350 } elsif ($cc <= 0xFF) {
351 $t = _i2g (chr($cc-0x80), $C, type => 'G96', charset => 'A');
352 } elsif ($cc <= 0x24FF) {
353 my $c = $cc - 0x100;
354 my $final = $C->{private_set}->{U96n}->[0];
355 if (length $final) {
356 $t = _i2g (chr(($c / 96)+0x20).chr(($c % 96)+0x20), $C,
357 type => 'G96n', charset => $final);
358 }
359 } elsif ($cc <= 0x33FF) {
360 my $c = $cc - 0x2500;
361 my $final = $C->{private_set}->{U96n}->[1];
362 if (length $final) {
363 $t = _i2g (chr(($c / 96)+0x20).chr(($c % 96)+0x20), $C,
364 type => 'G96n', charset => $final);
365 }
366 } elsif (0xE000 <= $cc && $cc <= 0xFFFF) {
367 my $c = $cc - 0xE000;
368 my $final = $C->{private_set}->{U96n}->[2];
369 if (length $final) {
370 $t = _i2g (chr(($c / 96)+0x20).chr(($c % 96)+0x20), $C,
371 type => 'G96n', charset => $final);
372 }
373
374 } elsif (0xE9F6C0 <= $cc && $cc <= 0xF06F80) {
375 my $c = $cc - 0xE9F6C0;
376 $t = _i2g (chr((($c % 8836) / 94)+0x21).chr(($c % 94)+0x21), $C,
377 type => 'G94n', charset => chr(($c / 8836)+0x30));
378 } elsif (0xF49D7C <= $cc && $cc <= 0xF4BFFF) {
379 my $c = $cc - 0xF49D7C;
380 $t = _i2g (chr(($c / 94)+0x21).chr(($c % 94)+0x21), $C,
381 type => 'G94n', charset => 'B', revision => '@');
382
383 } elsif (0xF0000 <= $cc && $cc <= 0x10F7FF) {
384 my $c = $cc - 0xF0000;
385 $t = _i2g (chr((($c % 9216) / 96)+0x20).chr(($c % 96)+0x20), $C,
386 type => 'G96n', charset => "\x20".chr(($c / 9216)+0x40));
387 } elsif (0xE90940 <= $cc && $cc <= 0xE92641) {
388 my $c = $cc - 0xE90940;
389 $t = _i2g (chr(($c % 94)+0x21), $C,
390 type => 'G94', charset => chr(($c / 94)+0x30));
391 } elsif (0xE92642 <= $cc && $cc <= 0xE9269F) {
392 my $c = $cc - 0xE92642;
393 $t = _i2g (chr($c+0x21),$C,type => 'G94', charset => "\x64", revision => '@');
394 } elsif (0xE926A0 <= $cc && $cc <= 0xE9443F) {
395 my $c = $cc - 0xE926A0;
396 $t = _i2g (chr(($c % 96)+0x20), $C,
397 type => 'G96', charset => chr(($c / 96)+0x30));
398 } elsif (0xE944A0 <= $cc && $cc <= 0xE961A1) {
399 my $c = $cc - 0xE944A0;
400 $t = _i2g (chr(($c % 94)+0x21), $C,
401 type => 'G94', charset => '!'.chr(($c / 94)+0x30));
402 } elsif (0xE96200 <= $cc && $cc <= 0xE97F9F) {
403 my $c = $cc - 0xE96200;
404 $t = _i2g (chr(($c % 96)+0x20), $C,
405 type => 'G96', charset => '!'.chr(($c / 96)+0x30));
406 } elsif (0xE98000 <= $cc && $cc <= 0xE99D01) {
407 my $c = $cc - 0xE98000;
408 $t = _i2g (chr(($c % 94)+0x21), $C,
409 type => 'G94', charset => '"'.chr(($c / 94)+0x30));
410 } elsif (0xE99D60 <= $cc && $cc <= 0xE9BAFF) {
411 my $c = $cc - 0xE99D60;
412 $t = _i2g (chr(($c % 96)+0x20), $C,
413 type => 'G96', charset => '"'.chr(($c / 96)+0x30));
414 } elsif (0xE9BB60 <= $cc && $cc <= 0xE9D861) {
415 my $c = $cc - 0xE9BB60;
416 $t = _i2g (chr(($c % 94)+0x21), $C,
417 type => 'G94', charset => '#'.chr(($c / 94)+0x30));
418 } elsif (0xE9D8C0 <= $cc && $cc <= 0xE9F65F) {
419 my $c = $cc - 0xE9D8C0;
420 $t = _i2g (chr(($c % 96)+0x20), $C,
421 type => 'G96', charset => '#'.chr(($c / 96)+0x30));
422 } elsif (0x70090940 <= $cc && $cc <= 0x70092641) {
423 my $c = $cc - 0x70090940;
424 $t = _i2g (chr(($c % 94)+0x21), $C,
425 type => 'G94', charset => "\x20".chr(($c / 94)+0x30));
426 } elsif (0x700926A0 <= $cc && $cc <= 0x7009443F) {
427 my $c = $cc - 0x700926A0;
428 $t = _i2g (chr(($c % 96)+0x20), $C,
429 type => 'G96', charset => "\x20".chr(($c / 96)+0x30));
430 ## TODO: DRCS with I byte: U+700944A0-U+7009F6BF
431 } elsif (0x70400000 <= $cc && $cc <= 0x7040FFED) {
432 my $c = $cc - 0x70400000;
433 $t = _i2g (chr(($c % 94)+0x21), $C, charset_id => 'P'.int ($c / 94),
434 type => 'G94', charset => $C->{private_set}->{G94}->[ $c / 94 ]);
435 } elsif (0x70410000 <= $cc && $cc <= 0x7041FFBF) {
436 my $c = $cc - 0x70410000;
437 $t = _i2g (chr(($c % 96)+0x20), $C, charset_id => 'P'.int ($c / 96),
438 type => 'G96', charset => $C->{private_set}->{G96}->[ $c / 96 ]);
439 } elsif (0x70420000 <= $cc && $cc <= 0x7046F19B) {
440 my $c = $cc % 0x10000;
441 $t = _i2g (chr((($c % 8836) / 94)+0x21).chr(($c % 94)+0x21), $C,
442 type => 'G94n',
443 charset_id => 'P'.int(($cc / 0x10000) - 0x7042).'_'.int($c / 8836),
444 charset => $C->{private_set}->{G94n}->[ ($cc / 0x10000) - 0x7042 ]
445 ->[ $c / 8836 ]);
446 }
447 if (defined $t) {
448 $t = _i2o ($t, $C, cs_F => "\x40")
449 if $C->{coding_system} ne $CODING_SYSTEM{"\x40"};
450 } else {
451 my $F; my @F = qw~G /G /H /I B /A /D /F~;
452 push @F, qw~/J /K /L~ if $cc <= 0x10FFFF;
453 push @F, qw~/@ /C /E~ if $cc <= 0xFFFF;
454 for (@F) {
455 if (defined $C->{option}->{designate_to}->{coding_system}->{$_}
456 && $C->{option}->{designate_to}->{coding_system}->{$_} > -1) {
457 $F = $_; last;
458 } elsif ($C->{option}->{designate_to}->{coding_system}->{default} > -1) {
459 $F = $_; last;
460 }
461 }
462 $t = _i2o ($c, $C, cs_F => $F) if $F;
463 }
464 if (defined $t) {
465 $r .= $t;
466 } else {
467 unless ($C->{option}->{undef_char}->[0] eq "\x20") {
468 $t = _i2g ($C->{option}->{undef_char}->[0], $C,
469 %{ $C->{option}->{undef_char}->[1] });
470 } else { ## SP
471 $t = _back2ascii ($C) . "\x20";
472 }
473 $r .= $C->{coding_system} eq $CODING_SYSTEM{"\x40"} ?
474 $t : _i2o ($t, $C, cs_F => "\x40");
475 }
476 }
477 $r . _back2ascii ($C);
478 }
479
480 ## $O{charset} eq undef means that charset is same as the current designated one.
481 sub _i2c ($%%) {
482 my ($s, $C, %O) = @_;
483 my $r = '';
484 if ($O{type} eq 'C0') {
485 if (defined $O{charset}) {
486 if ( $C->{C0} ne $CHARSET{C0}->{$O{charset}}
487 && $C->{C0} ne $CHARSET{C0}->{$O{charset_id}}) {
488 for ($C->{option}->{designate_to}->{C0}->{$O{charset}},
489 $C->{option}->{designate_to}->{C0}->{default}) {
490 if (defined $_) { return undef if $_ == -1; last }
491 }
492 $r .= "\x1B\x21".$O{charset};
493 $C->{C0} = $CHARSET{C0}->{$O{charset}};
494 }
495 } elsif (defined $O{charset_id}) {
496 if ($C->{C0} ne $CHARSET{C0}->{$O{charset_id}}) {
497 return undef; ## Control set is not designated nor has F byte
498 }
499 }
500 $r .= _back2ascii ($C, reset_all => $C->{C0}->{reset_all}->{$s});
501 return $r . $s;
502 } elsif ($O{type} eq 'C1') {
503 if (defined $O{charset}) {
504 if ( $C->{C1} ne $CHARSET{C1}->{$O{charset}}
505 && $C->{C1} ne $CHARSET{C1}->{$O{charset_id}}) {
506 for ($C->{option}->{designate_to}->{C1}->{$O{charset}},
507 $C->{option}->{designate_to}->{C1}->{default}) {
508 if (defined $_) { return undef if $_ == -1; last }
509 }
510 $r .= "\x1B\x22".$O{charset};
511 $C->{C1} = $CHARSET{C1}->{$O{charset}};
512 }
513 } elsif (defined $O{charset_id}) {
514 if ($C->{C1} ne $CHARSET{C1}->{$O{charset_id}}) {
515 return undef; ## Control set is not designated nor has F byte
516 }
517 }
518 $r .= _back2ascii ($C, reset_all => $C->{C1}->{reset_all}->{$s});
519 unless ($C->{option}->{C1invoke_to_right}) { ## ESC Fe
520 $s =~ s/([\x80-\x9F])/"\x1B" . chr (ord ($1) - 0x40)/ge;
521 }
522 return $r . $s;
523 }
524 }
525 sub _i2g ($%%) {
526 my ($s, $C, %O) = @_;
527 my $r = '';
528 my $set = $CHARSET{$O{type}}->{$O{charset}.
529 ($O{revision}&&$C->{option}->{use_revision}?$O{revision}:'')};
530 my $set0 = $CHARSET{$O{type}}->{$O{charset_id}};
531 ## -- designate character set
532 my $G = 0;
533 if ($C->{G0} eq $set || $C->{G0} eq $set0) { $G = 0 }
534 elsif ($C->{G1} eq $set || $C->{G1} eq $set0) { $G = 1 }
535 elsif ($C->{G2} eq $set || $C->{G2} eq $set0) { $G = 2 }
536 elsif ($C->{G3} eq $set || $C->{G3} eq $set0) { $G = 3 }
537 else {
538 return undef unless $set; ## charset does not have F byte
539 $G = 1 if $O{type} eq 'G96' || $O{type} eq 'G96n';
540 for ($C->{option}->{designate_to}->{$O{type}}->{$O{charset}},
541 $C->{option}->{designate_to}->{$O{type}}->{default}) {
542 if (defined $_) {
543 $G = $_; last;
544 }
545 }
546 if ($G == -1) {
547 return undef;
548 }
549 if ($O{type} eq 'G94') {
550 $r .= ($O{revision}&&$C->{option}->{use_revision}?"\x1B\x26".$O{revision}:'')
551 ."\x1B".("\x28","\x29","\x2A","\x2B")[$G].$O{charset};
552 } elsif ($O{type} eq 'G94n') {
553 if ($G == 0 && !$C->{option}->{G94n_designate_long}
554 && ($O{charset} eq '@' || $O{charset} eq 'A' || $O{charset} eq 'B')) {
555 $r .= ($O{revision}&&$C->{option}->{use_revision}?"\x1B\x26".$O{revision}:'')
556 ."\x1B\x24".$O{charset};
557 } else {
558 $r .= ($O{revision}&&$C->{option}->{use_revision}?"\x1B\x26".$O{revision}:'')
559 ."\x1B\x24".("\x28","\x29","\x2A","\x2B")[$G].$O{charset};
560 }
561 } elsif ($O{type} eq 'G96') {
562 $r .= ($O{revision}&&$C->{option}->{use_revision}?"\x1B\x26".$O{revision}:'')
563 ."\x1B".("\x2C","\x2D","\x2E","\x2F")[$G].$O{charset};
564 } elsif ($O{type} eq 'G96n') {
565 $r .= ($O{revision}&&$C->{option}->{use_revision}?"\x1B\x26".$O{revision}:'')
566 ."\x1B\x24".("\x2C","\x2D","\x2E","\x2F")[$G].$O{charset};
567 }
568 $C->{'G'.$G} = $CHARSET{$O{type}}->{$O{charset}};
569 }
570 ## -- invoke G buffer
571 my $left = $C->{option}->{Ginvoke_to_left}->[$G];
572 if ($C->{GL} eq 'G'.$G) {
573 $left = 1;
574 } elsif ($C->{GR} eq 'G'.$G) {
575 $left = 0;
576 } else {
577 if ($C->{option}->{Ginvoke_by_single_shift}->[$G]) {
578 if ($C->{C1}->{'C_SS'.$G}) {
579 $r .= _i2c ($C->{C1}->{'C_SS'.$G}, $C, type => 'C1') || return undef;
580 } elsif ($C->{C0}->{'C_SS'.$G}) {
581 $r .= _i2c ($C->{C0}->{'C_SS'.$G}, $C, type => 'C0') || return undef;
582 } else { ## Both C0 and C1 set do not have SS2/3.
583 $left = 0 if $G == 1 && !$C->{C0}->{C_LS1};
584 $r .= __invoke ($C, $G => $left) if $C->{$left?'GL':'GR'} ne 'G'.$G;
585 }
586 } else {
587 $left = 0 if $G == 1 && !$C->{C0}->{C_LS1};
588 $r .= __invoke ($C, $G => $left) if $C->{$left?'GL':'GR'} ne 'G'.$G;
589 }
590 }
591 $s =~ tr/\x00-\x7F/\x80-\xFF/ unless $left;
592 $r . $s;
593 }
594 sub _back2ascii (%) {
595 my ($C, %O) = @_;
596 my $r = '';
597 if ($C->{option}->{reset}->{Gdesignation}) {
598 my $F = $C->{option}->{reset}->{Gdesignation}; # \x42
599 $r .= "\x1B\x28".$F unless $C->{G0} eq $CHARSET{G94}->{$F};
600 $C->{G0} = $CHARSET{G94}->{$F};
601 if ($O{reset_all}) {
602 $C->{G1} = $CHARSET{G94}->{"\x7E"};
603 $C->{G2} = $CHARSET{G94}->{"\x7E"};
604 $C->{G3} = $CHARSET{G94}->{"\x7E"};
605 }
606 }
607 if ($C->{option}->{reset}->{Ginvoke}) {
608 if ($C->{GL} ne 'G0') {
609 $r .= $C->{C0}->{C_LS0} || ($C->{C0} = $CHARSET{C0}->{'@'},"\x1B\x21\x40\x0F");
610 $C->{GL} = 'G0';
611 }
612 $C->{GR} = undef if $O{reset_all};
613 }
614 $r;
615 }
616 ## __invoke (\%C, $G, $left_or_right)
617 sub __invoke (\%$$) {
618 my ($C, $G) = @_;
619 if ($_[2]) {
620 $C->{GL} = 'G'.$G;
621 return ($C->{C0}->{C_LS0}
622 || scalar ($C->{C0}=$CHARSET{C0}->{'@'},"\x1B\x21\x40\x0F"),
623 $C->{C0}->{C_LS1}, "\x1B\x6E", "\x1B\x6F")[$G];
624 } else {
625 $C->{GR} = 'G'.$G;
626 return ("", "\x1B\x7E", "\x1B\x7D", "\x1B\x7C")[$G];
627 }
628 '';
629 }
630 sub _i2o ($\%%) {
631 my ($s, $C, %O) = @_;
632 my $CS = $CODING_SYSTEM{ $O{cs_F} } || $CODING_SYSTEM{ $O{cs_id} } || return undef;
633 my $r = '';
634 if ($CS ne $C->{coding_system}) {
635 my $e = '';
636 $e .= "\x1B\x25";
637 $e .= $O{cs_F} || $C->{private_set}->{coding_system}->{ $O{cs_id} }
638 || return undef;
639 if ($C->{coding_system} eq $CODING_SYSTEM{"\x2F\x40"}
640 || $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x43"}
641 || $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x45"}
642 || $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x4A"}
643 || $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x4B"}
644 || $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x4C"}) {
645 $e =~ s/(.)/\x00$1/go;
646 } elsif ($C->{coding_system} eq $CODING_SYSTEM{"\x2F\x41"}
647 || $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x44"}
648 || $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x46"}) {
649 $e =~ s/(.)/\x00\x00\x00$1/go;
650 }
651 $r .= $e;
652 $C->{coding_system} = $CS;
653 if ($CS->{reset_state}) {
654 $C->{GL} = undef; $C->{GR} = undef;
655 $C->{C0} = $CHARSET{C0}->{"\x7E"};
656 $C->{C1} = $CHARSET{C1}->{"\x7E"};
657 $C->{G0} = $CHARSET{G94}->{"\x7E"};
658 $C->{G1} = $CHARSET{G94}->{"\x7E"};
659 $C->{G2} = $CHARSET{G94}->{"\x7E"};
660 $C->{G3} = $CHARSET{G94}->{"\x7E"};
661 }
662 }
663 if ($CS eq $CODING_SYSTEM{"\x40"}) {
664 #
665 } elsif ($CS eq $CODING_SYSTEM{G} || $CS eq $CODING_SYSTEM{'/G'}
666 || $CS eq $CODING_SYSTEM{'/H'} || $CS eq $CODING_SYSTEM{'/I'}) {
667 Encode::_utf8_off ($s);
668 } elsif ($CS eq $CODING_SYSTEM{'/@'} || $CS eq $CODING_SYSTEM{'/C'}
669 || $CS eq $CODING_SYSTEM{'/E'}) {
670 $s = Encode::encode ('ucs-2be', $s);
671 } elsif ($CS eq $CODING_SYSTEM{'/A'} || $CS eq $CODING_SYSTEM{'/D'}
672 || $CS eq $CODING_SYSTEM{'/F'}) {
673 $s = Encode::encode ('ucs-4be', $s);
674 } elsif ($CS eq $CODING_SYSTEM{'/J'} || $CS eq $CODING_SYSTEM{'/K'}
675 || $CS eq $CODING_SYSTEM{'/L'}) {
676 $s = Encode::encode ('UTF-16BE', $s);
677 } elsif ($CS eq $CODING_SYSTEM{B}) {
678 $s = Encode::encode ('utf-1', $s);
679 } else {
680 return undef;
681 }
682 $r . $s;
683 }
684
685 1;
686 __END__
687
688 =head1 SEE ALSO
689
690 ISO/IEC 646:1991, "7-bit coded graphic character set for intormation interchange".
691
692 ISO/IEC 2022:1994, "Character Code Structure and Extension Techniques".
693 (IDT with ECMA 35, JIS X 0202:1998)
694
695 ISO/IEC 4873:1991, "8-Bit Coded Character Set Structure and Rules".
696 (IDT with ECMA 43)
697
698 ISO/IEC 6429:1992, "Control Functions for Coded Character Sets".
699 (IDT with ECMA 48:1991, JIS X 0211:1998)
700
701 ISO/IEC 8859, "8-Bit Single-Byte Coded Graphic Character Sets".
702
703 Encode, perlunicode
704
705 =head1 TODO
706
707 =over 4
708
709 =item NCR (coding system other than ISO/IEC 2022) support
710
711 =over 2
712
713 =item ESC 02/05 02/15 03/x of X Compound Text
714
715 =back
716
717 =item Output of control character sets, single control functions
718
719 =item Designation sequence of control character sets (input)
720
721 =item Special graphic character sets such as G3 of EUC-TW
722
723 =item SUPER SHIFT (SS) invoke function of old control character set
724
725 =item Safe transparent of control string (ISO/IEC 6429)
726
727 =item Output of unoutputable characters as alternative notation such as SGML-like entity
728
729 =item C0 set invoked to CR area like ISIRI code
730
731 Really need?
732
733 =item special treatment of 0x20, 0x7E, 0xA0, 0xFF
734
735 For example, GB mongolian sets use MSP (MONGOLIAN SPACE)
736 with these code positions.
737
738 And, no less coding systems does not use (or does ban using) DEL.
739
740 =item A lot of character sets don't have pseudo-UCS mapping.
741
742 Most of 9m^n (n >= 3) sets, 9m^n sets with I byte, 9m^n
743 DRCSes do not have pseudo-UCS mapping area. It is
744 questionable to allocate lots of code positions to these
745 rarely-(or no-)used character sets.
746
747 =item Even character sets that have pseudo-UCS mapping, some of them can't be outputed in ISO/IEC 2022.
748
749 Because output of rarely-used character sets is
750 not implemented yet.
751
752 =back
753
754 =head1 AUTHORS
755
756 Nanashi-san
757
758 Wakaba <w@suika.fam.cx>
759
760 =head1 LICENSE
761
762 Copyright 2002 AUTHORS
763
764 This library is free software; you can redistribute it
765 and/or modify it under the same terms as Perl itself.
766
767 =cut
768
769 # $Date: 2002/09/21 01:34:08 $
770 ### ISO2022.pm ends here

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24