/[suikacvs]/perl/lib/Encode/ISO2022.pm
Suika

Contents of /perl/lib/Encode/ISO2022.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.6 - (show annotations) (download)
Sat Sep 21 01:34:08 2002 UTC (22 years, 1 month ago) by wakaba
Branch: MAIN
Changes since 1.5: +63 -92 lines
2002-09-21  Wakaba <w@suika.fam.cx>

	* ISO2022.pm:
	- More DOCS support.
	- (new_object): Redefined as alias of Encode::Charset's.
	- (pod:ENCODINGS): New section.
	- Regists 'iso/iec2022' as encoding name.
	* Charset.pm:
	- (new_object): Moved from Encode::ISO2022.
	- (make_initial_coding_system): Define 'Csjis' coding system.

1
2 =head1 NAME
3
4 Encode::ISO2022 --- ISO/IEC 2022 encoder and decoder
5
6 =head1 ENCODINGS
7
8 =over 4
9
10 =item iso2022
11
12 ISO/IEC 2022:1994. Default status is:
13
14 =over 2
15
16 =item CL = C0 = ISO/IEC 6429:1991 C0 set
17
18 =item CR = C1 = ISO/IEC 6429:1991 C1 set
19
20 =item GL = G0 = ISO/IEC 646:1991 IRV GL(G0) set
21
22 =item GR = G1 = empty set
23
24 =item G2 = empty set
25
26 =item G3 = empty set
27
28 =back
29
30 (Alias: iso/iec2022, iso-2022, 2022, cp2022)
31
32 =back
33
34 Note that ISO/IEC 2022 based encodings are found in
35 Encode::ISO2022::* modules. This module, Encode::ISO2022
36 only provides a general ISO/IEC 2022 encoder/decoder.
37
38 =cut
39
40 require v5.7.3;
41 package Encode::ISO2022;
42 use strict;
43 use vars qw(%CHARSET %CODING_SYSTEM $VERSION);
44 $VERSION=do{my @r=(q$Revision: 1.5 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
45 use base qw(Encode::Encoding);
46 __PACKAGE__->Define (qw!iso-2022 iso/iec2022 iso2022 2022 cp2022!);
47 require Encode::Charset;
48 *CHARSET = \%Encode::Charset::CHARSET;
49 *CODING_SYSTEM = \%Encode::Charset::CODING_SYSTEM;
50
51 ### --- Intialization
52
53 my %_CHARS_to_RANGE = (
54 l94 => q/[\x21-\x7E]/,
55 l96 => q/[\x20-\x7F]/,
56 l128 => q/[\x00-\x7F]/,
57 l256 => q/[\x00-\xFF]/,
58 r94 => q/[\xA1-\xFE]/,
59 r96 => q/[\xA0-\xFF]/,
60 r128 => q/[\x80-\xFF]/,
61 r256 => q/[\x80-\xFF]/,
62 b94 => q/[\x21-\x7E\xA1-\xFE]/,
63 b96 => q/[\x20-\x7F\xA0-\xFF]/,
64 b128 => q/[\x00-\xFF]/,
65 b256 => q/[\x00-\xFF]/,
66 );
67
68 ### --- Perl Encode module common functions
69
70 sub encode ($$;$) {
71 my ($obj, $str, $chk) = @_;
72 $_[1] = '' if $chk;
73 $str = &internal_to_iso2022 ($str);
74 return $str;
75 }
76
77 sub decode ($$;$) {
78 my ($obj, $str, $chk) = @_;
79 $_[1] = '' if $chk;
80 return &iso2022_to_internal ($str);
81 }
82
83 ### --- Encode::ISO2022 unique functions
84 *new_object = \&Encode::Charset::new_object;
85
86 sub iso2022_to_internal ($;\%) {
87 my ($s, $C) = @_;
88 $C ||= &new_object;
89 my $t = '';
90 $s =~ s{
91 ^((?:(?!\x1B\x25\x2F?[\x30-\x7E]).)*)
92 }{
93 my $i2 = $1;
94 $t = _iso2022_to_internal ($i2, $C);
95 '';
96 }gesx;
97 my $pad = '';
98 use re 'eval';
99 $s =~ s{
100 ## ISO/IEC 2022
101 (??{"$pad\x1B$pad\x25$pad\x40"})((?:(?!\x1B\x25\x2F?[\x30-\x7E]).)*)
102 ## UTF-8
103 |(??{"$pad\x1B$pad\x25$pad(?:\x47|\x2F$pad"."[\x47-\x49])"})
104 ((?:(?!\x1B\x25\x2F?[\x30-\x7E]).)*)
105 ## UCS-2, UTF-16
106 |(??{"$pad\x1B$pad\x25$pad\x2F$pad"})([\x40\x43\x45\x4A-\x4C])
107 ((?:(?!\x00\x1B\x00\x25(?:\x00\x2F)?\x00[\x30-\x7E])..)*)
108 ## UCS-4
109 |(??{"$pad\x1B$pad\x25$pad\x2F$pad"})[\x41\x44\x46]
110 ((?:(?!\x00\x00\x00\x1B\x00\x00\x00\x25(?:\x00\x00\x00\x2F)?
111 \x00\x00\x00[\x30-\x7E])....)*)
112 ## with standard return
113 |(??{"$pad\x1B$pad\x25$pad"})([\x30-\x7E])
114 ((?:(?!\x1B\x25\x2F?[\x30-\x7E]).)*)
115 ## without standard return
116 |(??{"$pad\x1B$pad\x25$pad\x2F$pad"})([\x30-\x7E])(.*)
117 }{
118 my ($i2,$u8,$Fu2,$u2,$u4,$Fsr,$sr,$Fnsr,$nsr) = ($1,$2,$3,$4,$5,$6,$7,$8,$9);
119 my $r = '';
120 if (defined $i2) {
121 $r = _iso2022_to_internal ($i2, $C); $pad = '';
122 } elsif (defined $u8) {
123 $r = Encode::decode ('utf8', $u8); $pad = '';
124 } elsif ($Fu2) {
125 if (ord ($Fu2) > 0x49) {
126 $r = Encode::decode ('utf-16be', $u2);
127 } else {
128 $r = Encode::decode ('ucs-2be', $u2);
129 }
130 $pad = "\x00";
131 } elsif (defined $u4) {
132 $r = Encode::decode ('ucs-4be', $u2); $pad = "\x00\x00\x00";
133 } elsif (defined $Fsr && $CODING_SYSTEM{$Fsr}->{perl_name}) {
134 $r = Encode::decode ($CODING_SYSTEM{$Fsr}->{perl_name}, $sr); $pad = '';
135 } elsif (defined $Fnsr && $CODING_SYSTEM{$Fnsr}->{perl_name}) {
136 $r = Encode::decode ($CODING_SYSTEM{$Fnsr}->{perl_name}, $nsr); $pad = '';
137 } else { ## temporary
138 $r = '?' x length ($sr.$nsr); $pad = '';
139 }
140 $r;
141 }gesx;
142 $t . $s;
143 }
144
145 sub _iso2022_to_internal ($;\%) {
146 my ($s, $C) = @_;
147 my %_GB_to_GN = (
148 "\x28"=>'G0',"\x29"=>'G1',"\x2A"=>'G2',"\x2B"=>'G3',
149 "\x2C"=>'G0',"\x2D"=>'G1',"\x2E"=>'G2',"\x2F"=>'G3',
150 );
151
152 use re 'eval';
153 $s =~ s{
154 ((??{ $_CHARS_to_RANGE{'l'.$C->{$C->{GL}}->{chars}}
155 . qq/{$C->{$C->{GL}}->{dimension},$C->{$C->{GL}}->{dimension}}/ }))
156 |((??{ $_CHARS_to_RANGE{'r'.$C->{$C->{GR}}->{chars}}
157 . qq/{$C->{$C->{GR}}->{dimension},$C->{$C->{GR}}->{dimension}}/ }))
158 | (??{ q/(?:/ . ($C->{$C->{CR}}->{r_SS2} || '(?!)')
159 . ($C->{$C->{ESC_Fe}}->{r_SS2_ESC} ?
160 qq/|$C->{$C->{ESC_Fe}}->{r_SS2_ESC}/ : '')
161 . ($C->{$C->{CL}}->{r_SS2} ? qq/|$C->{$C->{CL}}->{r_SS2}/ : '') . q/)/
162 . ( $C->{$C->{CL}}->{r_LS0}
163 ||$C->{$C->{CL}}->{r_LS1}? ## ISO/IEC 6429:1992 9
164 qq/[$C->{$C->{CL}}->{r_LS0}$C->{$C->{CL}}->{r_LS1}]*/:'')
165 })
166 ((??{ $_CHARS_to_RANGE{'b'.$C->{G2}->{chars}}
167 . qq/{$C->{G2}->{dimension},$C->{G2}->{dimension}}/ }))
168 | (??{ q/(?:/ . ($C->{$C->{CR}}->{r_SS3} || '(?!)')
169 . ($C->{$C->{ESC_Fe}}->{r_SS3_ESC} ?
170 qq/|$C->{$C->{ESC_Fe}}->{r_SS3_ESC}/ : '')
171 . ($C->{$C->{CL}}->{r_SS3} ? qq/|$C->{$C->{CL}}->{r_SS3}/ : '') . q/)/
172 . ( $C->{$C->{CL}}->{r_LS0}
173 || $C->{$C->{CL}}->{r_LS1}? ## ISO/IEC 6429:1992 9
174 qq/[$C->{$C->{CL}}->{r_LS0}$C->{$C->{CL}}->{r_LS1}]*/:'')
175 })
176 ((??{ $_CHARS_to_RANGE{'b'.$C->{G3}->{chars}}
177 . qq/{$C->{G3}->{dimension},$C->{G3}->{dimension}}/ }))
178
179 ## Locking shift
180 |( (??{ $C->{$C->{CL}}->{r_LS0}||'(?!)' })
181 |(??{ $C->{$C->{CL}}->{r_LS1}||'(?!)' })
182 )
183
184 ## Control sequence
185 |(??{ '(?:'.($C->{$C->{CR}}->{r_CSI}||'(?!)')
186 .($C->{$C->{ESC_Fe}}->{r_CSI_ESC} ?
187 qq/|$C->{$C->{ESC_Fe}}->{r_CSI_ESC}/: '')
188 .')'
189 })
190 ((??{ qq/[\x30-\x3F$C->{$C->{CL}}->{LS0}$C->{$C->{CL}}->{LS1}\xB0-\xBF]*/
191 .qq/[\x20-\x2F$C->{$C->{CL}}->{LS0}$C->{$C->{CL}}->{LS1}\xA0-\xAF]*/
192 }) [\x40-\x7E\xD0-\xFE])
193
194 ## Other escape sequence
195 |(\x1B[\x20-\x2F]*[\x30-\x7E])
196
197 ## Misc. sequence (SP, control, or broken data)
198 |([\x00-\xFF])
199 }{
200 my ($gl,$gr,$ss2,$ss3,$ls,$csi,$esc,$misc)
201 = ($1,$2,$3,$4,$5,$6,$7,$8,$9);
202 $C->{_irr} = undef unless defined $esc;
203 ## GL graphic character
204 if (defined $gl) {
205 my $c = 0;
206 my $m = $C->{$C->{GL}}->{chars}==94?0x21:$C->{$C->{GL}}->{chars}==96?0x20:0;
207 for (split //, $gl) {
208 $c = $c * $C->{$C->{GL}}->{chars} + unpack ('C', $_) - $m;
209 }
210 chr ($C->{$C->{GL}}->{ucs} + $c);
211 ## Control, SP, or broken data
212 ## TODO: support control sets other than ISO/IEC 6429's
213 } elsif (defined $misc) {
214 $misc;
215 ## GR graphic character
216 } elsif ($gr) {
217 my $c = 0;
218 my $m = $C->{$C->{GR}}->{chars}==94?0xA1:$C->{$C->{GR}}->{chars}==96?0xA0:0x80;
219 for (split //, $gr) {
220 $c = $c * $C->{$C->{GR}}->{chars} + unpack ('C', $_) - $m;
221 }
222 chr ($C->{$C->{GR}}->{ucs} + $c);
223 ## Graphic character with SS2
224 } elsif ($ss2) {
225 $ss2 =~ tr/\x80-\xFF/\x00-\x7F/;
226 my $c = 0; my $m = $C->{G2}->{chars}==94?0x21:$C->{G2}->{chars}==96?0x20:0;
227 for (split //, $ss2) {
228 $c = $c * $C->{G2}->{chars} + unpack ('C', $_) - $m;
229 }
230 chr ($C->{G2}->{ucs} + $c);
231 ## Graphic character with SS3
232 } elsif ($ss3) {
233 $ss3 =~ tr/\x80-\xFF/\x00-\x7F/;
234 my $c = 0; my $m = $C->{G3}->{chars}==94?0x21:$C->{G3}->{chars}==96?0x20:0;
235 for (split //, $ss3) {
236 $c = $c * $C->{G3}->{chars} + unpack ('C', $_) - $m;
237 }
238 chr ($C->{G3}->{ucs} + $c);
239 ## Escape sequence
240 } elsif ($esc) {
241 ## IRR (revision number)
242 if ($esc =~ /\x1B\x26([\x40-\x7E])/) {
243 $C->{_irr} = $1; $esc = '';
244 } else {
245 $esc =~ s{
246 \x1B([\x28-\x2B])(\x20?[\x21-\x23]?[\x30-\x7E]) ## Gx = 94^1
247 |\x1B\x24([\x28-\x2B]?)(\x20?[\x21-\x23]?[\x30-\x7E]) ## Gx = 94^n
248
249 |\x1B([\x2C-\x2F])(\x20?[\x21-\x23]?[\x30-\x7E]) ## Gx = 96^1
250 |\x1B\x24([\x2C-\x2F])(\x20?[\x21-\x23]?[\x30-\x7E]) ## Gx = 96^n
251
252 |\x1B([\x40-\x5F]) ## ESC Fe
253
254 |\x1B\x21([\x21-\x23]?[\x30-\x7E]) ## CL = C0
255 |\x1B\x22([\x21-\x23]?[\x30-\x7E]) ## CR & ESC Fe = C1
256
257 |\x1B([\x60-\x7E]) ## Single control functions
258 |\x1B\x23([\x21-\x23]?)([\x30-\x7E])
259
260 |\x1B\x20([\x40-\x7E]) ## Announcer
261 }{
262 my ($g94_g,$g94_f,$g94n_g,$g94n_f,$g96_g,$g96_f,$g96n_g,$g96n_f,$Fe,
263 $CZD, $C1D, $Fs, $sI, $sF,$ACS)
264 = ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15);
265 my $rev = $C->{_irr} || '';
266 if ($g94_g) {
267 $C->{ $_GB_to_GN{ $g94_g } } = $CHARSET{G94}->{ $g94_f.$rev }
268 || $CHARSET{G94}->{ $g94_f }
269 || $CHARSET{G94}->{ "\x7E" }; '';
270 } elsif (defined $g94n_f) {
271 $C->{ $_GB_to_GN{ $g94n_g } || 'G0' } = $CHARSET{G94n}->{ $g94n_f.$rev }
272 || $CHARSET{G94n}->{ $g94n_f }
273 || $CHARSET{G94n}->{ "\x7E" }; '';
274 } elsif ($g96_g) {
275 $C->{ $_GB_to_GN{ $g96_g } } = $CHARSET{G96}->{ $g96_f.$rev }
276 || $CHARSET{G96}->{ $g96_f }
277 || $CHARSET{G96}->{ "\x7E" }; '';
278 } elsif (defined $g96n_f) {
279 $C->{ $_GB_to_GN{ $g96n_g } } = $CHARSET{G96n}->{ $g96n_f.$rev }
280 || $CHARSET{G96n}->{ $g96n_f }
281 || $CHARSET{G96n}->{ "\x7E" }; '';
282 } elsif ($Fe) { ## ESC Fe => C1
283 chr ($C->{ $C->{ESC_Fe} }->{ucs} + (ord ($Fe) - 0x40));
284 } elsif ($CZD) {
285 $C->{C0} = $CHARSET{C0}->{ $CZD.$rev }
286 || $CHARSET{C0}->{ $CZD } || $CHARSET{C0}->{ "\x7E" }; '';
287 } elsif ($C1D) {
288 $C->{C1} = $CHARSET{C1}->{ $C1D.$rev }
289 || $CHARSET{C1}->{ $C1D } || $CHARSET{C1}->{ "\x7E" }; '';
290 } elsif ($Fs) {
291 if ($Fs eq "\x6E") { ## LS2
292 $C->{GL} = 'G2'; '';
293 } elsif ($Fs eq "\x6F") { ## LS3
294 $C->{GL} = 'G3'; '';
295 } elsif ($Fs eq "\x7E") { ## LS1R
296 $C->{GR} = 'G1'; $C->{GL} = 'G1' if $C->{bit} == 7; '';
297 } elsif ($Fs eq "\x7D") { ## LS2R
298 $C->{GR} = 'G2'; $C->{GL} = 'G2' if $C->{bit} == 7; '';
299 } elsif ($Fs eq "\x7C") { ## LS3R
300 $C->{GR} = 'G3'; $C->{GL} = 'G3' if $C->{bit} == 7; '';
301 } else {
302 chr ($CHARSET{single_control}->{Fs}->{ucs} + (ord ($Fs) - 0x60));
303 }
304 } elsif ($sI) {
305 chr ($CHARSET{single_control}->{'3F'.$sI}->{ucs} + (ord ($sF) - 0x30));
306 } elsif ($ACS) { ## Announcer
307 if ($ACS eq "\x4A") { $C->{bit} = 7 }
308 elsif ($ACS eq "\x4B") { $C->{bit} = 8 }
309 '';
310 }
311 }gex;
312 $C->{_irr} = undef;
313 }
314 $esc;
315 ## Locking shifts
316 } elsif ($ls) {
317 if ($ls eq $C->{$C->{CL}}->{LS0}) {
318 $C->{GL} = 'G0'; '';
319 } elsif ($ls eq $C->{$C->{CL}}->{LS1}) {
320 $C->{GL} = 'G1'; '';
321 }
322 ## Control sequence
323 } elsif ($csi) {
324 $csi =~ tr/\xA0-\xFF/\x20-\x7F/d;
325 $csi =~ s/$C->{$C->{CL}}->{LS0}//g if $C->{$C->{CL}}->{LS0};
326 $csi =~ s/$C->{$C->{CL}}->{LS1}//g if $C->{$C->{CL}}->{LS1};
327 "\x9B".$csi;
328 }
329 }gex;
330 $s;
331 }
332
333 sub internal_to_iso2022 ($\%) {
334 my ($s, $C) = @_;
335 $C ||= &new_object;
336
337 my $r = '';
338 for my $c (split //, $s) {
339 my $cc = ord $c;
340 my $t;
341 if ($cc <= 0x1F) {
342 $t = _i2c ($c, $C, type => 'C0', charset => '@');
343 } elsif ($cc == 0x20 || $cc == 0x7F) {
344 $t = _back2ascii ($C) . $c;
345 } elsif ($cc < 0x7F) {
346 $t = _i2g ($c, $C, type => 'G94', charset => 'B');
347 } elsif ($cc <= 0x9F) {
348 $t = _i2c ($c, $C, type => 'C1', charset_id => '64291991C1',
349 charset => $C->{private_set}->{XC1}->{'64291991C1'});
350 } elsif ($cc <= 0xFF) {
351 $t = _i2g (chr($cc-0x80), $C, type => 'G96', charset => 'A');
352 } elsif ($cc <= 0x24FF) {
353 my $c = $cc - 0x100;
354 my $final = $C->{private_set}->{U96n}->[0];
355 if (length $final) {
356 $t = _i2g (chr(($c / 96)+0x20).chr(($c % 96)+0x20), $C,
357 type => 'G96n', charset => $final);
358 }
359 } elsif ($cc <= 0x33FF) {
360 my $c = $cc - 0x2500;
361 my $final = $C->{private_set}->{U96n}->[1];
362 if (length $final) {
363 $t = _i2g (chr(($c / 96)+0x20).chr(($c % 96)+0x20), $C,
364 type => 'G96n', charset => $final);
365 }
366 } elsif (0xE000 <= $cc && $cc <= 0xFFFF) {
367 my $c = $cc - 0xE000;
368 my $final = $C->{private_set}->{U96n}->[2];
369 if (length $final) {
370 $t = _i2g (chr(($c / 96)+0x20).chr(($c % 96)+0x20), $C,
371 type => 'G96n', charset => $final);
372 }
373
374 } elsif (0xE9F6C0 <= $cc && $cc <= 0xF06F80) {
375 my $c = $cc - 0xE9F6C0;
376 $t = _i2g (chr((($c % 8836) / 94)+0x21).chr(($c % 94)+0x21), $C,
377 type => 'G94n', charset => chr(($c / 8836)+0x30));
378 } elsif (0xF49D7C <= $cc && $cc <= 0xF4BFFF) {
379 my $c = $cc - 0xF49D7C;
380 $t = _i2g (chr(($c / 94)+0x21).chr(($c % 94)+0x21), $C,
381 type => 'G94n', charset => 'B', revision => '@');
382
383 } elsif (0xF0000 <= $cc && $cc <= 0x10F7FF) {
384 my $c = $cc - 0xF0000;
385 $t = _i2g (chr((($c % 9216) / 96)+0x20).chr(($c % 96)+0x20), $C,
386 type => 'G96n', charset => "\x20".chr(($c / 9216)+0x40));
387 } elsif (0xE90940 <= $cc && $cc <= 0xE92641) {
388 my $c = $cc - 0xE90940;
389 $t = _i2g (chr(($c % 94)+0x21), $C,
390 type => 'G94', charset => chr(($c / 94)+0x30));
391 } elsif (0xE92642 <= $cc && $cc <= 0xE9269F) {
392 my $c = $cc - 0xE92642;
393 $t = _i2g (chr($c+0x21),$C,type => 'G94', charset => "\x64", revision => '@');
394 } elsif (0xE926A0 <= $cc && $cc <= 0xE9443F) {
395 my $c = $cc - 0xE926A0;
396 $t = _i2g (chr(($c % 96)+0x20), $C,
397 type => 'G96', charset => chr(($c / 96)+0x30));
398 } elsif (0xE944A0 <= $cc && $cc <= 0xE961A1) {
399 my $c = $cc - 0xE944A0;
400 $t = _i2g (chr(($c % 94)+0x21), $C,
401 type => 'G94', charset => '!'.chr(($c / 94)+0x30));
402 } elsif (0xE96200 <= $cc && $cc <= 0xE97F9F) {
403 my $c = $cc - 0xE96200;
404 $t = _i2g (chr(($c % 96)+0x20), $C,
405 type => 'G96', charset => '!'.chr(($c / 96)+0x30));
406 } elsif (0xE98000 <= $cc && $cc <= 0xE99D01) {
407 my $c = $cc - 0xE98000;
408 $t = _i2g (chr(($c % 94)+0x21), $C,
409 type => 'G94', charset => '"'.chr(($c / 94)+0x30));
410 } elsif (0xE99D60 <= $cc && $cc <= 0xE9BAFF) {
411 my $c = $cc - 0xE99D60;
412 $t = _i2g (chr(($c % 96)+0x20), $C,
413 type => 'G96', charset => '"'.chr(($c / 96)+0x30));
414 } elsif (0xE9BB60 <= $cc && $cc <= 0xE9D861) {
415 my $c = $cc - 0xE9BB60;
416 $t = _i2g (chr(($c % 94)+0x21), $C,
417 type => 'G94', charset => '#'.chr(($c / 94)+0x30));
418 } elsif (0xE9D8C0 <= $cc && $cc <= 0xE9F65F) {
419 my $c = $cc - 0xE9D8C0;
420 $t = _i2g (chr(($c % 96)+0x20), $C,
421 type => 'G96', charset => '#'.chr(($c / 96)+0x30));
422 } elsif (0x70090940 <= $cc && $cc <= 0x70092641) {
423 my $c = $cc - 0x70090940;
424 $t = _i2g (chr(($c % 94)+0x21), $C,
425 type => 'G94', charset => "\x20".chr(($c / 94)+0x30));
426 } elsif (0x700926A0 <= $cc && $cc <= 0x7009443F) {
427 my $c = $cc - 0x700926A0;
428 $t = _i2g (chr(($c % 96)+0x20), $C,
429 type => 'G96', charset => "\x20".chr(($c / 96)+0x30));
430 ## TODO: DRCS with I byte: U+700944A0-U+7009F6BF
431 } elsif (0x70400000 <= $cc && $cc <= 0x7040FFED) {
432 my $c = $cc - 0x70400000;
433 $t = _i2g (chr(($c % 94)+0x21), $C, charset_id => 'P'.int ($c / 94),
434 type => 'G94', charset => $C->{private_set}->{G94}->[ $c / 94 ]);
435 } elsif (0x70410000 <= $cc && $cc <= 0x7041FFBF) {
436 my $c = $cc - 0x70410000;
437 $t = _i2g (chr(($c % 96)+0x20), $C, charset_id => 'P'.int ($c / 96),
438 type => 'G96', charset => $C->{private_set}->{G96}->[ $c / 96 ]);
439 } elsif (0x70420000 <= $cc && $cc <= 0x7046F19B) {
440 my $c = $cc % 0x10000;
441 $t = _i2g (chr((($c % 8836) / 94)+0x21).chr(($c % 94)+0x21), $C,
442 type => 'G94n',
443 charset_id => 'P'.int(($cc / 0x10000) - 0x7042).'_'.int($c / 8836),
444 charset => $C->{private_set}->{G94n}->[ ($cc / 0x10000) - 0x7042 ]
445 ->[ $c / 8836 ]);
446 }
447 if (defined $t) {
448 $t = _i2o ($t, $C, cs_F => "\x40")
449 if $C->{coding_system} ne $CODING_SYSTEM{"\x40"};
450 } else {
451 my $F; my @F = qw~G /G /H /I B /A /D /F~;
452 push @F, qw~/J /K /L~ if $cc <= 0x10FFFF;
453 push @F, qw~/@ /C /E~ if $cc <= 0xFFFF;
454 for (@F) {
455 if (defined $C->{option}->{designate_to}->{coding_system}->{$_}
456 && $C->{option}->{designate_to}->{coding_system}->{$_} > -1) {
457 $F = $_; last;
458 } elsif ($C->{option}->{designate_to}->{coding_system}->{default} > -1) {
459 $F = $_; last;
460 }
461 }
462 $t = _i2o ($c, $C, cs_F => $F) if $F;
463 }
464 if (defined $t) {
465 $r .= $t;
466 } else {
467 $t = _i2g ($C->{option}->{undef_char}->[0], $C,
468 %{ $C->{option}->{undef_char}->[1] });
469 $r .= $C->{coding_system} eq $CODING_SYSTEM{"\x40"} ?
470 $t : _i2o ($t, $C, cs_F => "\x40");
471 }
472 }
473 $r . _back2ascii ($C);
474 }
475
476 ## $O{charset} eq undef means that charset is same as the current designated one.
477 sub _i2c ($%%) {
478 my ($s, $C, %O) = @_;
479 my $r = '';
480 if ($O{type} eq 'C0') {
481 if (defined $O{charset}) {
482 if ( $C->{C0} ne $CHARSET{C0}->{$O{charset}}
483 && $C->{C0} ne $CHARSET{C0}->{$O{charset_id}}) {
484 for ($C->{option}->{designate_to}->{C0}->{$O{charset}},
485 $C->{option}->{designate_to}->{C0}->{default}) {
486 if (defined $_) { return undef if $_ == -1; last }
487 }
488 $r .= "\x1B\x21".$O{charset};
489 $C->{C0} = $CHARSET{C0}->{$O{charset}};
490 }
491 } elsif (defined $O{charset_id}) {
492 if ($C->{C0} ne $CHARSET{C0}->{$O{charset_id}}) {
493 return undef; ## Control set is not designated nor has F byte
494 }
495 }
496 $r .= _back2ascii ($C, reset_all => $C->{C0}->{reset_all}->{$s});
497 return $r . $s;
498 } elsif ($O{type} eq 'C1') {
499 if (defined $O{charset}) {
500 if ( $C->{C1} ne $CHARSET{C1}->{$O{charset}}
501 && $C->{C1} ne $CHARSET{C1}->{$O{charset_id}}) {
502 for ($C->{option}->{designate_to}->{C1}->{$O{charset}},
503 $C->{option}->{designate_to}->{C1}->{default}) {
504 if (defined $_) { return undef if $_ == -1; last }
505 }
506 $r .= "\x1B\x22".$O{charset};
507 $C->{C1} = $CHARSET{C1}->{$O{charset}};
508 }
509 } elsif (defined $O{charset_id}) {
510 if ($C->{C1} ne $CHARSET{C1}->{$O{charset_id}}) {
511 return undef; ## Control set is not designated nor has F byte
512 }
513 }
514 $r .= _back2ascii ($C, reset_all => $C->{C1}->{reset_all}->{$s});
515 unless ($C->{option}->{C1invoke_to_right}) { ## ESC Fe
516 $s =~ s/([\x80-\x9F])/"\x1B" . chr (ord ($1) - 0x40)/ge;
517 }
518 return $r . $s;
519 }
520 }
521 sub _i2g ($%%) {
522 my ($s, $C, %O) = @_;
523 my $r = '';
524 my $set = $CHARSET{$O{type}}->{$O{charset}.
525 ($O{revision}&&$C->{option}->{use_revision}?$O{revision}:'')};
526 my $set0 = $CHARSET{$O{type}}->{$O{charset_id}};
527 ## -- designate character set
528 my $G = 0;
529 if ($C->{G0} eq $set || $C->{G0} eq $set0) { $G = 0 }
530 elsif ($C->{G1} eq $set || $C->{G1} eq $set0) { $G = 1 }
531 elsif ($C->{G2} eq $set || $C->{G2} eq $set0) { $G = 2 }
532 elsif ($C->{G3} eq $set || $C->{G3} eq $set0) { $G = 3 }
533 else {
534 return undef unless $set; ## charset does not have F byte
535 $G = 1 if $O{type} eq 'G96' || $O{type} eq 'G96n';
536 for ($C->{option}->{designate_to}->{$O{type}}->{$O{charset}},
537 $C->{option}->{designate_to}->{$O{type}}->{default}) {
538 if (defined $_) {
539 $G = $_; last;
540 }
541 }
542 if ($G == -1) {
543 return undef;
544 }
545 if ($O{type} eq 'G94') {
546 $r .= ($O{revision}&&$C->{option}->{use_revision}?"\x1B\x26".$O{revision}:'')
547 ."\x1B".("\x28","\x29","\x2A","\x2B")[$G].$O{charset};
548 } elsif ($O{type} eq 'G94n') {
549 if ($G == 0 && !$C->{option}->{G94n_designate_long}
550 && ($O{charset} eq '@' || $O{charset} eq 'A' || $O{charset} eq 'B')) {
551 $r .= ($O{revision}&&$C->{option}->{use_revision}?"\x1B\x26".$O{revision}:'')
552 ."\x1B\x24".$O{charset};
553 } else {
554 $r .= ($O{revision}&&$C->{option}->{use_revision}?"\x1B\x26".$O{revision}:'')
555 ."\x1B\x24".("\x28","\x29","\x2A","\x2B")[$G].$O{charset};
556 }
557 } elsif ($O{type} eq 'G96') {
558 $r .= ($O{revision}&&$C->{option}->{use_revision}?"\x1B\x26".$O{revision}:'')
559 ."\x1B".("\x2C","\x2D","\x2E","\x2F")[$G].$O{charset};
560 } elsif ($O{type} eq 'G96n') {
561 $r .= ($O{revision}&&$C->{option}->{use_revision}?"\x1B\x26".$O{revision}:'')
562 ."\x1B\x24".("\x2C","\x2D","\x2E","\x2F")[$G].$O{charset};
563 }
564 $C->{'G'.$G} = $CHARSET{$O{type}}->{$O{charset}};
565 }
566 ## -- invoke G buffer
567 my $left = $C->{option}->{Ginvoke_to_left}->[$G];
568 if ($C->{GL} eq 'G'.$G) {
569 $left = 1;
570 } elsif ($C->{GR} eq 'G'.$G) {
571 $left = 0;
572 } else {
573 if ($C->{option}->{Ginvoke_by_single_shift}->[$G]) {
574 if ($C->{C1}->{'C_SS'.$G}) {
575 $r .= _i2c ($C->{C1}->{'C_SS'.$G}, $C, type => 'C1') || return undef;
576 } elsif ($C->{C0}->{'C_SS'.$G}) {
577 $r .= _i2c ($C->{C0}->{'C_SS'.$G}, $C, type => 'C0') || return undef;
578 } else { ## Both C0 and C1 set do not have SS2/3.
579 $left = 0 if $G == 1 && !$C->{C0}->{C_LS1};
580 $r .= __invoke ($C, $G => $left) if $C->{$left?'GL':'GR'} ne 'G'.$G;
581 }
582 } else {
583 $left = 0 if $G == 1 && !$C->{C0}->{C_LS1};
584 $r .= __invoke ($C, $G => $left) if $C->{$left?'GL':'GR'} ne 'G'.$G;
585 }
586 }
587 $s =~ tr/\x00-\x7F/\x80-\xFF/ unless $left;
588 $r . $s;
589 }
590 sub _back2ascii (%) {
591 my ($C, %O) = @_;
592 my $r = '';
593 if ($C->{option}->{reset}->{Gdesignation}) {
594 my $F = $C->{option}->{reset}->{Gdesignation}; # \x42
595 $r .= "\x1B\x28".$F unless $C->{G0} eq $CHARSET{G94}->{$F};
596 $C->{G0} = $CHARSET{G94}->{$F};
597 if ($O{reset_all}) {
598 $C->{G1} = $CHARSET{G94}->{"\x7E"};
599 $C->{G2} = $CHARSET{G94}->{"\x7E"};
600 $C->{G3} = $CHARSET{G94}->{"\x7E"};
601 }
602 }
603 if ($C->{option}->{reset}->{Ginvoke}) {
604 if ($C->{GL} ne 'G0') {
605 $r .= $C->{C0}->{C_LS0} || ($C->{C0} = $CHARSET{C0}->{'@'},"\x1B\x21\x40\x0F");
606 $C->{GL} = 'G0';
607 }
608 $C->{GR} = undef if $O{reset_all};
609 }
610 $r;
611 }
612 ## __invoke (\%C, $G, $left_or_right)
613 sub __invoke (\%$$) {
614 my ($C, $G) = @_;
615 if ($_[2]) {
616 $C->{GL} = 'G'.$G;
617 return ($C->{C0}->{C_LS0}
618 || scalar ($C->{C0}=$CHARSET{C0}->{'@'},"\x1B\x21\x40\x0F"),
619 $C->{C0}->{C_LS1}, "\x1B\x6E", "\x1B\x6F")[$G];
620 } else {
621 $C->{GR} = 'G'.$G;
622 return ("", "\x1B\x7E", "\x1B\x7D", "\x1B\x7C")[$G];
623 }
624 '';
625 }
626 sub _i2o ($\%%) {
627 my ($s, $C, %O) = @_;
628 my $CS = $CODING_SYSTEM{ $O{cs_F} } || $CODING_SYSTEM{ $O{cs_id} } || return undef;
629 my $r = '';
630 if ($CS ne $C->{coding_system}) {
631 my $e = '';
632 $e .= "\x1B\x25";
633 $e .= $O{cs_F} || $C->{private_set}->{coding_system}->{ $O{cs_id} }
634 || return undef;
635 if ($C->{coding_system} eq $CODING_SYSTEM{"\x2F\x40"}
636 || $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x43"}
637 || $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x45"}
638 || $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x4A"}
639 || $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x4B"}
640 || $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x4C"}) {
641 $e =~ s/(.)/\x00$1/go;
642 } elsif ($C->{coding_system} eq $CODING_SYSTEM{"\x2F\x41"}
643 || $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x44"}
644 || $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x46"}) {
645 $e =~ s/(.)/\x00\x00\x00$1/go;
646 }
647 $r .= $e;
648 $C->{coding_system} = $CS;
649 if ($CS->{reset_state}) {
650 $C->{GL} = undef; $C->{GR} = undef;
651 $C->{C0} = $CHARSET{C0}->{"\x7E"};
652 $C->{C1} = $CHARSET{C1}->{"\x7E"};
653 $C->{G0} = $CHARSET{G94}->{"\x7E"};
654 $C->{G1} = $CHARSET{G94}->{"\x7E"};
655 $C->{G2} = $CHARSET{G94}->{"\x7E"};
656 $C->{G3} = $CHARSET{G94}->{"\x7E"};
657 }
658 }
659 if ($CS eq $CODING_SYSTEM{"\x40"}) {
660 #
661 } elsif ($CS eq $CODING_SYSTEM{G} || $CS eq $CODING_SYSTEM{'/G'}
662 || $CS eq $CODING_SYSTEM{'/H'} || $CS eq $CODING_SYSTEM{'/I'}) {
663 Encode::_utf8_off ($s);
664 } elsif ($CS eq $CODING_SYSTEM{'/@'} || $CS eq $CODING_SYSTEM{'/C'}
665 || $CS eq $CODING_SYSTEM{'/E'}) {
666 $s = Encode::encode ('ucs-2be', $s);
667 } elsif ($CS eq $CODING_SYSTEM{'/A'} || $CS eq $CODING_SYSTEM{'/D'}
668 || $CS eq $CODING_SYSTEM{'/F'}) {
669 $s = Encode::encode ('ucs-4be', $s);
670 } elsif ($CS eq $CODING_SYSTEM{'/J'} || $CS eq $CODING_SYSTEM{'/K'}
671 || $CS eq $CODING_SYSTEM{'/L'}) {
672 $s = Encode::encode ('UTF-16BE', $s);
673 } elsif ($CS eq $CODING_SYSTEM{B}) {
674 $s = Encode::encode ('utf-1', $s);
675 } else {
676 return undef;
677 }
678 $r . $s;
679 }
680
681 1;
682 __END__
683
684 =head1 SEE ALSO
685
686 ISO/IEC 646:1991, "7-bit coded graphic character set for intormation interchange".
687
688 ISO/IEC 2022:1994, "Character Code Structure and Extension Techniques".
689 (IDT with ECMA 35, JIS X 0202:1998)
690
691 ISO/IEC 4873:1991, "8-Bit Coded Character Set Structure and Rules".
692 (IDT with ECMA 43)
693
694 ISO/IEC 6429:1992, "Control Functions for Coded Character Sets".
695 (IDT with ECMA 48:1991, JIS X 0211:1998)
696
697 ISO/IEC 8859, "8-Bit Single-Byte Coded Graphic Character Sets".
698
699 Encode, perlunicode
700
701 =head1 TODO
702
703 =over 4
704
705 =item NCR (coding system other than ISO/IEC 2022) support
706
707 =over 2
708
709 =item ESC 02/05 02/15 03/x of X Compound Text
710
711 =back
712
713 =item Output of control character sets, single control functions
714
715 =item Designation sequence of control character sets (input)
716
717 =item Special graphic character sets such as G3 of EUC-TW
718
719 =item SUPER SHIFT (SS) invoke function of old control character set
720
721 =item Safe transparent of control string (ISO/IEC 6429)
722
723 =item Output of unoutputable characters as alternative notation such as SGML-like entity
724
725 =item C0 set invoked to CR area like ISIRI code
726
727 Really need?
728
729 =item special treatment of 0x20, 0x7E, 0xA0, 0xFF
730
731 For example, GB mongolian sets use MSP (MONGOLIAN SPACE)
732 with these code positions.
733
734 And, no less coding systems does not use (or does ban using) DEL.
735
736 =item A lot of character sets don't have pseudo-UCS mapping.
737
738 Most of 9m^n (n >= 3) sets, 9m^n sets with I byte, 9m^n
739 DRCSes do not have pseudo-UCS mapping area. It is
740 questionable to allocate lots of code positions to these
741 rarely-(or no-)used character sets.
742
743 =item Even character sets that have pseudo-UCS mapping, some of them can't be outputed in ISO/IEC 2022.
744
745 Because output of rarely-used character sets is
746 not implemented yet.
747
748 =back
749
750 =head1 AUTHORS
751
752 Nanashi-san
753
754 Wakaba <w@suika.fam.cx>
755
756 =head1 LICENSE
757
758 Copyright 2002 AUTHORS
759
760 This library is free software; you can redistribute it
761 and/or modify it under the same terms as Perl itself.
762
763 =cut
764
765 # $Date: 2002/09/20 14:01:45 $
766 ### ISO2022.pm ends here

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24