/[suikacvs]/perl/lib/Encode/ISO2022.pm
Suika

Contents of /perl/lib/Encode/ISO2022.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.5 - (show annotations) (download)
Fri Sep 20 14:01:45 2002 UTC (22 years, 1 month ago) by wakaba
Branch: MAIN
Changes since 1.4: +139 -165 lines
2002-09-20  Wakaba <w@suika.fam.cx>

	* ISO2022.pm:
	- (iso2022_to_internal): New function.
	- (_iso2022_to_internal): Renamed from iso2022_to_internal.
	- (iso2022_to_internal): Experimental support of DOCS.
	- (internal_to_iso2022): Output in UCS coding systems
	if the character is unable to be encoded in ISO/IEC 2022
	coded character sets.
	- (_i2o): New procedure.
	- ($C->{option}->{designate_to}->{coding_system}): New option
	property object.
	- ($C->{coding_system}): New property.
	- (%CODING_SYSTEM): New hash.  (Alias to Encode::Charset's one.)
	* Charset.pm (make_initial_coding_system): Set 'reset_state'
	property with 1 value to coding systems of DOCS with 02/14 I byte.

1
2 =head1 NAME
3
4 Encode::ISO2022 --- ISO/IEC 2022 encoder and decoder
5
6 =cut
7
8 require v5.7.3;
9 package Encode::ISO2022;
10 use strict;
11 use vars qw(%CHARSET %CODING_SYSTEM $VERSION);
12 $VERSION=do{my @r=(q$Revision: 1.4 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
13 use base qw(Encode::Encoding);
14 __PACKAGE__->Define (qw/iso-2022 iso2022 2022 cp2022/);
15 require Encode::Charset;
16 *CHARSET = \%Encode::Charset::CHARSET;
17 *CODING_SYSTEM = \%Encode::Charset::CODING_SYSTEM;
18
19 ### --- Intialization
20
21 my %_CHARS_to_RANGE = (
22 l94 => q/[\x21-\x7E]/,
23 l96 => q/[\x20-\x7F]/,
24 l128 => q/[\x00-\x7F]/,
25 l256 => q/[\x00-\xFF]/,
26 r94 => q/[\xA1-\xFE]/,
27 r96 => q/[\xA0-\xFF]/,
28 r128 => q/[\x80-\xFF]/,
29 r256 => q/[\x80-\xFF]/,
30 b94 => q/[\x21-\x7E\xA1-\xFE]/,
31 b96 => q/[\x20-\x7F\xA0-\xFF]/,
32 b128 => q/[\x00-\xFF]/,
33 b256 => q/[\x00-\xFF]/,
34 );
35
36 ### --- Perl Encode module common functions
37
38 sub encode ($$;$) {
39 my ($obj, $str, $chk) = @_;
40 $_[1] = '' if $chk;
41 $str = &internal_to_iso2022 ($str);
42 return $str;
43 }
44
45 sub decode ($$;$) {
46 my ($obj, $str, $chk) = @_;
47 $_[1] = '' if $chk;
48 return &iso2022_to_internal ($str);
49 }
50
51 ### --- Encode::ISO2022 unique functions
52
53 ## Make a new ISO/IEC 2022-buffers object with default status
54 sub new_object {
55 my %C;
56 $C{bit} = 8;
57 $C{CL} = 'C0'; $C{CR} = 'C1'; $C{ESC_Fe} = 'C1';
58 $C{C0} = $CHARSET{C0}->{"\x40"}; ## ISO/IEC 6429:1991 C0
59 $C{C1} = $CHARSET{C1}->{'64291991C1'}; ## ISO/IEC 6429:1991 C1
60 $C{GL} = 'G0'; $C{GR} = 'G1';
61 $C{G0} = $CHARSET{G94}->{"\x42"}; ## ISO/IEC 646:1991 IRV
62 #$C{G1} = $CHARSET{G96}->{"\x41"}; ## ISO/IEC 8859-1 GR
63 $C{G1} = $CHARSET{G94}->{"\x7E"}; ## empty set
64 $C{G2} = $CHARSET{G94}->{"\x7E"}; ## empty set
65 $C{G3} = $CHARSET{G94}->{"\x7E"}; ## empty set
66 $C{coding_system} = $CODING_SYSTEM{"\x40"}; ## ISO/IEC 2022
67 $C{option} = {
68 C1invoke_to_right => 0, ## C1 invoked to: (0: ESC Fe, 1: CR)
69 G94n_designate_long => 0, ## (1: ESC 02/04 02/08 04/00..02)
70 designate_to => { ## Designated G buffer (-1: not be outputed)
71 C0 => {
72 default => 0,
73 },
74 C1 => {
75 default => 1,
76 },
77 G94 => {
78 "\x42" => 0,
79 default => 0,
80 },
81 G96 => {
82 default => 1,
83 },
84 G94n => {
85 default => 0,
86 },
87 G96n => {
88 default => 1,
89 },
90 coding_system => {
91 default => -1,
92 },
93 },
94 Ginvoke_by_single_shift => [0,0,0,0], ## Invoked by SS
95 Ginvoke_to_left => [1,1,1,1], ## Which invoked to? (1: L, 0: R)
96 private_set => { ## Private set vs Final byte
97 C0 => [],
98 C1 => [],
99 G94 => [],
100 G94n => [[],[],[],[],[]],
101 G96 => [],
102 #G96n => [], ## (not implemented)
103 U96n => [], ## mule-unicode sets
104 XC1 => {
105 '64291991C1' => undef, ## ISO/IEC 6429:1991 C1
106 },
107 },
108 reset => { ## Reset status at top of line
109 Gdesignation => "\x42", ## F of designation or 0
110 Ginvoke => 1,
111 },
112 undef_char => ["\x3F", {type => 'G94', charset => 'B'}],
113 use_revision => 1, ## Output IRR
114 };
115 \%C;
116 }
117
118 sub iso2022_to_internal ($;\%) {
119 my ($s, $C) = @_;
120 my $t = '';
121 $s =~ s{
122 ^((?:(?!\x1B\x25\x2F?[\x30-\x7E]).)*)
123 }{
124 my $i2 = $1;
125 $t = _iso2022_to_internal ($i2, $C);
126 '';
127 }gesx;
128 $s =~ s{
129 ## ISO/IEC 2022
130 \x1B\x25\x40((?:(?!\x1B\x25\x2F?[\x30-\x7E]).)*)
131 ## UTF-8
132 |\x1B\x25(?:\x47|\x2F[\x47-\x49])((?:(?!\x1B\x25\x2F?[\x30-\x7E]).)*)
133 ## UCS-2, UTF-16
134 |\x1B\x25\x2F[\x40\x43\x45\x4A-\x4C]
135 ((?!\x00\x1B\x00\x25\x00\x2F?\x00[\x30-\x7E].)*)
136 ## UCS-4
137 |\x1B\x25\x2F[\x41\x44\x46]
138 ((?!\x00\x00\x00\x1B\x00\x00\x00\x25\x00\x00\x00\x2F?
139 \x00\x00\x00[\x30-\x7E].)*)
140 ## with standard return
141 |\x1B\x25([\x30-\x7E])((?:(?!\x1B\x25\x2F?[\x30-\x7E]).)*)
142 ## without standard return
143 |\x1B\x25\x2F([\x30-\x7E])(.*)
144 }{
145 my ($i2,$u8,$Fu2,$u2,$u4,$Fsr,$sr,$Fnsr,$nsr) = ($1,$2,$3,$4,$5,$6,$7,$8,$9);
146 my $r = '';
147 if (defined $i2) {
148 $r = _iso2022_to_internal ($i2, $C);
149 } elsif (defined $u8) {
150 $r = Encode::decode ('utf8', $u8);
151 } elsif ($Fu2) {
152 if (ord ($Fu2) > 0x49) {
153 $r = Encode::decode ('utf-16be', $u2);
154 } else {
155 $r = Encode::decode ('ucs-2be', $u2);
156 }
157 } elsif (defined $u4) {
158 $r = Encode::decode ('ucs-4be', $u2);
159 } else { ## temporary
160 $r = '?+';
161 }
162 $r;
163 }gesx;
164 $t . $s;
165 }
166
167 sub _iso2022_to_internal ($;\%) {
168 my ($s, $C) = @_;
169 my %_GB_to_GN = (
170 "\x28"=>'G0',"\x29"=>'G1',"\x2A"=>'G2',"\x2B"=>'G3',
171 "\x2C"=>'G0',"\x2D"=>'G1',"\x2E"=>'G2',"\x2F"=>'G3',
172 );
173 $C ||= &new_object;
174
175 use re 'eval';
176 $s =~ s{
177 ((??{ $_CHARS_to_RANGE{'l'.$C->{$C->{GL}}->{chars}}
178 . qq/{$C->{$C->{GL}}->{dimension},$C->{$C->{GL}}->{dimension}}/ }))
179 |((??{ $_CHARS_to_RANGE{'r'.$C->{$C->{GL}}->{chars}}
180 . qq/{$C->{$C->{GR}}->{dimension},$C->{$C->{GR}}->{dimension}}/ }))
181
182 | (??{ q/(?:/ . ($C->{$C->{CR}}->{r_SS2} || '(?!)')
183 . ($C->{$C->{ESC_Fe}}->{r_SS2_ESC} ?
184 qq/|$C->{$C->{ESC_Fe}}->{r_SS2_ESC}/ : '')
185 . ($C->{$C->{CL}}->{r_SS2} ? qq/|$C->{$C->{CL}}->{r_SS2}/ : '') . q/)/
186 . ( $C->{$C->{CL}}->{r_LS0}
187 ||$C->{$C->{CL}}->{r_LS1}? ## ISO/IEC 6429:1992 9
188 qq/[$C->{$C->{CL}}->{r_LS0}$C->{$C->{CL}}->{r_LS1}]*/:'')
189 })
190 ((??{ $_CHARS_to_RANGE{'b'.$C->{$C->{GL}}->{chars}}
191 . qq/{$C->{G2}->{dimension},$C->{G2}->{dimension}}/ }))
192 | (??{ q/(?:/ . ($C->{$C->{CR}}->{r_SS3} || '(?!)')
193 . ($C->{$C->{ESC_Fe}}->{r_SS3_ESC} ?
194 qq/|$C->{$C->{ESC_Fe}}->{r_SS3_ESC}/ : '')
195 . ($C->{$C->{CL}}->{r_SS3} ? qq/|$C->{$C->{CL}}->{r_SS3}/ : '') . q/)/
196 . ( $C->{$C->{CL}}->{r_LS0}
197 || $C->{$C->{CL}}->{r_LS1}? ## ISO/IEC 6429:1992 9
198 qq/[$C->{$C->{CL}}->{r_LS0}$C->{$C->{CL}}->{r_LS1}]*/:'')
199 })
200 ((??{ $_CHARS_to_RANGE{'b'.$C->{$C->{GL}}->{chars}}
201 . qq/{$C->{G3}->{dimension},$C->{G3}->{dimension}}/ }))
202
203 ## Locking shift
204 |( \x1B[\x6E\x6F\x7C-\x7E]
205 |(??{ $C->{$C->{CL}}->{r_LS0}||'(?!)' })
206 |(??{ $C->{$C->{CL}}->{r_LS1}||'(?!)' })
207 )
208
209 ## Control sequence
210 |(??{ '(?:'.($C->{$C->{CR}}->{r_CSI}||'(?!)')
211 .($C->{$C->{ESC_Fe}}->{r_CSI_ESC} ?
212 qq/|$C->{$C->{ESC_Fe}}->{r_CSI_ESC}/: '')
213 .')'
214 })
215 ((??{ qq/[\x30-\x3F$C->{$C->{CL}}->{LS0}$C->{$C->{CL}}->{LS1}\xB0-\xBF]*/
216 .qq/[\x20-\x2F$C->{$C->{CL}}->{LS0}$C->{$C->{CL}}->{LS1}\xA0-\xAF]*/
217 }) [\x40-\x7E\xD0-\xFE])
218
219 ## Other escape sequence
220 |(\x1B[\x20-\x2F]*[\x30-\x7E])
221
222 ## Misc. sequence (SP, control, or broken data)
223 |([\x00-\xFF])
224 }{
225 my ($gl,$gr,$ss2,$ss3,$ls,$csi,$esc,$misc)
226 = ($1,$2,$3,$4,$5,$6,$7,$8,$9);
227 $C->{_irr} = undef unless defined $esc;
228 ## GL graphic character
229 if (defined $gl) {
230 my $c = 0;
231 my $m = $C->{$C->{GL}}->{chars}==94?0x21:$C->{$C->{GL}}->{chars}==96?0x20:0;
232 for (split //, $gl) {
233 $c = $c * $C->{$C->{GL}}->{chars} + unpack ('C', $_) - $m;
234 }
235 chr ($C->{$C->{GL}}->{ucs} + $c);
236 ## Control, SP, or broken data
237 ## TODO: support control sets other than ISO/IEC 6429's
238 } elsif (defined $misc) {
239 $misc;
240 ## GR graphic character
241 } elsif ($gr) {
242 my $c = 0;
243 my $m = $C->{$C->{GR}}->{chars}==94?0xA1:$C->{$C->{GR}}->{chars}==96?0xA0:0x80;
244 for (split //, $gr) {
245 $c = $c * $C->{$C->{GR}}->{chars} + unpack ('C', $_) - $m;
246 }
247 chr ($C->{$C->{GR}}->{ucs} + $c);
248 ## Graphic character with SS2
249 } elsif ($ss2) {
250 $ss2 =~ tr/\x80-\xFF/\x00-\x7F/;
251 my $c = 0; my $m = $C->{G2}->{chars}==94?0x21:$C->{G2}->{chars}==96?0x20:0;
252 for (split //, $ss2) {
253 $c = $c * $C->{G2}->{chars} + unpack ('C', $_) - $m;
254 }
255 chr ($C->{G2}->{ucs} + $c);
256 ## Graphic character with SS3
257 } elsif ($ss3) {
258 $ss3 =~ tr/\x80-\xFF/\x00-\x7F/;
259 my $c = 0; my $m = $C->{G3}->{chars}==94?0x21:$C->{G3}->{chars}==96?0x20:0;
260 for (split //, $ss3) {
261 $c = $c * $C->{G3}->{chars} + unpack ('C', $_) - $m;
262 }
263 chr ($C->{G3}->{ucs} + $c);
264 ## Escape sequence
265 } elsif ($esc) {
266 ## IRR (revision number)
267 if ($esc =~ /\x1B\x26([\x40-\x7E])/) {
268 $C->{_irr} = $1; $esc = '';
269 } else {
270 $esc =~ s{
271 \x1B([\x28-\x2B])(\x20?[\x21-\x23]?[\x30-\x7E]) ## Gx = 94^1
272 |\x1B\x24([\x28-\x2B]?)(\x20?[\x21-\x23]?[\x30-\x7E]) ## Gx = 94^n
273
274 |\x1B([\x2C-\x2F])(\x20?[\x21-\x23]?[\x30-\x7E]) ## Gx = 96^1
275 |\x1B\x24([\x2C-\x2F])(\x20?[\x21-\x23]?[\x30-\x7E]) ## Gx = 96^n
276
277 |\x1B([\x40-\x5F]) ## ESC Fe
278
279 |\x1B\x21([\x21-\x23]?[\x30-\x7E]) ## CL = C0
280 |\x1B\x22([\x21-\x23]?[\x30-\x7E]) ## CR & ESC Fe = C1
281
282 |\x1B([\x60-\x7E]) ## Single control functions
283 |\x1B\x23([\x21-\x23]?)([\x30-\x7E])
284
285 |\x1B\x20([\x40-\x7E]) ## Announcer
286 }{
287 my ($g94_g,$g94_f,$g94n_g,$g94n_f,$g96_g,$g96_f,$g96n_g,$g96n_f,$Fe,
288 $CZD, $C1D, $Fs, $sI, $sF,$ACS)
289 = ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15);
290 my $rev = $C->{_irr} || '';
291 if ($g94_g) {
292 $C->{ $_GB_to_GN{ $g94_g } } = $CHARSET{G94}->{ $g94_f.$rev }
293 || $CHARSET{G94}->{ $g94_f }
294 || $CHARSET{G94}->{ "\x7E" }; '';
295 } elsif (defined $g94n_f) {
296 $C->{ $_GB_to_GN{ $g94n_g } || 'G0' } = $CHARSET{G94n}->{ $g94n_f.$rev }
297 || $CHARSET{G94n}->{ $g94n_f }
298 || $CHARSET{G94n}->{ "\x7E" }; '';
299 } elsif ($g96_g) {
300 $C->{ $_GB_to_GN{ $g96_g } } = $CHARSET{G96}->{ $g96_f.$rev }
301 || $CHARSET{G96}->{ $g96_f }
302 || $CHARSET{G96}->{ "\x7E" }; '';
303 } elsif (defined $g96n_f) {
304 $C->{ $_GB_to_GN{ $g96n_g } } = $CHARSET{G96n}->{ $g96n_f.$rev }
305 || $CHARSET{G96n}->{ $g96n_f }
306 || $CHARSET{G96n}->{ "\x7E" }; '';
307 } elsif ($Fe) { ## ESC Fe => C1
308 chr ($C->{ $C->{ESC_Fe} }->{ucs} + (ord ($Fe) - 0x40));
309 } elsif ($CZD) {
310 $C->{C0} = $CHARSET{C0}->{ $CZD.$rev }
311 || $CHARSET{C0}->{ $CZD } || $CHARSET{C0}->{ "\x7E" }; '';
312 } elsif ($C1D) {
313 $C->{C1} = $CHARSET{C1}->{ $C1D.$rev }
314 || $CHARSET{C1}->{ $C1D } || $CHARSET{C1}->{ "\x7E" }; '';
315 } elsif ($Fs) {
316 if ($Fs eq "\x6E") { ## LS2
317 $C->{GL} = 'G2'; '';
318 } elsif ($Fs eq "\x6F") { ## LS3
319 $C->{GL} = 'G3'; '';
320 } elsif ($Fs eq "\x7E") { ## LS1R
321 $C->{GR} = 'G1'; $C->{GL} = 'G1' if $C->{bit} == 7; '';
322 } elsif ($Fs eq "\x7D") { ## LS2R
323 $C->{GR} = 'G2'; $C->{GL} = 'G2' if $C->{bit} == 7; '';
324 } elsif ($Fs eq "\x7C") { ## LS3R
325 $C->{GR} = 'G3'; $C->{GL} = 'G3' if $C->{bit} == 7; '';
326 } else {
327 chr ($CHARSET{single_control}->{Fs}->{ucs} + (ord ($Fs) - 0x60));
328 }
329 } elsif ($sI) {
330 chr ($CHARSET{single_control}->{'3F'.$sI}->{ucs} + (ord ($sF) - 0x30));
331 } elsif ($ACS) { ## Announcer
332 if ($ACS eq "\x4A") { $C->{bit} = 7 }
333 elsif ($ACS eq "\x4B") { $C->{bit} = 8 }
334 '';
335 }
336 }gex;
337 $C->{_irr} = undef;
338 }
339 $esc;
340 ## Locking shifts
341 } elsif ($ls) {
342 if ($ls eq $C->{$C->{CL}}->{LS0}) {
343 $C->{GL} = 'G0'; '';
344 } elsif ($ls eq $C->{$C->{CL}}->{LS1}) {
345 $C->{GL} = 'G1'; '';
346 } elsif ($ls =~ /\x1B([\x6E\x6F])/) {
347 $C->{GL} = {"\x6E"=>2, "\x6F"=>3}->{$1}; '';
348 } elsif ($ls =~ /\x1B([\x7C-\x7E])/) {
349 $C->{GR} = {"\x7E"=>1, "\x7D"=>2, "\x7C"=>3}->{$1}; '';
350 }
351 ## Control sequence
352 } elsif ($csi) {
353 $csi =~ tr/\xA0-\xFF/\x20-\x7F/d;
354 $csi =~ s/$C->{$C->{CL}}->{LS0}//g if $C->{$C->{CL}}->{LS0};
355 $csi =~ s/$C->{$C->{CL}}->{LS1}//g if $C->{$C->{CL}}->{LS1};
356 "\x9B".$csi;
357 }
358 }gex;
359 $s;
360 }
361
362 sub internal_to_iso2022 ($\%) {
363 my ($s, $C) = @_;
364 $C ||= &new_object;
365
366 my $r = '';
367 for my $c (split //, $s) {
368 my $cc = ord $c;
369 my $t;
370 if ($cc <= 0x1F) {
371 $t = _i2c ($c, $C, type => 'C0', charset => '@');
372 } elsif ($cc == 0x20 || $cc == 0x7F) {
373 $t = _back2ascii ($C) . $c;
374 } elsif ($cc < 0x7F) {
375 $t = _i2g ($c, $C, type => 'G94', charset => 'B');
376 } elsif ($cc <= 0x9F) {
377 $t = _i2c ($c, $C, type => 'C1', charset_id => '64291991C1',
378 charset => $C->{private_set}->{XC1}->{'64291991C1'});
379 } elsif ($cc <= 0xFF) {
380 $t = _i2g (chr($cc-0x80), $C, type => 'G96', charset => 'A');
381 } elsif ($cc <= 0x24FF) {
382 my $c = $cc - 0x100;
383 my $final = $C->{private_set}->{U96n}->[0];
384 if (length $final) {
385 $t = _i2g (chr(($c / 96)+0x20).chr(($c % 96)+0x20), $C,
386 type => 'G96n', charset => $final);
387 }
388 } elsif ($cc <= 0x33FF) {
389 my $c = $cc - 0x2500;
390 my $final = $C->{private_set}->{U96n}->[1];
391 if (length $final) {
392 $t = _i2g (chr(($c / 96)+0x20).chr(($c % 96)+0x20), $C,
393 type => 'G96n', charset => $final);
394 }
395 } elsif (0xE000 <= $cc && $cc <= 0xFFFF) {
396 my $c = $cc - 0xE000;
397 my $final = $C->{private_set}->{U96n}->[2];
398 if (length $final) {
399 $t = _i2g (chr(($c / 96)+0x20).chr(($c % 96)+0x20), $C,
400 type => 'G96n', charset => $final);
401 }
402
403 } elsif (0xE9F6C0 <= $cc && $cc <= 0xF06F80) {
404 my $c = $cc - 0xE9F6C0;
405 $t = _i2g (chr((($c % 8836) / 94)+0x21).chr(($c % 94)+0x21), $C,
406 type => 'G94n', charset => chr(($c / 8836)+0x30));
407 } elsif (0xF49D7C <= $cc && $cc <= 0xF4BFFF) {
408 my $c = $cc - 0xF49D7C;
409 $t = _i2g (chr(($c / 94)+0x21).chr(($c % 94)+0x21), $C,
410 type => 'G94n', charset => 'B', revision => '@');
411
412 } elsif (0xF0000 <= $cc && $cc <= 0x10F7FF) {
413 my $c = $cc - 0xF0000;
414 $t = _i2g (chr((($c % 9216) / 96)+0x20).chr(($c % 96)+0x20), $C,
415 type => 'G96n', charset => "\x20".chr(($c / 9216)+0x40));
416 } elsif (0xE90940 <= $cc && $cc <= 0xE92641) {
417 my $c = $cc - 0xE90940;
418 $t = _i2g (chr(($c % 94)+0x21), $C,
419 type => 'G94', charset => chr(($c / 94)+0x30));
420 } elsif (0xE92642 <= $cc && $cc <= 0xE9269F) {
421 my $c = $cc - 0xE92642;
422 $t = _i2g (chr($c+0x21),$C,type => 'G94', charset => "\x64", revision => '@');
423 } elsif (0xE926A0 <= $cc && $cc <= 0xE9443F) {
424 my $c = $cc - 0xE926A0;
425 $t = _i2g (chr(($c % 96)+0x20), $C,
426 type => 'G96', charset => chr(($c / 96)+0x30));
427 } elsif (0xE944A0 <= $cc && $cc <= 0xE961A1) {
428 my $c = $cc - 0xE944A0;
429 $t = _i2g (chr(($c % 94)+0x21), $C,
430 type => 'G94', charset => '!'.chr(($c / 94)+0x30));
431 } elsif (0xE96200 <= $cc && $cc <= 0xE97F9F) {
432 my $c = $cc - 0xE96200;
433 $t = _i2g (chr(($c % 96)+0x20), $C,
434 type => 'G96', charset => '!'.chr(($c / 96)+0x30));
435 } elsif (0xE98000 <= $cc && $cc <= 0xE99D01) {
436 my $c = $cc - 0xE98000;
437 $t = _i2g (chr(($c % 94)+0x21), $C,
438 type => 'G94', charset => '"'.chr(($c / 94)+0x30));
439 } elsif (0xE99D60 <= $cc && $cc <= 0xE9BAFF) {
440 my $c = $cc - 0xE99D60;
441 $t = _i2g (chr(($c % 96)+0x20), $C,
442 type => 'G96', charset => '"'.chr(($c / 96)+0x30));
443 } elsif (0xE9BB60 <= $cc && $cc <= 0xE9D861) {
444 my $c = $cc - 0xE9BB60;
445 $t = _i2g (chr(($c % 94)+0x21), $C,
446 type => 'G94', charset => '#'.chr(($c / 94)+0x30));
447 } elsif (0xE9D8C0 <= $cc && $cc <= 0xE9F65F) {
448 my $c = $cc - 0xE9D8C0;
449 $t = _i2g (chr(($c % 96)+0x20), $C,
450 type => 'G96', charset => '#'.chr(($c / 96)+0x30));
451 } elsif (0x70090940 <= $cc && $cc <= 0x70092641) {
452 my $c = $cc - 0x70090940;
453 $t = _i2g (chr(($c % 94)+0x21), $C,
454 type => 'G94', charset => "\x20".chr(($c / 94)+0x30));
455 } elsif (0x700926A0 <= $cc && $cc <= 0x7009443F) {
456 my $c = $cc - 0x700926A0;
457 $t = _i2g (chr(($c % 96)+0x20), $C,
458 type => 'G96', charset => "\x20".chr(($c / 96)+0x30));
459 ## TODO: DRCS with I byte: U+700944A0-U+7009F6BF
460 } elsif (0x70400000 <= $cc && $cc <= 0x7040FFED) {
461 my $c = $cc - 0x70400000;
462 $t = _i2g (chr(($c % 94)+0x21), $C, charset_id => 'P'.int ($c / 94),
463 type => 'G94', charset => $C->{private_set}->{G94}->[ $c / 94 ]);
464 } elsif (0x70410000 <= $cc && $cc <= 0x7041FFBF) {
465 my $c = $cc - 0x70410000;
466 $t = _i2g (chr(($c % 96)+0x20), $C, charset_id => 'P'.int ($c / 96),
467 type => 'G96', charset => $C->{private_set}->{G96}->[ $c / 96 ]);
468 } elsif (0x70420000 <= $cc && $cc <= 0x7046F19B) {
469 my $c = $cc % 0x10000;
470 $t = _i2g (chr((($c % 8836) / 94)+0x21).chr(($c % 94)+0x21), $C,
471 type => 'G94n',
472 charset_id => 'P'.int(($cc / 0x10000) - 0x7042).'_'.int($c / 8836),
473 charset => $C->{private_set}->{G94n}->[ ($cc / 0x10000) - 0x7042 ]
474 ->[ $c / 8836 ]);
475 }
476 if (defined $t) {
477 $t = _i2o ($t, $C, cs_F => "\x40")
478 if $C->{coding_system} ne $CODING_SYSTEM{"\x40"};
479 } else {
480 my $F; my @F = qw~G /G /H /I B /A /D /F~;
481 push @F, qw~/J /K /L~ if $cc <= 0x10FFFF;
482 push @F, qw~/@ /C /E~ if $cc <= 0xFFFF;
483 for (@F) {
484 if (defined $C->{option}->{designate_to}->{coding_system}->{$_}
485 && $C->{option}->{designate_to}->{coding_system}->{$_} > -1) {
486 $F = $_; last;
487 } elsif ($C->{option}->{designate_to}->{coding_system}->{default} > -1) {
488 $F = $_; last;
489 }
490 }
491 $t = _i2o ($c, $C, cs_F => $F) if $F;
492 }
493 if (defined $t) {
494 $r .= $t;
495 } else {
496 $t = _i2g ($C->{option}->{undef_char}->[0], $C,
497 %{ $C->{option}->{undef_char}->[1] });
498 $r .= $C->{coding_system} eq $CODING_SYSTEM{"\x40"} ?
499 $t : _i2o ($t, $C, cs_F => "\x40");
500 }
501 }
502 $r . _back2ascii ($C);
503 }
504
505 ## $O{charset} eq undef means that charset is same as the current designated one.
506 sub _i2c ($%%) {
507 my ($s, $C, %O) = @_;
508 my $r = '';
509 if ($O{type} eq 'C0') {
510 if (defined $O{charset}) {
511 if ( $C->{C0} ne $CHARSET{C0}->{$O{charset}}
512 && $C->{C0} ne $CHARSET{C0}->{$O{charset_id}}) {
513 for ($C->{option}->{designate_to}->{C0}->{$O{charset}},
514 $C->{option}->{designate_to}->{C0}->{default}) {
515 if (defined $_) { return undef if $_ == -1; last }
516 }
517 $r .= "\x1B\x21".$O{charset};
518 $C->{C0} = $CHARSET{C0}->{$O{charset}};
519 }
520 } elsif (defined $O{charset_id}) {
521 if ($C->{C0} ne $CHARSET{C0}->{$O{charset_id}}) {
522 return undef; ## Control set is not designated nor has F byte
523 }
524 }
525 $r .= _back2ascii ($C, reset_all => $C->{C0}->{reset_all}->{$s});
526 return $r . $s;
527 } elsif ($O{type} eq 'C1') {
528 if (defined $O{charset}) {
529 if ( $C->{C1} ne $CHARSET{C1}->{$O{charset}}
530 && $C->{C1} ne $CHARSET{C1}->{$O{charset_id}}) {
531 for ($C->{option}->{designate_to}->{C1}->{$O{charset}},
532 $C->{option}->{designate_to}->{C1}->{default}) {
533 if (defined $_) { return undef if $_ == -1; last }
534 }
535 $r .= "\x1B\x22".$O{charset};
536 $C->{C1} = $CHARSET{C1}->{$O{charset}};
537 }
538 } elsif (defined $O{charset_id}) {
539 if ($C->{C1} ne $CHARSET{C1}->{$O{charset_id}}) {
540 return undef; ## Control set is not designated nor has F byte
541 }
542 }
543 $r .= _back2ascii ($C, reset_all => $C->{C1}->{reset_all}->{$s});
544 unless ($C->{option}->{C1invoke_to_right}) { ## ESC Fe
545 $s =~ s/([\x80-\x9F])/"\x1B" . chr (ord ($1) - 0x40)/ge;
546 }
547 return $r . $s;
548 }
549 }
550 sub _i2g ($%%) {
551 my ($s, $C, %O) = @_;
552 my $r = '';
553 my $set = $CHARSET{$O{type}}->{$O{charset}.
554 ($O{revision}&&$C->{option}->{use_revision}?$O{revision}:'')};
555 my $set0 = $CHARSET{$O{type}}->{$O{charset_id}};
556 ## -- designate character set
557 my $G = 0;
558 if ($C->{G0} eq $set || $C->{G0} eq $set0) { $G = 0 }
559 elsif ($C->{G1} eq $set || $C->{G1} eq $set0) { $G = 1 }
560 elsif ($C->{G2} eq $set || $C->{G2} eq $set0) { $G = 2 }
561 elsif ($C->{G3} eq $set || $C->{G3} eq $set0) { $G = 3 }
562 else {
563 return undef unless $set; ## charset does not have F byte
564 $G = 1 if $O{type} eq 'G96' || $O{type} eq 'G96n';
565 for ($C->{option}->{designate_to}->{$O{type}}->{$O{charset}},
566 $C->{option}->{designate_to}->{$O{type}}->{default}) {
567 if (defined $_) {
568 $G = $_; last;
569 }
570 }
571 if ($G == -1) {
572 return undef;
573 }
574 if ($O{type} eq 'G94') {
575 $r .= ($O{revision}&&$C->{option}->{use_revision}?"\x1B\x26".$O{revision}:'')
576 ."\x1B".("\x28","\x29","\x2A","\x2B")[$G].$O{charset};
577 } elsif ($O{type} eq 'G94n') {
578 if ($G == 0 && !$C->{option}->{G94n_designate_long}
579 && ($O{charset} eq '@' || $O{charset} eq 'A' || $O{charset} eq 'B')) {
580 $r .= ($O{revision}&&$C->{option}->{use_revision}?"\x1B\x26".$O{revision}:'')
581 ."\x1B\x24".$O{charset};
582 } else {
583 $r .= ($O{revision}&&$C->{option}->{use_revision}?"\x1B\x26".$O{revision}:'')
584 ."\x1B\x24".("\x28","\x29","\x2A","\x2B")[$G].$O{charset};
585 }
586 } elsif ($O{type} eq 'G96') {
587 $r .= ($O{revision}&&$C->{option}->{use_revision}?"\x1B\x26".$O{revision}:'')
588 ."\x1B".("\x2C","\x2D","\x2E","\x2F")[$G].$O{charset};
589 } elsif ($O{type} eq 'G96n') {
590 $r .= ($O{revision}&&$C->{option}->{use_revision}?"\x1B\x26".$O{revision}:'')
591 ."\x1B\x24".("\x2C","\x2D","\x2E","\x2F")[$G].$O{charset};
592 }
593 $C->{'G'.$G} = $CHARSET{$O{type}}->{$O{charset}};
594 }
595 ## -- invoke G buffer
596 my $left = $C->{option}->{Ginvoke_to_left}->[$G];
597 if ($C->{GL} eq 'G'.$G) {
598 $left = 1;
599 } elsif ($C->{GR} eq 'G'.$G) {
600 $left = 0;
601 } else {
602 if ($C->{option}->{Ginvoke_by_single_shift}->[$G]) {
603 if ($C->{C1}->{'C_SS'.$G}) {
604 $r .= _i2c ($C->{C1}->{'C_SS'.$G}, $C, type => 'C1') || return undef;
605 } elsif ($C->{C0}->{'C_SS'.$G}) {
606 $r .= _i2c ($C->{C0}->{'C_SS'.$G}, $C, type => 'C0') || return undef;
607 } else { ## Both C0 and C1 set do not have SS2/3.
608 $left = 0 if $G == 1 && !$C->{C0}->{C_LS1};
609 $r .= __invoke ($C, $G => $left) if $C->{$left?'GL':'GR'} ne 'G'.$G;
610 }
611 } else {
612 $left = 0 if $G == 1 && !$C->{C0}->{C_LS1};
613 $r .= __invoke ($C, $G => $left) if $C->{$left?'GL':'GR'} ne 'G'.$G;
614 }
615 }
616 $s =~ tr/\x00-\x7F/\x80-\xFF/ unless $left;
617 $r . $s;
618 }
619 sub _back2ascii (%) {
620 my ($C, %O) = @_;
621 my $r = '';
622 if ($C->{option}->{reset}->{Gdesignation}) {
623 my $F = $C->{option}->{reset}->{Gdesignation}; # \x42
624 $r .= "\x1B\x28".$F unless $C->{G0} eq $CHARSET{G94}->{$F};
625 $C->{G0} = $CHARSET{G94}->{$F};
626 if ($O{reset_all}) {
627 $C->{G1} = $CHARSET{G94}->{"\x7E"};
628 $C->{G2} = $CHARSET{G94}->{"\x7E"};
629 $C->{G3} = $CHARSET{G94}->{"\x7E"};
630 }
631 }
632 if ($C->{option}->{reset}->{Ginvoke}) {
633 if ($C->{GL} ne 'G0') {
634 $r .= $C->{C0}->{C_LS0} || ($C->{C0} = $CHARSET{C0}->{'@'},"\x1B\x21\x40\x0F");
635 $C->{GL} = 'G0';
636 }
637 $C->{GR} = undef if $O{reset_all};
638 }
639 $r;
640 }
641 ## __invoke (\%C, $G, $left_or_right)
642 sub __invoke (\%$$) {
643 my ($C, $G) = @_;
644 if ($_[2]) {
645 $C->{GL} = 'G'.$G;
646 return ($C->{C0}->{C_LS0}
647 || scalar ($C->{C0}=$CHARSET{C0}->{'@'},"\x1B\x21\x40\x0F"),
648 $C->{C0}->{C_LS1}, "\x1B\x6E", "\x1B\x6F")[$G];
649 } else {
650 $C->{GR} = 'G'.$G;
651 return ("", "\x1B\x7E", "\x1B\x7D", "\x1B\x7C")[$G];
652 }
653 '';
654 }
655 sub _i2o ($\%%) {
656 my ($s, $C, %O) = @_;
657 my $CS = $CODING_SYSTEM{ $O{cs_F} } || $CODING_SYSTEM{ $O{cs_id} } || return undef;
658 my $r = '';
659 if ($CS ne $C->{coding_system}) {
660 my $e = '';
661 $e .= "\x1B\x25";
662 $e .= $O{cs_F} || $C->{private_set}->{coding_system}->{ $O{cs_id} }
663 || return undef;
664 if ($C->{coding_system} eq $CODING_SYSTEM{"\x2F\x40"}
665 || $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x43"}
666 || $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x45"}
667 || $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x4A"}
668 || $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x4B"}
669 || $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x4C"}) {
670 $e =~ s/(.)/\x00$1/go;
671 } elsif ($C->{coding_system} eq $CODING_SYSTEM{"\x2F\x41"}
672 || $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x44"}
673 || $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x46"}) {
674 $e =~ s/(.)/\x00\x00\x00$1/go;
675 }
676 $r .= $e;
677 $C->{coding_system} = $CS;
678 if ($CS->{reset_state}) {
679 $C->{GL} = undef; $C->{GR} = undef;
680 $C->{C0} = $CHARSET{C0}->{"\x7E"};
681 $C->{C1} = $CHARSET{C1}->{"\x7E"};
682 $C->{G0} = $CHARSET{G94}->{"\x7E"};
683 $C->{G1} = $CHARSET{G94}->{"\x7E"};
684 $C->{G2} = $CHARSET{G94}->{"\x7E"};
685 $C->{G3} = $CHARSET{G94}->{"\x7E"};
686 }
687 }
688 if ($CS eq $CODING_SYSTEM{"\x40"}) {
689 #
690 } elsif ($CS eq $CODING_SYSTEM{G} || $CS eq $CODING_SYSTEM{'/G'}
691 || $CS eq $CODING_SYSTEM{'/H'} || $CS eq $CODING_SYSTEM{'/I'}) {
692 Encode::_utf8_off ($s);
693 } elsif ($CS eq $CODING_SYSTEM{'/@'} || $CS eq $CODING_SYSTEM{'/C'}
694 || $CS eq $CODING_SYSTEM{'/E'}) {
695 $s = Encode::encode ('ucs-2be', $s);
696 } elsif ($CS eq $CODING_SYSTEM{'/A'} || $CS eq $CODING_SYSTEM{'/D'}
697 || $CS eq $CODING_SYSTEM{'/F'}) {
698 $s = Encode::encode ('ucs-4be', $s);
699 } elsif ($CS eq $CODING_SYSTEM{'/J'} || $CS eq $CODING_SYSTEM{'/K'}
700 || $CS eq $CODING_SYSTEM{'/L'}) {
701 $s = Encode::encode ('UTF-16BE', $s);
702 } elsif ($CS eq $CODING_SYSTEM{B}) {
703 $s = Encode::encode ('utf-1', $s);
704 } else {
705 return undef;
706 }
707 $r . $s;
708 }
709
710 1;
711 __END__
712
713 =head1 SEE ALSO
714
715 ISO/IEC 646:1991, "7-bit coded graphic character set for intormation interchange".
716
717 ISO/IEC 2022:1994, "Character Code Structure and Extension Techniques".
718 (IDT with ECMA 35, JIS X 0202:1998)
719
720 ISO/IEC 4873:1991, "8-Bit Coded Character Set Structure and Rules".
721 (IDT with ECMA 43)
722
723 ISO/IEC 6429:1992, "Control Functions for Coded Character Sets".
724 (IDT with ECMA 48:1991, JIS X 0211:1998)
725
726 ISO/IEC 8859, "8-Bit Single-Byte Coded Graphic Character Sets".
727
728 Encode, perlunicode
729
730 =head1 TODO
731
732 =over 4
733
734 =item NCR (coding system other than ISO/IEC 2022) support
735
736 =over 2
737
738 =item ESC 02/05 02/15 03/x of X Compound Text
739
740 =back
741
742 =item Output of control character sets, single control functions
743
744 =item Designation sequence of control character sets (input)
745
746 =item Special graphic character sets such as G3 of EUC-TW
747
748 =item SUPER SHIFT (SS) invoke function of old control character set
749
750 =item Safe transparent of control string (ISO/IEC 6429)
751
752 =item Output of unoutputable characters as alternative notation such as SGML-like entity
753
754 =item C0 set invoked to CR area like ISIRI code
755
756 Really need?
757
758 =item special treatment of 0x20, 0x7E, 0xA0, 0xFF
759
760 For example, GB mongolian sets use MSP (MONGOLIAN SPACE)
761 with these code positions.
762
763 And, no less coding systems does not use (or does ban using) DEL.
764
765 =item A lot of character sets don't have pseudo-UCS mapping.
766
767 Most of 9m^n (n >= 3) sets, 9m^n sets with I byte, 9m^n
768 DRCSes do not have pseudo-UCS mapping area. It is
769 questionable to allocate lots of code positions to these
770 rarely-(or no-)used character sets.
771
772 =item Even character sets that have pseudo-UCS mapping, some of them can't be outputed in ISO/IEC 2022.
773
774 Because output of rarely-used character sets is
775 not implemented yet.
776
777 =back
778
779 =head1 AUTHORS
780
781 Nanashi-san
782
783 Wakaba <w@suika.fam.cx>
784
785 =head1 LICENSE
786
787 Copyright 2002 AUTHORS
788
789 This library is free software; you can redistribute it
790 and/or modify it under the same terms as Perl itself.
791
792 =cut
793
794 # $Date: 2002/09/16 06:35:16 $
795 ### ISO2022.pm ends here

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24