/[suikacvs]/perl/lib/Encode/ISO2022.pm
Suika

Contents of /perl/lib/Encode/ISO2022.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.11 - (show annotations) (download)
Thu Dec 12 08:17:16 2002 UTC (21 years, 10 months ago) by wakaba
Branch: MAIN
Changes since 1.10: +2 -3 lines
]

1
2 =head1 NAME
3
4 Encode::ISO2022 --- ISO/IEC 2022 encoder and decoder
5
6 =head1 ENCODINGS
7
8 =over 4
9
10 =item iso2022
11
12 ISO/IEC 2022:1994. Default status is:
13
14 =over 2
15
16 =item CL = C0 = ISO/IEC 6429:1991 C0 set
17
18 =item CR = C1 = ISO/IEC 6429:1991 C1 set
19
20 =item GL = G0 = ISO/IEC 646:1991 IRV GL(G0) set
21
22 =item GR = G1 = empty set
23
24 =item G2 = empty set
25
26 =item G3 = empty set
27
28 =back
29
30 (Alias: iso/iec2022, iso-2022, 2022, cp2022)
31
32 =back
33
34 Note that ISO/IEC 2022 based encodings are found in
35 Encode::ISO2022::* modules. This module, Encode::ISO2022
36 only provides a general ISO/IEC 2022 encoder/decoder.
37
38 =cut
39
40 require v5.7.3;
41 package Encode::ISO2022;
42 use strict;
43 use vars qw(%CHARSET %CODING_SYSTEM $VERSION);
44 $VERSION=do{my @r=(q$Revision: 1.10 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
45 use base qw(Encode::Encoding);
46 __PACKAGE__->Define (qw!iso-2022 iso/iec2022 iso2022 2022 cp2022!);
47 require Encode::Charset;
48 *CHARSET = \%Encode::Charset::CHARSET;
49 *CODING_SYSTEM = \%Encode::Charset::CODING_SYSTEM;
50
51 ### --- Perl Encode module common functions
52
53 sub encode ($$;$) {
54 my ($obj, $str, $chk) = @_;
55 $_[1] = '' if $chk;
56 $str = &internal_to_iso2022 ($str);
57 return $str;
58 }
59
60 sub decode ($$;$) {
61 my ($obj, $str, $chk) = @_;
62 $_[1] = '' if $chk;
63 return &iso2022_to_internal ($str);
64 }
65
66 ### --- Encode::ISO2022 unique functions
67 *new_object = \&Encode::Charset::new_object;
68
69 sub iso2022_to_internal ($;%) {
70 my ($s, $C) = @_;
71 $C ||= &new_object;
72 my $t = '';
73 $s =~ s{^((?:(?!\x1B\x25\x2F?[\x30-\x7E]).)*)}{
74 my $i2 = $1;
75 $t = _iso2022_to_internal ($i2, $C);
76 '';
77 }es;
78 my $pad = '';
79 use re 'eval';
80 $s =~ s{
81 ## ISO/IEC 2022
82 (??{"$pad\x1B$pad\x25$pad\x40"})((?:(?!\x1B\x25\x2F?[\x30-\x7E]).)*)
83 ## UTF-8
84 |(??{"$pad\x1B$pad\x25$pad(?:\x47|\x2F$pad"."[\x47-\x49])"})
85 ((?:(?!\x1B\x25\x2F?[\x30-\x7E]).)*)
86 ## UCS-2, UTF-16
87 |(??{"$pad\x1B$pad\x25$pad\x2F$pad"})([\x40\x43\x45\x4A-\x4C])
88 ((?:(?!\x00\x1B\x00\x25(?:\x00\x2F)?\x00[\x30-\x7E])..)*)
89 ## UCS-4
90 |(??{"$pad\x1B$pad\x25$pad\x2F$pad"})[\x41\x44\x46]
91 ((?:(?!\x00\x00\x00\x1B\x00\x00\x00\x25(?:\x00\x00\x00\x2F)?
92 \x00\x00\x00[\x30-\x7E])....)*)
93 ## with standard return
94 |(??{"$pad\x1B$pad\x25$pad"})([\x30-\x7E])
95 ((?:(?!\x1B\x25\x2F?[\x30-\x7E]).)*)
96 ## without standard return
97 |(??{"$pad\x1B$pad\x25$pad\x2F$pad"})([\x30-\x7E])(.*)
98 }{
99 my ($i2,$u8,$Fu2,$u2,$u4,$Fsr,$sr,$Fnsr,$nsr) = ($1,$2,$3,$4,$5,$6,$7,$8,$9);
100 my $r = '';
101 if (defined $i2) {
102 $r = _iso2022_to_internal ($i2, $C); $pad = '';
103 } elsif (defined $u8) {
104 $r = Encode::decode ('utf8', $u8); $pad = '';
105 } elsif ($Fu2) {
106 if (ord ($Fu2) > 0x49) {
107 $r = Encode::decode ('utf-16be', $u2);
108 } else {
109 $r = Encode::decode ('ucs-2be', $u2);
110 }
111 $pad = "\x00";
112 } elsif (defined $u4) {
113 $r = Encode::decode ('ucs-4be', $u2); $pad = "\x00\x00\x00";
114 } elsif (defined $Fsr && $CODING_SYSTEM{$Fsr}->{perl_name}) {
115 $r = Encode::decode ($CODING_SYSTEM{$Fsr}->{perl_name}, $sr); $pad = '';
116 } elsif (defined $Fnsr && $CODING_SYSTEM{$Fnsr}->{perl_name}) {
117 $r = Encode::decode ($CODING_SYSTEM{$Fnsr}->{perl_name}, $nsr); $pad = '';
118 } else { ## temporary
119 $r = '?' x length ($sr.$nsr); $pad = '';
120 }
121 $r;
122 }gesx;
123 $t . $s;
124 }
125
126 # this is very very trickey. my perl 5.8.0 does not process
127 # regex with eval except the first time (i think it's a bug
128 # of perl), so we redefine this function whenever being called!
129 # when this unexpected behavior is fixed or someone finds
130 # better way to avoid it, we will rewrite this code.
131 &_iso2022_to_internal (undef);
132 sub _iso2022_to_internal ($;%) {
133 eval q{ sub __iso2022_to_internal ($;%) { 0 } };
134 eval q{
135 sub __iso2022_to_internal ($;%) {
136 use re 'eval';
137 my ($s, $C) = @_;
138 my %_GB_to_GN = (
139 "\x28"=>'G0',"\x29"=>'G1',"\x2A"=>'G2',"\x2B"=>'G3',
140 "\x2C"=>'G0',"\x2D"=>'G1',"\x2E"=>'G2',"\x2F"=>'G3',
141 );
142 my %_CHARS_to_RANGE = (
143 l94 => q/[\x21-\x7E]/, l96 => q/[\x20-\x7F]/,
144 l128 => q/[\x00-\x7F]/, l256 => q/[\x00-\xFF]/,
145 r94 => q/[\xA1-\xFE]/, r96 => q/[\xA0-\xFF]/,
146 r128 => q/[\x80-\xFF]/, r256 => q/[\x80-\xFF]/,
147 b94 => q/[\x21-\x7E\xA1-\xFE]/, b96 => q/[\x20-\x7F\xA0-\xFF]/,
148 b128 => q/[\x00-\xFF]/, b256 => q/[\x00-\xFF]/,
149 );
150
151 $s =~ s{
152 ((??{ $_CHARS_to_RANGE{'l'.$C->{$C->{GL}}->{chars}}
153 . qq/{$C->{$C->{GL}}->{dimension},$C->{$C->{GL}}->{dimension}}/ }))
154 |((??{ $_CHARS_to_RANGE{'r'.$C->{$C->{GR}}->{chars}}
155 . qq/{$C->{$C->{GR}}->{dimension},$C->{$C->{GR}}->{dimension}}/ }))
156 | (??{ q/(?:/ . ($C->{$C->{CR}}->{r_SS2} || '(?!)')
157 . ($C->{$C->{ESC_Fe}}->{r_SS2_ESC} ?
158 qq/|$C->{$C->{ESC_Fe}}->{r_SS2_ESC}/ : '')
159 . ($C->{$C->{CL}}->{r_SS2} ? qq/|$C->{$C->{CL}}->{r_SS2}/ : '') . q/)/
160 . ( $C->{$C->{CL}}->{r_LS0}
161 ||$C->{$C->{CL}}->{r_LS1}? ## ISO/IEC 6429:1992 9
162 qq/[$C->{$C->{CL}}->{r_LS0}$C->{$C->{CL}}->{r_LS1}]*/:'')
163 })
164 ((??{ $_CHARS_to_RANGE{'b'.$C->{G2}->{chars}}
165 . qq/{$C->{G2}->{dimension},$C->{G2}->{dimension}}/ }))
166 | (??{ q/(?:/ . ($C->{$C->{CR}}->{r_SS3} || '(?!)')
167 . ($C->{$C->{ESC_Fe}}->{r_SS3_ESC} ?
168 qq/|$C->{$C->{ESC_Fe}}->{r_SS3_ESC}/ : '')
169 . ($C->{$C->{CL}}->{r_SS3} ? qq/|$C->{$C->{CL}}->{r_SS3}/ : '') . q/)/
170 . ( $C->{$C->{CL}}->{r_LS0}
171 || $C->{$C->{CL}}->{r_LS1}? ## ISO/IEC 6429:1992 9
172 qq/[$C->{$C->{CL}}->{r_LS0}$C->{$C->{CL}}->{r_LS1}]*/:'')
173 })
174 ((??{ $_CHARS_to_RANGE{'b'.$C->{G3}->{chars}}
175 . qq/{$C->{G3}->{dimension},$C->{G3}->{dimension}}/ }))
176
177 ## Locking shift
178 |( (??{ $C->{$C->{CL}}->{r_LS0}||'(?!)' })
179 |(??{ $C->{$C->{CL}}->{r_LS1}||'(?!)' })
180 )
181
182 ## Control sequence
183 |(??{ '(?:'.($C->{$C->{CR}}->{r_CSI}||'(?!)')
184 .($C->{$C->{ESC_Fe}}->{r_CSI_ESC} ?
185 qq/|$C->{$C->{ESC_Fe}}->{r_CSI_ESC}/: '')
186 .')'
187 })
188 ((??{ qq/[\x30-\x3F$C->{$C->{CL}}->{LS0}$C->{$C->{CL}}->{LS1}\xB0-\xBF]*/
189 .qq/[\x20-\x2F$C->{$C->{CL}}->{LS0}$C->{$C->{CL}}->{LS1}\xA0-\xAF]*/
190 }) [\x40-\x7E\xD0-\xFE])
191
192 ## Other escape sequence
193 |(\x1B[\x20-\x2F]*[\x30-\x7E])
194
195 ## Misc. sequence (SP, control, or broken data)
196 |([\x00-\xFF])
197 }{
198 my ($gl,$gr,$ss2,$ss3,$ls,$csi,$esc,$misc)
199 = ($1,$2,$3,$4,$5,$6,$7,$8,$9);
200 $C->{_irr} = undef unless defined $esc;
201 ## GL graphic character
202 if (defined $gl) {
203 my $c = 0;
204 my $m = $C->{$C->{GL}}->{chars}==94?0x21:$C->{$C->{GL}}->{chars}==96?0x20:0;
205 for (split //, $gl) {
206 $c = $c * $C->{$C->{GL}}->{chars} + unpack ('C', $_) - $m;
207 }
208 chr ($C->{$C->{GL}}->{ucs} + $c);
209 ## GR graphic character
210 } elsif ($gr) {
211 my $c = 0;
212 my $m = $C->{$C->{GR}}->{chars}==94?0xA1:$C->{$C->{GR}}->{chars}==96?0xA0:0x80;
213 for (split //, $gr) {
214 $c = $c * $C->{$C->{GR}}->{chars} + unpack ('C', $_) - $m;
215 }
216 chr ($C->{$C->{GR}}->{ucs} + $c);
217 ## Control, SP, or broken data
218 ## TODO: support control sets other than ISO/IEC 6429's
219 } elsif (defined $misc) {
220 $misc;
221 ## Graphic character with SS2
222 } elsif ($ss2) {
223 $ss2 =~ tr/\x80-\xFF/\x00-\x7F/;
224 my $c = 0; my $m = $C->{G2}->{chars}==94?0x21:$C->{G2}->{chars}==96?0x20:0;
225 for (split //, $ss2) {
226 $c = $c * $C->{G2}->{chars} + unpack ('C', $_) - $m;
227 }
228 chr ($C->{G2}->{ucs} + $c);
229 ## Graphic character with SS3
230 } elsif ($ss3) {
231 $ss3 =~ tr/\x80-\xFF/\x00-\x7F/;
232 my $c = 0; my $m = $C->{G3}->{chars}==94?0x21:$C->{G3}->{chars}==96?0x20:0;
233 for (split //, $ss3) {
234 $c = $c * $C->{G3}->{chars} + unpack ('C', $_) - $m;
235 }
236 chr ($C->{G3}->{ucs} + $c);
237 ## Escape sequence
238 } elsif ($esc) {
239 if ($esc =~ /\x1B\x26([\x40-\x7E])/) { ## 6F (IRR) = ESC 02/06 Ft
240 $C->{_irr} = $1; $esc = '';
241 } else {
242 $esc =~ s{
243 \x1B([\x28-\x2B])(\x20?[\x21-\x23]?[\x30-\x7E]) ## Gx = 94^1
244 |\x1B\x24([\x28-\x2B]?)(\x20?[\x21-\x23]?[\x30-\x7E]) ## Gx = 94^n
245
246 |\x1B([\x2C-\x2F])(\x20?[\x21-\x23]?[\x30-\x7E]) ## Gx = 96^1
247 |\x1B\x24([\x2C-\x2F])(\x20?[\x21-\x23]?[\x30-\x7E]) ## Gx = 96^n
248
249 |\x1B([\x40-\x5F]) ## ESC Fe
250
251 |\x1B\x21([\x21-\x23]?[\x30-\x7E]) ## CL = C0
252 |\x1B\x22([\x21-\x23]?[\x30-\x7E]) ## CR & ESC Fe = C1
253
254 |\x1B([\x60-\x7E]) ## Single control functions
255 |\x1B\x23([\x21-\x23]?)([\x30-\x7E])
256
257 |\x1B\x20([\x40-\x7E]) ## Announcer
258 }{
259 my ($g94_g,$g94_f,$g94n_g,$g94n_f,$g96_g,$g96_f,$g96n_g,$g96n_f,$Fe,
260 $CZD, $C1D, $Fs, $sI, $sF,$ACS)
261 = ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15);
262 my $rev = $C->{_irr} || '';
263 my $f2s = $C->{option}->{final_to_set};
264 if ($g94_g) { ## ESC 02/08 [I] F
265 $C->{ $_GB_to_GN{ $g94_g } }
266 = $CHARSET{G94}->{ $f2s->{G94}->{$g94_f.$rev} || $g94_f.$rev }
267 || $CHARSET{G94}->{ $f2s->{G94}->{$g94_f} || $g94_f }
268 || $CHARSET{G94}->{ "\x7E" }; '';
269 } elsif (defined $g94n_f) { ## ESC 02/04 [02/08..11] [I] F
270 $C->{ $_GB_to_GN{ $g94n_g } || 'G0' }
271 = $CHARSET{G94n}->{ $f2s->{G94n}->{$g94n_f.$rev} || $g94n_f.$rev }
272 || $CHARSET{G94n}->{ $f2s->{G94n}->{$g94n_f} || $g94n_f }
273 || $CHARSET{G94n}->{ "\x7E" }; '';
274 } elsif ($g96_g) { ## ESC 02/12..15 [I] F
275 $C->{ $_GB_to_GN{ $g96_g } }
276 = $CHARSET{G96}->{ $f2s->{G96}->{$g96_f.$rev} || $g96_f.$rev }
277 || $CHARSET{G96}->{ $f2s->{G96}->{$g96_f} || $g96_f }
278 || $CHARSET{G96}->{ "\x7E" }; '';
279 } elsif ($g96n_g) { ## ESC 02/04 02/12..15 [I] F
280 $C->{ $_GB_to_GN{ $g96n_g } }
281 = $CHARSET{G96n}->{ $f2s->{G96n}->{$g96n_f.$rev} || $g96n_f.$rev }
282 || $CHARSET{G96n}->{ $f2s->{G96n}->{$g96n_f} || $g96n_f }
283 || $CHARSET{G96n}->{ "\x7E" }; '';
284 } elsif ($Fe) { ## ESC Fe = C1
285 chr ($C->{ $C->{ESC_Fe} }->{ucs} + (ord ($Fe) - 0x40));
286 } elsif (defined $Fs) { ## ESC Fs
287 if ($Fs eq "\x6E") { ## LS2
288 $C->{GL} = 'G2'; '';
289 } elsif ($Fs eq "\x6F") { ## LS3
290 $C->{GL} = 'G3'; '';
291 } elsif ($Fs eq "\x7E" || $Fs eq "\x6B") { ## LS1R
292 $C->{GR} = 'G1'; $C->{GL} = 'G1' if $C->{bit} == 7; '';
293 } elsif ($Fs eq "\x7D" || $Fs eq "\x6C") { ## LS2R
294 $C->{GR} = 'G2'; $C->{GL} = 'G2' if $C->{bit} == 7; '';
295 } elsif ($Fs eq "\x7C" || $Fs eq "\x6D") { ## LS3R
296 $C->{GR} = 'G3'; $C->{GL} = 'G3' if $C->{bit} == 7; '';
297 } else {
298 chr ($CHARSET{single_control}->{Fs}->{ucs} + (ord ($Fs) - 0x60));
299 }
300 } elsif (defined $CZD) { ## 1F (CZD) = ESC 02/01 [I] F
301 $C->{C0} = $CHARSET{C0}->{ $f2s->{C0}->{$CZD.$rev} || $CZD.$rev }
302 || $CHARSET{C0}->{ $f2s->{C0}->{$CZD} || $CZD }
303 || $CHARSET{C0}->{ "\x7E" }; '';
304 } elsif (defined $C1D) { ## 2F (C1D) = ESC 02/02 [I] F
305 $C->{C1} = $CHARSET{C1}->{ $f2s->{C1}->{$C1D.$rev} || $C1D.$rev }
306 || $CHARSET{C1}->{ $f2s->{C1}->{$C1D} || $C1D }
307 || $CHARSET{C1}->{ "\x7E" }; '';
308 } elsif ($sI) { ## 3F = ESC 02/03 [I] F
309 chr ($CHARSET{single_control}->{'3F'.$sI}->{ucs} + (ord ($sF) - 0x30));
310 } elsif ($ACS) { ## 0F (Announcer) = ESC 02/00 F
311 if ($ACS eq "\x4A") { $C->{bit} = 7 }
312 elsif ($ACS eq "\x4B") { $C->{bit} = 8 }
313 '';
314 }
315 }gex;
316 $C->{_irr} = undef;
317 }
318 $esc;
319 } elsif ($ls) { ## Locking shifts = LS0 / LS1
320 if ($ls eq $C->{$C->{CL}}->{LS0}) {
321 $C->{GL} = 'G0'; '';
322 } elsif ($ls eq $C->{$C->{CL}}->{LS1}) {
323 $C->{GL} = 'G1'; '';
324 }
325 } elsif ($csi) { ## Control sequence = CSI [P..] [I] F
326 $csi =~ tr/\xA0-\xFF/\x20-\x7F/d;
327 $csi =~ s/$C->{$C->{CL}}->{LS0}//g if $C->{$C->{CL}}->{LS0};
328 $csi =~ s/$C->{$C->{CL}}->{LS1}//g if $C->{$C->{CL}}->{LS1};
329 "\x9B".$csi;
330 }
331 }gesx;
332 $s;
333 } # __iso2022_to_internal
334
335 };
336 &__iso2022_to_internal (@_) if defined $_[0];
337
338 } # _iso2022_to_internal
339
340 sub internal_to_iso2022 ($;%) {
341 my ($s, $C) = @_;
342 $C ||= &new_object;
343
344 my $r = '';
345 for my $c (split //, $s) {
346 my $cc = ord $c; Encode::_utf8_off ($c);
347 my $t;
348 if ($cc <= 0x1F) {
349 $t = _i2c ($c, $C, type => 'C0', charset => '@');
350 } elsif ($cc == 0x20 || $cc == 0x7F) {
351 $t = _back2ascii ($C) . $c;
352 } elsif ($cc < 0x7F) {
353 $t = _i2g ($c, $C, type => 'G94', charset => 'B');
354 } elsif ($cc <= 0x9F) {
355 $t = _i2c (pack ('C', $cc), $C, type => 'C1', charset_id => '64291991C1',
356 charset => $C->{option}->{private_set}->{XC1}->{'64291991C1'});
357 } elsif ($cc <= 0xFF) {
358 $t = _i2g (pack ('C', $cc-0x80), $C, type => 'G96', charset => 'A');
359 } elsif ($cc <= 0x24FF) {
360 my $c = $cc - 0x100;
361 my $final = $C->{option}->{private_set}->{U96n}->[0];
362 if (length $final) {
363 $t = _i2g (chr(($c / 96)+0x20).chr(($c % 96)+0x20), $C,
364 type => 'G96n', charset => $final);
365 }
366 } elsif ($cc <= 0x33FF) {
367 my $c = $cc - 0x2500;
368 my $final = $C->{option}->{private_set}->{U96n}->[1];
369 if (length $final) {
370 $t = _i2g (chr(($c / 96)+0x20).chr(($c % 96)+0x20), $C,
371 type => 'G96n', charset => $final);
372 }
373 } elsif (0xE000 <= $cc && $cc <= 0xFFFF) {
374 my $c = $cc - 0xE000;
375 my $final = $C->{option}->{private_set}->{U96n}->[2];
376 if (length $final) {
377 $t = _i2g (chr(($c / 96)+0x20).chr(($c % 96)+0x20), $C,
378 type => 'G96n', charset => $final);
379 }
380
381 } elsif (0xE9F6C0 <= $cc && $cc <= 0xF06F80) {
382 my $c = $cc - 0xE9F6C0;
383 $t = _i2g (chr((($c % 8836) / 94)+0x21).chr(($c % 94)+0x21), $C,
384 type => 'G94n', charset => chr(($c / 8836)+0x30));
385 } elsif (0xF49D7C <= $cc && $cc <= 0xF4BFFF) {
386 my $c = $cc - 0xF49D7C;
387 $t = _i2g (chr(($c / 94)+0x21).chr(($c % 94)+0x21), $C,
388 type => 'G94n', charset => 'B', revision => '@');
389
390 } elsif (0xF0000 <= $cc && $cc <= 0x10F7FF) {
391 my $c = $cc - 0xF0000;
392 $t = _i2g (chr((($c % 9216) / 96)+0x20).chr(($c % 96)+0x20), $C,
393 type => 'G96n', charset => "\x20".chr(($c / 9216)+0x40));
394 } elsif (0xE90940 <= $cc && $cc <= 0xE92641) {
395 my $c = $cc - 0xE90940;
396 $t = _i2g (chr(($c % 94)+0x21), $C,
397 type => 'G94', charset => chr(($c / 94)+0x30));
398 } elsif (0xE92642 <= $cc && $cc <= 0xE9269F) {
399 my $c = $cc - 0xE92642;
400 $t = _i2g (chr($c+0x21),$C,type => 'G94', charset => "\x64", revision => '@');
401 } elsif (0xE926A0 <= $cc && $cc <= 0xE9443F) {
402 my $c = $cc - 0xE926A0;
403 $t = _i2g (chr(($c % 96)+0x20), $C,
404 type => 'G96', charset => chr(($c / 96)+0x30));
405 } elsif (0xE944A0 <= $cc && $cc <= 0xE961A1) {
406 my $c = $cc - 0xE944A0;
407 $t = _i2g (chr(($c % 94)+0x21), $C,
408 type => 'G94', charset => '!'.chr(($c / 94)+0x30));
409 } elsif (0xE96200 <= $cc && $cc <= 0xE97F9F) {
410 my $c = $cc - 0xE96200;
411 $t = _i2g (chr(($c % 96)+0x20), $C,
412 type => 'G96', charset => '!'.chr(($c / 96)+0x30));
413 } elsif (0xE98000 <= $cc && $cc <= 0xE99D01) {
414 my $c = $cc - 0xE98000;
415 $t = _i2g (chr(($c % 94)+0x21), $C,
416 type => 'G94', charset => '"'.chr(($c / 94)+0x30));
417 } elsif (0xE99D60 <= $cc && $cc <= 0xE9BAFF) {
418 my $c = $cc - 0xE99D60;
419 $t = _i2g (chr(($c % 96)+0x20), $C,
420 type => 'G96', charset => '"'.chr(($c / 96)+0x30));
421 } elsif (0xE9BB60 <= $cc && $cc <= 0xE9D861) {
422 my $c = $cc - 0xE9BB60;
423 $t = _i2g (chr(($c % 94)+0x21), $C,
424 type => 'G94', charset => '#'.chr(($c / 94)+0x30));
425 } elsif (0xE9D8C0 <= $cc && $cc <= 0xE9F65F) {
426 my $c = $cc - 0xE9D8C0;
427 $t = _i2g (chr(($c % 96)+0x20), $C,
428 type => 'G96', charset => '#'.chr(($c / 96)+0x30));
429 } elsif (0x70090940 <= $cc && $cc <= 0x70092641) {
430 my $c = $cc - 0x70090940;
431 $t = _i2g (chr(($c % 94)+0x21), $C,
432 type => 'G94', charset => "\x20".chr(($c / 94)+0x30));
433 } elsif (0x700926A0 <= $cc && $cc <= 0x7009443F) {
434 my $c = $cc - 0x700926A0;
435 $t = _i2g (chr(($c % 96)+0x20), $C,
436 type => 'G96', charset => "\x20".chr(($c / 96)+0x30));
437 ## TODO: DRCS with I byte: U+700944A0-U+7009F6BF
438 } elsif (0x70400000 <= $cc && $cc <= 0x7040FFED) {
439 my $c = $cc - 0x70400000;
440 $t = _i2g (chr(($c % 94)+0x21), $C, charset_id => 'P'.int ($c / 94),
441 type => 'G94', charset => $C->{option}->{private_set}->{G94}->[ $c / 94 ]);
442 } elsif (0x70410000 <= $cc && $cc <= 0x7041FFBF) {
443 my $c = $cc - 0x70410000;
444 $t = _i2g (chr(($c % 96)+0x20), $C, charset_id => 'P'.int ($c / 96),
445 type => 'G96', charset => $C->{option}->{private_set}->{G96}->[ $c / 96 ]);
446 } elsif (0x70420000 <= $cc && $cc <= 0x7046F19B) {
447 my $c = $cc % 0x10000;
448 $t = _i2g (chr((($c % 8836) / 94)+0x21).chr(($c % 94)+0x21), $C,
449 type => 'G94n',
450 charset_id => 'P'.int(($cc / 0x10000) - 0x7042).'_'.int($c / 8836),
451 charset => $C->{option}->{private_set}->{G94n}
452 ->[ ($cc / 0x10000) - 0x7042 ]->[ $c / 8836 ]);
453 }
454 if (defined $t) {
455 $t = _i2o ($t, $C, cs_F => "\x40")
456 if $C->{coding_system} ne $CODING_SYSTEM{"\x40"};
457 } else {
458 my $F; my @F = qw~G /G /H /I B /A /D /F~;
459 push @F, qw~/J /K /L~ if $cc <= 0x10FFFF;
460 push @F, qw~/@ /C /E~ if $cc <= 0xFFFF;
461 for (@F) {
462 if (defined $C->{option}->{designate_to}->{coding_system}->{$_}
463 && $C->{option}->{designate_to}->{coding_system}->{$_} > -1) {
464 $F = $_; last;
465 } elsif ($C->{option}->{designate_to}->{coding_system}->{default} > -1) {
466 $F = $_; last;
467 }
468 }
469 $t = _i2o ($c, $C, cs_F => $F) if $F;
470 }
471 if (defined $t) {
472 $r .= $t;
473 } else {
474 unless ($C->{option}->{undef_char}->[0] eq "\x20") {
475 $t = _i2g ($C->{option}->{undef_char}->[0], $C,
476 %{ $C->{option}->{undef_char}->[1] });
477 } else { ## SP
478 $t = _back2ascii ($C) . "\x20";
479 }
480 $r .= $C->{coding_system} eq $CODING_SYSTEM{"\x40"} ?
481 $t : _i2o ($t, $C, cs_F => "\x40");
482 }
483 }
484 $r . _back2ascii ($C);
485 }
486
487 ## $O{charset} eq undef means that charset is same as the current designated one.
488 sub _i2c ($%%) {
489 my ($s, $C, %O) = @_;
490 my $r = '';
491 if ($O{type} eq 'C0') {
492 if (defined $O{charset}) {
493 if ( $C->{C0} ne $CHARSET{C0}->{$O{charset}}
494 && $C->{C0} ne $CHARSET{C0}->{$O{charset_id}}) {
495 for ($C->{option}->{designate_to}->{C0}->{$O{charset}},
496 $C->{option}->{designate_to}->{C0}->{default}) {
497 if (defined $_) { return undef if $_ == -1; last }
498 }
499 $r .= "\x1B\x21".$O{charset};
500 $C->{C0} = $CHARSET{C0}->{$O{charset}};
501 }
502 } elsif (defined $O{charset_id}) {
503 if ($C->{C0} ne $CHARSET{C0}->{$O{charset_id}}) {
504 return undef; ## Control set is not designated nor has F byte
505 }
506 }
507 $r .= _back2ascii ($C, reset_all => $C->{C0}->{reset_all}->{$s});
508 return $r . $s;
509 } elsif ($O{type} eq 'C1') {
510 if (defined $O{charset}) {
511 if ( $C->{C1} ne $CHARSET{C1}->{$O{charset}}
512 && $C->{C1} ne $CHARSET{C1}->{$O{charset_id}}) {
513 for ($C->{option}->{designate_to}->{C1}->{$O{charset}},
514 $C->{option}->{designate_to}->{C1}->{default}) {
515 if (defined $_) { return undef if $_ == -1; last }
516 }
517 $r .= "\x1B\x22".$O{charset};
518 $C->{C1} = $CHARSET{C1}->{$O{charset}};
519 }
520 } elsif (defined $O{charset_id}) {
521 if ($C->{C1} ne $CHARSET{C1}->{$O{charset_id}}) {
522 return undef; ## Control set is not designated nor has F byte
523 }
524 }
525 $r .= _back2ascii ($C, reset_all => $C->{C1}->{reset_all}->{$s});
526 unless ($C->{option}->{C1invoke_to_right}) { ## ESC Fe
527 $s =~ s/([\x80-\x9F])/"\x1B" . pack ('C', ord ($1) - 0x40)/ge;
528 }
529 return $r . $s;
530 }
531 }
532 sub _i2g ($%%) {
533 my ($s, $C, %O) = @_;
534 my $r = '';
535 my $set = $CHARSET{$O{type}}->{$O{charset}.
536 ($O{revision}&&$C->{option}->{use_revision}?$O{revision}:'')};
537 my $set0 = $CHARSET{$O{type}}->{$O{charset_id}};
538 ## -- designate character set
539 my $G = 0;
540 if ($C->{G0} eq $set || $C->{G0} eq $set0) { $G = 0 }
541 elsif ($C->{G1} eq $set || $C->{G1} eq $set0) { $G = 1 }
542 elsif ($C->{G2} eq $set || $C->{G2} eq $set0) { $G = 2 }
543 elsif ($C->{G3} eq $set || $C->{G3} eq $set0) { $G = 3 }
544 else {
545 return undef unless $set; ## charset does not have F byte
546 $G = 1 if $O{type} eq 'G96' || $O{type} eq 'G96n';
547 for ($C->{option}->{designate_to}->{$O{type}}->{$O{charset}},
548 $C->{option}->{designate_to}->{$O{type}}->{default}) {
549 if (defined $_) {
550 $G = $_; last;
551 }
552 }
553 if ($G == -1) {
554 return undef;
555 }
556 if ($O{type} eq 'G94') {
557 $r .= ($O{revision}&&$C->{option}->{use_revision}?"\x1B\x26".$O{revision}:'')
558 ."\x1B".("\x28","\x29","\x2A","\x2B")[$G].$O{charset};
559 } elsif ($O{type} eq 'G94n') {
560 if ($G == 0 && !$C->{option}->{G94n_designate_long}
561 && ($O{charset} eq '@' || $O{charset} eq 'A' || $O{charset} eq 'B')) {
562 $r .= ($O{revision}&&$C->{option}->{use_revision}?"\x1B\x26".$O{revision}:'')
563 ."\x1B\x24".$O{charset};
564 } else {
565 $r .= ($O{revision}&&$C->{option}->{use_revision}?"\x1B\x26".$O{revision}:'')
566 ."\x1B\x24".("\x28","\x29","\x2A","\x2B")[$G].$O{charset};
567 }
568 } elsif ($O{type} eq 'G96') {
569 $r .= ($O{revision}&&$C->{option}->{use_revision}?"\x1B\x26".$O{revision}:'')
570 ."\x1B".("\x2C","\x2D","\x2E","\x2F")[$G].$O{charset};
571 } elsif ($O{type} eq 'G96n') {
572 $r .= ($O{revision}&&$C->{option}->{use_revision}?"\x1B\x26".$O{revision}:'')
573 ."\x1B\x24".("\x2C","\x2D","\x2E","\x2F")[$G].$O{charset};
574 }
575 $C->{'G'.$G} = $CHARSET{$O{type}}->{$O{charset}};
576 }
577 ## -- invoke G buffer
578 my $left = $C->{option}->{Ginvoke_to_left}->[$G];
579 if ($C->{GL} eq 'G'.$G) {
580 $left = 1;
581 } elsif ($C->{GR} eq 'G'.$G) {
582 $left = 0;
583 } else {
584 if ($C->{option}->{Ginvoke_by_single_shift}->[$G]) {
585 if ($C->{C1}->{'C_SS'.$G}) {
586 $r .= _i2c ($C->{C1}->{'C_SS'.$G}, $C, type => 'C1') || return undef;
587 } elsif ($C->{C0}->{'C_SS'.$G}) {
588 $r .= _i2c ($C->{C0}->{'C_SS'.$G}, $C, type => 'C0') || return undef;
589 } else { ## Both C0 and C1 set do not have SS2/3.
590 $left = 0 if $G == 1 && !$C->{C0}->{C_LS1};
591 $r .= __invoke ($C, $G => $left) if $C->{$left?'GL':'GR'} ne 'G'.$G;
592 }
593 } else {
594 $left = 0 if $G == 1 && !$C->{C0}->{C_LS1};
595 $r .= __invoke ($C, $G => $left) if $C->{$left?'GL':'GR'} ne 'G'.$G;
596 }
597 }
598 $s =~ tr/\x00-\x7F/\x80-\xFF/ unless $left;
599 $r . $s;
600 }
601 sub _back2ascii (%) {
602 my ($C, %O) = @_;
603 my $r = '';
604 if ($C->{option}->{reset}->{Gdesignation}) {
605 my $F = $C->{option}->{reset}->{Gdesignation}; # \x42
606 $r .= "\x1B\x28".$F unless $C->{G0} eq $CHARSET{G94}->{$F};
607 $C->{G0} = $CHARSET{G94}->{$F};
608 if ($O{reset_all}) {
609 $C->{G1} = $CHARSET{G94}->{"\x7E"};
610 $C->{G2} = $CHARSET{G94}->{"\x7E"};
611 $C->{G3} = $CHARSET{G94}->{"\x7E"};
612 }
613 }
614 if ($C->{option}->{reset}->{Ginvoke}) {
615 if ($C->{GL} ne 'G0') {
616 $r .= $C->{C0}->{C_LS0} || ($C->{C0} = $CHARSET{C0}->{'@'},"\x1B\x21\x40\x0F");
617 $C->{GL} = 'G0';
618 }
619 $C->{GR} = undef if $O{reset_all};
620 }
621 $r;
622 }
623 ## __invoke (\%C, $G, $left_or_right)
624 sub __invoke (\%$$) {
625 my ($C, $G) = @_;
626 if ($_[2]) {
627 $C->{GL} = 'G'.$G;
628 return ($C->{C0}->{C_LS0}
629 || scalar ($C->{C0}=$CHARSET{C0}->{'@'},"\x1B\x21\x40\x0F"),
630 $C->{C0}->{C_LS1}, "\x1B\x6E", "\x1B\x6F")[$G];
631 } else {
632 $C->{GR} = 'G'.$G;
633 return ("", "\x1B\x7E", "\x1B\x7D", "\x1B\x7C")[$G];
634 }
635 '';
636 }
637 sub _i2o ($\%%) {
638 my ($s, $C, %O) = @_;
639 my $CS = $CODING_SYSTEM{ $O{cs_F} } || $CODING_SYSTEM{ $O{cs_id} } || return undef;
640 my $r = '';
641 if ($CS ne $C->{coding_system}) {
642 my $e = '';
643 $e .= "\x1B\x25";
644 $e .= $O{cs_F} || $C->{option}->{private_set}->{coding_system}->{ $O{cs_id} }
645 || return undef;
646 if ($C->{coding_system} eq $CODING_SYSTEM{"\x2F\x40"}
647 || $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x43"}
648 || $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x45"}
649 || $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x4A"}
650 || $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x4B"}
651 || $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x4C"}) {
652 $e =~ s/(.)/\x00$1/go;
653 } elsif ($C->{coding_system} eq $CODING_SYSTEM{"\x2F\x41"}
654 || $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x44"}
655 || $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x46"}) {
656 $e =~ s/(.)/\x00\x00\x00$1/go;
657 }
658 $r .= $e;
659 $C->{coding_system} = $CS;
660 if ($CS->{reset_state}) {
661 $C->{GL} = undef; $C->{GR} = undef;
662 $C->{C0} = $CHARSET{C0}->{"\x7E"};
663 $C->{C1} = $CHARSET{C1}->{"\x7E"};
664 $C->{G0} = $CHARSET{G94}->{"\x7E"};
665 $C->{G1} = $CHARSET{G94}->{"\x7E"};
666 $C->{G2} = $CHARSET{G94}->{"\x7E"};
667 $C->{G3} = $CHARSET{G94}->{"\x7E"};
668 }
669 }
670 if ($CS eq $CODING_SYSTEM{"\x40"}) {
671 #
672 } elsif ($CS eq $CODING_SYSTEM{G} || $CS eq $CODING_SYSTEM{'/G'}
673 || $CS eq $CODING_SYSTEM{'/H'} || $CS eq $CODING_SYSTEM{'/I'}) {
674 Encode::_utf8_off ($s);
675 } elsif ($CS eq $CODING_SYSTEM{'/@'} || $CS eq $CODING_SYSTEM{'/C'}
676 || $CS eq $CODING_SYSTEM{'/E'}) {
677 $s = Encode::encode ('ucs-2be', $s);
678 } elsif ($CS eq $CODING_SYSTEM{'/A'} || $CS eq $CODING_SYSTEM{'/D'}
679 || $CS eq $CODING_SYSTEM{'/F'}) {
680 $s = Encode::encode ('ucs-4be', $s);
681 } elsif ($CS eq $CODING_SYSTEM{'/J'} || $CS eq $CODING_SYSTEM{'/K'}
682 || $CS eq $CODING_SYSTEM{'/L'}) {
683 $s = Encode::encode ('UTF-16BE', $s);
684 } elsif ($CS eq $CODING_SYSTEM{B}) {
685 $s = Encode::encode ('utf-1', $s);
686 } else {
687 return undef;
688 }
689 $r . $s;
690 }
691
692 1;
693 __END__
694
695 =head1 SEE ALSO
696
697 ISO/IEC 646:1991, "7-bit coded graphic character set for intormation interchange".
698
699 ISO/IEC 2022:1994, "Character Code Structure and Extension Techniques".
700 (IDT with ECMA 35, JIS X 0202:1998)
701
702 ISO/IEC 4873:1991, "8-Bit Coded Character Set Structure and Rules".
703 (IDT with ECMA 43)
704
705 ISO/IEC 6429:1992, "Control Functions for Coded Character Sets".
706 (IDT with ECMA 48:1991, JIS X 0211:1998)
707
708 ISO/IEC 8859, "8-Bit Single-Byte Coded Graphic Character Sets".
709
710 Encode, perlunicode
711
712 =head1 TODO
713
714 =over 4
715
716 =item NCR (coding system other than ISO/IEC 2022) support
717
718 =over 2
719
720 =item ESC 02/05 02/15 03/x of X Compound Text
721
722 =back
723
724 =item Output of control character sets, single control functions
725
726 =item Designation sequence of control character sets (input)
727
728 =item Special graphic character sets such as G3 of EUC-TW
729
730 =item SUPER SHIFT (SS) invoke function of old control character set
731
732 =item Safe transparent of control string (ISO/IEC 6429)
733
734 =item Output of unoutputable characters as alternative notation such as SGML-like entity
735
736 =item C0 set invoked to CR area like ISIRI code
737
738 Really need?
739
740 =item special treatment of 0x20, 0x7E, 0xA0, 0xFF
741
742 For example, GB mongolian sets use MSP (MONGOLIAN SPACE)
743 with these code positions.
744
745 And, no less coding systems does not use (or does ban using) DEL.
746
747 =item A lot of character sets don't have pseudo-UCS mapping.
748
749 Most of 9m^n (n >= 3) sets, 9m^n sets with I byte, 9m^n
750 DRCSes do not have pseudo-UCS mapping area. It is
751 questionable to allocate lots of code positions to these
752 rarely-(or no-)used character sets.
753
754 =item Even character sets that have pseudo-UCS mapping, some of them can't be outputed in ISO/IEC 2022.
755
756 Because output of rarely-used character sets is
757 not implemented yet.
758
759 =back
760
761 =head1 AUTHORS
762
763 Nanashi-san
764
765 Wakaba <w@suika.fam.cx>
766
767 =head1 LICENSE
768
769 Copyright 2002 AUTHORS
770
771 This library is free software; you can redistribute it
772 and/or modify it under the same terms as Perl itself.
773
774 =cut
775
776 # $Date: 2002/10/16 10:39:35 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24