/[suikacvs]/perl/lib/Encode/ISO2022.pm
Suika

Contents of /perl/lib/Encode/ISO2022.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.15 - (show annotations) (download)
Wed Dec 18 12:57:40 2002 UTC (21 years, 5 months ago) by wakaba
Branch: MAIN
CVS Tags: HEAD
Changes since 1.14: +11 -9 lines
Pictogram charsets support

1
2 =head1 NAME
3
4 Encode::ISO2022 --- ISO/IEC 2022 encoder and decoder
5
6 =head1 ENCODINGS
7
8 =over 4
9
10 =item iso2022
11
12 ISO/IEC 2022:1994. Default status is:
13
14 =over 2
15
16 =item CL = C0 = ISO/IEC 6429:1991 C0 set
17
18 =item CR = C1 = ISO/IEC 6429:1991 C1 set
19
20 =item GL = G0 = ISO/IEC 646:1991 IRV GL(G0) set
21
22 =item GR = G1 = empty set
23
24 =item G2 = empty set
25
26 =item G3 = empty set
27
28 =back
29
30 (Alias: iso/iec2022, iso-2022, 2022, cp2022)
31
32 =back
33
34 Note that ISO/IEC 2022 based encodings are found in
35 Encode::ISO2022::* modules. This module, Encode::ISO2022
36 only provides a general ISO/IEC 2022 encoder/decoder.
37
38 =cut
39
40 require v5.7.3;
41 package Encode::ISO2022;
42 use strict;
43 use vars qw(%CHARSET %CODING_SYSTEM $VERSION);
44 $VERSION=do{my @r=(q$Revision: 1.14 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
45 use base qw(Encode::Encoding);
46 __PACKAGE__->Define (qw!iso-2022 iso/iec2022 iso2022 2022 cp2022!);
47 require Encode::Charset;
48 *CHARSET = \%Encode::Charset::CHARSET;
49 *CODING_SYSTEM = \%Encode::Charset::CODING_SYSTEM;
50
51 ### --- Perl Encode module common functions
52
53 sub encode ($$;$) {
54 my ($obj, $str, $chk) = @_;
55 $_[1] = '' if $chk;
56 $str = &internal_to_iso2022 ($str);
57 return $str;
58 }
59
60 sub decode ($$;$) {
61 my ($obj, $str, $chk) = @_;
62 $_[1] = '' if $chk;
63 return &iso2022_to_internal ($str);
64 }
65
66 ### --- Encode::ISO2022 unique functions
67 *new_object = \&Encode::Charset::new_object;
68
69 sub iso2022_to_internal ($;%) {
70 my ($s, $C) = @_;
71 $C ||= &new_object;
72 my $t = '';
73 $s =~ s{^((?:(?!\x1B\x25\x2F?[\x30-\x7E]).)*)}{
74 my $i2 = $1;
75 $t = _iso2022_to_internal ($i2, $C);
76 '';
77 }es;
78 my $pad = '';
79 use re 'eval';
80 $s =~ s{
81 ## ISO/IEC 2022
82 (??{"$pad\x1B$pad\x25$pad\x40"})((?:(?!\x1B\x25\x2F?[\x30-\x7E]).)*)
83 ## UTF-8
84 |(??{"$pad\x1B$pad\x25$pad(?:\x47|\x2F$pad"."[\x47-\x49])"})
85 ((?:(?!\x1B\x25\x2F?[\x30-\x7E]).)*)
86 ## UCS-2, UTF-16
87 |(??{"$pad\x1B$pad\x25$pad\x2F$pad"})([\x40\x43\x45\x4A-\x4C])
88 ((?:(?!\x00\x1B\x00\x25(?:\x00\x2F)?\x00[\x30-\x7E])..)*)
89 ## UCS-4
90 |(??{"$pad\x1B$pad\x25$pad\x2F$pad"})[\x41\x44\x46]
91 ((?:(?!\x00\x00\x00\x1B\x00\x00\x00\x25(?:\x00\x00\x00\x2F)?
92 \x00\x00\x00[\x30-\x7E])....)*)
93 ## with standard return
94 |(??{"$pad\x1B$pad\x25$pad"})([\x30-\x7E])
95 ((?:(?!\x1B\x25\x2F?[\x30-\x7E]).)*)
96 ## without standard return
97 |(??{"$pad\x1B$pad\x25$pad\x2F$pad"})([\x30-\x7E])(.*)
98 }{
99 my ($i2,$u8,$Fu2,$u2,$u4,$Fsr,$sr,$Fnsr,$nsr) = ($1,$2,$3,$4,$5,$6,$7,$8,$9);
100 my $r = '';
101 if (defined $i2) {
102 $r = _iso2022_to_internal ($i2, $C); $pad = '';
103 } elsif (defined $u8) {
104 $r = Encode::decode ('utf8', $u8); $pad = '';
105 } elsif ($Fu2) {
106 if (ord ($Fu2) > 0x49) {
107 $r = Encode::decode ('utf-16be', $u2);
108 } else {
109 $r = Encode::decode ('ucs-2be', $u2);
110 }
111 $pad = "\x00";
112 } elsif (defined $u4) {
113 $r = Encode::decode ('ucs-4be', $u2); $pad = "\x00\x00\x00";
114 } elsif (defined $Fsr && $CODING_SYSTEM{$Fsr}->{perl_name}) {
115 $r = Encode::decode ($CODING_SYSTEM{$Fsr}->{perl_name}, $sr); $pad = '';
116 } elsif (defined $Fnsr && $CODING_SYSTEM{$Fnsr}->{perl_name}) {
117 $r = Encode::decode ($CODING_SYSTEM{$Fnsr}->{perl_name}, $nsr); $pad = '';
118 } else { ## temporary
119 $r = '?' x length ($sr.$nsr); $pad = '';
120 }
121 $r;
122 }gesx;
123 $t . $s;
124 }
125
126 # this is very very trickey. my perl 5.8.0 does not process
127 # regex with eval except the first time (i think it's a bug
128 # of perl), so we redefine this function whenever being called!
129 # when this unexpected behavior is fixed or someone finds
130 # better way to avoid it, we will rewrite this code.
131 &_iso2022_to_internal (undef);
132 sub _iso2022_to_internal ($;%) {
133 eval q{ sub __iso2022_to_internal ($;%) { 0 } };
134 eval q{
135 sub __iso2022_to_internal ($;%) {
136 use re 'eval';
137 my ($s, $C) = @_;
138 my %_GB_to_GN = (
139 "\x28"=>'G0',"\x29"=>'G1',"\x2A"=>'G2',"\x2B"=>'G3',
140 "\x2C"=>'G0',"\x2D"=>'G1',"\x2E"=>'G2',"\x2F"=>'G3',
141 );
142 my %_CHARS_to_RANGE = (
143 l94 => q/[\x21-\x7E]/, l96 => q/[\x20-\x7F]/,
144 l128 => q/[\x00-\x7F]/, l256 => q/[\x00-\xFF]/,
145 r94 => q/[\xA1-\xFE]/, r96 => q/[\xA0-\xFF]/,
146 r128 => q/[\x80-\xFF]/, r256 => q/[\x80-\xFF]/,
147 b94 => q/[\x21-\x7E\xA1-\xFE]/, b96 => q/[\x20-\x7F\xA0-\xFF]/,
148 b128 => q/[\x00-\xFF]/, b256 => q/[\x00-\xFF]/,
149 );
150
151 $s =~ s{
152 ((??{ $_CHARS_to_RANGE{'l'.$C->{$C->{GL}}->{chars}}
153 . qq/{$C->{$C->{GL}}->{dimension},$C->{$C->{GL}}->{dimension}}/ }))
154 |((??{ $_CHARS_to_RANGE{'r'.$C->{$C->{GR}}->{chars}}
155 . qq/{$C->{$C->{GR}}->{dimension},$C->{$C->{GR}}->{dimension}}/ }))
156 | (??{ q/(?:/ . ($C->{$C->{CR}}->{r_SS2} || '(?!)')
157 . ($C->{$C->{ESC_Fe}}->{r_SS2_ESC} ?
158 qq/|$C->{$C->{ESC_Fe}}->{r_SS2_ESC}/ : '')
159 . ($C->{$C->{CL}}->{r_SS2} ? qq/|$C->{$C->{CL}}->{r_SS2}/ : '') . q/)/
160 . ( $C->{$C->{CL}}->{r_LS0}
161 ||$C->{$C->{CL}}->{r_LS1}? ## ISO/IEC 6429:1992 9
162 qq/[$C->{$C->{CL}}->{r_LS0}$C->{$C->{CL}}->{r_LS1}]*/:'')
163 })
164 ((??{ $_CHARS_to_RANGE{'b'.$C->{G2}->{chars}}
165 . qq/{$C->{G2}->{dimension},$C->{G2}->{dimension}}/ }))
166 | (??{ q/(?:/ . ($C->{$C->{CR}}->{r_SS3} || '(?!)')
167 . ($C->{$C->{ESC_Fe}}->{r_SS3_ESC} ?
168 qq/|$C->{$C->{ESC_Fe}}->{r_SS3_ESC}/ : '')
169 . ($C->{$C->{CL}}->{r_SS3} ? qq/|$C->{$C->{CL}}->{r_SS3}/ : '') . q/)/
170 . ( $C->{$C->{CL}}->{r_LS0}
171 || $C->{$C->{CL}}->{r_LS1}? ## ISO/IEC 6429:1992 9
172 qq/[$C->{$C->{CL}}->{r_LS0}$C->{$C->{CL}}->{r_LS1}]*/:'')
173 })
174 ((??{ $_CHARS_to_RANGE{'b'.$C->{G3}->{chars}}
175 . qq/{$C->{G3}->{dimension},$C->{G3}->{dimension}}/ }))
176
177 ## Locking shift
178 |( (??{ $C->{$C->{CL}}->{r_LS0}||'(?!)' })
179 |(??{ $C->{$C->{CL}}->{r_LS1}||'(?!)' })
180 )
181
182 ## Control sequence
183 |(??{ '(?:'.($C->{$C->{CR}}->{r_CSI}||'(?!)')
184 .($C->{$C->{ESC_Fe}}->{r_CSI_ESC} ?
185 qq/|$C->{$C->{ESC_Fe}}->{r_CSI_ESC}/: '')
186 .')'
187 })
188 ((??{ qq/[\x30-\x3F$C->{$C->{CL}}->{LS0}$C->{$C->{CL}}->{LS1}\xB0-\xBF]*/
189 .qq/[\x20-\x2F$C->{$C->{CL}}->{LS0}$C->{$C->{CL}}->{LS1}\xA0-\xAF]*/
190 }) [\x40-\x7E\xD0-\xFE])
191
192 ## Other escape sequence
193 |(\x1B[\x20-\x2F]*[\x30-\x7E])
194
195 ## Misc. sequence (SP, control, or broken data)
196 |([\x00-\xFF])
197 }{
198 my ($gl,$gr,$ss2,$ss3,$ls,$csi,$esc,$misc)
199 = ($1,$2,$3,$4,$5,$6,$7,$8,$9);
200 $C->{_irr} = undef unless defined $esc;
201 ## GL graphic character
202 if (defined $gl) {
203 my $c = 0;
204 my $m = $C->{$C->{GL}}->{chars}==94?0x21:$C->{$C->{GL}}->{chars}==96?0x20:0;
205 for (split //, $gl) {
206 $c = $c * $C->{$C->{GL}}->{chars} + unpack ('C', $_) - $m;
207 }
208 chr ($C->{$C->{GL}}->{ucs} + $c);
209 ## GR graphic character
210 } elsif ($gr) {
211 my $c = 0;
212 my $m = $C->{$C->{GR}}->{chars}==94?0xA1:$C->{$C->{GR}}->{chars}==96?0xA0:0x80;
213 for (split //, $gr) {
214 $c = $c * $C->{$C->{GR}}->{chars} + unpack ('C', $_) - $m;
215 }
216 chr ($C->{$C->{GR}}->{ucs} + $c);
217 ## Control, SP, or broken data
218 ## TODO: support control sets other than ISO/IEC 6429's
219 } elsif (defined $misc) {
220 $misc;
221 ## Graphic character with SS2
222 } elsif ($ss2) {
223 $ss2 =~ tr/\x80-\xFF/\x00-\x7F/;
224 my $c = 0; my $m = $C->{G2}->{chars}==94?0x21:$C->{G2}->{chars}==96?0x20:0;
225 for (split //, $ss2) {
226 $c = $c * $C->{G2}->{chars} + unpack ('C', $_) - $m;
227 }
228 chr ($C->{G2}->{ucs} + $c);
229 ## Graphic character with SS3
230 } elsif ($ss3) {
231 $ss3 =~ tr/\x80-\xFF/\x00-\x7F/;
232 my $c = 0; my $m = $C->{G3}->{chars}==94?0x21:$C->{G3}->{chars}==96?0x20:0;
233 for (split //, $ss3) {
234 $c = $c * $C->{G3}->{chars} + unpack ('C', $_) - $m;
235 }
236 chr ($C->{G3}->{ucs} + $c);
237 ## Escape sequence
238 } elsif ($esc) {
239 if ($esc =~ /\x1B\x26([\x40-\x7E])/) { ## 6F (IRR) = ESC 02/06 Ft
240 $C->{_irr} = $1; $esc = '';
241 } else {
242 $esc =~ s{
243 \x1B([\x28-\x2B])(\x20?[\x21-\x23]?[\x30-\x7E]) ## Gx = 94^1
244 |\x1B\x24([\x28-\x2B]?)(\x20?[\x21-\x23]?[\x30-\x7E]) ## Gx = 94^n
245
246 |\x1B([\x2C-\x2F])(\x20?[\x21-\x23]?[\x30-\x7E]) ## Gx = 96^1
247 |\x1B\x24([\x2C-\x2F])(\x20?[\x21-\x23]?[\x30-\x7E]) ## Gx = 96^n
248
249 |\x1B([\x40-\x5F]) ## ESC Fe
250
251 |\x1B\x21([\x21-\x23]?[\x30-\x7E]) ## CL = C0
252 |\x1B\x22([\x21-\x23]?[\x30-\x7E]) ## CR & ESC Fe = C1
253
254 |\x1B([\x60-\x7E]) ## Single control functions
255 |\x1B\x23([\x21-\x23]?)([\x30-\x7E])
256
257 |\x1B\x20([\x40-\x7E]) ## Announcer
258 }{
259 my ($g94_g,$g94_f,$g94n_g,$g94n_f,$g96_g,$g96_f,$g96n_g,$g96n_f,$Fe,
260 $CZD, $C1D, $Fs, $sI, $sF,$ACS)
261 = ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15);
262 my $rev = $C->{_irr} || '';
263 my $f2s = $C->{option}->{final_to_set};
264 if ($g94_g) { ## ESC 02/08 [I] F
265 $C->{ $_GB_to_GN{ $g94_g } }
266 = $CHARSET{G94}->{ $f2s->{G94}->{$g94_f.$rev} || $g94_f.$rev }
267 || $CHARSET{G94}->{ $f2s->{G94}->{$g94_f} || $g94_f }
268 || $CHARSET{G94}->{ "\x7E" }; '';
269 } elsif (defined $g94n_f) { ## ESC 02/04 [02/08..11] [I] F
270 $C->{ $_GB_to_GN{ $g94n_g } || 'G0' }
271 = $CHARSET{G94n}->{ $f2s->{G94n}->{$g94n_f.$rev} || $g94n_f.$rev }
272 || $CHARSET{G94n}->{ $f2s->{G94n}->{$g94n_f} || $g94n_f }
273 || $CHARSET{G94n}->{ "\x7E" }; '';
274 } elsif ($g96_g) { ## ESC 02/12..15 [I] F
275 $C->{ $_GB_to_GN{ $g96_g } }
276 = $CHARSET{G96}->{ $f2s->{G96}->{$g96_f.$rev} || $g96_f.$rev }
277 || $CHARSET{G96}->{ $f2s->{G96}->{$g96_f} || $g96_f }
278 || $CHARSET{G96}->{ "\x7E" }; '';
279 } elsif ($g96n_g) { ## ESC 02/04 02/12..15 [I] F
280 $C->{ $_GB_to_GN{ $g96n_g } }
281 = $CHARSET{G96n}->{ $f2s->{G96n}->{$g96n_f.$rev} || $g96n_f.$rev }
282 || $CHARSET{G96n}->{ $f2s->{G96n}->{$g96n_f} || $g96n_f }
283 || $CHARSET{G96n}->{ "\x7E" }; '';
284 } elsif ($Fe) { ## ESC Fe = C1
285 chr ($C->{ $C->{ESC_Fe} }->{ucs} + (ord ($Fe) - 0x40));
286 } elsif (defined $Fs) { ## ESC Fs
287 if ($Fs eq "\x6E") { ## LS2
288 $C->{GL} = 'G2'; '';
289 } elsif ($Fs eq "\x6F") { ## LS3
290 $C->{GL} = 'G3'; '';
291 } elsif ($Fs eq "\x7E" || $Fs eq "\x6B") { ## LS1R
292 $C->{GR} = 'G1'; $C->{GL} = 'G1' if $C->{bit} == 7; '';
293 } elsif ($Fs eq "\x7D" || $Fs eq "\x6C") { ## LS2R
294 $C->{GR} = 'G2'; $C->{GL} = 'G2' if $C->{bit} == 7; '';
295 } elsif ($Fs eq "\x7C" || $Fs eq "\x6D") { ## LS3R
296 $C->{GR} = 'G3'; $C->{GL} = 'G3' if $C->{bit} == 7; '';
297 } else {
298 chr ($CHARSET{single_control}->{Fs}->{ucs} + (ord ($Fs) - 0x60));
299 }
300 } elsif (defined $CZD) { ## 1F (CZD) = ESC 02/01 [I] F
301 $C->{C0} = $CHARSET{C0}->{ $f2s->{C0}->{$CZD.$rev} || $CZD.$rev }
302 || $CHARSET{C0}->{ $f2s->{C0}->{$CZD} || $CZD }
303 || $CHARSET{C0}->{ "\x7E" }; '';
304 } elsif (defined $C1D) { ## 2F (C1D) = ESC 02/02 [I] F
305 $C->{C1} = $CHARSET{C1}->{ $f2s->{C1}->{$C1D.$rev} || $C1D.$rev }
306 || $CHARSET{C1}->{ $f2s->{C1}->{$C1D} || $C1D }
307 || $CHARSET{C1}->{ "\x7E" }; '';
308 } elsif ($sI) { ## 3F = ESC 02/03 [I] F
309 chr ($CHARSET{single_control}->{'3F'.$sI}->{ucs} + (ord ($sF) - 0x30));
310 } elsif ($ACS) { ## 0F (Announcer) = ESC 02/00 F
311 if ($ACS eq "\x4A") { $C->{bit} = 7 }
312 elsif ($ACS eq "\x4B") { $C->{bit} = 8 }
313 '';
314 }
315 }gex;
316 $C->{_irr} = undef;
317 }
318 $esc;
319 } elsif ($ls) { ## Locking shifts = LS0 / LS1
320 if ($ls eq $C->{$C->{CL}}->{LS0}) {
321 $C->{GL} = 'G0'; '';
322 } elsif ($ls eq $C->{$C->{CL}}->{LS1}) {
323 $C->{GL} = 'G1'; '';
324 }
325 } elsif ($csi) { ## Control sequence = CSI [P..] [I] F
326 $csi =~ tr/\xA0-\xFF/\x20-\x7F/d;
327 $csi =~ s/$C->{$C->{CL}}->{LS0}//g if $C->{$C->{CL}}->{LS0};
328 $csi =~ s/$C->{$C->{CL}}->{LS1}//g if $C->{$C->{CL}}->{LS1};
329 "\x9B".$csi;
330 }
331 }gesx;
332 $s;
333 } # __iso2022_to_internal
334
335 };
336 &__iso2022_to_internal (@_) if defined $_[0];
337
338 } # _iso2022_to_internal
339
340 sub internal_to_iso2022 ($;%) {
341 my ($s, $C) = @_;
342 $C ||= &new_object;
343
344 my $r = '';
345 my @c = split //, $s;
346 for my $i (0..$#c) {
347 my $c = $c[$i]; my $cc = ord $c; Encode::_utf8_off ($c);
348 my $t;
349 if ($cc <= 0x1F) {
350 $t = _i2c ($c, $C, type => 'C0', charset => '@');
351 } elsif ($cc == 0x20 || $cc == 0x7F) {
352 $t = _back2ascii ($C) . $c;
353 } elsif ($cc < 0x7F) {
354 $t = _i2g ($c, $C, type => 'G94', charset => 'B');
355 } elsif ($cc <= 0x9F) {
356 $t = _i2c (pack ('C', $cc), $C, type => 'C1', charset_id => '64291991C1',
357 charset => $C->{option}->{private_set}->{XC1}->{'64291991C1'});
358 } elsif ($cc <= 0xFF) {
359 $t = _i2g (pack ('C', $cc-0x80), $C, type => 'G96', charset => 'A');
360 } elsif ($cc <= 0x24FF) {
361 my $c = $cc - 0x100;
362 my $final = $C->{option}->{private_set}->{U96n}->[0];
363 if (length $final) {
364 $t = _i2g (chr(($c / 96)+0x20).chr(($c % 96)+0x20), $C,
365 type => 'G96n', charset => $final);
366 }
367 } elsif ($cc <= 0x33FF) {
368 my $c = $cc - 0x2500;
369 my $final = $C->{option}->{private_set}->{U96n}->[1];
370 if (length $final) {
371 $t = _i2g (chr(($c / 96)+0x20).chr(($c % 96)+0x20), $C,
372 type => 'G96n', charset => $final);
373 }
374 } elsif (0xE000 <= $cc && $cc <= 0xFFFF) {
375 my $c = $cc - 0xE000;
376 my $final = $C->{option}->{private_set}->{U96n}->[2];
377 if (length $final) {
378 $t = _i2g (chr(($c / 96)+0x20).chr(($c % 96)+0x20), $C,
379 type => 'G96n', charset => $final);
380 }
381
382 } elsif (0xE9F6C0 <= $cc && $cc <= 0xF06F80) {
383 my $c = $cc - 0xE9F6C0;
384 $t = _i2g (chr((($c % 8836) / 94)+0x21).chr(($c % 94)+0x21), $C,
385 type => 'G94n', charset => chr(($c / 8836)+0x30));
386 } elsif (0xF49D7C <= $cc && $cc <= 0xF4BFFF) {
387 my $c = $cc - 0xF49D7C;
388 $t = _i2g (chr(($c / 94)+0x21).chr(($c % 94)+0x21), $C,
389 type => 'G94n', charset => 'B', revision => '@');
390
391 } elsif (0xF0000 <= $cc && $cc <= 0x10F7FF) {
392 my $c = $cc - 0xF0000;
393 $t = _i2g (chr((($c % 9216) / 96)+0x20).chr(($c % 96)+0x20), $C,
394 type => 'G96n', charset => "\x20".chr(($c / 9216)+0x40));
395 } elsif (0xE90940 <= $cc && $cc <= 0xE92641) {
396 my $c = $cc - 0xE90940;
397 $t = _i2g (chr(($c % 94)+0x21), $C,
398 type => 'G94', charset => chr(($c / 94)+0x30));
399 } elsif (0xE92642 <= $cc && $cc <= 0xE9269F) {
400 my $c = $cc - 0xE92642;
401 $t = _i2g (chr($c+0x21),$C,type => 'G94', charset => "\x64", revision => '@');
402 } elsif (0xE926A0 <= $cc && $cc <= 0xE9443F) {
403 my $c = $cc - 0xE926A0;
404 $t = _i2g (chr(($c % 96)+0x20), $C,
405 type => 'G96', charset => chr(($c / 96)+0x30));
406 } elsif (0xE944A0 <= $cc && $cc <= 0xE961A1) {
407 my $c = $cc - 0xE944A0;
408 $t = _i2g (chr(($c % 94)+0x21), $C,
409 type => 'G94', charset => '!'.chr(($c / 94)+0x30));
410 } elsif (0xE96200 <= $cc && $cc <= 0xE97F9F) {
411 my $c = $cc - 0xE96200;
412 $t = _i2g (chr(($c % 96)+0x20), $C,
413 type => 'G96', charset => '!'.chr(($c / 96)+0x30));
414 } elsif (0xE98000 <= $cc && $cc <= 0xE99D01) {
415 my $c = $cc - 0xE98000;
416 $t = _i2g (chr(($c % 94)+0x21), $C,
417 type => 'G94', charset => '"'.chr(($c / 94)+0x30));
418 } elsif (0xE99D60 <= $cc && $cc <= 0xE9BAFF) {
419 my $c = $cc - 0xE99D60;
420 $t = _i2g (chr(($c % 96)+0x20), $C,
421 type => 'G96', charset => '"'.chr(($c / 96)+0x30));
422 } elsif (0xE9BB60 <= $cc && $cc <= 0xE9D861) {
423 my $c = $cc - 0xE9BB60;
424 $t = _i2g (chr(($c % 94)+0x21), $C,
425 type => 'G94', charset => '#'.chr(($c / 94)+0x30));
426 } elsif (0xE9D8C0 <= $cc && $cc <= 0xE9F65F) {
427 my $c = $cc - 0xE9D8C0;
428 $t = _i2g (chr(($c % 96)+0x20), $C,
429 type => 'G96', charset => '#'.chr(($c / 96)+0x30));
430 } elsif (0x70090940 <= $cc && $cc <= 0x70092641) {
431 my $c = $cc - 0x70090940;
432 $t = _i2g (chr(($c % 94)+0x21), $C,
433 type => 'G94', charset => "\x20".chr(($c / 94)+0x30));
434 } elsif (0x700926A0 <= $cc && $cc <= 0x7009443F) {
435 my $c = $cc - 0x700926A0;
436 $t = _i2g (chr(($c % 96)+0x20), $C,
437 type => 'G96', charset => "\x20".chr(($c / 96)+0x30));
438 ## TODO: DRCS with I byte: U+700944A0-U+7009F6BF
439 } elsif (0x70400000 <= $cc && $cc <= 0x7040FFED) {
440 my $c = $cc - 0x70400000;
441 $t = _i2g (chr(($c % 94)+0x21), $C, charset_id => 'P'.int ($c / 94),
442 type => 'G94', charset => $C->{option}->{private_set}->{G94}->[ $c / 94 ]);
443 } elsif (0x70410000 <= $cc && $cc <= 0x7041FFBF) {
444 my $c = $cc - 0x70410000;
445 $t = _i2g (chr(($c % 96)+0x20), $C, charset_id => 'P'.int ($c / 96),
446 type => 'G96', charset => $C->{option}->{private_set}->{G96}->[ $c / 96 ]);
447 } elsif (0x70420000 <= $cc && $cc <= 0x7046F19B) {
448 my $c = $cc % 0x10000;
449 $t = _i2g (chr((($c % 8836) / 94)+0x21).chr(($c % 94)+0x21), $C,
450 type => 'G94n',
451 charset_id => 'P'.int(($cc / 0x10000) - 0x7042).'_'.int($c / 8836),
452 charset => $C->{option}->{private_set}->{G94n}
453 ->[ ($cc / 0x10000) - 0x7042 ]->[ $c / 8836 ]);
454 }
455 if (defined $t) {
456 ## Back to ISO/IEC 2022 if necessary
457 $t = _i2o ($t, $C, cs_F => "\x40")
458 if $C->{coding_system} ne $CODING_SYSTEM{"\x40"};
459 } else {
460 ## Output in UCS-n or UTF-n if character can't be represented in ISO/IEC 2022
461 my $F; my @F = qw~G /G /H /I B /A /D /F~;
462 push @F, qw~/J /K /L~ if $cc <= 0x10FFFF;
463 push @F, qw~/@ /C /E~ if $cc <= 0xFFFF;
464 for (@F) {
465 if (defined $C->{option}->{designate_to}->{coding_system}->{$_}
466 && $C->{option}->{designate_to}->{coding_system}->{$_} > -1) {
467 $F = $_; last;
468 } elsif ($C->{option}->{designate_to}->{coding_system}->{default} > -1) {
469 $F = $_; last;
470 }
471 }
472 $t = _i2o ($c, $C, cs_F => $F) if $F;
473 }
474 if (defined $t) { ## Output the character itself
475 $r .= $t;
476 } elsif ($C->{option}->{fallback_from_ucs} =~ /quiet/) {
477 $r .= _back2ascii ($C) if $C->{option}->{fallback_from_ucs} =~ /back/;
478 return ($r, halfway => 1, converted_length => $i,
479 warn => $C->{option}->{fallback_from_ucs} =~ /warn/ ? 1 : 0,
480 reason => sprintf (q(U+%04X: There is no character mapped to), $cc));
481 } elsif ($C->{option}->{fallback_from_ucs} eq 'croak') {
482 return ($r, halfway => 1, die => 1,
483 reason => sprintf (q(U+%04X: There is no character mapped to), $cc));
484 } else {
485 ## Try to output with fallback escape sequence (if specified)
486 my $t = Encode::Charset->fallback_escape ($C, $c);
487 if (defined $t) {
488 my %D = (fallback => $C->{option}->{fallback_from_ucs}, reset => $C->{option}->{reset});
489 $C->{option}->{fallback_from_ucs} = 'croak';
490 $C->{option}->{reset_at_end} = {Gdesignation => 0, Ginvoke => 0};
491 eval q{$t = $C->{_encoder}->_encode_internal ($t, $C)} or undef $t;
492 $C->{option}->{fallback_from_ucs} = $D{fallback};
493 $C->{option}->{reset_at_end} = $D{reset_at_end};
494 }
495 if (defined $t) {
496 $r .= $t;
497 } else { ## Replacement character specified in charset definition
498 unless ($C->{option}->{undef_char}->[0] eq "\x20") { ## A graphic character
499 $t = _i2g ($C->{option}->{undef_char}->[0], $C,
500 %{ $C->{option}->{undef_char}->[1] });
501 } else { ## SPACE
502 $t = _back2ascii ($C) . "\x20";
503 }
504 $r .= $C->{coding_system} eq $CODING_SYSTEM{"\x40"} ?
505 $t : _i2o ($t, $C, cs_F => "\x40");
506 }
507 }
508 }
509 ($r . _back2ascii ($C, at => 'reset_at_end'));
510 }
511
512 ## $O{charset} eq undef means that charset is same as the current designated one.
513 sub _i2c ($%%) {
514 my ($s, $C, %O) = @_;
515 my $r = '';
516 if ($O{type} eq 'C0') {
517 if (defined $O{charset}) {
518 if ( $C->{C0} ne $CHARSET{C0}->{$O{charset}}
519 && $C->{C0} ne $CHARSET{C0}->{$O{charset_id}}) {
520 for ($C->{option}->{designate_to}->{C0}->{$O{charset}},
521 $C->{option}->{designate_to}->{C0}->{default}) {
522 if (defined $_) { return undef if $_ == -1; last }
523 }
524 $r .= "\x1B\x21".$O{charset};
525 $C->{C0} = $CHARSET{C0}->{$O{charset}};
526 }
527 } elsif (defined $O{charset_id}) {
528 if ($C->{C0} ne $CHARSET{C0}->{$O{charset_id}}) {
529 return undef; ## Control set is not designated nor has F byte
530 }
531 }
532 $r .= _back2ascii ($C, reset_all => $C->{C0}->{reset_all}->{$s});
533 return $r . $s;
534 } elsif ($O{type} eq 'C1') {
535 if (defined $O{charset}) {
536 if ( $C->{C1} ne $CHARSET{C1}->{$O{charset}}
537 && $C->{C1} ne $CHARSET{C1}->{$O{charset_id}}) {
538 for ($C->{option}->{designate_to}->{C1}->{$O{charset}},
539 $C->{option}->{designate_to}->{C1}->{default}) {
540 if (defined $_) { return undef if $_ == -1; last }
541 }
542 $r .= "\x1B\x22".$O{charset};
543 $C->{C1} = $CHARSET{C1}->{$O{charset}};
544 }
545 } elsif (defined $O{charset_id}) {
546 if ($C->{C1} ne $CHARSET{C1}->{$O{charset_id}}) {
547 return undef; ## Control set is not designated nor has F byte
548 }
549 }
550 $r .= _back2ascii ($C, reset_all => $C->{C1}->{reset_all}->{$s});
551 unless ($C->{option}->{C1invoke_to_right}) { ## ESC Fe
552 $s =~ s/([\x80-\x9F])/"\x1B" . pack ('C', ord ($1) - 0x40)/ge;
553 }
554 return $r . $s;
555 }
556 }
557 sub _i2g ($%%) {
558 my ($s, $C, %O) = @_;
559 my $r = '';
560 my $set = $CHARSET{$O{type}}->{$O{charset}.
561 ($O{revision}&&$C->{option}->{use_revision}?$O{revision}:'')};
562 my $set0 = $CHARSET{$O{type}}->{$O{charset_id}};
563 ## -- designate character set
564 my $G = 0;
565 if ($C->{G0} eq $set || $C->{G0} eq $set0) { $G = 0 }
566 elsif ($C->{G1} eq $set || $C->{G1} eq $set0) { $G = 1 }
567 elsif ($C->{G2} eq $set || $C->{G2} eq $set0) { $G = 2 }
568 elsif ($C->{G3} eq $set || $C->{G3} eq $set0) { $G = 3 }
569 else {
570 return undef unless $set; ## charset does not have F byte
571 $G = 1 if $O{type} eq 'G96' || $O{type} eq 'G96n';
572 for ($C->{option}->{designate_to}->{$O{type}}->{$O{charset}},
573 $C->{option}->{designate_to}->{$O{type}}->{default}) {
574 if (defined $_) {
575 $G = $_; last;
576 }
577 }
578 if ($G == -1) {
579 return undef;
580 }
581 if ($O{type} eq 'G94') {
582 $r .= ($O{revision}&&$C->{option}->{use_revision}?"\x1B\x26".$O{revision}:'')
583 ."\x1B".("\x28","\x29","\x2A","\x2B")[$G].$O{charset};
584 } elsif ($O{type} eq 'G94n') {
585 if ($G == 0 && !$C->{option}->{G94n_designate_long}
586 && ($O{charset} eq '@' || $O{charset} eq 'A' || $O{charset} eq 'B')) {
587 $r .= ($O{revision}&&$C->{option}->{use_revision}?"\x1B\x26".$O{revision}:'')
588 ."\x1B\x24".$O{charset};
589 } else {
590 $r .= ($O{revision}&&$C->{option}->{use_revision}?"\x1B\x26".$O{revision}:'')
591 ."\x1B\x24".("\x28","\x29","\x2A","\x2B")[$G].$O{charset};
592 }
593 } elsif ($O{type} eq 'G96') {
594 $r .= ($O{revision}&&$C->{option}->{use_revision}?"\x1B\x26".$O{revision}:'')
595 ."\x1B".("\x2C","\x2D","\x2E","\x2F")[$G].$O{charset};
596 } elsif ($O{type} eq 'G96n') {
597 $r .= ($O{revision}&&$C->{option}->{use_revision}?"\x1B\x26".$O{revision}:'')
598 ."\x1B\x24".("\x2C","\x2D","\x2E","\x2F")[$G].$O{charset};
599 }
600 $C->{'G'.$G} = $CHARSET{$O{type}}->{$O{charset}};
601 }
602 ## -- invoke G buffer
603 my $left = $C->{option}->{Ginvoke_to_left}->[$G];
604 if ($C->{GL} eq 'G'.$G) {
605 $left = 1;
606 } elsif ($C->{GR} eq 'G'.$G) {
607 $left = 0;
608 } else {
609 if ($C->{option}->{Ginvoke_by_single_shift}->[$G]) {
610 if ($C->{C1}->{'C_SS'.$G}) {
611 $r .= _i2c ($C->{C1}->{'C_SS'.$G}, $C, type => 'C1') || return undef;
612 } elsif ($C->{C0}->{'C_SS'.$G}) {
613 $r .= _i2c ($C->{C0}->{'C_SS'.$G}, $C, type => 'C0') || return undef;
614 } else { ## Both C0 and C1 set do not have SS2/3.
615 $left = 0 if $G == 1 && !$C->{C0}->{C_LS1};
616 $r .= __invoke ($C, $G => $left) if $C->{$left?'GL':'GR'} ne 'G'.$G;
617 }
618 } else {
619 $left = 0 if $G == 1 && !$C->{C0}->{C_LS1};
620 $r .= __invoke ($C, $G => $left) if $C->{$left?'GL':'GR'} ne 'G'.$G;
621 }
622 }
623 $s =~ tr/\x00-\x7F/\x80-\xFF/ unless $left;
624 $r . $s;
625 }
626 sub _back2ascii ($;%) {
627 my ($C, %O) = @_;
628 my $r = '';
629 $O{at} ||= 'reset';
630 if ($C->{option}->{$O{at}}->{Gdesignation}||$C->{option}->{reset}->{Gdesignation}) {
631 my $F = $C->{option}->{$O{at}}->{Gdesignation}
632 || $C->{option}->{reset}->{Gdesignation}; # \x42
633 $r .= "\x1B\x28".$F unless $C->{G0} eq $CHARSET{G94}->{$F};
634 $C->{G0} = $CHARSET{G94}->{$F};
635 if ($O{reset_all}) {
636 $C->{G1} = $CHARSET{G94}->{"\x7E"};
637 $C->{G2} = $CHARSET{G94}->{"\x7E"};
638 $C->{G3} = $CHARSET{G94}->{"\x7E"};
639 }
640 }
641 if ($C->{option}->{$O{at}}->{Ginvoke}||$C->{option}->{reset}->{Ginvoke}) {
642 if ($C->{GL} ne 'G0') {
643 $r .= $C->{C0}->{C_LS0} || ($C->{C0} = $CHARSET{C0}->{'@'},"\x1B\x21\x40\x0F");
644 $C->{GL} = 'G0';
645 }
646 $C->{GR} = undef if $O{reset_all};
647 }
648 $r;
649 }
650 ## __invoke (\%C, $G, $left_or_right)
651 sub __invoke (\%$$) {
652 my ($C, $G) = @_;
653 if ($_[2]) {
654 $C->{GL} = 'G'.$G;
655 return ($C->{C0}->{C_LS0}
656 || scalar ($C->{C0}=$CHARSET{C0}->{'@'},"\x1B\x21\x40\x0F"),
657 $C->{C0}->{C_LS1}, "\x1B\x6E", "\x1B\x6F")[$G];
658 } else {
659 $C->{GR} = 'G'.$G;
660 return ("", "\x1B\x7E", "\x1B\x7D", "\x1B\x7C")[$G];
661 }
662 '';
663 }
664 sub _i2o ($\%%) {
665 my ($s, $C, %O) = @_;
666 my $CS = $CODING_SYSTEM{ $O{cs_F} } || $CODING_SYSTEM{ $O{cs_id} } || return undef;
667 my $r = '';
668 if ($CS ne $C->{coding_system}) {
669 my $e = '';
670 $e .= "\x1B\x25";
671 $e .= $O{cs_F} || $C->{option}->{private_set}->{coding_system}->{ $O{cs_id} }
672 || return undef;
673 if ($C->{coding_system} eq $CODING_SYSTEM{"\x2F\x40"}
674 || $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x43"}
675 || $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x45"}
676 || $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x4A"}
677 || $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x4B"}
678 || $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x4C"}) {
679 $e =~ s/(.)/\x00$1/go;
680 } elsif ($C->{coding_system} eq $CODING_SYSTEM{"\x2F\x41"}
681 || $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x44"}
682 || $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x46"}) {
683 $e =~ s/(.)/\x00\x00\x00$1/go;
684 }
685 $r .= $e;
686 $C->{coding_system} = $CS;
687 if ($CS->{reset_state}) {
688 $C->{GL} = undef; $C->{GR} = undef;
689 $C->{C0} = $CHARSET{C0}->{"\x7E"};
690 $C->{C1} = $CHARSET{C1}->{"\x7E"};
691 $C->{G0} = $CHARSET{G94}->{"\x7E"};
692 $C->{G1} = $CHARSET{G94}->{"\x7E"};
693 $C->{G2} = $CHARSET{G94}->{"\x7E"};
694 $C->{G3} = $CHARSET{G94}->{"\x7E"};
695 }
696 }
697 if ($CS eq $CODING_SYSTEM{"\x40"}) {
698 #
699 } elsif ($CS eq $CODING_SYSTEM{G} || $CS eq $CODING_SYSTEM{'/G'}
700 || $CS eq $CODING_SYSTEM{'/H'} || $CS eq $CODING_SYSTEM{'/I'}) {
701 Encode::_utf8_off ($s);
702 } elsif ($CS eq $CODING_SYSTEM{'/@'} || $CS eq $CODING_SYSTEM{'/C'}
703 || $CS eq $CODING_SYSTEM{'/E'}) {
704 $s = Encode::encode ('ucs-2be', $s);
705 } elsif ($CS eq $CODING_SYSTEM{'/A'} || $CS eq $CODING_SYSTEM{'/D'}
706 || $CS eq $CODING_SYSTEM{'/F'}) {
707 $s = Encode::encode ('ucs-4be', $s);
708 } elsif ($CS eq $CODING_SYSTEM{'/J'} || $CS eq $CODING_SYSTEM{'/K'}
709 || $CS eq $CODING_SYSTEM{'/L'}) {
710 $s = Encode::encode ('UTF-16BE', $s);
711 } elsif ($CS eq $CODING_SYSTEM{B}) {
712 $s = Encode::encode ('utf-1', $s);
713 } else {
714 return undef;
715 }
716 $r . $s;
717 }
718
719 =head1 SEE ALSO
720
721 ISO/IEC 646:1991, "7-bit coded graphic character set for intormation interchange".
722
723 ISO/IEC 2022:1994, "Character Code Structure and Extension Techniques".
724 (IDT with ECMA 35, JIS X 0202:1998)
725
726 ISO/IEC 4873:1991, "8-Bit Coded Character Set Structure and Rules".
727 (IDT with ECMA 43)
728
729 ISO/IEC 6429:1992, "Control Functions for Coded Character Sets".
730 (IDT with ECMA 48:1991, JIS X 0211:1998)
731
732 ISO/IEC 8859, "8-Bit Single-Byte Coded Graphic Character Sets".
733
734 L<Encode>, perlunicode
735
736 =head1 TODO
737
738 =over 4
739
740 =item NCR (coding system other than ISO/IEC 2022) support
741
742 =over 2
743
744 =item ESC 02/05 02/15 03/x of X Compound Text
745
746 =back
747
748 =item Output of control character sets, single control functions
749
750 =item Designation sequence of control character sets (input)
751
752 =item Special graphic character sets such as G3 of EUC-TW
753
754 =item SUPER SHIFT (SS) invoke function of old control character set
755
756 =item Safe transparent of control string (ISO/IEC 6429)
757
758 =item Output of unoutputable characters as alternative notation such as SGML-like entity
759
760 =item C0 set invoked to CR area like ISIRI code
761
762 Really need?
763
764 =item special treatment of 0x20, 0x7E, 0xA0, 0xFF
765
766 For example, GB mongolian sets use MSP (MONGOLIAN SPACE)
767 with these code positions.
768
769 And, no less coding systems does not use (or does ban using) DEL.
770
771 =item A lot of character sets don't have pseudo-UCS mapping.
772
773 Most of 9m^n (n >= 3) sets, 9m^n sets with I byte, 9m^n
774 DRCSes do not have pseudo-UCS mapping area. It is
775 questionable to allocate lots of code positions to these
776 rarely-(or no-)used character sets.
777
778 =item Even character sets that have pseudo-UCS mapping, some of them can't be outputed in ISO/IEC 2022.
779
780 Because output of rarely-used character sets is
781 not implemented yet.
782
783 =back
784
785 =head1 AUTHORS
786
787 Nanashi-san <nanashi.san@nanashi.invalid>
788
789 Wakaba <w@suika.fam.cx>
790
791 =head1 LICENSE
792
793 Copyright 2002 AUTHORS, all rights reserved.
794
795 This library is free software; you can redistribute it
796 and/or modify it under the same terms as Perl itself.
797
798 =cut
799
800 1; # $Date: 2002/12/18 10:21:09 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24