/[suikacvs]/perl/lib/Encode/ISO2022.pm
Suika

Contents of /perl/lib/Encode/ISO2022.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.13 - (show annotations) (download)
Mon Dec 16 10:25:01 2002 UTC (21 years, 10 months ago) by wakaba
Branch: MAIN
Changes since 1.12: +3 -3 lines
*** empty log message ***

1
2 =head1 NAME
3
4 Encode::ISO2022 --- ISO/IEC 2022 encoder and decoder
5
6 =head1 ENCODINGS
7
8 =over 4
9
10 =item iso2022
11
12 ISO/IEC 2022:1994. Default status is:
13
14 =over 2
15
16 =item CL = C0 = ISO/IEC 6429:1991 C0 set
17
18 =item CR = C1 = ISO/IEC 6429:1991 C1 set
19
20 =item GL = G0 = ISO/IEC 646:1991 IRV GL(G0) set
21
22 =item GR = G1 = empty set
23
24 =item G2 = empty set
25
26 =item G3 = empty set
27
28 =back
29
30 (Alias: iso/iec2022, iso-2022, 2022, cp2022)
31
32 =back
33
34 Note that ISO/IEC 2022 based encodings are found in
35 Encode::ISO2022::* modules. This module, Encode::ISO2022
36 only provides a general ISO/IEC 2022 encoder/decoder.
37
38 =cut
39
40 require v5.7.3;
41 package Encode::ISO2022;
42 use strict;
43 use vars qw(%CHARSET %CODING_SYSTEM $VERSION);
44 $VERSION=do{my @r=(q$Revision: 1.12 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
45 use base qw(Encode::Encoding);
46 __PACKAGE__->Define (qw!iso-2022 iso/iec2022 iso2022 2022 cp2022!);
47 require Encode::Charset;
48 *CHARSET = \%Encode::Charset::CHARSET;
49 *CODING_SYSTEM = \%Encode::Charset::CODING_SYSTEM;
50
51 ### --- Perl Encode module common functions
52
53 sub encode ($$;$) {
54 my ($obj, $str, $chk) = @_;
55 $_[1] = '' if $chk;
56 $str = &internal_to_iso2022 ($str);
57 return $str;
58 }
59
60 sub decode ($$;$) {
61 my ($obj, $str, $chk) = @_;
62 $_[1] = '' if $chk;
63 return &iso2022_to_internal ($str);
64 }
65
66 ### --- Encode::ISO2022 unique functions
67 *new_object = \&Encode::Charset::new_object;
68
69 sub iso2022_to_internal ($;%) {
70 my ($s, $C) = @_;
71 $C ||= &new_object;
72 my $t = '';
73 $s =~ s{^((?:(?!\x1B\x25\x2F?[\x30-\x7E]).)*)}{
74 my $i2 = $1;
75 $t = _iso2022_to_internal ($i2, $C);
76 '';
77 }es;
78 my $pad = '';
79 use re 'eval';
80 $s =~ s{
81 ## ISO/IEC 2022
82 (??{"$pad\x1B$pad\x25$pad\x40"})((?:(?!\x1B\x25\x2F?[\x30-\x7E]).)*)
83 ## UTF-8
84 |(??{"$pad\x1B$pad\x25$pad(?:\x47|\x2F$pad"."[\x47-\x49])"})
85 ((?:(?!\x1B\x25\x2F?[\x30-\x7E]).)*)
86 ## UCS-2, UTF-16
87 |(??{"$pad\x1B$pad\x25$pad\x2F$pad"})([\x40\x43\x45\x4A-\x4C])
88 ((?:(?!\x00\x1B\x00\x25(?:\x00\x2F)?\x00[\x30-\x7E])..)*)
89 ## UCS-4
90 |(??{"$pad\x1B$pad\x25$pad\x2F$pad"})[\x41\x44\x46]
91 ((?:(?!\x00\x00\x00\x1B\x00\x00\x00\x25(?:\x00\x00\x00\x2F)?
92 \x00\x00\x00[\x30-\x7E])....)*)
93 ## with standard return
94 |(??{"$pad\x1B$pad\x25$pad"})([\x30-\x7E])
95 ((?:(?!\x1B\x25\x2F?[\x30-\x7E]).)*)
96 ## without standard return
97 |(??{"$pad\x1B$pad\x25$pad\x2F$pad"})([\x30-\x7E])(.*)
98 }{
99 my ($i2,$u8,$Fu2,$u2,$u4,$Fsr,$sr,$Fnsr,$nsr) = ($1,$2,$3,$4,$5,$6,$7,$8,$9);
100 my $r = '';
101 if (defined $i2) {
102 $r = _iso2022_to_internal ($i2, $C); $pad = '';
103 } elsif (defined $u8) {
104 $r = Encode::decode ('utf8', $u8); $pad = '';
105 } elsif ($Fu2) {
106 if (ord ($Fu2) > 0x49) {
107 $r = Encode::decode ('utf-16be', $u2);
108 } else {
109 $r = Encode::decode ('ucs-2be', $u2);
110 }
111 $pad = "\x00";
112 } elsif (defined $u4) {
113 $r = Encode::decode ('ucs-4be', $u2); $pad = "\x00\x00\x00";
114 } elsif (defined $Fsr && $CODING_SYSTEM{$Fsr}->{perl_name}) {
115 $r = Encode::decode ($CODING_SYSTEM{$Fsr}->{perl_name}, $sr); $pad = '';
116 } elsif (defined $Fnsr && $CODING_SYSTEM{$Fnsr}->{perl_name}) {
117 $r = Encode::decode ($CODING_SYSTEM{$Fnsr}->{perl_name}, $nsr); $pad = '';
118 } else { ## temporary
119 $r = '?' x length ($sr.$nsr); $pad = '';
120 }
121 $r;
122 }gesx;
123 $t . $s;
124 }
125
126 # this is very very trickey. my perl 5.8.0 does not process
127 # regex with eval except the first time (i think it's a bug
128 # of perl), so we redefine this function whenever being called!
129 # when this unexpected behavior is fixed or someone finds
130 # better way to avoid it, we will rewrite this code.
131 &_iso2022_to_internal (undef);
132 sub _iso2022_to_internal ($;%) {
133 eval q{ sub __iso2022_to_internal ($;%) { 0 } };
134 eval q{
135 sub __iso2022_to_internal ($;%) {
136 use re 'eval';
137 my ($s, $C) = @_;
138 my %_GB_to_GN = (
139 "\x28"=>'G0',"\x29"=>'G1',"\x2A"=>'G2',"\x2B"=>'G3',
140 "\x2C"=>'G0',"\x2D"=>'G1',"\x2E"=>'G2',"\x2F"=>'G3',
141 );
142 my %_CHARS_to_RANGE = (
143 l94 => q/[\x21-\x7E]/, l96 => q/[\x20-\x7F]/,
144 l128 => q/[\x00-\x7F]/, l256 => q/[\x00-\xFF]/,
145 r94 => q/[\xA1-\xFE]/, r96 => q/[\xA0-\xFF]/,
146 r128 => q/[\x80-\xFF]/, r256 => q/[\x80-\xFF]/,
147 b94 => q/[\x21-\x7E\xA1-\xFE]/, b96 => q/[\x20-\x7F\xA0-\xFF]/,
148 b128 => q/[\x00-\xFF]/, b256 => q/[\x00-\xFF]/,
149 );
150
151 $s =~ s{
152 ((??{ $_CHARS_to_RANGE{'l'.$C->{$C->{GL}}->{chars}}
153 . qq/{$C->{$C->{GL}}->{dimension},$C->{$C->{GL}}->{dimension}}/ }))
154 |((??{ $_CHARS_to_RANGE{'r'.$C->{$C->{GR}}->{chars}}
155 . qq/{$C->{$C->{GR}}->{dimension},$C->{$C->{GR}}->{dimension}}/ }))
156 | (??{ q/(?:/ . ($C->{$C->{CR}}->{r_SS2} || '(?!)')
157 . ($C->{$C->{ESC_Fe}}->{r_SS2_ESC} ?
158 qq/|$C->{$C->{ESC_Fe}}->{r_SS2_ESC}/ : '')
159 . ($C->{$C->{CL}}->{r_SS2} ? qq/|$C->{$C->{CL}}->{r_SS2}/ : '') . q/)/
160 . ( $C->{$C->{CL}}->{r_LS0}
161 ||$C->{$C->{CL}}->{r_LS1}? ## ISO/IEC 6429:1992 9
162 qq/[$C->{$C->{CL}}->{r_LS0}$C->{$C->{CL}}->{r_LS1}]*/:'')
163 })
164 ((??{ $_CHARS_to_RANGE{'b'.$C->{G2}->{chars}}
165 . qq/{$C->{G2}->{dimension},$C->{G2}->{dimension}}/ }))
166 | (??{ q/(?:/ . ($C->{$C->{CR}}->{r_SS3} || '(?!)')
167 . ($C->{$C->{ESC_Fe}}->{r_SS3_ESC} ?
168 qq/|$C->{$C->{ESC_Fe}}->{r_SS3_ESC}/ : '')
169 . ($C->{$C->{CL}}->{r_SS3} ? qq/|$C->{$C->{CL}}->{r_SS3}/ : '') . q/)/
170 . ( $C->{$C->{CL}}->{r_LS0}
171 || $C->{$C->{CL}}->{r_LS1}? ## ISO/IEC 6429:1992 9
172 qq/[$C->{$C->{CL}}->{r_LS0}$C->{$C->{CL}}->{r_LS1}]*/:'')
173 })
174 ((??{ $_CHARS_to_RANGE{'b'.$C->{G3}->{chars}}
175 . qq/{$C->{G3}->{dimension},$C->{G3}->{dimension}}/ }))
176
177 ## Locking shift
178 |( (??{ $C->{$C->{CL}}->{r_LS0}||'(?!)' })
179 |(??{ $C->{$C->{CL}}->{r_LS1}||'(?!)' })
180 )
181
182 ## Control sequence
183 |(??{ '(?:'.($C->{$C->{CR}}->{r_CSI}||'(?!)')
184 .($C->{$C->{ESC_Fe}}->{r_CSI_ESC} ?
185 qq/|$C->{$C->{ESC_Fe}}->{r_CSI_ESC}/: '')
186 .')'
187 })
188 ((??{ qq/[\x30-\x3F$C->{$C->{CL}}->{LS0}$C->{$C->{CL}}->{LS1}\xB0-\xBF]*/
189 .qq/[\x20-\x2F$C->{$C->{CL}}->{LS0}$C->{$C->{CL}}->{LS1}\xA0-\xAF]*/
190 }) [\x40-\x7E\xD0-\xFE])
191
192 ## Other escape sequence
193 |(\x1B[\x20-\x2F]*[\x30-\x7E])
194
195 ## Misc. sequence (SP, control, or broken data)
196 |([\x00-\xFF])
197 }{
198 my ($gl,$gr,$ss2,$ss3,$ls,$csi,$esc,$misc)
199 = ($1,$2,$3,$4,$5,$6,$7,$8,$9);
200 $C->{_irr} = undef unless defined $esc;
201 ## GL graphic character
202 if (defined $gl) {
203 my $c = 0;
204 my $m = $C->{$C->{GL}}->{chars}==94?0x21:$C->{$C->{GL}}->{chars}==96?0x20:0;
205 for (split //, $gl) {
206 $c = $c * $C->{$C->{GL}}->{chars} + unpack ('C', $_) - $m;
207 }
208 chr ($C->{$C->{GL}}->{ucs} + $c);
209 ## GR graphic character
210 } elsif ($gr) {
211 my $c = 0;
212 my $m = $C->{$C->{GR}}->{chars}==94?0xA1:$C->{$C->{GR}}->{chars}==96?0xA0:0x80;
213 for (split //, $gr) {
214 $c = $c * $C->{$C->{GR}}->{chars} + unpack ('C', $_) - $m;
215 }
216 chr ($C->{$C->{GR}}->{ucs} + $c);
217 ## Control, SP, or broken data
218 ## TODO: support control sets other than ISO/IEC 6429's
219 } elsif (defined $misc) {
220 $misc;
221 ## Graphic character with SS2
222 } elsif ($ss2) {
223 $ss2 =~ tr/\x80-\xFF/\x00-\x7F/;
224 my $c = 0; my $m = $C->{G2}->{chars}==94?0x21:$C->{G2}->{chars}==96?0x20:0;
225 for (split //, $ss2) {
226 $c = $c * $C->{G2}->{chars} + unpack ('C', $_) - $m;
227 }
228 chr ($C->{G2}->{ucs} + $c);
229 ## Graphic character with SS3
230 } elsif ($ss3) {
231 $ss3 =~ tr/\x80-\xFF/\x00-\x7F/;
232 my $c = 0; my $m = $C->{G3}->{chars}==94?0x21:$C->{G3}->{chars}==96?0x20:0;
233 for (split //, $ss3) {
234 $c = $c * $C->{G3}->{chars} + unpack ('C', $_) - $m;
235 }
236 chr ($C->{G3}->{ucs} + $c);
237 ## Escape sequence
238 } elsif ($esc) {
239 if ($esc =~ /\x1B\x26([\x40-\x7E])/) { ## 6F (IRR) = ESC 02/06 Ft
240 $C->{_irr} = $1; $esc = '';
241 } else {
242 $esc =~ s{
243 \x1B([\x28-\x2B])(\x20?[\x21-\x23]?[\x30-\x7E]) ## Gx = 94^1
244 |\x1B\x24([\x28-\x2B]?)(\x20?[\x21-\x23]?[\x30-\x7E]) ## Gx = 94^n
245
246 |\x1B([\x2C-\x2F])(\x20?[\x21-\x23]?[\x30-\x7E]) ## Gx = 96^1
247 |\x1B\x24([\x2C-\x2F])(\x20?[\x21-\x23]?[\x30-\x7E]) ## Gx = 96^n
248
249 |\x1B([\x40-\x5F]) ## ESC Fe
250
251 |\x1B\x21([\x21-\x23]?[\x30-\x7E]) ## CL = C0
252 |\x1B\x22([\x21-\x23]?[\x30-\x7E]) ## CR & ESC Fe = C1
253
254 |\x1B([\x60-\x7E]) ## Single control functions
255 |\x1B\x23([\x21-\x23]?)([\x30-\x7E])
256
257 |\x1B\x20([\x40-\x7E]) ## Announcer
258 }{
259 my ($g94_g,$g94_f,$g94n_g,$g94n_f,$g96_g,$g96_f,$g96n_g,$g96n_f,$Fe,
260 $CZD, $C1D, $Fs, $sI, $sF,$ACS)
261 = ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15);
262 my $rev = $C->{_irr} || '';
263 my $f2s = $C->{option}->{final_to_set};
264 if ($g94_g) { ## ESC 02/08 [I] F
265 $C->{ $_GB_to_GN{ $g94_g } }
266 = $CHARSET{G94}->{ $f2s->{G94}->{$g94_f.$rev} || $g94_f.$rev }
267 || $CHARSET{G94}->{ $f2s->{G94}->{$g94_f} || $g94_f }
268 || $CHARSET{G94}->{ "\x7E" }; '';
269 } elsif (defined $g94n_f) { ## ESC 02/04 [02/08..11] [I] F
270 $C->{ $_GB_to_GN{ $g94n_g } || 'G0' }
271 = $CHARSET{G94n}->{ $f2s->{G94n}->{$g94n_f.$rev} || $g94n_f.$rev }
272 || $CHARSET{G94n}->{ $f2s->{G94n}->{$g94n_f} || $g94n_f }
273 || $CHARSET{G94n}->{ "\x7E" }; '';
274 } elsif ($g96_g) { ## ESC 02/12..15 [I] F
275 $C->{ $_GB_to_GN{ $g96_g } }
276 = $CHARSET{G96}->{ $f2s->{G96}->{$g96_f.$rev} || $g96_f.$rev }
277 || $CHARSET{G96}->{ $f2s->{G96}->{$g96_f} || $g96_f }
278 || $CHARSET{G96}->{ "\x7E" }; '';
279 } elsif ($g96n_g) { ## ESC 02/04 02/12..15 [I] F
280 $C->{ $_GB_to_GN{ $g96n_g } }
281 = $CHARSET{G96n}->{ $f2s->{G96n}->{$g96n_f.$rev} || $g96n_f.$rev }
282 || $CHARSET{G96n}->{ $f2s->{G96n}->{$g96n_f} || $g96n_f }
283 || $CHARSET{G96n}->{ "\x7E" }; '';
284 } elsif ($Fe) { ## ESC Fe = C1
285 chr ($C->{ $C->{ESC_Fe} }->{ucs} + (ord ($Fe) - 0x40));
286 } elsif (defined $Fs) { ## ESC Fs
287 if ($Fs eq "\x6E") { ## LS2
288 $C->{GL} = 'G2'; '';
289 } elsif ($Fs eq "\x6F") { ## LS3
290 $C->{GL} = 'G3'; '';
291 } elsif ($Fs eq "\x7E" || $Fs eq "\x6B") { ## LS1R
292 $C->{GR} = 'G1'; $C->{GL} = 'G1' if $C->{bit} == 7; '';
293 } elsif ($Fs eq "\x7D" || $Fs eq "\x6C") { ## LS2R
294 $C->{GR} = 'G2'; $C->{GL} = 'G2' if $C->{bit} == 7; '';
295 } elsif ($Fs eq "\x7C" || $Fs eq "\x6D") { ## LS3R
296 $C->{GR} = 'G3'; $C->{GL} = 'G3' if $C->{bit} == 7; '';
297 } else {
298 chr ($CHARSET{single_control}->{Fs}->{ucs} + (ord ($Fs) - 0x60));
299 }
300 } elsif (defined $CZD) { ## 1F (CZD) = ESC 02/01 [I] F
301 $C->{C0} = $CHARSET{C0}->{ $f2s->{C0}->{$CZD.$rev} || $CZD.$rev }
302 || $CHARSET{C0}->{ $f2s->{C0}->{$CZD} || $CZD }
303 || $CHARSET{C0}->{ "\x7E" }; '';
304 } elsif (defined $C1D) { ## 2F (C1D) = ESC 02/02 [I] F
305 $C->{C1} = $CHARSET{C1}->{ $f2s->{C1}->{$C1D.$rev} || $C1D.$rev }
306 || $CHARSET{C1}->{ $f2s->{C1}->{$C1D} || $C1D }
307 || $CHARSET{C1}->{ "\x7E" }; '';
308 } elsif ($sI) { ## 3F = ESC 02/03 [I] F
309 chr ($CHARSET{single_control}->{'3F'.$sI}->{ucs} + (ord ($sF) - 0x30));
310 } elsif ($ACS) { ## 0F (Announcer) = ESC 02/00 F
311 if ($ACS eq "\x4A") { $C->{bit} = 7 }
312 elsif ($ACS eq "\x4B") { $C->{bit} = 8 }
313 '';
314 }
315 }gex;
316 $C->{_irr} = undef;
317 }
318 $esc;
319 } elsif ($ls) { ## Locking shifts = LS0 / LS1
320 if ($ls eq $C->{$C->{CL}}->{LS0}) {
321 $C->{GL} = 'G0'; '';
322 } elsif ($ls eq $C->{$C->{CL}}->{LS1}) {
323 $C->{GL} = 'G1'; '';
324 }
325 } elsif ($csi) { ## Control sequence = CSI [P..] [I] F
326 $csi =~ tr/\xA0-\xFF/\x20-\x7F/d;
327 $csi =~ s/$C->{$C->{CL}}->{LS0}//g if $C->{$C->{CL}}->{LS0};
328 $csi =~ s/$C->{$C->{CL}}->{LS1}//g if $C->{$C->{CL}}->{LS1};
329 "\x9B".$csi;
330 }
331 }gesx;
332 $s;
333 } # __iso2022_to_internal
334
335 };
336 &__iso2022_to_internal (@_) if defined $_[0];
337
338 } # _iso2022_to_internal
339
340 sub internal_to_iso2022 ($;%) {
341 my ($s, $C) = @_;
342 $C ||= &new_object;
343
344 my $r = '';
345 my @c = split //, $s;
346 for my $i (0..$#c) {
347 my $c = $c[$i]; my $cc = ord $c; Encode::_utf8_off ($c);
348 my $t;
349 if ($cc <= 0x1F) {
350 $t = _i2c ($c, $C, type => 'C0', charset => '@');
351 } elsif ($cc == 0x20 || $cc == 0x7F) {
352 $t = _back2ascii ($C) . $c;
353 } elsif ($cc < 0x7F) {
354 $t = _i2g ($c, $C, type => 'G94', charset => 'B');
355 } elsif ($cc <= 0x9F) {
356 $t = _i2c (pack ('C', $cc), $C, type => 'C1', charset_id => '64291991C1',
357 charset => $C->{option}->{private_set}->{XC1}->{'64291991C1'});
358 } elsif ($cc <= 0xFF) {
359 $t = _i2g (pack ('C', $cc-0x80), $C, type => 'G96', charset => 'A');
360 } elsif ($cc <= 0x24FF) {
361 my $c = $cc - 0x100;
362 my $final = $C->{option}->{private_set}->{U96n}->[0];
363 if (length $final) {
364 $t = _i2g (chr(($c / 96)+0x20).chr(($c % 96)+0x20), $C,
365 type => 'G96n', charset => $final);
366 }
367 } elsif ($cc <= 0x33FF) {
368 my $c = $cc - 0x2500;
369 my $final = $C->{option}->{private_set}->{U96n}->[1];
370 if (length $final) {
371 $t = _i2g (chr(($c / 96)+0x20).chr(($c % 96)+0x20), $C,
372 type => 'G96n', charset => $final);
373 }
374 } elsif (0xE000 <= $cc && $cc <= 0xFFFF) {
375 my $c = $cc - 0xE000;
376 my $final = $C->{option}->{private_set}->{U96n}->[2];
377 if (length $final) {
378 $t = _i2g (chr(($c / 96)+0x20).chr(($c % 96)+0x20), $C,
379 type => 'G96n', charset => $final);
380 }
381
382 } elsif (0xE9F6C0 <= $cc && $cc <= 0xF06F80) {
383 my $c = $cc - 0xE9F6C0;
384 $t = _i2g (chr((($c % 8836) / 94)+0x21).chr(($c % 94)+0x21), $C,
385 type => 'G94n', charset => chr(($c / 8836)+0x30));
386 } elsif (0xF49D7C <= $cc && $cc <= 0xF4BFFF) {
387 my $c = $cc - 0xF49D7C;
388 $t = _i2g (chr(($c / 94)+0x21).chr(($c % 94)+0x21), $C,
389 type => 'G94n', charset => 'B', revision => '@');
390
391 } elsif (0xF0000 <= $cc && $cc <= 0x10F7FF) {
392 my $c = $cc - 0xF0000;
393 $t = _i2g (chr((($c % 9216) / 96)+0x20).chr(($c % 96)+0x20), $C,
394 type => 'G96n', charset => "\x20".chr(($c / 9216)+0x40));
395 } elsif (0xE90940 <= $cc && $cc <= 0xE92641) {
396 my $c = $cc - 0xE90940;
397 $t = _i2g (chr(($c % 94)+0x21), $C,
398 type => 'G94', charset => chr(($c / 94)+0x30));
399 } elsif (0xE92642 <= $cc && $cc <= 0xE9269F) {
400 my $c = $cc - 0xE92642;
401 $t = _i2g (chr($c+0x21),$C,type => 'G94', charset => "\x64", revision => '@');
402 } elsif (0xE926A0 <= $cc && $cc <= 0xE9443F) {
403 my $c = $cc - 0xE926A0;
404 $t = _i2g (chr(($c % 96)+0x20), $C,
405 type => 'G96', charset => chr(($c / 96)+0x30));
406 } elsif (0xE944A0 <= $cc && $cc <= 0xE961A1) {
407 my $c = $cc - 0xE944A0;
408 $t = _i2g (chr(($c % 94)+0x21), $C,
409 type => 'G94', charset => '!'.chr(($c / 94)+0x30));
410 } elsif (0xE96200 <= $cc && $cc <= 0xE97F9F) {
411 my $c = $cc - 0xE96200;
412 $t = _i2g (chr(($c % 96)+0x20), $C,
413 type => 'G96', charset => '!'.chr(($c / 96)+0x30));
414 } elsif (0xE98000 <= $cc && $cc <= 0xE99D01) {
415 my $c = $cc - 0xE98000;
416 $t = _i2g (chr(($c % 94)+0x21), $C,
417 type => 'G94', charset => '"'.chr(($c / 94)+0x30));
418 } elsif (0xE99D60 <= $cc && $cc <= 0xE9BAFF) {
419 my $c = $cc - 0xE99D60;
420 $t = _i2g (chr(($c % 96)+0x20), $C,
421 type => 'G96', charset => '"'.chr(($c / 96)+0x30));
422 } elsif (0xE9BB60 <= $cc && $cc <= 0xE9D861) {
423 my $c = $cc - 0xE9BB60;
424 $t = _i2g (chr(($c % 94)+0x21), $C,
425 type => 'G94', charset => '#'.chr(($c / 94)+0x30));
426 } elsif (0xE9D8C0 <= $cc && $cc <= 0xE9F65F) {
427 my $c = $cc - 0xE9D8C0;
428 $t = _i2g (chr(($c % 96)+0x20), $C,
429 type => 'G96', charset => '#'.chr(($c / 96)+0x30));
430 } elsif (0x70090940 <= $cc && $cc <= 0x70092641) {
431 my $c = $cc - 0x70090940;
432 $t = _i2g (chr(($c % 94)+0x21), $C,
433 type => 'G94', charset => "\x20".chr(($c / 94)+0x30));
434 } elsif (0x700926A0 <= $cc && $cc <= 0x7009443F) {
435 my $c = $cc - 0x700926A0;
436 $t = _i2g (chr(($c % 96)+0x20), $C,
437 type => 'G96', charset => "\x20".chr(($c / 96)+0x30));
438 ## TODO: DRCS with I byte: U+700944A0-U+7009F6BF
439 } elsif (0x70400000 <= $cc && $cc <= 0x7040FFED) {
440 my $c = $cc - 0x70400000;
441 $t = _i2g (chr(($c % 94)+0x21), $C, charset_id => 'P'.int ($c / 94),
442 type => 'G94', charset => $C->{option}->{private_set}->{G94}->[ $c / 94 ]);
443 } elsif (0x70410000 <= $cc && $cc <= 0x7041FFBF) {
444 my $c = $cc - 0x70410000;
445 $t = _i2g (chr(($c % 96)+0x20), $C, charset_id => 'P'.int ($c / 96),
446 type => 'G96', charset => $C->{option}->{private_set}->{G96}->[ $c / 96 ]);
447 } elsif (0x70420000 <= $cc && $cc <= 0x7046F19B) {
448 my $c = $cc % 0x10000;
449 $t = _i2g (chr((($c % 8836) / 94)+0x21).chr(($c % 94)+0x21), $C,
450 type => 'G94n',
451 charset_id => 'P'.int(($cc / 0x10000) - 0x7042).'_'.int($c / 8836),
452 charset => $C->{option}->{private_set}->{G94n}
453 ->[ ($cc / 0x10000) - 0x7042 ]->[ $c / 8836 ]);
454 }
455 if (defined $t) {
456 ## Back to ISO/IEC 2022 if necessary
457 $t = _i2o ($t, $C, cs_F => "\x40")
458 if $C->{coding_system} ne $CODING_SYSTEM{"\x40"};
459 } else {
460 ## Output in UCS-n or UTF-n if character can't be represented in ISO/IEC 2022
461 my $F; my @F = qw~G /G /H /I B /A /D /F~;
462 push @F, qw~/J /K /L~ if $cc <= 0x10FFFF;
463 push @F, qw~/@ /C /E~ if $cc <= 0xFFFF;
464 for (@F) {
465 if (defined $C->{option}->{designate_to}->{coding_system}->{$_}
466 && $C->{option}->{designate_to}->{coding_system}->{$_} > -1) {
467 $F = $_; last;
468 } elsif ($C->{option}->{designate_to}->{coding_system}->{default} > -1) {
469 $F = $_; last;
470 }
471 }
472 $t = _i2o ($c, $C, cs_F => $F) if $F;
473 }
474 if (defined $t) { ## Output the character itself
475 $r .= $t;
476 } elsif ($C->{option}->{fallback_from_ucs} =~ /quiet/) {
477 $r .= _back2ascii ($C) if $C->{option}->{fallback_from_ucs} =~ /back/;
478 return ($r, halfway => 1, converted_length => $i,
479 warn => $C->{option}->{fallback_from_ucs} =~ /warn/ ? 1 : 0,
480 reason => sprintf (q(U+%04X: There is no character mapped to), $cc));
481 } elsif ($C->{option}->{fallback_from_ucs} eq 'croak') {
482 return ($r, halfway => 1, die => 1,
483 reason => sprintf (q(U+%04X: There is no character mapped to), $cc));
484 } else {
485 ## Try to output with fallback escape sequence (if specified)
486 my $t = Encode::Charset::fallback_escape ($C, $c);
487 if (defined $t) {
488 my %D = (fallback => $C->{option}->{fallback_from_ucs}, reset => $C->{option}->{reset});
489 $C->{option}->{fallback_from_ucs} = 'croak';
490 $C->{option}->{reset} = {Gdesignation => 0, Ginvoke => 0};
491 eval q{$t = $C->{_encoder}->_encode_internal ($t, $C)} or undef $t;
492 $C->{option}->{fallback_from_ucs} = $D{fallback};
493 $C->{option}->{reset} = $D{reset};
494 }
495 if (defined $t) {
496 $r .= $t;
497 } else { ## Replacement character specified in charset definition
498 unless ($C->{option}->{undef_char}->[0] eq "\x20") { ## A graphic character
499 $t = _i2g ($C->{option}->{undef_char}->[0], $C,
500 %{ $C->{option}->{undef_char}->[1] });
501 } else { ## SPACE
502 $t = _back2ascii ($C) . "\x20";
503 }
504 $r .= $C->{coding_system} eq $CODING_SYSTEM{"\x40"} ?
505 $t : _i2o ($t, $C, cs_F => "\x40");
506 }
507 }
508 }
509 ($r . _back2ascii ($C)); ## Back to ASCII at the end of document if specified
510 }
511
512 ## $O{charset} eq undef means that charset is same as the current designated one.
513 sub _i2c ($%%) {
514 my ($s, $C, %O) = @_;
515 my $r = '';
516 if ($O{type} eq 'C0') {
517 if (defined $O{charset}) {
518 if ( $C->{C0} ne $CHARSET{C0}->{$O{charset}}
519 && $C->{C0} ne $CHARSET{C0}->{$O{charset_id}}) {
520 for ($C->{option}->{designate_to}->{C0}->{$O{charset}},
521 $C->{option}->{designate_to}->{C0}->{default}) {
522 if (defined $_) { return undef if $_ == -1; last }
523 }
524 $r .= "\x1B\x21".$O{charset};
525 $C->{C0} = $CHARSET{C0}->{$O{charset}};
526 }
527 } elsif (defined $O{charset_id}) {
528 if ($C->{C0} ne $CHARSET{C0}->{$O{charset_id}}) {
529 return undef; ## Control set is not designated nor has F byte
530 }
531 }
532 $r .= _back2ascii ($C, reset_all => $C->{C0}->{reset_all}->{$s});
533 return $r . $s;
534 } elsif ($O{type} eq 'C1') {
535 if (defined $O{charset}) {
536 if ( $C->{C1} ne $CHARSET{C1}->{$O{charset}}
537 && $C->{C1} ne $CHARSET{C1}->{$O{charset_id}}) {
538 for ($C->{option}->{designate_to}->{C1}->{$O{charset}},
539 $C->{option}->{designate_to}->{C1}->{default}) {
540 if (defined $_) { return undef if $_ == -1; last }
541 }
542 $r .= "\x1B\x22".$O{charset};
543 $C->{C1} = $CHARSET{C1}->{$O{charset}};
544 }
545 } elsif (defined $O{charset_id}) {
546 if ($C->{C1} ne $CHARSET{C1}->{$O{charset_id}}) {
547 return undef; ## Control set is not designated nor has F byte
548 }
549 }
550 $r .= _back2ascii ($C, reset_all => $C->{C1}->{reset_all}->{$s});
551 unless ($C->{option}->{C1invoke_to_right}) { ## ESC Fe
552 $s =~ s/([\x80-\x9F])/"\x1B" . pack ('C', ord ($1) - 0x40)/ge;
553 }
554 return $r . $s;
555 }
556 }
557 sub _i2g ($%%) {
558 my ($s, $C, %O) = @_;
559 my $r = '';
560 my $set = $CHARSET{$O{type}}->{$O{charset}.
561 ($O{revision}&&$C->{option}->{use_revision}?$O{revision}:'')};
562 my $set0 = $CHARSET{$O{type}}->{$O{charset_id}};
563 ## -- designate character set
564 my $G = 0;
565 if ($C->{G0} eq $set || $C->{G0} eq $set0) { $G = 0 }
566 elsif ($C->{G1} eq $set || $C->{G1} eq $set0) { $G = 1 }
567 elsif ($C->{G2} eq $set || $C->{G2} eq $set0) { $G = 2 }
568 elsif ($C->{G3} eq $set || $C->{G3} eq $set0) { $G = 3 }
569 else {
570 return undef unless $set; ## charset does not have F byte
571 $G = 1 if $O{type} eq 'G96' || $O{type} eq 'G96n';
572 for ($C->{option}->{designate_to}->{$O{type}}->{$O{charset}},
573 $C->{option}->{designate_to}->{$O{type}}->{default}) {
574 if (defined $_) {
575 $G = $_; last;
576 }
577 }
578 if ($G == -1) {
579 return undef;
580 }
581 if ($O{type} eq 'G94') {
582 $r .= ($O{revision}&&$C->{option}->{use_revision}?"\x1B\x26".$O{revision}:'')
583 ."\x1B".("\x28","\x29","\x2A","\x2B")[$G].$O{charset};
584 } elsif ($O{type} eq 'G94n') {
585 if ($G == 0 && !$C->{option}->{G94n_designate_long}
586 && ($O{charset} eq '@' || $O{charset} eq 'A' || $O{charset} eq 'B')) {
587 $r .= ($O{revision}&&$C->{option}->{use_revision}?"\x1B\x26".$O{revision}:'')
588 ."\x1B\x24".$O{charset};
589 } else {
590 $r .= ($O{revision}&&$C->{option}->{use_revision}?"\x1B\x26".$O{revision}:'')
591 ."\x1B\x24".("\x28","\x29","\x2A","\x2B")[$G].$O{charset};
592 }
593 } elsif ($O{type} eq 'G96') {
594 $r .= ($O{revision}&&$C->{option}->{use_revision}?"\x1B\x26".$O{revision}:'')
595 ."\x1B".("\x2C","\x2D","\x2E","\x2F")[$G].$O{charset};
596 } elsif ($O{type} eq 'G96n') {
597 $r .= ($O{revision}&&$C->{option}->{use_revision}?"\x1B\x26".$O{revision}:'')
598 ."\x1B\x24".("\x2C","\x2D","\x2E","\x2F")[$G].$O{charset};
599 }
600 $C->{'G'.$G} = $CHARSET{$O{type}}->{$O{charset}};
601 }
602 ## -- invoke G buffer
603 my $left = $C->{option}->{Ginvoke_to_left}->[$G];
604 if ($C->{GL} eq 'G'.$G) {
605 $left = 1;
606 } elsif ($C->{GR} eq 'G'.$G) {
607 $left = 0;
608 } else {
609 if ($C->{option}->{Ginvoke_by_single_shift}->[$G]) {
610 if ($C->{C1}->{'C_SS'.$G}) {
611 $r .= _i2c ($C->{C1}->{'C_SS'.$G}, $C, type => 'C1') || return undef;
612 } elsif ($C->{C0}->{'C_SS'.$G}) {
613 $r .= _i2c ($C->{C0}->{'C_SS'.$G}, $C, type => 'C0') || return undef;
614 } else { ## Both C0 and C1 set do not have SS2/3.
615 $left = 0 if $G == 1 && !$C->{C0}->{C_LS1};
616 $r .= __invoke ($C, $G => $left) if $C->{$left?'GL':'GR'} ne 'G'.$G;
617 }
618 } else {
619 $left = 0 if $G == 1 && !$C->{C0}->{C_LS1};
620 $r .= __invoke ($C, $G => $left) if $C->{$left?'GL':'GR'} ne 'G'.$G;
621 }
622 }
623 $s =~ tr/\x00-\x7F/\x80-\xFF/ unless $left;
624 $r . $s;
625 }
626 sub _back2ascii (%) {
627 my ($C, %O) = @_;
628 my $r = '';
629 if ($C->{option}->{reset}->{Gdesignation}) {
630 my $F = $C->{option}->{reset}->{Gdesignation}; # \x42
631 $r .= "\x1B\x28".$F unless $C->{G0} eq $CHARSET{G94}->{$F};
632 $C->{G0} = $CHARSET{G94}->{$F};
633 if ($O{reset_all}) {
634 $C->{G1} = $CHARSET{G94}->{"\x7E"};
635 $C->{G2} = $CHARSET{G94}->{"\x7E"};
636 $C->{G3} = $CHARSET{G94}->{"\x7E"};
637 }
638 }
639 if ($C->{option}->{reset}->{Ginvoke}) {
640 if ($C->{GL} ne 'G0') {
641 $r .= $C->{C0}->{C_LS0} || ($C->{C0} = $CHARSET{C0}->{'@'},"\x1B\x21\x40\x0F");
642 $C->{GL} = 'G0';
643 }
644 $C->{GR} = undef if $O{reset_all};
645 }
646 $r;
647 }
648 ## __invoke (\%C, $G, $left_or_right)
649 sub __invoke (\%$$) {
650 my ($C, $G) = @_;
651 if ($_[2]) {
652 $C->{GL} = 'G'.$G;
653 return ($C->{C0}->{C_LS0}
654 || scalar ($C->{C0}=$CHARSET{C0}->{'@'},"\x1B\x21\x40\x0F"),
655 $C->{C0}->{C_LS1}, "\x1B\x6E", "\x1B\x6F")[$G];
656 } else {
657 $C->{GR} = 'G'.$G;
658 return ("", "\x1B\x7E", "\x1B\x7D", "\x1B\x7C")[$G];
659 }
660 '';
661 }
662 sub _i2o ($\%%) {
663 my ($s, $C, %O) = @_;
664 my $CS = $CODING_SYSTEM{ $O{cs_F} } || $CODING_SYSTEM{ $O{cs_id} } || return undef;
665 my $r = '';
666 if ($CS ne $C->{coding_system}) {
667 my $e = '';
668 $e .= "\x1B\x25";
669 $e .= $O{cs_F} || $C->{option}->{private_set}->{coding_system}->{ $O{cs_id} }
670 || return undef;
671 if ($C->{coding_system} eq $CODING_SYSTEM{"\x2F\x40"}
672 || $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x43"}
673 || $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x45"}
674 || $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x4A"}
675 || $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x4B"}
676 || $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x4C"}) {
677 $e =~ s/(.)/\x00$1/go;
678 } elsif ($C->{coding_system} eq $CODING_SYSTEM{"\x2F\x41"}
679 || $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x44"}
680 || $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x46"}) {
681 $e =~ s/(.)/\x00\x00\x00$1/go;
682 }
683 $r .= $e;
684 $C->{coding_system} = $CS;
685 if ($CS->{reset_state}) {
686 $C->{GL} = undef; $C->{GR} = undef;
687 $C->{C0} = $CHARSET{C0}->{"\x7E"};
688 $C->{C1} = $CHARSET{C1}->{"\x7E"};
689 $C->{G0} = $CHARSET{G94}->{"\x7E"};
690 $C->{G1} = $CHARSET{G94}->{"\x7E"};
691 $C->{G2} = $CHARSET{G94}->{"\x7E"};
692 $C->{G3} = $CHARSET{G94}->{"\x7E"};
693 }
694 }
695 if ($CS eq $CODING_SYSTEM{"\x40"}) {
696 #
697 } elsif ($CS eq $CODING_SYSTEM{G} || $CS eq $CODING_SYSTEM{'/G'}
698 || $CS eq $CODING_SYSTEM{'/H'} || $CS eq $CODING_SYSTEM{'/I'}) {
699 Encode::_utf8_off ($s);
700 } elsif ($CS eq $CODING_SYSTEM{'/@'} || $CS eq $CODING_SYSTEM{'/C'}
701 || $CS eq $CODING_SYSTEM{'/E'}) {
702 $s = Encode::encode ('ucs-2be', $s);
703 } elsif ($CS eq $CODING_SYSTEM{'/A'} || $CS eq $CODING_SYSTEM{'/D'}
704 || $CS eq $CODING_SYSTEM{'/F'}) {
705 $s = Encode::encode ('ucs-4be', $s);
706 } elsif ($CS eq $CODING_SYSTEM{'/J'} || $CS eq $CODING_SYSTEM{'/K'}
707 || $CS eq $CODING_SYSTEM{'/L'}) {
708 $s = Encode::encode ('UTF-16BE', $s);
709 } elsif ($CS eq $CODING_SYSTEM{B}) {
710 $s = Encode::encode ('utf-1', $s);
711 } else {
712 return undef;
713 }
714 $r . $s;
715 }
716
717 =head1 SEE ALSO
718
719 ISO/IEC 646:1991, "7-bit coded graphic character set for intormation interchange".
720
721 ISO/IEC 2022:1994, "Character Code Structure and Extension Techniques".
722 (IDT with ECMA 35, JIS X 0202:1998)
723
724 ISO/IEC 4873:1991, "8-Bit Coded Character Set Structure and Rules".
725 (IDT with ECMA 43)
726
727 ISO/IEC 6429:1992, "Control Functions for Coded Character Sets".
728 (IDT with ECMA 48:1991, JIS X 0211:1998)
729
730 ISO/IEC 8859, "8-Bit Single-Byte Coded Graphic Character Sets".
731
732 L<Encode>, perlunicode
733
734 =head1 TODO
735
736 =over 4
737
738 =item NCR (coding system other than ISO/IEC 2022) support
739
740 =over 2
741
742 =item ESC 02/05 02/15 03/x of X Compound Text
743
744 =back
745
746 =item Output of control character sets, single control functions
747
748 =item Designation sequence of control character sets (input)
749
750 =item Special graphic character sets such as G3 of EUC-TW
751
752 =item SUPER SHIFT (SS) invoke function of old control character set
753
754 =item Safe transparent of control string (ISO/IEC 6429)
755
756 =item Output of unoutputable characters as alternative notation such as SGML-like entity
757
758 =item C0 set invoked to CR area like ISIRI code
759
760 Really need?
761
762 =item special treatment of 0x20, 0x7E, 0xA0, 0xFF
763
764 For example, GB mongolian sets use MSP (MONGOLIAN SPACE)
765 with these code positions.
766
767 And, no less coding systems does not use (or does ban using) DEL.
768
769 =item A lot of character sets don't have pseudo-UCS mapping.
770
771 Most of 9m^n (n >= 3) sets, 9m^n sets with I byte, 9m^n
772 DRCSes do not have pseudo-UCS mapping area. It is
773 questionable to allocate lots of code positions to these
774 rarely-(or no-)used character sets.
775
776 =item Even character sets that have pseudo-UCS mapping, some of them can't be outputed in ISO/IEC 2022.
777
778 Because output of rarely-used character sets is
779 not implemented yet.
780
781 =back
782
783 =head1 AUTHORS
784
785 Nanashi-san <nanashi.san@nanashi.invalid>
786
787 Wakaba <w@suika.fam.cx>
788
789 =head1 LICENSE
790
791 Copyright 2002 AUTHORS, all rights reserved.
792
793 This library is free software; you can redistribute it
794 and/or modify it under the same terms as Perl itself.
795
796 =cut
797
798 1; # $Date: 2002/12/14 11:02:25 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24