/[suikacvs]/perl/lib/Encode/ISO2022.pm
Suika

Contents of /perl/lib/Encode/ISO2022.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.9 - (hide annotations) (download)
Mon Oct 14 06:58:35 2002 UTC (22 years ago) by wakaba
Branch: MAIN
Changes since 1.8: +32 -29 lines
2002-10-14  Nanashi-san

	* ISO2022.pm, SJIS.pm: Bug fix of utf8 flag control.
	(Committed by Wakaba <w@suika.fam.cx>.)

1 wakaba 1.1
2     =head1 NAME
3    
4     Encode::ISO2022 --- ISO/IEC 2022 encoder and decoder
5    
6 wakaba 1.6 =head1 ENCODINGS
7    
8     =over 4
9    
10     =item iso2022
11    
12     ISO/IEC 2022:1994. Default status is:
13    
14     =over 2
15    
16     =item CL = C0 = ISO/IEC 6429:1991 C0 set
17    
18     =item CR = C1 = ISO/IEC 6429:1991 C1 set
19    
20     =item GL = G0 = ISO/IEC 646:1991 IRV GL(G0) set
21    
22     =item GR = G1 = empty set
23    
24     =item G2 = empty set
25    
26     =item G3 = empty set
27    
28     =back
29    
30     (Alias: iso/iec2022, iso-2022, 2022, cp2022)
31    
32     =back
33    
34     Note that ISO/IEC 2022 based encodings are found in
35     Encode::ISO2022::* modules. This module, Encode::ISO2022
36     only provides a general ISO/IEC 2022 encoder/decoder.
37    
38 wakaba 1.1 =cut
39    
40     require v5.7.3;
41     package Encode::ISO2022;
42     use strict;
43 wakaba 1.5 use vars qw(%CHARSET %CODING_SYSTEM $VERSION);
44 wakaba 1.9 $VERSION=do{my @r=(q$Revision: 1.8 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
45 wakaba 1.1 use base qw(Encode::Encoding);
46 wakaba 1.6 __PACKAGE__->Define (qw!iso-2022 iso/iec2022 iso2022 2022 cp2022!);
47 wakaba 1.5 require Encode::Charset;
48     *CHARSET = \%Encode::Charset::CHARSET;
49     *CODING_SYSTEM = \%Encode::Charset::CODING_SYSTEM;
50 wakaba 1.1
51     ### --- Perl Encode module common functions
52    
53     sub encode ($$;$) {
54     my ($obj, $str, $chk) = @_;
55     $_[1] = '' if $chk;
56     $str = &internal_to_iso2022 ($str);
57     return $str;
58     }
59    
60     sub decode ($$;$) {
61     my ($obj, $str, $chk) = @_;
62     $_[1] = '' if $chk;
63     return &iso2022_to_internal ($str);
64     }
65    
66     ### --- Encode::ISO2022 unique functions
67 wakaba 1.6 *new_object = \&Encode::Charset::new_object;
68 wakaba 1.1
69     sub iso2022_to_internal ($;\%) {
70     my ($s, $C) = @_;
71 wakaba 1.6 $C ||= &new_object;
72 wakaba 1.5 my $t = '';
73 wakaba 1.9 $s =~ s{^((?:(?!\x1B\x25\x2F?[\x30-\x7E]).)*)}{
74 wakaba 1.5 my $i2 = $1;
75     $t = _iso2022_to_internal ($i2, $C);
76     '';
77 wakaba 1.9 }es;
78 wakaba 1.6 my $pad = '';
79     use re 'eval';
80 wakaba 1.5 $s =~ s{
81     ## ISO/IEC 2022
82 wakaba 1.6 (??{"$pad\x1B$pad\x25$pad\x40"})((?:(?!\x1B\x25\x2F?[\x30-\x7E]).)*)
83 wakaba 1.5 ## UTF-8
84 wakaba 1.6 |(??{"$pad\x1B$pad\x25$pad(?:\x47|\x2F$pad"."[\x47-\x49])"})
85     ((?:(?!\x1B\x25\x2F?[\x30-\x7E]).)*)
86 wakaba 1.5 ## UCS-2, UTF-16
87 wakaba 1.6 |(??{"$pad\x1B$pad\x25$pad\x2F$pad"})([\x40\x43\x45\x4A-\x4C])
88     ((?:(?!\x00\x1B\x00\x25(?:\x00\x2F)?\x00[\x30-\x7E])..)*)
89 wakaba 1.5 ## UCS-4
90 wakaba 1.6 |(??{"$pad\x1B$pad\x25$pad\x2F$pad"})[\x41\x44\x46]
91     ((?:(?!\x00\x00\x00\x1B\x00\x00\x00\x25(?:\x00\x00\x00\x2F)?
92     \x00\x00\x00[\x30-\x7E])....)*)
93 wakaba 1.5 ## with standard return
94 wakaba 1.6 |(??{"$pad\x1B$pad\x25$pad"})([\x30-\x7E])
95     ((?:(?!\x1B\x25\x2F?[\x30-\x7E]).)*)
96 wakaba 1.5 ## without standard return
97 wakaba 1.6 |(??{"$pad\x1B$pad\x25$pad\x2F$pad"})([\x30-\x7E])(.*)
98 wakaba 1.5 }{
99     my ($i2,$u8,$Fu2,$u2,$u4,$Fsr,$sr,$Fnsr,$nsr) = ($1,$2,$3,$4,$5,$6,$7,$8,$9);
100     my $r = '';
101     if (defined $i2) {
102 wakaba 1.6 $r = _iso2022_to_internal ($i2, $C); $pad = '';
103 wakaba 1.5 } elsif (defined $u8) {
104 wakaba 1.6 $r = Encode::decode ('utf8', $u8); $pad = '';
105 wakaba 1.5 } elsif ($Fu2) {
106     if (ord ($Fu2) > 0x49) {
107     $r = Encode::decode ('utf-16be', $u2);
108     } else {
109     $r = Encode::decode ('ucs-2be', $u2);
110     }
111 wakaba 1.6 $pad = "\x00";
112 wakaba 1.5 } elsif (defined $u4) {
113 wakaba 1.6 $r = Encode::decode ('ucs-4be', $u2); $pad = "\x00\x00\x00";
114     } elsif (defined $Fsr && $CODING_SYSTEM{$Fsr}->{perl_name}) {
115     $r = Encode::decode ($CODING_SYSTEM{$Fsr}->{perl_name}, $sr); $pad = '';
116     } elsif (defined $Fnsr && $CODING_SYSTEM{$Fnsr}->{perl_name}) {
117     $r = Encode::decode ($CODING_SYSTEM{$Fnsr}->{perl_name}, $nsr); $pad = '';
118 wakaba 1.5 } else { ## temporary
119 wakaba 1.6 $r = '?' x length ($sr.$nsr); $pad = '';
120 wakaba 1.5 }
121     $r;
122     }gesx;
123     $t . $s;
124     }
125    
126 wakaba 1.9 # this is very very trickey. my perl 5.8.0 does not process
127     # regex with eval except the first time (i think it's a bug
128     # of perl), so we redefine this function whenever being called!
129     # when this unexpected behavior is fixed or someone finds
130     # better way to avoid it, we will rewrite this code.
131     &_iso2022_to_internal (undef);
132 wakaba 1.5 sub _iso2022_to_internal ($;\%) {
133 wakaba 1.9 eval q{ sub __iso2022_to_internal ($;\%) { 0 } };
134     eval q{
135     sub __iso2022_to_internal ($;\%) {
136     use re 'eval';
137 wakaba 1.5 my ($s, $C) = @_;
138 wakaba 1.1 my %_GB_to_GN = (
139     "\x28"=>'G0',"\x29"=>'G1',"\x2A"=>'G2',"\x2B"=>'G3',
140     "\x2C"=>'G0',"\x2D"=>'G1',"\x2E"=>'G2',"\x2F"=>'G3',
141     );
142 wakaba 1.9 my %_CHARS_to_RANGE = (
143     l94 => q/[\x21-\x7E]/, l96 => q/[\x20-\x7F]/,
144     l128 => q/[\x00-\x7F]/, l256 => q/[\x00-\xFF]/,
145     r94 => q/[\xA1-\xFE]/, r96 => q/[\xA0-\xFF]/,
146     r128 => q/[\x80-\xFF]/, r256 => q/[\x80-\xFF]/,
147     b94 => q/[\x21-\x7E\xA1-\xFE]/, b96 => q/[\x20-\x7F\xA0-\xFF]/,
148     b128 => q/[\x00-\xFF]/, b256 => q/[\x00-\xFF]/,
149     );
150 wakaba 1.1
151     $s =~ s{
152     ((??{ $_CHARS_to_RANGE{'l'.$C->{$C->{GL}}->{chars}}
153     . qq/{$C->{$C->{GL}}->{dimension},$C->{$C->{GL}}->{dimension}}/ }))
154 wakaba 1.6 |((??{ $_CHARS_to_RANGE{'r'.$C->{$C->{GR}}->{chars}}
155 wakaba 1.9 . qq/{$C->{$C->{GR}}->{dimension},$C->{$C->{GR}}->{dimension}}/ }))
156 wakaba 1.1 | (??{ q/(?:/ . ($C->{$C->{CR}}->{r_SS2} || '(?!)')
157     . ($C->{$C->{ESC_Fe}}->{r_SS2_ESC} ?
158     qq/|$C->{$C->{ESC_Fe}}->{r_SS2_ESC}/ : '')
159     . ($C->{$C->{CL}}->{r_SS2} ? qq/|$C->{$C->{CL}}->{r_SS2}/ : '') . q/)/
160     . ( $C->{$C->{CL}}->{r_LS0}
161     ||$C->{$C->{CL}}->{r_LS1}? ## ISO/IEC 6429:1992 9
162     qq/[$C->{$C->{CL}}->{r_LS0}$C->{$C->{CL}}->{r_LS1}]*/:'')
163     })
164 wakaba 1.6 ((??{ $_CHARS_to_RANGE{'b'.$C->{G2}->{chars}}
165 wakaba 1.9 . qq/{$C->{G2}->{dimension},$C->{G2}->{dimension}}/ }))
166 wakaba 1.1 | (??{ q/(?:/ . ($C->{$C->{CR}}->{r_SS3} || '(?!)')
167     . ($C->{$C->{ESC_Fe}}->{r_SS3_ESC} ?
168     qq/|$C->{$C->{ESC_Fe}}->{r_SS3_ESC}/ : '')
169     . ($C->{$C->{CL}}->{r_SS3} ? qq/|$C->{$C->{CL}}->{r_SS3}/ : '') . q/)/
170     . ( $C->{$C->{CL}}->{r_LS0}
171     || $C->{$C->{CL}}->{r_LS1}? ## ISO/IEC 6429:1992 9
172     qq/[$C->{$C->{CL}}->{r_LS0}$C->{$C->{CL}}->{r_LS1}]*/:'')
173     })
174 wakaba 1.6 ((??{ $_CHARS_to_RANGE{'b'.$C->{G3}->{chars}}
175 wakaba 1.1 . qq/{$C->{G3}->{dimension},$C->{G3}->{dimension}}/ }))
176    
177 wakaba 1.3 ## Locking shift
178 wakaba 1.6 |( (??{ $C->{$C->{CL}}->{r_LS0}||'(?!)' })
179 wakaba 1.3 |(??{ $C->{$C->{CL}}->{r_LS1}||'(?!)' })
180     )
181 wakaba 1.1
182     ## Control sequence
183     |(??{ '(?:'.($C->{$C->{CR}}->{r_CSI}||'(?!)')
184     .($C->{$C->{ESC_Fe}}->{r_CSI_ESC} ?
185     qq/|$C->{$C->{ESC_Fe}}->{r_CSI_ESC}/: '')
186     .')'
187     })
188     ((??{ qq/[\x30-\x3F$C->{$C->{CL}}->{LS0}$C->{$C->{CL}}->{LS1}\xB0-\xBF]*/
189     .qq/[\x20-\x2F$C->{$C->{CL}}->{LS0}$C->{$C->{CL}}->{LS1}\xA0-\xAF]*/
190     }) [\x40-\x7E\xD0-\xFE])
191    
192     ## Other escape sequence
193     |(\x1B[\x20-\x2F]*[\x30-\x7E])
194    
195     ## Misc. sequence (SP, control, or broken data)
196     |([\x00-\xFF])
197     }{
198 wakaba 1.3 my ($gl,$gr,$ss2,$ss3,$ls,$csi,$esc,$misc)
199     = ($1,$2,$3,$4,$5,$6,$7,$8,$9);
200 wakaba 1.1 $C->{_irr} = undef unless defined $esc;
201     ## GL graphic character
202     if (defined $gl) {
203     my $c = 0;
204     my $m = $C->{$C->{GL}}->{chars}==94?0x21:$C->{$C->{GL}}->{chars}==96?0x20:0;
205     for (split //, $gl) {
206     $c = $c * $C->{$C->{GL}}->{chars} + unpack ('C', $_) - $m;
207     }
208     chr ($C->{$C->{GL}}->{ucs} + $c);
209     ## Control, SP, or broken data
210     ## TODO: support control sets other than ISO/IEC 6429's
211     } elsif (defined $misc) {
212     $misc;
213     ## GR graphic character
214     } elsif ($gr) {
215     my $c = 0;
216     my $m = $C->{$C->{GR}}->{chars}==94?0xA1:$C->{$C->{GR}}->{chars}==96?0xA0:0x80;
217     for (split //, $gr) {
218     $c = $c * $C->{$C->{GR}}->{chars} + unpack ('C', $_) - $m;
219     }
220     chr ($C->{$C->{GR}}->{ucs} + $c);
221     ## Graphic character with SS2
222     } elsif ($ss2) {
223     $ss2 =~ tr/\x80-\xFF/\x00-\x7F/;
224     my $c = 0; my $m = $C->{G2}->{chars}==94?0x21:$C->{G2}->{chars}==96?0x20:0;
225     for (split //, $ss2) {
226     $c = $c * $C->{G2}->{chars} + unpack ('C', $_) - $m;
227     }
228     chr ($C->{G2}->{ucs} + $c);
229     ## Graphic character with SS3
230     } elsif ($ss3) {
231     $ss3 =~ tr/\x80-\xFF/\x00-\x7F/;
232     my $c = 0; my $m = $C->{G3}->{chars}==94?0x21:$C->{G3}->{chars}==96?0x20:0;
233     for (split //, $ss3) {
234     $c = $c * $C->{G3}->{chars} + unpack ('C', $_) - $m;
235     }
236     chr ($C->{G3}->{ucs} + $c);
237     ## Escape sequence
238     } elsif ($esc) {
239     ## IRR (revision number)
240     if ($esc =~ /\x1B\x26([\x40-\x7E])/) {
241     $C->{_irr} = $1; $esc = '';
242     } else {
243     $esc =~ s{
244     \x1B([\x28-\x2B])(\x20?[\x21-\x23]?[\x30-\x7E]) ## Gx = 94^1
245     |\x1B\x24([\x28-\x2B]?)(\x20?[\x21-\x23]?[\x30-\x7E]) ## Gx = 94^n
246    
247     |\x1B([\x2C-\x2F])(\x20?[\x21-\x23]?[\x30-\x7E]) ## Gx = 96^1
248     |\x1B\x24([\x2C-\x2F])(\x20?[\x21-\x23]?[\x30-\x7E]) ## Gx = 96^n
249    
250     |\x1B([\x40-\x5F]) ## ESC Fe
251    
252     |\x1B\x21([\x21-\x23]?[\x30-\x7E]) ## CL = C0
253     |\x1B\x22([\x21-\x23]?[\x30-\x7E]) ## CR & ESC Fe = C1
254    
255     |\x1B([\x60-\x7E]) ## Single control functions
256     |\x1B\x23([\x21-\x23]?)([\x30-\x7E])
257    
258     |\x1B\x20([\x40-\x7E]) ## Announcer
259     }{
260     my ($g94_g,$g94_f,$g94n_g,$g94n_f,$g96_g,$g96_f,$g96n_g,$g96n_f,$Fe,
261     $CZD, $C1D, $Fs, $sI, $sF,$ACS)
262     = ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15);
263     my $rev = $C->{_irr} || '';
264     if ($g94_g) {
265     $C->{ $_GB_to_GN{ $g94_g } } = $CHARSET{G94}->{ $g94_f.$rev }
266     || $CHARSET{G94}->{ $g94_f }
267     || $CHARSET{G94}->{ "\x7E" }; '';
268     } elsif (defined $g94n_f) {
269     $C->{ $_GB_to_GN{ $g94n_g } || 'G0' } = $CHARSET{G94n}->{ $g94n_f.$rev }
270     || $CHARSET{G94n}->{ $g94n_f }
271     || $CHARSET{G94n}->{ "\x7E" }; '';
272     } elsif ($g96_g) {
273     $C->{ $_GB_to_GN{ $g96_g } } = $CHARSET{G96}->{ $g96_f.$rev }
274     || $CHARSET{G96}->{ $g96_f }
275     || $CHARSET{G96}->{ "\x7E" }; '';
276     } elsif (defined $g96n_f) {
277     $C->{ $_GB_to_GN{ $g96n_g } } = $CHARSET{G96n}->{ $g96n_f.$rev }
278     || $CHARSET{G96n}->{ $g96n_f }
279     || $CHARSET{G96n}->{ "\x7E" }; '';
280     } elsif ($Fe) { ## ESC Fe => C1
281     chr ($C->{ $C->{ESC_Fe} }->{ucs} + (ord ($Fe) - 0x40));
282     } elsif ($CZD) {
283     $C->{C0} = $CHARSET{C0}->{ $CZD.$rev }
284     || $CHARSET{C0}->{ $CZD } || $CHARSET{C0}->{ "\x7E" }; '';
285     } elsif ($C1D) {
286     $C->{C1} = $CHARSET{C1}->{ $C1D.$rev }
287     || $CHARSET{C1}->{ $C1D } || $CHARSET{C1}->{ "\x7E" }; '';
288     } elsif ($Fs) {
289     if ($Fs eq "\x6E") { ## LS2
290     $C->{GL} = 'G2'; '';
291     } elsif ($Fs eq "\x6F") { ## LS3
292     $C->{GL} = 'G3'; '';
293 wakaba 1.7 } elsif ($Fs eq "\x7E" || $Fs eq "\x6B") { ## LS1R
294 wakaba 1.1 $C->{GR} = 'G1'; $C->{GL} = 'G1' if $C->{bit} == 7; '';
295 wakaba 1.7 } elsif ($Fs eq "\x7D" || $Fs eq "\x6C") { ## LS2R
296 wakaba 1.1 $C->{GR} = 'G2'; $C->{GL} = 'G2' if $C->{bit} == 7; '';
297 wakaba 1.7 } elsif ($Fs eq "\x7C" || $Fs eq "\x6D") { ## LS3R
298 wakaba 1.1 $C->{GR} = 'G3'; $C->{GL} = 'G3' if $C->{bit} == 7; '';
299     } else {
300     chr ($CHARSET{single_control}->{Fs}->{ucs} + (ord ($Fs) - 0x60));
301     }
302     } elsif ($sI) {
303     chr ($CHARSET{single_control}->{'3F'.$sI}->{ucs} + (ord ($sF) - 0x30));
304     } elsif ($ACS) { ## Announcer
305     if ($ACS eq "\x4A") { $C->{bit} = 7 }
306     elsif ($ACS eq "\x4B") { $C->{bit} = 8 }
307     '';
308     }
309     }gex;
310     $C->{_irr} = undef;
311     }
312     $esc;
313 wakaba 1.3 ## Locking shifts
314     } elsif ($ls) {
315     if ($ls eq $C->{$C->{CL}}->{LS0}) {
316     $C->{GL} = 'G0'; '';
317     } elsif ($ls eq $C->{$C->{CL}}->{LS1}) {
318     $C->{GL} = 'G1'; '';
319     }
320 wakaba 1.1 ## Control sequence
321     } elsif ($csi) {
322     $csi =~ tr/\xA0-\xFF/\x20-\x7F/d;
323     $csi =~ s/$C->{$C->{CL}}->{LS0}//g if $C->{$C->{CL}}->{LS0};
324     $csi =~ s/$C->{$C->{CL}}->{LS1}//g if $C->{$C->{CL}}->{LS1};
325     "\x9B".$csi;
326     }
327 wakaba 1.9 }gesx;
328 wakaba 1.1 $s;
329 wakaba 1.9 } # __iso2022_to_internal
330    
331     };
332     &__iso2022_to_internal (@_) if defined $_[0];
333    
334     } # _iso2022_to_internal
335 wakaba 1.1
336     sub internal_to_iso2022 ($\%) {
337     my ($s, $C) = @_;
338     $C ||= &new_object;
339    
340     my $r = '';
341     for my $c (split //, $s) {
342 wakaba 1.9 my $cc = ord $c; Encode::_utf8_off ($c);
343 wakaba 1.1 my $t;
344     if ($cc <= 0x1F) {
345     $t = _i2c ($c, $C, type => 'C0', charset => '@');
346     } elsif ($cc == 0x20 || $cc == 0x7F) {
347     $t = _back2ascii ($C) . $c;
348     } elsif ($cc < 0x7F) {
349     $t = _i2g ($c, $C, type => 'G94', charset => 'B');
350     } elsif ($cc <= 0x9F) {
351     $t = _i2c ($c, $C, type => 'C1', charset_id => '64291991C1',
352 wakaba 1.8 charset => $C->{option}->{private_set}->{XC1}->{'64291991C1'});
353 wakaba 1.1 } elsif ($cc <= 0xFF) {
354     $t = _i2g (chr($cc-0x80), $C, type => 'G96', charset => 'A');
355     } elsif ($cc <= 0x24FF) {
356     my $c = $cc - 0x100;
357 wakaba 1.8 my $final = $C->{option}->{private_set}->{U96n}->[0];
358 wakaba 1.1 if (length $final) {
359     $t = _i2g (chr(($c / 96)+0x20).chr(($c % 96)+0x20), $C,
360     type => 'G96n', charset => $final);
361     }
362     } elsif ($cc <= 0x33FF) {
363     my $c = $cc - 0x2500;
364 wakaba 1.8 my $final = $C->{option}->{private_set}->{U96n}->[1];
365 wakaba 1.1 if (length $final) {
366     $t = _i2g (chr(($c / 96)+0x20).chr(($c % 96)+0x20), $C,
367     type => 'G96n', charset => $final);
368     }
369     } elsif (0xE000 <= $cc && $cc <= 0xFFFF) {
370     my $c = $cc - 0xE000;
371 wakaba 1.8 my $final = $C->{option}->{private_set}->{U96n}->[2];
372 wakaba 1.1 if (length $final) {
373     $t = _i2g (chr(($c / 96)+0x20).chr(($c % 96)+0x20), $C,
374     type => 'G96n', charset => $final);
375     }
376    
377     } elsif (0xE9F6C0 <= $cc && $cc <= 0xF06F80) {
378     my $c = $cc - 0xE9F6C0;
379     $t = _i2g (chr((($c % 8836) / 94)+0x21).chr(($c % 94)+0x21), $C,
380     type => 'G94n', charset => chr(($c / 8836)+0x30));
381     } elsif (0xF49D7C <= $cc && $cc <= 0xF4BFFF) {
382     my $c = $cc - 0xF49D7C;
383     $t = _i2g (chr(($c / 94)+0x21).chr(($c % 94)+0x21), $C,
384     type => 'G94n', charset => 'B', revision => '@');
385    
386     } elsif (0xF0000 <= $cc && $cc <= 0x10F7FF) {
387     my $c = $cc - 0xF0000;
388     $t = _i2g (chr((($c % 9216) / 96)+0x20).chr(($c % 96)+0x20), $C,
389     type => 'G96n', charset => "\x20".chr(($c / 9216)+0x40));
390     } elsif (0xE90940 <= $cc && $cc <= 0xE92641) {
391     my $c = $cc - 0xE90940;
392     $t = _i2g (chr(($c % 94)+0x21), $C,
393     type => 'G94', charset => chr(($c / 94)+0x30));
394     } elsif (0xE92642 <= $cc && $cc <= 0xE9269F) {
395     my $c = $cc - 0xE92642;
396     $t = _i2g (chr($c+0x21),$C,type => 'G94', charset => "\x64", revision => '@');
397     } elsif (0xE926A0 <= $cc && $cc <= 0xE9443F) {
398     my $c = $cc - 0xE926A0;
399     $t = _i2g (chr(($c % 96)+0x20), $C,
400     type => 'G96', charset => chr(($c / 96)+0x30));
401     } elsif (0xE944A0 <= $cc && $cc <= 0xE961A1) {
402     my $c = $cc - 0xE944A0;
403     $t = _i2g (chr(($c % 94)+0x21), $C,
404     type => 'G94', charset => '!'.chr(($c / 94)+0x30));
405     } elsif (0xE96200 <= $cc && $cc <= 0xE97F9F) {
406     my $c = $cc - 0xE96200;
407     $t = _i2g (chr(($c % 96)+0x20), $C,
408     type => 'G96', charset => '!'.chr(($c / 96)+0x30));
409     } elsif (0xE98000 <= $cc && $cc <= 0xE99D01) {
410     my $c = $cc - 0xE98000;
411     $t = _i2g (chr(($c % 94)+0x21), $C,
412     type => 'G94', charset => '"'.chr(($c / 94)+0x30));
413     } elsif (0xE99D60 <= $cc && $cc <= 0xE9BAFF) {
414     my $c = $cc - 0xE99D60;
415     $t = _i2g (chr(($c % 96)+0x20), $C,
416     type => 'G96', charset => '"'.chr(($c / 96)+0x30));
417     } elsif (0xE9BB60 <= $cc && $cc <= 0xE9D861) {
418     my $c = $cc - 0xE9BB60;
419     $t = _i2g (chr(($c % 94)+0x21), $C,
420     type => 'G94', charset => '#'.chr(($c / 94)+0x30));
421     } elsif (0xE9D8C0 <= $cc && $cc <= 0xE9F65F) {
422     my $c = $cc - 0xE9D8C0;
423     $t = _i2g (chr(($c % 96)+0x20), $C,
424     type => 'G96', charset => '#'.chr(($c / 96)+0x30));
425     } elsif (0x70090940 <= $cc && $cc <= 0x70092641) {
426     my $c = $cc - 0x70090940;
427     $t = _i2g (chr(($c % 94)+0x21), $C,
428     type => 'G94', charset => "\x20".chr(($c / 94)+0x30));
429     } elsif (0x700926A0 <= $cc && $cc <= 0x7009443F) {
430     my $c = $cc - 0x700926A0;
431     $t = _i2g (chr(($c % 96)+0x20), $C,
432     type => 'G96', charset => "\x20".chr(($c / 96)+0x30));
433     ## TODO: DRCS with I byte: U+700944A0-U+7009F6BF
434     } elsif (0x70400000 <= $cc && $cc <= 0x7040FFED) {
435     my $c = $cc - 0x70400000;
436     $t = _i2g (chr(($c % 94)+0x21), $C, charset_id => 'P'.int ($c / 94),
437 wakaba 1.8 type => 'G94', charset => $C->{option}->{private_set}->{G94}->[ $c / 94 ]);
438 wakaba 1.1 } elsif (0x70410000 <= $cc && $cc <= 0x7041FFBF) {
439     my $c = $cc - 0x70410000;
440     $t = _i2g (chr(($c % 96)+0x20), $C, charset_id => 'P'.int ($c / 96),
441 wakaba 1.8 type => 'G96', charset => $C->{option}->{private_set}->{G96}->[ $c / 96 ]);
442 wakaba 1.1 } elsif (0x70420000 <= $cc && $cc <= 0x7046F19B) {
443     my $c = $cc % 0x10000;
444     $t = _i2g (chr((($c % 8836) / 94)+0x21).chr(($c % 94)+0x21), $C,
445     type => 'G94n',
446 wakaba 1.2 charset_id => 'P'.int(($cc / 0x10000) - 0x7042).'_'.int($c / 8836),
447 wakaba 1.8 charset => $C->{option}->{private_set}->{G94n}
448     ->[ ($cc / 0x10000) - 0x7042 ]->[ $c / 8836 ]);
449 wakaba 1.1 }
450     if (defined $t) {
451 wakaba 1.5 $t = _i2o ($t, $C, cs_F => "\x40")
452     if $C->{coding_system} ne $CODING_SYSTEM{"\x40"};
453     } else {
454     my $F; my @F = qw~G /G /H /I B /A /D /F~;
455     push @F, qw~/J /K /L~ if $cc <= 0x10FFFF;
456     push @F, qw~/@ /C /E~ if $cc <= 0xFFFF;
457     for (@F) {
458     if (defined $C->{option}->{designate_to}->{coding_system}->{$_}
459     && $C->{option}->{designate_to}->{coding_system}->{$_} > -1) {
460     $F = $_; last;
461     } elsif ($C->{option}->{designate_to}->{coding_system}->{default} > -1) {
462     $F = $_; last;
463     }
464     }
465     $t = _i2o ($c, $C, cs_F => $F) if $F;
466     }
467     if (defined $t) {
468 wakaba 1.1 $r .= $t;
469     } else {
470 wakaba 1.7 unless ($C->{option}->{undef_char}->[0] eq "\x20") {
471     $t = _i2g ($C->{option}->{undef_char}->[0], $C,
472     %{ $C->{option}->{undef_char}->[1] });
473     } else { ## SP
474     $t = _back2ascii ($C) . "\x20";
475     }
476 wakaba 1.5 $r .= $C->{coding_system} eq $CODING_SYSTEM{"\x40"} ?
477     $t : _i2o ($t, $C, cs_F => "\x40");
478 wakaba 1.1 }
479     }
480     $r . _back2ascii ($C);
481     }
482    
483     ## $O{charset} eq undef means that charset is same as the current designated one.
484     sub _i2c ($%%) {
485     my ($s, $C, %O) = @_;
486     my $r = '';
487     if ($O{type} eq 'C0') {
488     if (defined $O{charset}) {
489     if ( $C->{C0} ne $CHARSET{C0}->{$O{charset}}
490     && $C->{C0} ne $CHARSET{C0}->{$O{charset_id}}) {
491     for ($C->{option}->{designate_to}->{C0}->{$O{charset}},
492     $C->{option}->{designate_to}->{C0}->{default}) {
493     if (defined $_) { return undef if $_ == -1; last }
494     }
495     $r .= "\x1B\x21".$O{charset};
496     $C->{C0} = $CHARSET{C0}->{$O{charset}};
497     }
498     } elsif (defined $O{charset_id}) {
499     if ($C->{C0} ne $CHARSET{C0}->{$O{charset_id}}) {
500     return undef; ## Control set is not designated nor has F byte
501     }
502     }
503     $r .= _back2ascii ($C, reset_all => $C->{C0}->{reset_all}->{$s});
504     return $r . $s;
505     } elsif ($O{type} eq 'C1') {
506     if (defined $O{charset}) {
507     if ( $C->{C1} ne $CHARSET{C1}->{$O{charset}}
508     && $C->{C1} ne $CHARSET{C1}->{$O{charset_id}}) {
509     for ($C->{option}->{designate_to}->{C1}->{$O{charset}},
510     $C->{option}->{designate_to}->{C1}->{default}) {
511     if (defined $_) { return undef if $_ == -1; last }
512     }
513     $r .= "\x1B\x22".$O{charset};
514     $C->{C1} = $CHARSET{C1}->{$O{charset}};
515     }
516     } elsif (defined $O{charset_id}) {
517     if ($C->{C1} ne $CHARSET{C1}->{$O{charset_id}}) {
518     return undef; ## Control set is not designated nor has F byte
519     }
520     }
521     $r .= _back2ascii ($C, reset_all => $C->{C1}->{reset_all}->{$s});
522     unless ($C->{option}->{C1invoke_to_right}) { ## ESC Fe
523     $s =~ s/([\x80-\x9F])/"\x1B" . chr (ord ($1) - 0x40)/ge;
524     }
525     return $r . $s;
526     }
527     }
528     sub _i2g ($%%) {
529     my ($s, $C, %O) = @_;
530     my $r = '';
531 wakaba 1.4 my $set = $CHARSET{$O{type}}->{$O{charset}.
532     ($O{revision}&&$C->{option}->{use_revision}?$O{revision}:'')};
533 wakaba 1.1 my $set0 = $CHARSET{$O{type}}->{$O{charset_id}};
534     ## -- designate character set
535     my $G = 0;
536     if ($C->{G0} eq $set || $C->{G0} eq $set0) { $G = 0 }
537     elsif ($C->{G1} eq $set || $C->{G1} eq $set0) { $G = 1 }
538     elsif ($C->{G2} eq $set || $C->{G2} eq $set0) { $G = 2 }
539     elsif ($C->{G3} eq $set || $C->{G3} eq $set0) { $G = 3 }
540     else {
541     return undef unless $set; ## charset does not have F byte
542     $G = 1 if $O{type} eq 'G96' || $O{type} eq 'G96n';
543     for ($C->{option}->{designate_to}->{$O{type}}->{$O{charset}},
544     $C->{option}->{designate_to}->{$O{type}}->{default}) {
545     if (defined $_) {
546     $G = $_; last;
547     }
548     }
549     if ($G == -1) {
550     return undef;
551     }
552     if ($O{type} eq 'G94') {
553     $r .= ($O{revision}&&$C->{option}->{use_revision}?"\x1B\x26".$O{revision}:'')
554     ."\x1B".("\x28","\x29","\x2A","\x2B")[$G].$O{charset};
555     } elsif ($O{type} eq 'G94n') {
556     if ($G == 0 && !$C->{option}->{G94n_designate_long}
557     && ($O{charset} eq '@' || $O{charset} eq 'A' || $O{charset} eq 'B')) {
558     $r .= ($O{revision}&&$C->{option}->{use_revision}?"\x1B\x26".$O{revision}:'')
559     ."\x1B\x24".$O{charset};
560     } else {
561     $r .= ($O{revision}&&$C->{option}->{use_revision}?"\x1B\x26".$O{revision}:'')
562     ."\x1B\x24".("\x28","\x29","\x2A","\x2B")[$G].$O{charset};
563     }
564     } elsif ($O{type} eq 'G96') {
565     $r .= ($O{revision}&&$C->{option}->{use_revision}?"\x1B\x26".$O{revision}:'')
566     ."\x1B".("\x2C","\x2D","\x2E","\x2F")[$G].$O{charset};
567     } elsif ($O{type} eq 'G96n') {
568     $r .= ($O{revision}&&$C->{option}->{use_revision}?"\x1B\x26".$O{revision}:'')
569     ."\x1B\x24".("\x2C","\x2D","\x2E","\x2F")[$G].$O{charset};
570     }
571     $C->{'G'.$G} = $CHARSET{$O{type}}->{$O{charset}};
572     }
573     ## -- invoke G buffer
574     my $left = $C->{option}->{Ginvoke_to_left}->[$G];
575     if ($C->{GL} eq 'G'.$G) {
576     $left = 1;
577     } elsif ($C->{GR} eq 'G'.$G) {
578     $left = 0;
579     } else {
580     if ($C->{option}->{Ginvoke_by_single_shift}->[$G]) {
581     if ($C->{C1}->{'C_SS'.$G}) {
582     $r .= _i2c ($C->{C1}->{'C_SS'.$G}, $C, type => 'C1') || return undef;
583     } elsif ($C->{C0}->{'C_SS'.$G}) {
584     $r .= _i2c ($C->{C0}->{'C_SS'.$G}, $C, type => 'C0') || return undef;
585     } else { ## Both C0 and C1 set do not have SS2/3.
586 wakaba 1.4 $left = 0 if $G == 1 && !$C->{C0}->{C_LS1};
587 wakaba 1.1 $r .= __invoke ($C, $G => $left) if $C->{$left?'GL':'GR'} ne 'G'.$G;
588     }
589     } else {
590     $left = 0 if $G == 1 && !$C->{C0}->{C_LS1};
591     $r .= __invoke ($C, $G => $left) if $C->{$left?'GL':'GR'} ne 'G'.$G;
592     }
593     }
594     $s =~ tr/\x00-\x7F/\x80-\xFF/ unless $left;
595     $r . $s;
596     }
597     sub _back2ascii (%) {
598     my ($C, %O) = @_;
599     my $r = '';
600     if ($C->{option}->{reset}->{Gdesignation}) {
601     my $F = $C->{option}->{reset}->{Gdesignation}; # \x42
602     $r .= "\x1B\x28".$F unless $C->{G0} eq $CHARSET{G94}->{$F};
603     $C->{G0} = $CHARSET{G94}->{$F};
604     if ($O{reset_all}) {
605     $C->{G1} = $CHARSET{G94}->{"\x7E"};
606     $C->{G2} = $CHARSET{G94}->{"\x7E"};
607     $C->{G3} = $CHARSET{G94}->{"\x7E"};
608     }
609     }
610     if ($C->{option}->{reset}->{Ginvoke}) {
611     if ($C->{GL} ne 'G0') {
612     $r .= $C->{C0}->{C_LS0} || ($C->{C0} = $CHARSET{C0}->{'@'},"\x1B\x21\x40\x0F");
613     $C->{GL} = 'G0';
614     }
615     $C->{GR} = undef if $O{reset_all};
616     }
617     $r;
618     }
619     ## __invoke (\%C, $G, $left_or_right)
620     sub __invoke (\%$$) {
621     my ($C, $G) = @_;
622     if ($_[2]) {
623     $C->{GL} = 'G'.$G;
624     return ($C->{C0}->{C_LS0}
625     || scalar ($C->{C0}=$CHARSET{C0}->{'@'},"\x1B\x21\x40\x0F"),
626     $C->{C0}->{C_LS1}, "\x1B\x6E", "\x1B\x6F")[$G];
627     } else {
628     $C->{GR} = 'G'.$G;
629     return ("", "\x1B\x7E", "\x1B\x7D", "\x1B\x7C")[$G];
630     }
631     '';
632     }
633 wakaba 1.5 sub _i2o ($\%%) {
634     my ($s, $C, %O) = @_;
635     my $CS = $CODING_SYSTEM{ $O{cs_F} } || $CODING_SYSTEM{ $O{cs_id} } || return undef;
636     my $r = '';
637     if ($CS ne $C->{coding_system}) {
638     my $e = '';
639     $e .= "\x1B\x25";
640 wakaba 1.8 $e .= $O{cs_F} || $C->{option}->{private_set}->{coding_system}->{ $O{cs_id} }
641 wakaba 1.5 || return undef;
642     if ($C->{coding_system} eq $CODING_SYSTEM{"\x2F\x40"}
643     || $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x43"}
644     || $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x45"}
645     || $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x4A"}
646     || $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x4B"}
647     || $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x4C"}) {
648     $e =~ s/(.)/\x00$1/go;
649     } elsif ($C->{coding_system} eq $CODING_SYSTEM{"\x2F\x41"}
650     || $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x44"}
651     || $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x46"}) {
652     $e =~ s/(.)/\x00\x00\x00$1/go;
653     }
654     $r .= $e;
655     $C->{coding_system} = $CS;
656     if ($CS->{reset_state}) {
657     $C->{GL} = undef; $C->{GR} = undef;
658     $C->{C0} = $CHARSET{C0}->{"\x7E"};
659     $C->{C1} = $CHARSET{C1}->{"\x7E"};
660     $C->{G0} = $CHARSET{G94}->{"\x7E"};
661     $C->{G1} = $CHARSET{G94}->{"\x7E"};
662     $C->{G2} = $CHARSET{G94}->{"\x7E"};
663     $C->{G3} = $CHARSET{G94}->{"\x7E"};
664     }
665     }
666     if ($CS eq $CODING_SYSTEM{"\x40"}) {
667     #
668     } elsif ($CS eq $CODING_SYSTEM{G} || $CS eq $CODING_SYSTEM{'/G'}
669     || $CS eq $CODING_SYSTEM{'/H'} || $CS eq $CODING_SYSTEM{'/I'}) {
670     Encode::_utf8_off ($s);
671     } elsif ($CS eq $CODING_SYSTEM{'/@'} || $CS eq $CODING_SYSTEM{'/C'}
672     || $CS eq $CODING_SYSTEM{'/E'}) {
673     $s = Encode::encode ('ucs-2be', $s);
674     } elsif ($CS eq $CODING_SYSTEM{'/A'} || $CS eq $CODING_SYSTEM{'/D'}
675     || $CS eq $CODING_SYSTEM{'/F'}) {
676     $s = Encode::encode ('ucs-4be', $s);
677     } elsif ($CS eq $CODING_SYSTEM{'/J'} || $CS eq $CODING_SYSTEM{'/K'}
678     || $CS eq $CODING_SYSTEM{'/L'}) {
679     $s = Encode::encode ('UTF-16BE', $s);
680     } elsif ($CS eq $CODING_SYSTEM{B}) {
681     $s = Encode::encode ('utf-1', $s);
682     } else {
683     return undef;
684     }
685     $r . $s;
686 wakaba 1.1 }
687    
688     1;
689     __END__
690    
691     =head1 SEE ALSO
692    
693     ISO/IEC 646:1991, "7-bit coded graphic character set for intormation interchange".
694    
695     ISO/IEC 2022:1994, "Character Code Structure and Extension Techniques".
696     (IDT with ECMA 35, JIS X 0202:1998)
697    
698     ISO/IEC 4873:1991, "8-Bit Coded Character Set Structure and Rules".
699     (IDT with ECMA 43)
700    
701     ISO/IEC 6429:1992, "Control Functions for Coded Character Sets".
702     (IDT with ECMA 48:1991, JIS X 0211:1998)
703    
704     ISO/IEC 8859, "8-Bit Single-Byte Coded Graphic Character Sets".
705    
706     Encode, perlunicode
707    
708 wakaba 1.4 =head1 TODO
709    
710     =over 4
711    
712     =item NCR (coding system other than ISO/IEC 2022) support
713    
714     =over 2
715    
716     =item ESC 02/05 02/15 03/x of X Compound Text
717    
718     =back
719    
720     =item Output of control character sets, single control functions
721    
722     =item Designation sequence of control character sets (input)
723    
724     =item Special graphic character sets such as G3 of EUC-TW
725    
726     =item SUPER SHIFT (SS) invoke function of old control character set
727    
728     =item Safe transparent of control string (ISO/IEC 6429)
729    
730     =item Output of unoutputable characters as alternative notation such as SGML-like entity
731    
732     =item C0 set invoked to CR area like ISIRI code
733    
734     Really need?
735    
736     =item special treatment of 0x20, 0x7E, 0xA0, 0xFF
737    
738     For example, GB mongolian sets use MSP (MONGOLIAN SPACE)
739     with these code positions.
740    
741     And, no less coding systems does not use (or does ban using) DEL.
742    
743     =item A lot of character sets don't have pseudo-UCS mapping.
744    
745     Most of 9m^n (n >= 3) sets, 9m^n sets with I byte, 9m^n
746     DRCSes do not have pseudo-UCS mapping area. It is
747     questionable to allocate lots of code positions to these
748     rarely-(or no-)used character sets.
749    
750     =item Even character sets that have pseudo-UCS mapping, some of them can't be outputed in ISO/IEC 2022.
751    
752     Because output of rarely-used character sets is
753     not implemented yet.
754    
755     =back
756    
757 wakaba 1.5 =head1 AUTHORS
758    
759     Nanashi-san
760    
761     Wakaba <w@suika.fam.cx>
762    
763 wakaba 1.1 =head1 LICENSE
764    
765 wakaba 1.5 Copyright 2002 AUTHORS
766 wakaba 1.1
767     This library is free software; you can redistribute it
768     and/or modify it under the same terms as Perl itself.
769    
770     =cut
771    
772 wakaba 1.9 # $Date: 2002/10/12 11:03:00 $
773 wakaba 1.1 ### ISO2022.pm ends here

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24