/[suikacvs]/perl/lib/Encode/ISO2022.pm
Suika

Contents of /perl/lib/Encode/ISO2022.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.6 - (hide annotations) (download)
Sat Sep 21 01:34:08 2002 UTC (22 years, 1 month ago) by wakaba
Branch: MAIN
Changes since 1.5: +63 -92 lines
2002-09-21  Wakaba <w@suika.fam.cx>

	* ISO2022.pm:
	- More DOCS support.
	- (new_object): Redefined as alias of Encode::Charset's.
	- (pod:ENCODINGS): New section.
	- Regists 'iso/iec2022' as encoding name.
	* Charset.pm:
	- (new_object): Moved from Encode::ISO2022.
	- (make_initial_coding_system): Define 'Csjis' coding system.

1 wakaba 1.1
2     =head1 NAME
3    
4     Encode::ISO2022 --- ISO/IEC 2022 encoder and decoder
5    
6 wakaba 1.6 =head1 ENCODINGS
7    
8     =over 4
9    
10     =item iso2022
11    
12     ISO/IEC 2022:1994. Default status is:
13    
14     =over 2
15    
16     =item CL = C0 = ISO/IEC 6429:1991 C0 set
17    
18     =item CR = C1 = ISO/IEC 6429:1991 C1 set
19    
20     =item GL = G0 = ISO/IEC 646:1991 IRV GL(G0) set
21    
22     =item GR = G1 = empty set
23    
24     =item G2 = empty set
25    
26     =item G3 = empty set
27    
28     =back
29    
30     (Alias: iso/iec2022, iso-2022, 2022, cp2022)
31    
32     =back
33    
34     Note that ISO/IEC 2022 based encodings are found in
35     Encode::ISO2022::* modules. This module, Encode::ISO2022
36     only provides a general ISO/IEC 2022 encoder/decoder.
37    
38 wakaba 1.1 =cut
39    
40     require v5.7.3;
41     package Encode::ISO2022;
42     use strict;
43 wakaba 1.5 use vars qw(%CHARSET %CODING_SYSTEM $VERSION);
44 wakaba 1.6 $VERSION=do{my @r=(q$Revision: 1.5 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
45 wakaba 1.1 use base qw(Encode::Encoding);
46 wakaba 1.6 __PACKAGE__->Define (qw!iso-2022 iso/iec2022 iso2022 2022 cp2022!);
47 wakaba 1.5 require Encode::Charset;
48     *CHARSET = \%Encode::Charset::CHARSET;
49     *CODING_SYSTEM = \%Encode::Charset::CODING_SYSTEM;
50 wakaba 1.1
51     ### --- Intialization
52    
53     my %_CHARS_to_RANGE = (
54     l94 => q/[\x21-\x7E]/,
55     l96 => q/[\x20-\x7F]/,
56     l128 => q/[\x00-\x7F]/,
57     l256 => q/[\x00-\xFF]/,
58     r94 => q/[\xA1-\xFE]/,
59     r96 => q/[\xA0-\xFF]/,
60     r128 => q/[\x80-\xFF]/,
61     r256 => q/[\x80-\xFF]/,
62     b94 => q/[\x21-\x7E\xA1-\xFE]/,
63     b96 => q/[\x20-\x7F\xA0-\xFF]/,
64     b128 => q/[\x00-\xFF]/,
65     b256 => q/[\x00-\xFF]/,
66     );
67    
68     ### --- Perl Encode module common functions
69    
70     sub encode ($$;$) {
71     my ($obj, $str, $chk) = @_;
72     $_[1] = '' if $chk;
73     $str = &internal_to_iso2022 ($str);
74     return $str;
75     }
76    
77     sub decode ($$;$) {
78     my ($obj, $str, $chk) = @_;
79     $_[1] = '' if $chk;
80     return &iso2022_to_internal ($str);
81     }
82    
83     ### --- Encode::ISO2022 unique functions
84 wakaba 1.6 *new_object = \&Encode::Charset::new_object;
85 wakaba 1.1
86     sub iso2022_to_internal ($;\%) {
87     my ($s, $C) = @_;
88 wakaba 1.6 $C ||= &new_object;
89 wakaba 1.5 my $t = '';
90     $s =~ s{
91     ^((?:(?!\x1B\x25\x2F?[\x30-\x7E]).)*)
92     }{
93     my $i2 = $1;
94     $t = _iso2022_to_internal ($i2, $C);
95     '';
96     }gesx;
97 wakaba 1.6 my $pad = '';
98     use re 'eval';
99 wakaba 1.5 $s =~ s{
100     ## ISO/IEC 2022
101 wakaba 1.6 (??{"$pad\x1B$pad\x25$pad\x40"})((?:(?!\x1B\x25\x2F?[\x30-\x7E]).)*)
102 wakaba 1.5 ## UTF-8
103 wakaba 1.6 |(??{"$pad\x1B$pad\x25$pad(?:\x47|\x2F$pad"."[\x47-\x49])"})
104     ((?:(?!\x1B\x25\x2F?[\x30-\x7E]).)*)
105 wakaba 1.5 ## UCS-2, UTF-16
106 wakaba 1.6 |(??{"$pad\x1B$pad\x25$pad\x2F$pad"})([\x40\x43\x45\x4A-\x4C])
107     ((?:(?!\x00\x1B\x00\x25(?:\x00\x2F)?\x00[\x30-\x7E])..)*)
108 wakaba 1.5 ## UCS-4
109 wakaba 1.6 |(??{"$pad\x1B$pad\x25$pad\x2F$pad"})[\x41\x44\x46]
110     ((?:(?!\x00\x00\x00\x1B\x00\x00\x00\x25(?:\x00\x00\x00\x2F)?
111     \x00\x00\x00[\x30-\x7E])....)*)
112 wakaba 1.5 ## with standard return
113 wakaba 1.6 |(??{"$pad\x1B$pad\x25$pad"})([\x30-\x7E])
114     ((?:(?!\x1B\x25\x2F?[\x30-\x7E]).)*)
115 wakaba 1.5 ## without standard return
116 wakaba 1.6 |(??{"$pad\x1B$pad\x25$pad\x2F$pad"})([\x30-\x7E])(.*)
117 wakaba 1.5 }{
118     my ($i2,$u8,$Fu2,$u2,$u4,$Fsr,$sr,$Fnsr,$nsr) = ($1,$2,$3,$4,$5,$6,$7,$8,$9);
119     my $r = '';
120     if (defined $i2) {
121 wakaba 1.6 $r = _iso2022_to_internal ($i2, $C); $pad = '';
122 wakaba 1.5 } elsif (defined $u8) {
123 wakaba 1.6 $r = Encode::decode ('utf8', $u8); $pad = '';
124 wakaba 1.5 } elsif ($Fu2) {
125     if (ord ($Fu2) > 0x49) {
126     $r = Encode::decode ('utf-16be', $u2);
127     } else {
128     $r = Encode::decode ('ucs-2be', $u2);
129     }
130 wakaba 1.6 $pad = "\x00";
131 wakaba 1.5 } elsif (defined $u4) {
132 wakaba 1.6 $r = Encode::decode ('ucs-4be', $u2); $pad = "\x00\x00\x00";
133     } elsif (defined $Fsr && $CODING_SYSTEM{$Fsr}->{perl_name}) {
134     $r = Encode::decode ($CODING_SYSTEM{$Fsr}->{perl_name}, $sr); $pad = '';
135     } elsif (defined $Fnsr && $CODING_SYSTEM{$Fnsr}->{perl_name}) {
136     $r = Encode::decode ($CODING_SYSTEM{$Fnsr}->{perl_name}, $nsr); $pad = '';
137 wakaba 1.5 } else { ## temporary
138 wakaba 1.6 $r = '?' x length ($sr.$nsr); $pad = '';
139 wakaba 1.5 }
140     $r;
141     }gesx;
142     $t . $s;
143     }
144    
145     sub _iso2022_to_internal ($;\%) {
146     my ($s, $C) = @_;
147 wakaba 1.1 my %_GB_to_GN = (
148     "\x28"=>'G0',"\x29"=>'G1',"\x2A"=>'G2',"\x2B"=>'G3',
149     "\x2C"=>'G0',"\x2D"=>'G1',"\x2E"=>'G2',"\x2F"=>'G3',
150     );
151    
152     use re 'eval';
153     $s =~ s{
154     ((??{ $_CHARS_to_RANGE{'l'.$C->{$C->{GL}}->{chars}}
155     . qq/{$C->{$C->{GL}}->{dimension},$C->{$C->{GL}}->{dimension}}/ }))
156 wakaba 1.6 |((??{ $_CHARS_to_RANGE{'r'.$C->{$C->{GR}}->{chars}}
157 wakaba 1.1 . qq/{$C->{$C->{GR}}->{dimension},$C->{$C->{GR}}->{dimension}}/ }))
158     | (??{ q/(?:/ . ($C->{$C->{CR}}->{r_SS2} || '(?!)')
159     . ($C->{$C->{ESC_Fe}}->{r_SS2_ESC} ?
160     qq/|$C->{$C->{ESC_Fe}}->{r_SS2_ESC}/ : '')
161     . ($C->{$C->{CL}}->{r_SS2} ? qq/|$C->{$C->{CL}}->{r_SS2}/ : '') . q/)/
162     . ( $C->{$C->{CL}}->{r_LS0}
163     ||$C->{$C->{CL}}->{r_LS1}? ## ISO/IEC 6429:1992 9
164     qq/[$C->{$C->{CL}}->{r_LS0}$C->{$C->{CL}}->{r_LS1}]*/:'')
165     })
166 wakaba 1.6 ((??{ $_CHARS_to_RANGE{'b'.$C->{G2}->{chars}}
167 wakaba 1.1 . qq/{$C->{G2}->{dimension},$C->{G2}->{dimension}}/ }))
168     | (??{ q/(?:/ . ($C->{$C->{CR}}->{r_SS3} || '(?!)')
169     . ($C->{$C->{ESC_Fe}}->{r_SS3_ESC} ?
170     qq/|$C->{$C->{ESC_Fe}}->{r_SS3_ESC}/ : '')
171     . ($C->{$C->{CL}}->{r_SS3} ? qq/|$C->{$C->{CL}}->{r_SS3}/ : '') . q/)/
172     . ( $C->{$C->{CL}}->{r_LS0}
173     || $C->{$C->{CL}}->{r_LS1}? ## ISO/IEC 6429:1992 9
174     qq/[$C->{$C->{CL}}->{r_LS0}$C->{$C->{CL}}->{r_LS1}]*/:'')
175     })
176 wakaba 1.6 ((??{ $_CHARS_to_RANGE{'b'.$C->{G3}->{chars}}
177 wakaba 1.1 . qq/{$C->{G3}->{dimension},$C->{G3}->{dimension}}/ }))
178    
179 wakaba 1.3 ## Locking shift
180 wakaba 1.6 |( (??{ $C->{$C->{CL}}->{r_LS0}||'(?!)' })
181 wakaba 1.3 |(??{ $C->{$C->{CL}}->{r_LS1}||'(?!)' })
182     )
183 wakaba 1.1
184     ## Control sequence
185     |(??{ '(?:'.($C->{$C->{CR}}->{r_CSI}||'(?!)')
186     .($C->{$C->{ESC_Fe}}->{r_CSI_ESC} ?
187     qq/|$C->{$C->{ESC_Fe}}->{r_CSI_ESC}/: '')
188     .')'
189     })
190     ((??{ qq/[\x30-\x3F$C->{$C->{CL}}->{LS0}$C->{$C->{CL}}->{LS1}\xB0-\xBF]*/
191     .qq/[\x20-\x2F$C->{$C->{CL}}->{LS0}$C->{$C->{CL}}->{LS1}\xA0-\xAF]*/
192     }) [\x40-\x7E\xD0-\xFE])
193    
194     ## Other escape sequence
195     |(\x1B[\x20-\x2F]*[\x30-\x7E])
196    
197     ## Misc. sequence (SP, control, or broken data)
198     |([\x00-\xFF])
199     }{
200 wakaba 1.3 my ($gl,$gr,$ss2,$ss3,$ls,$csi,$esc,$misc)
201     = ($1,$2,$3,$4,$5,$6,$7,$8,$9);
202 wakaba 1.1 $C->{_irr} = undef unless defined $esc;
203     ## GL graphic character
204     if (defined $gl) {
205     my $c = 0;
206     my $m = $C->{$C->{GL}}->{chars}==94?0x21:$C->{$C->{GL}}->{chars}==96?0x20:0;
207     for (split //, $gl) {
208     $c = $c * $C->{$C->{GL}}->{chars} + unpack ('C', $_) - $m;
209     }
210     chr ($C->{$C->{GL}}->{ucs} + $c);
211     ## Control, SP, or broken data
212     ## TODO: support control sets other than ISO/IEC 6429's
213     } elsif (defined $misc) {
214     $misc;
215     ## GR graphic character
216     } elsif ($gr) {
217     my $c = 0;
218     my $m = $C->{$C->{GR}}->{chars}==94?0xA1:$C->{$C->{GR}}->{chars}==96?0xA0:0x80;
219     for (split //, $gr) {
220     $c = $c * $C->{$C->{GR}}->{chars} + unpack ('C', $_) - $m;
221     }
222     chr ($C->{$C->{GR}}->{ucs} + $c);
223     ## Graphic character with SS2
224     } elsif ($ss2) {
225     $ss2 =~ tr/\x80-\xFF/\x00-\x7F/;
226     my $c = 0; my $m = $C->{G2}->{chars}==94?0x21:$C->{G2}->{chars}==96?0x20:0;
227     for (split //, $ss2) {
228     $c = $c * $C->{G2}->{chars} + unpack ('C', $_) - $m;
229     }
230     chr ($C->{G2}->{ucs} + $c);
231     ## Graphic character with SS3
232     } elsif ($ss3) {
233     $ss3 =~ tr/\x80-\xFF/\x00-\x7F/;
234     my $c = 0; my $m = $C->{G3}->{chars}==94?0x21:$C->{G3}->{chars}==96?0x20:0;
235     for (split //, $ss3) {
236     $c = $c * $C->{G3}->{chars} + unpack ('C', $_) - $m;
237     }
238     chr ($C->{G3}->{ucs} + $c);
239     ## Escape sequence
240     } elsif ($esc) {
241     ## IRR (revision number)
242     if ($esc =~ /\x1B\x26([\x40-\x7E])/) {
243     $C->{_irr} = $1; $esc = '';
244     } else {
245     $esc =~ s{
246     \x1B([\x28-\x2B])(\x20?[\x21-\x23]?[\x30-\x7E]) ## Gx = 94^1
247     |\x1B\x24([\x28-\x2B]?)(\x20?[\x21-\x23]?[\x30-\x7E]) ## Gx = 94^n
248    
249     |\x1B([\x2C-\x2F])(\x20?[\x21-\x23]?[\x30-\x7E]) ## Gx = 96^1
250     |\x1B\x24([\x2C-\x2F])(\x20?[\x21-\x23]?[\x30-\x7E]) ## Gx = 96^n
251    
252     |\x1B([\x40-\x5F]) ## ESC Fe
253    
254     |\x1B\x21([\x21-\x23]?[\x30-\x7E]) ## CL = C0
255     |\x1B\x22([\x21-\x23]?[\x30-\x7E]) ## CR & ESC Fe = C1
256    
257     |\x1B([\x60-\x7E]) ## Single control functions
258     |\x1B\x23([\x21-\x23]?)([\x30-\x7E])
259    
260     |\x1B\x20([\x40-\x7E]) ## Announcer
261     }{
262     my ($g94_g,$g94_f,$g94n_g,$g94n_f,$g96_g,$g96_f,$g96n_g,$g96n_f,$Fe,
263     $CZD, $C1D, $Fs, $sI, $sF,$ACS)
264     = ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15);
265     my $rev = $C->{_irr} || '';
266     if ($g94_g) {
267     $C->{ $_GB_to_GN{ $g94_g } } = $CHARSET{G94}->{ $g94_f.$rev }
268     || $CHARSET{G94}->{ $g94_f }
269     || $CHARSET{G94}->{ "\x7E" }; '';
270     } elsif (defined $g94n_f) {
271     $C->{ $_GB_to_GN{ $g94n_g } || 'G0' } = $CHARSET{G94n}->{ $g94n_f.$rev }
272     || $CHARSET{G94n}->{ $g94n_f }
273     || $CHARSET{G94n}->{ "\x7E" }; '';
274     } elsif ($g96_g) {
275     $C->{ $_GB_to_GN{ $g96_g } } = $CHARSET{G96}->{ $g96_f.$rev }
276     || $CHARSET{G96}->{ $g96_f }
277     || $CHARSET{G96}->{ "\x7E" }; '';
278     } elsif (defined $g96n_f) {
279     $C->{ $_GB_to_GN{ $g96n_g } } = $CHARSET{G96n}->{ $g96n_f.$rev }
280     || $CHARSET{G96n}->{ $g96n_f }
281     || $CHARSET{G96n}->{ "\x7E" }; '';
282     } elsif ($Fe) { ## ESC Fe => C1
283     chr ($C->{ $C->{ESC_Fe} }->{ucs} + (ord ($Fe) - 0x40));
284     } elsif ($CZD) {
285     $C->{C0} = $CHARSET{C0}->{ $CZD.$rev }
286     || $CHARSET{C0}->{ $CZD } || $CHARSET{C0}->{ "\x7E" }; '';
287     } elsif ($C1D) {
288     $C->{C1} = $CHARSET{C1}->{ $C1D.$rev }
289     || $CHARSET{C1}->{ $C1D } || $CHARSET{C1}->{ "\x7E" }; '';
290     } elsif ($Fs) {
291     if ($Fs eq "\x6E") { ## LS2
292     $C->{GL} = 'G2'; '';
293     } elsif ($Fs eq "\x6F") { ## LS3
294     $C->{GL} = 'G3'; '';
295     } elsif ($Fs eq "\x7E") { ## LS1R
296     $C->{GR} = 'G1'; $C->{GL} = 'G1' if $C->{bit} == 7; '';
297     } elsif ($Fs eq "\x7D") { ## LS2R
298     $C->{GR} = 'G2'; $C->{GL} = 'G2' if $C->{bit} == 7; '';
299     } elsif ($Fs eq "\x7C") { ## LS3R
300     $C->{GR} = 'G3'; $C->{GL} = 'G3' if $C->{bit} == 7; '';
301     } else {
302     chr ($CHARSET{single_control}->{Fs}->{ucs} + (ord ($Fs) - 0x60));
303     }
304     } elsif ($sI) {
305     chr ($CHARSET{single_control}->{'3F'.$sI}->{ucs} + (ord ($sF) - 0x30));
306     } elsif ($ACS) { ## Announcer
307     if ($ACS eq "\x4A") { $C->{bit} = 7 }
308     elsif ($ACS eq "\x4B") { $C->{bit} = 8 }
309     '';
310     }
311     }gex;
312     $C->{_irr} = undef;
313     }
314     $esc;
315 wakaba 1.3 ## Locking shifts
316     } elsif ($ls) {
317     if ($ls eq $C->{$C->{CL}}->{LS0}) {
318     $C->{GL} = 'G0'; '';
319     } elsif ($ls eq $C->{$C->{CL}}->{LS1}) {
320     $C->{GL} = 'G1'; '';
321     }
322 wakaba 1.1 ## Control sequence
323     } elsif ($csi) {
324     $csi =~ tr/\xA0-\xFF/\x20-\x7F/d;
325     $csi =~ s/$C->{$C->{CL}}->{LS0}//g if $C->{$C->{CL}}->{LS0};
326     $csi =~ s/$C->{$C->{CL}}->{LS1}//g if $C->{$C->{CL}}->{LS1};
327     "\x9B".$csi;
328     }
329     }gex;
330     $s;
331     }
332    
333     sub internal_to_iso2022 ($\%) {
334     my ($s, $C) = @_;
335     $C ||= &new_object;
336    
337     my $r = '';
338     for my $c (split //, $s) {
339     my $cc = ord $c;
340     my $t;
341     if ($cc <= 0x1F) {
342     $t = _i2c ($c, $C, type => 'C0', charset => '@');
343     } elsif ($cc == 0x20 || $cc == 0x7F) {
344     $t = _back2ascii ($C) . $c;
345     } elsif ($cc < 0x7F) {
346     $t = _i2g ($c, $C, type => 'G94', charset => 'B');
347     } elsif ($cc <= 0x9F) {
348     $t = _i2c ($c, $C, type => 'C1', charset_id => '64291991C1',
349     charset => $C->{private_set}->{XC1}->{'64291991C1'});
350     } elsif ($cc <= 0xFF) {
351     $t = _i2g (chr($cc-0x80), $C, type => 'G96', charset => 'A');
352     } elsif ($cc <= 0x24FF) {
353     my $c = $cc - 0x100;
354     my $final = $C->{private_set}->{U96n}->[0];
355     if (length $final) {
356     $t = _i2g (chr(($c / 96)+0x20).chr(($c % 96)+0x20), $C,
357     type => 'G96n', charset => $final);
358     }
359     } elsif ($cc <= 0x33FF) {
360     my $c = $cc - 0x2500;
361     my $final = $C->{private_set}->{U96n}->[1];
362     if (length $final) {
363     $t = _i2g (chr(($c / 96)+0x20).chr(($c % 96)+0x20), $C,
364     type => 'G96n', charset => $final);
365     }
366     } elsif (0xE000 <= $cc && $cc <= 0xFFFF) {
367     my $c = $cc - 0xE000;
368     my $final = $C->{private_set}->{U96n}->[2];
369     if (length $final) {
370     $t = _i2g (chr(($c / 96)+0x20).chr(($c % 96)+0x20), $C,
371     type => 'G96n', charset => $final);
372     }
373    
374     } elsif (0xE9F6C0 <= $cc && $cc <= 0xF06F80) {
375     my $c = $cc - 0xE9F6C0;
376     $t = _i2g (chr((($c % 8836) / 94)+0x21).chr(($c % 94)+0x21), $C,
377     type => 'G94n', charset => chr(($c / 8836)+0x30));
378     } elsif (0xF49D7C <= $cc && $cc <= 0xF4BFFF) {
379     my $c = $cc - 0xF49D7C;
380     $t = _i2g (chr(($c / 94)+0x21).chr(($c % 94)+0x21), $C,
381     type => 'G94n', charset => 'B', revision => '@');
382    
383     } elsif (0xF0000 <= $cc && $cc <= 0x10F7FF) {
384     my $c = $cc - 0xF0000;
385     $t = _i2g (chr((($c % 9216) / 96)+0x20).chr(($c % 96)+0x20), $C,
386     type => 'G96n', charset => "\x20".chr(($c / 9216)+0x40));
387     } elsif (0xE90940 <= $cc && $cc <= 0xE92641) {
388     my $c = $cc - 0xE90940;
389     $t = _i2g (chr(($c % 94)+0x21), $C,
390     type => 'G94', charset => chr(($c / 94)+0x30));
391     } elsif (0xE92642 <= $cc && $cc <= 0xE9269F) {
392     my $c = $cc - 0xE92642;
393     $t = _i2g (chr($c+0x21),$C,type => 'G94', charset => "\x64", revision => '@');
394     } elsif (0xE926A0 <= $cc && $cc <= 0xE9443F) {
395     my $c = $cc - 0xE926A0;
396     $t = _i2g (chr(($c % 96)+0x20), $C,
397     type => 'G96', charset => chr(($c / 96)+0x30));
398     } elsif (0xE944A0 <= $cc && $cc <= 0xE961A1) {
399     my $c = $cc - 0xE944A0;
400     $t = _i2g (chr(($c % 94)+0x21), $C,
401     type => 'G94', charset => '!'.chr(($c / 94)+0x30));
402     } elsif (0xE96200 <= $cc && $cc <= 0xE97F9F) {
403     my $c = $cc - 0xE96200;
404     $t = _i2g (chr(($c % 96)+0x20), $C,
405     type => 'G96', charset => '!'.chr(($c / 96)+0x30));
406     } elsif (0xE98000 <= $cc && $cc <= 0xE99D01) {
407     my $c = $cc - 0xE98000;
408     $t = _i2g (chr(($c % 94)+0x21), $C,
409     type => 'G94', charset => '"'.chr(($c / 94)+0x30));
410     } elsif (0xE99D60 <= $cc && $cc <= 0xE9BAFF) {
411     my $c = $cc - 0xE99D60;
412     $t = _i2g (chr(($c % 96)+0x20), $C,
413     type => 'G96', charset => '"'.chr(($c / 96)+0x30));
414     } elsif (0xE9BB60 <= $cc && $cc <= 0xE9D861) {
415     my $c = $cc - 0xE9BB60;
416     $t = _i2g (chr(($c % 94)+0x21), $C,
417     type => 'G94', charset => '#'.chr(($c / 94)+0x30));
418     } elsif (0xE9D8C0 <= $cc && $cc <= 0xE9F65F) {
419     my $c = $cc - 0xE9D8C0;
420     $t = _i2g (chr(($c % 96)+0x20), $C,
421     type => 'G96', charset => '#'.chr(($c / 96)+0x30));
422     } elsif (0x70090940 <= $cc && $cc <= 0x70092641) {
423     my $c = $cc - 0x70090940;
424     $t = _i2g (chr(($c % 94)+0x21), $C,
425     type => 'G94', charset => "\x20".chr(($c / 94)+0x30));
426     } elsif (0x700926A0 <= $cc && $cc <= 0x7009443F) {
427     my $c = $cc - 0x700926A0;
428     $t = _i2g (chr(($c % 96)+0x20), $C,
429     type => 'G96', charset => "\x20".chr(($c / 96)+0x30));
430     ## TODO: DRCS with I byte: U+700944A0-U+7009F6BF
431     } elsif (0x70400000 <= $cc && $cc <= 0x7040FFED) {
432     my $c = $cc - 0x70400000;
433     $t = _i2g (chr(($c % 94)+0x21), $C, charset_id => 'P'.int ($c / 94),
434     type => 'G94', charset => $C->{private_set}->{G94}->[ $c / 94 ]);
435     } elsif (0x70410000 <= $cc && $cc <= 0x7041FFBF) {
436     my $c = $cc - 0x70410000;
437     $t = _i2g (chr(($c % 96)+0x20), $C, charset_id => 'P'.int ($c / 96),
438     type => 'G96', charset => $C->{private_set}->{G96}->[ $c / 96 ]);
439     } elsif (0x70420000 <= $cc && $cc <= 0x7046F19B) {
440     my $c = $cc % 0x10000;
441     $t = _i2g (chr((($c % 8836) / 94)+0x21).chr(($c % 94)+0x21), $C,
442     type => 'G94n',
443 wakaba 1.2 charset_id => 'P'.int(($cc / 0x10000) - 0x7042).'_'.int($c / 8836),
444 wakaba 1.1 charset => $C->{private_set}->{G94n}->[ ($cc / 0x10000) - 0x7042 ]
445     ->[ $c / 8836 ]);
446     }
447     if (defined $t) {
448 wakaba 1.5 $t = _i2o ($t, $C, cs_F => "\x40")
449     if $C->{coding_system} ne $CODING_SYSTEM{"\x40"};
450     } else {
451     my $F; my @F = qw~G /G /H /I B /A /D /F~;
452     push @F, qw~/J /K /L~ if $cc <= 0x10FFFF;
453     push @F, qw~/@ /C /E~ if $cc <= 0xFFFF;
454     for (@F) {
455     if (defined $C->{option}->{designate_to}->{coding_system}->{$_}
456     && $C->{option}->{designate_to}->{coding_system}->{$_} > -1) {
457     $F = $_; last;
458     } elsif ($C->{option}->{designate_to}->{coding_system}->{default} > -1) {
459     $F = $_; last;
460     }
461     }
462     $t = _i2o ($c, $C, cs_F => $F) if $F;
463     }
464     if (defined $t) {
465 wakaba 1.1 $r .= $t;
466     } else {
467 wakaba 1.5 $t = _i2g ($C->{option}->{undef_char}->[0], $C,
468 wakaba 1.4 %{ $C->{option}->{undef_char}->[1] });
469 wakaba 1.5 $r .= $C->{coding_system} eq $CODING_SYSTEM{"\x40"} ?
470     $t : _i2o ($t, $C, cs_F => "\x40");
471 wakaba 1.1 }
472     }
473     $r . _back2ascii ($C);
474     }
475    
476     ## $O{charset} eq undef means that charset is same as the current designated one.
477     sub _i2c ($%%) {
478     my ($s, $C, %O) = @_;
479     my $r = '';
480     if ($O{type} eq 'C0') {
481     if (defined $O{charset}) {
482     if ( $C->{C0} ne $CHARSET{C0}->{$O{charset}}
483     && $C->{C0} ne $CHARSET{C0}->{$O{charset_id}}) {
484     for ($C->{option}->{designate_to}->{C0}->{$O{charset}},
485     $C->{option}->{designate_to}->{C0}->{default}) {
486     if (defined $_) { return undef if $_ == -1; last }
487     }
488     $r .= "\x1B\x21".$O{charset};
489     $C->{C0} = $CHARSET{C0}->{$O{charset}};
490     }
491     } elsif (defined $O{charset_id}) {
492     if ($C->{C0} ne $CHARSET{C0}->{$O{charset_id}}) {
493     return undef; ## Control set is not designated nor has F byte
494     }
495     }
496     $r .= _back2ascii ($C, reset_all => $C->{C0}->{reset_all}->{$s});
497     return $r . $s;
498     } elsif ($O{type} eq 'C1') {
499     if (defined $O{charset}) {
500     if ( $C->{C1} ne $CHARSET{C1}->{$O{charset}}
501     && $C->{C1} ne $CHARSET{C1}->{$O{charset_id}}) {
502     for ($C->{option}->{designate_to}->{C1}->{$O{charset}},
503     $C->{option}->{designate_to}->{C1}->{default}) {
504     if (defined $_) { return undef if $_ == -1; last }
505     }
506     $r .= "\x1B\x22".$O{charset};
507     $C->{C1} = $CHARSET{C1}->{$O{charset}};
508     }
509     } elsif (defined $O{charset_id}) {
510     if ($C->{C1} ne $CHARSET{C1}->{$O{charset_id}}) {
511     return undef; ## Control set is not designated nor has F byte
512     }
513     }
514     $r .= _back2ascii ($C, reset_all => $C->{C1}->{reset_all}->{$s});
515     unless ($C->{option}->{C1invoke_to_right}) { ## ESC Fe
516     $s =~ s/([\x80-\x9F])/"\x1B" . chr (ord ($1) - 0x40)/ge;
517     }
518     return $r . $s;
519     }
520     }
521     sub _i2g ($%%) {
522     my ($s, $C, %O) = @_;
523     my $r = '';
524 wakaba 1.4 my $set = $CHARSET{$O{type}}->{$O{charset}.
525     ($O{revision}&&$C->{option}->{use_revision}?$O{revision}:'')};
526 wakaba 1.1 my $set0 = $CHARSET{$O{type}}->{$O{charset_id}};
527     ## -- designate character set
528     my $G = 0;
529     if ($C->{G0} eq $set || $C->{G0} eq $set0) { $G = 0 }
530     elsif ($C->{G1} eq $set || $C->{G1} eq $set0) { $G = 1 }
531     elsif ($C->{G2} eq $set || $C->{G2} eq $set0) { $G = 2 }
532     elsif ($C->{G3} eq $set || $C->{G3} eq $set0) { $G = 3 }
533     else {
534     return undef unless $set; ## charset does not have F byte
535     $G = 1 if $O{type} eq 'G96' || $O{type} eq 'G96n';
536     for ($C->{option}->{designate_to}->{$O{type}}->{$O{charset}},
537     $C->{option}->{designate_to}->{$O{type}}->{default}) {
538     if (defined $_) {
539     $G = $_; last;
540     }
541     }
542     if ($G == -1) {
543     return undef;
544     }
545     if ($O{type} eq 'G94') {
546     $r .= ($O{revision}&&$C->{option}->{use_revision}?"\x1B\x26".$O{revision}:'')
547     ."\x1B".("\x28","\x29","\x2A","\x2B")[$G].$O{charset};
548     } elsif ($O{type} eq 'G94n') {
549     if ($G == 0 && !$C->{option}->{G94n_designate_long}
550     && ($O{charset} eq '@' || $O{charset} eq 'A' || $O{charset} eq 'B')) {
551     $r .= ($O{revision}&&$C->{option}->{use_revision}?"\x1B\x26".$O{revision}:'')
552     ."\x1B\x24".$O{charset};
553     } else {
554     $r .= ($O{revision}&&$C->{option}->{use_revision}?"\x1B\x26".$O{revision}:'')
555     ."\x1B\x24".("\x28","\x29","\x2A","\x2B")[$G].$O{charset};
556     }
557     } elsif ($O{type} eq 'G96') {
558     $r .= ($O{revision}&&$C->{option}->{use_revision}?"\x1B\x26".$O{revision}:'')
559     ."\x1B".("\x2C","\x2D","\x2E","\x2F")[$G].$O{charset};
560     } elsif ($O{type} eq 'G96n') {
561     $r .= ($O{revision}&&$C->{option}->{use_revision}?"\x1B\x26".$O{revision}:'')
562     ."\x1B\x24".("\x2C","\x2D","\x2E","\x2F")[$G].$O{charset};
563     }
564     $C->{'G'.$G} = $CHARSET{$O{type}}->{$O{charset}};
565     }
566     ## -- invoke G buffer
567     my $left = $C->{option}->{Ginvoke_to_left}->[$G];
568     if ($C->{GL} eq 'G'.$G) {
569     $left = 1;
570     } elsif ($C->{GR} eq 'G'.$G) {
571     $left = 0;
572     } else {
573     if ($C->{option}->{Ginvoke_by_single_shift}->[$G]) {
574     if ($C->{C1}->{'C_SS'.$G}) {
575     $r .= _i2c ($C->{C1}->{'C_SS'.$G}, $C, type => 'C1') || return undef;
576     } elsif ($C->{C0}->{'C_SS'.$G}) {
577     $r .= _i2c ($C->{C0}->{'C_SS'.$G}, $C, type => 'C0') || return undef;
578     } else { ## Both C0 and C1 set do not have SS2/3.
579 wakaba 1.4 $left = 0 if $G == 1 && !$C->{C0}->{C_LS1};
580 wakaba 1.1 $r .= __invoke ($C, $G => $left) if $C->{$left?'GL':'GR'} ne 'G'.$G;
581     }
582     } else {
583     $left = 0 if $G == 1 && !$C->{C0}->{C_LS1};
584     $r .= __invoke ($C, $G => $left) if $C->{$left?'GL':'GR'} ne 'G'.$G;
585     }
586     }
587     $s =~ tr/\x00-\x7F/\x80-\xFF/ unless $left;
588     $r . $s;
589     }
590     sub _back2ascii (%) {
591     my ($C, %O) = @_;
592     my $r = '';
593     if ($C->{option}->{reset}->{Gdesignation}) {
594     my $F = $C->{option}->{reset}->{Gdesignation}; # \x42
595     $r .= "\x1B\x28".$F unless $C->{G0} eq $CHARSET{G94}->{$F};
596     $C->{G0} = $CHARSET{G94}->{$F};
597     if ($O{reset_all}) {
598     $C->{G1} = $CHARSET{G94}->{"\x7E"};
599     $C->{G2} = $CHARSET{G94}->{"\x7E"};
600     $C->{G3} = $CHARSET{G94}->{"\x7E"};
601     }
602     }
603     if ($C->{option}->{reset}->{Ginvoke}) {
604     if ($C->{GL} ne 'G0') {
605     $r .= $C->{C0}->{C_LS0} || ($C->{C0} = $CHARSET{C0}->{'@'},"\x1B\x21\x40\x0F");
606     $C->{GL} = 'G0';
607     }
608     $C->{GR} = undef if $O{reset_all};
609     }
610     $r;
611     }
612     ## __invoke (\%C, $G, $left_or_right)
613     sub __invoke (\%$$) {
614     my ($C, $G) = @_;
615     if ($_[2]) {
616     $C->{GL} = 'G'.$G;
617     return ($C->{C0}->{C_LS0}
618     || scalar ($C->{C0}=$CHARSET{C0}->{'@'},"\x1B\x21\x40\x0F"),
619     $C->{C0}->{C_LS1}, "\x1B\x6E", "\x1B\x6F")[$G];
620     } else {
621     $C->{GR} = 'G'.$G;
622     return ("", "\x1B\x7E", "\x1B\x7D", "\x1B\x7C")[$G];
623     }
624     '';
625     }
626 wakaba 1.5 sub _i2o ($\%%) {
627     my ($s, $C, %O) = @_;
628     my $CS = $CODING_SYSTEM{ $O{cs_F} } || $CODING_SYSTEM{ $O{cs_id} } || return undef;
629     my $r = '';
630     if ($CS ne $C->{coding_system}) {
631     my $e = '';
632     $e .= "\x1B\x25";
633     $e .= $O{cs_F} || $C->{private_set}->{coding_system}->{ $O{cs_id} }
634     || return undef;
635     if ($C->{coding_system} eq $CODING_SYSTEM{"\x2F\x40"}
636     || $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x43"}
637     || $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x45"}
638     || $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x4A"}
639     || $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x4B"}
640     || $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x4C"}) {
641     $e =~ s/(.)/\x00$1/go;
642     } elsif ($C->{coding_system} eq $CODING_SYSTEM{"\x2F\x41"}
643     || $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x44"}
644     || $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x46"}) {
645     $e =~ s/(.)/\x00\x00\x00$1/go;
646     }
647     $r .= $e;
648     $C->{coding_system} = $CS;
649     if ($CS->{reset_state}) {
650     $C->{GL} = undef; $C->{GR} = undef;
651     $C->{C0} = $CHARSET{C0}->{"\x7E"};
652     $C->{C1} = $CHARSET{C1}->{"\x7E"};
653     $C->{G0} = $CHARSET{G94}->{"\x7E"};
654     $C->{G1} = $CHARSET{G94}->{"\x7E"};
655     $C->{G2} = $CHARSET{G94}->{"\x7E"};
656     $C->{G3} = $CHARSET{G94}->{"\x7E"};
657     }
658     }
659     if ($CS eq $CODING_SYSTEM{"\x40"}) {
660     #
661     } elsif ($CS eq $CODING_SYSTEM{G} || $CS eq $CODING_SYSTEM{'/G'}
662     || $CS eq $CODING_SYSTEM{'/H'} || $CS eq $CODING_SYSTEM{'/I'}) {
663     Encode::_utf8_off ($s);
664     } elsif ($CS eq $CODING_SYSTEM{'/@'} || $CS eq $CODING_SYSTEM{'/C'}
665     || $CS eq $CODING_SYSTEM{'/E'}) {
666     $s = Encode::encode ('ucs-2be', $s);
667     } elsif ($CS eq $CODING_SYSTEM{'/A'} || $CS eq $CODING_SYSTEM{'/D'}
668     || $CS eq $CODING_SYSTEM{'/F'}) {
669     $s = Encode::encode ('ucs-4be', $s);
670     } elsif ($CS eq $CODING_SYSTEM{'/J'} || $CS eq $CODING_SYSTEM{'/K'}
671     || $CS eq $CODING_SYSTEM{'/L'}) {
672     $s = Encode::encode ('UTF-16BE', $s);
673     } elsif ($CS eq $CODING_SYSTEM{B}) {
674     $s = Encode::encode ('utf-1', $s);
675     } else {
676     return undef;
677     }
678     $r . $s;
679 wakaba 1.1 }
680    
681     1;
682     __END__
683    
684     =head1 SEE ALSO
685    
686     ISO/IEC 646:1991, "7-bit coded graphic character set for intormation interchange".
687    
688     ISO/IEC 2022:1994, "Character Code Structure and Extension Techniques".
689     (IDT with ECMA 35, JIS X 0202:1998)
690    
691     ISO/IEC 4873:1991, "8-Bit Coded Character Set Structure and Rules".
692     (IDT with ECMA 43)
693    
694     ISO/IEC 6429:1992, "Control Functions for Coded Character Sets".
695     (IDT with ECMA 48:1991, JIS X 0211:1998)
696    
697     ISO/IEC 8859, "8-Bit Single-Byte Coded Graphic Character Sets".
698    
699     Encode, perlunicode
700    
701 wakaba 1.4 =head1 TODO
702    
703     =over 4
704    
705     =item NCR (coding system other than ISO/IEC 2022) support
706    
707     =over 2
708    
709     =item ESC 02/05 02/15 03/x of X Compound Text
710    
711     =back
712    
713     =item Output of control character sets, single control functions
714    
715     =item Designation sequence of control character sets (input)
716    
717     =item Special graphic character sets such as G3 of EUC-TW
718    
719     =item SUPER SHIFT (SS) invoke function of old control character set
720    
721     =item Safe transparent of control string (ISO/IEC 6429)
722    
723     =item Output of unoutputable characters as alternative notation such as SGML-like entity
724    
725     =item C0 set invoked to CR area like ISIRI code
726    
727     Really need?
728    
729     =item special treatment of 0x20, 0x7E, 0xA0, 0xFF
730    
731     For example, GB mongolian sets use MSP (MONGOLIAN SPACE)
732     with these code positions.
733    
734     And, no less coding systems does not use (or does ban using) DEL.
735    
736     =item A lot of character sets don't have pseudo-UCS mapping.
737    
738     Most of 9m^n (n >= 3) sets, 9m^n sets with I byte, 9m^n
739     DRCSes do not have pseudo-UCS mapping area. It is
740     questionable to allocate lots of code positions to these
741     rarely-(or no-)used character sets.
742    
743     =item Even character sets that have pseudo-UCS mapping, some of them can't be outputed in ISO/IEC 2022.
744    
745     Because output of rarely-used character sets is
746     not implemented yet.
747    
748     =back
749    
750 wakaba 1.5 =head1 AUTHORS
751    
752     Nanashi-san
753    
754     Wakaba <w@suika.fam.cx>
755    
756 wakaba 1.1 =head1 LICENSE
757    
758 wakaba 1.5 Copyright 2002 AUTHORS
759 wakaba 1.1
760     This library is free software; you can redistribute it
761     and/or modify it under the same terms as Perl itself.
762    
763     =cut
764    
765 wakaba 1.6 # $Date: 2002/09/20 14:01:45 $
766 wakaba 1.1 ### ISO2022.pm ends here

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24