/[suikacvs]/perl/lib/Encode/ISO2022.pm
Suika

Contents of /perl/lib/Encode/ISO2022.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.7 - (hide annotations) (download)
Sun Sep 22 11:09:38 2002 UTC (22 years, 1 month ago) by wakaba
Branch: MAIN
Changes since 1.6: +11 -7 lines
2002-09-22  Wakaba <w@suika.fam.cx>

	* ISO2022.pm (_internal_to_iso2022): Allow SP as a
	replacement character.

1 wakaba 1.1
2     =head1 NAME
3    
4     Encode::ISO2022 --- ISO/IEC 2022 encoder and decoder
5    
6 wakaba 1.6 =head1 ENCODINGS
7    
8     =over 4
9    
10     =item iso2022
11    
12     ISO/IEC 2022:1994. Default status is:
13    
14     =over 2
15    
16     =item CL = C0 = ISO/IEC 6429:1991 C0 set
17    
18     =item CR = C1 = ISO/IEC 6429:1991 C1 set
19    
20     =item GL = G0 = ISO/IEC 646:1991 IRV GL(G0) set
21    
22     =item GR = G1 = empty set
23    
24     =item G2 = empty set
25    
26     =item G3 = empty set
27    
28     =back
29    
30     (Alias: iso/iec2022, iso-2022, 2022, cp2022)
31    
32     =back
33    
34     Note that ISO/IEC 2022 based encodings are found in
35     Encode::ISO2022::* modules. This module, Encode::ISO2022
36     only provides a general ISO/IEC 2022 encoder/decoder.
37    
38 wakaba 1.1 =cut
39    
40     require v5.7.3;
41     package Encode::ISO2022;
42     use strict;
43 wakaba 1.5 use vars qw(%CHARSET %CODING_SYSTEM $VERSION);
44 wakaba 1.7 $VERSION=do{my @r=(q$Revision: 1.6 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
45 wakaba 1.1 use base qw(Encode::Encoding);
46 wakaba 1.6 __PACKAGE__->Define (qw!iso-2022 iso/iec2022 iso2022 2022 cp2022!);
47 wakaba 1.5 require Encode::Charset;
48     *CHARSET = \%Encode::Charset::CHARSET;
49     *CODING_SYSTEM = \%Encode::Charset::CODING_SYSTEM;
50 wakaba 1.1
51     ### --- Intialization
52    
53     my %_CHARS_to_RANGE = (
54     l94 => q/[\x21-\x7E]/,
55     l96 => q/[\x20-\x7F]/,
56     l128 => q/[\x00-\x7F]/,
57     l256 => q/[\x00-\xFF]/,
58     r94 => q/[\xA1-\xFE]/,
59     r96 => q/[\xA0-\xFF]/,
60     r128 => q/[\x80-\xFF]/,
61     r256 => q/[\x80-\xFF]/,
62     b94 => q/[\x21-\x7E\xA1-\xFE]/,
63     b96 => q/[\x20-\x7F\xA0-\xFF]/,
64     b128 => q/[\x00-\xFF]/,
65     b256 => q/[\x00-\xFF]/,
66     );
67    
68     ### --- Perl Encode module common functions
69    
70     sub encode ($$;$) {
71     my ($obj, $str, $chk) = @_;
72     $_[1] = '' if $chk;
73     $str = &internal_to_iso2022 ($str);
74     return $str;
75     }
76    
77     sub decode ($$;$) {
78     my ($obj, $str, $chk) = @_;
79     $_[1] = '' if $chk;
80     return &iso2022_to_internal ($str);
81     }
82    
83     ### --- Encode::ISO2022 unique functions
84 wakaba 1.6 *new_object = \&Encode::Charset::new_object;
85 wakaba 1.1
86     sub iso2022_to_internal ($;\%) {
87     my ($s, $C) = @_;
88 wakaba 1.6 $C ||= &new_object;
89 wakaba 1.5 my $t = '';
90     $s =~ s{
91     ^((?:(?!\x1B\x25\x2F?[\x30-\x7E]).)*)
92     }{
93     my $i2 = $1;
94     $t = _iso2022_to_internal ($i2, $C);
95     '';
96     }gesx;
97 wakaba 1.6 my $pad = '';
98     use re 'eval';
99 wakaba 1.5 $s =~ s{
100     ## ISO/IEC 2022
101 wakaba 1.6 (??{"$pad\x1B$pad\x25$pad\x40"})((?:(?!\x1B\x25\x2F?[\x30-\x7E]).)*)
102 wakaba 1.5 ## UTF-8
103 wakaba 1.6 |(??{"$pad\x1B$pad\x25$pad(?:\x47|\x2F$pad"."[\x47-\x49])"})
104     ((?:(?!\x1B\x25\x2F?[\x30-\x7E]).)*)
105 wakaba 1.5 ## UCS-2, UTF-16
106 wakaba 1.6 |(??{"$pad\x1B$pad\x25$pad\x2F$pad"})([\x40\x43\x45\x4A-\x4C])
107     ((?:(?!\x00\x1B\x00\x25(?:\x00\x2F)?\x00[\x30-\x7E])..)*)
108 wakaba 1.5 ## UCS-4
109 wakaba 1.6 |(??{"$pad\x1B$pad\x25$pad\x2F$pad"})[\x41\x44\x46]
110     ((?:(?!\x00\x00\x00\x1B\x00\x00\x00\x25(?:\x00\x00\x00\x2F)?
111     \x00\x00\x00[\x30-\x7E])....)*)
112 wakaba 1.5 ## with standard return
113 wakaba 1.6 |(??{"$pad\x1B$pad\x25$pad"})([\x30-\x7E])
114     ((?:(?!\x1B\x25\x2F?[\x30-\x7E]).)*)
115 wakaba 1.5 ## without standard return
116 wakaba 1.6 |(??{"$pad\x1B$pad\x25$pad\x2F$pad"})([\x30-\x7E])(.*)
117 wakaba 1.5 }{
118     my ($i2,$u8,$Fu2,$u2,$u4,$Fsr,$sr,$Fnsr,$nsr) = ($1,$2,$3,$4,$5,$6,$7,$8,$9);
119     my $r = '';
120     if (defined $i2) {
121 wakaba 1.6 $r = _iso2022_to_internal ($i2, $C); $pad = '';
122 wakaba 1.5 } elsif (defined $u8) {
123 wakaba 1.6 $r = Encode::decode ('utf8', $u8); $pad = '';
124 wakaba 1.5 } elsif ($Fu2) {
125     if (ord ($Fu2) > 0x49) {
126     $r = Encode::decode ('utf-16be', $u2);
127     } else {
128     $r = Encode::decode ('ucs-2be', $u2);
129     }
130 wakaba 1.6 $pad = "\x00";
131 wakaba 1.5 } elsif (defined $u4) {
132 wakaba 1.6 $r = Encode::decode ('ucs-4be', $u2); $pad = "\x00\x00\x00";
133     } elsif (defined $Fsr && $CODING_SYSTEM{$Fsr}->{perl_name}) {
134     $r = Encode::decode ($CODING_SYSTEM{$Fsr}->{perl_name}, $sr); $pad = '';
135     } elsif (defined $Fnsr && $CODING_SYSTEM{$Fnsr}->{perl_name}) {
136     $r = Encode::decode ($CODING_SYSTEM{$Fnsr}->{perl_name}, $nsr); $pad = '';
137 wakaba 1.5 } else { ## temporary
138 wakaba 1.6 $r = '?' x length ($sr.$nsr); $pad = '';
139 wakaba 1.5 }
140     $r;
141     }gesx;
142     $t . $s;
143     }
144    
145     sub _iso2022_to_internal ($;\%) {
146     my ($s, $C) = @_;
147 wakaba 1.1 my %_GB_to_GN = (
148     "\x28"=>'G0',"\x29"=>'G1',"\x2A"=>'G2',"\x2B"=>'G3',
149     "\x2C"=>'G0',"\x2D"=>'G1',"\x2E"=>'G2',"\x2F"=>'G3',
150     );
151    
152     use re 'eval';
153     $s =~ s{
154     ((??{ $_CHARS_to_RANGE{'l'.$C->{$C->{GL}}->{chars}}
155     . qq/{$C->{$C->{GL}}->{dimension},$C->{$C->{GL}}->{dimension}}/ }))
156 wakaba 1.6 |((??{ $_CHARS_to_RANGE{'r'.$C->{$C->{GR}}->{chars}}
157 wakaba 1.1 . qq/{$C->{$C->{GR}}->{dimension},$C->{$C->{GR}}->{dimension}}/ }))
158     | (??{ q/(?:/ . ($C->{$C->{CR}}->{r_SS2} || '(?!)')
159     . ($C->{$C->{ESC_Fe}}->{r_SS2_ESC} ?
160     qq/|$C->{$C->{ESC_Fe}}->{r_SS2_ESC}/ : '')
161     . ($C->{$C->{CL}}->{r_SS2} ? qq/|$C->{$C->{CL}}->{r_SS2}/ : '') . q/)/
162     . ( $C->{$C->{CL}}->{r_LS0}
163     ||$C->{$C->{CL}}->{r_LS1}? ## ISO/IEC 6429:1992 9
164     qq/[$C->{$C->{CL}}->{r_LS0}$C->{$C->{CL}}->{r_LS1}]*/:'')
165     })
166 wakaba 1.6 ((??{ $_CHARS_to_RANGE{'b'.$C->{G2}->{chars}}
167 wakaba 1.1 . qq/{$C->{G2}->{dimension},$C->{G2}->{dimension}}/ }))
168     | (??{ q/(?:/ . ($C->{$C->{CR}}->{r_SS3} || '(?!)')
169     . ($C->{$C->{ESC_Fe}}->{r_SS3_ESC} ?
170     qq/|$C->{$C->{ESC_Fe}}->{r_SS3_ESC}/ : '')
171     . ($C->{$C->{CL}}->{r_SS3} ? qq/|$C->{$C->{CL}}->{r_SS3}/ : '') . q/)/
172     . ( $C->{$C->{CL}}->{r_LS0}
173     || $C->{$C->{CL}}->{r_LS1}? ## ISO/IEC 6429:1992 9
174     qq/[$C->{$C->{CL}}->{r_LS0}$C->{$C->{CL}}->{r_LS1}]*/:'')
175     })
176 wakaba 1.6 ((??{ $_CHARS_to_RANGE{'b'.$C->{G3}->{chars}}
177 wakaba 1.1 . qq/{$C->{G3}->{dimension},$C->{G3}->{dimension}}/ }))
178    
179 wakaba 1.3 ## Locking shift
180 wakaba 1.6 |( (??{ $C->{$C->{CL}}->{r_LS0}||'(?!)' })
181 wakaba 1.3 |(??{ $C->{$C->{CL}}->{r_LS1}||'(?!)' })
182     )
183 wakaba 1.1
184     ## Control sequence
185     |(??{ '(?:'.($C->{$C->{CR}}->{r_CSI}||'(?!)')
186     .($C->{$C->{ESC_Fe}}->{r_CSI_ESC} ?
187     qq/|$C->{$C->{ESC_Fe}}->{r_CSI_ESC}/: '')
188     .')'
189     })
190     ((??{ qq/[\x30-\x3F$C->{$C->{CL}}->{LS0}$C->{$C->{CL}}->{LS1}\xB0-\xBF]*/
191     .qq/[\x20-\x2F$C->{$C->{CL}}->{LS0}$C->{$C->{CL}}->{LS1}\xA0-\xAF]*/
192     }) [\x40-\x7E\xD0-\xFE])
193    
194     ## Other escape sequence
195     |(\x1B[\x20-\x2F]*[\x30-\x7E])
196    
197     ## Misc. sequence (SP, control, or broken data)
198     |([\x00-\xFF])
199     }{
200 wakaba 1.3 my ($gl,$gr,$ss2,$ss3,$ls,$csi,$esc,$misc)
201     = ($1,$2,$3,$4,$5,$6,$7,$8,$9);
202 wakaba 1.1 $C->{_irr} = undef unless defined $esc;
203     ## GL graphic character
204     if (defined $gl) {
205     my $c = 0;
206     my $m = $C->{$C->{GL}}->{chars}==94?0x21:$C->{$C->{GL}}->{chars}==96?0x20:0;
207     for (split //, $gl) {
208     $c = $c * $C->{$C->{GL}}->{chars} + unpack ('C', $_) - $m;
209     }
210     chr ($C->{$C->{GL}}->{ucs} + $c);
211     ## Control, SP, or broken data
212     ## TODO: support control sets other than ISO/IEC 6429's
213     } elsif (defined $misc) {
214     $misc;
215     ## GR graphic character
216     } elsif ($gr) {
217     my $c = 0;
218     my $m = $C->{$C->{GR}}->{chars}==94?0xA1:$C->{$C->{GR}}->{chars}==96?0xA0:0x80;
219     for (split //, $gr) {
220     $c = $c * $C->{$C->{GR}}->{chars} + unpack ('C', $_) - $m;
221     }
222     chr ($C->{$C->{GR}}->{ucs} + $c);
223     ## Graphic character with SS2
224     } elsif ($ss2) {
225     $ss2 =~ tr/\x80-\xFF/\x00-\x7F/;
226     my $c = 0; my $m = $C->{G2}->{chars}==94?0x21:$C->{G2}->{chars}==96?0x20:0;
227     for (split //, $ss2) {
228     $c = $c * $C->{G2}->{chars} + unpack ('C', $_) - $m;
229     }
230     chr ($C->{G2}->{ucs} + $c);
231     ## Graphic character with SS3
232     } elsif ($ss3) {
233     $ss3 =~ tr/\x80-\xFF/\x00-\x7F/;
234     my $c = 0; my $m = $C->{G3}->{chars}==94?0x21:$C->{G3}->{chars}==96?0x20:0;
235     for (split //, $ss3) {
236     $c = $c * $C->{G3}->{chars} + unpack ('C', $_) - $m;
237     }
238     chr ($C->{G3}->{ucs} + $c);
239     ## Escape sequence
240     } elsif ($esc) {
241     ## IRR (revision number)
242     if ($esc =~ /\x1B\x26([\x40-\x7E])/) {
243     $C->{_irr} = $1; $esc = '';
244     } else {
245     $esc =~ s{
246     \x1B([\x28-\x2B])(\x20?[\x21-\x23]?[\x30-\x7E]) ## Gx = 94^1
247     |\x1B\x24([\x28-\x2B]?)(\x20?[\x21-\x23]?[\x30-\x7E]) ## Gx = 94^n
248    
249     |\x1B([\x2C-\x2F])(\x20?[\x21-\x23]?[\x30-\x7E]) ## Gx = 96^1
250     |\x1B\x24([\x2C-\x2F])(\x20?[\x21-\x23]?[\x30-\x7E]) ## Gx = 96^n
251    
252     |\x1B([\x40-\x5F]) ## ESC Fe
253    
254     |\x1B\x21([\x21-\x23]?[\x30-\x7E]) ## CL = C0
255     |\x1B\x22([\x21-\x23]?[\x30-\x7E]) ## CR & ESC Fe = C1
256    
257     |\x1B([\x60-\x7E]) ## Single control functions
258     |\x1B\x23([\x21-\x23]?)([\x30-\x7E])
259    
260     |\x1B\x20([\x40-\x7E]) ## Announcer
261     }{
262     my ($g94_g,$g94_f,$g94n_g,$g94n_f,$g96_g,$g96_f,$g96n_g,$g96n_f,$Fe,
263     $CZD, $C1D, $Fs, $sI, $sF,$ACS)
264     = ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15);
265     my $rev = $C->{_irr} || '';
266     if ($g94_g) {
267     $C->{ $_GB_to_GN{ $g94_g } } = $CHARSET{G94}->{ $g94_f.$rev }
268     || $CHARSET{G94}->{ $g94_f }
269     || $CHARSET{G94}->{ "\x7E" }; '';
270     } elsif (defined $g94n_f) {
271     $C->{ $_GB_to_GN{ $g94n_g } || 'G0' } = $CHARSET{G94n}->{ $g94n_f.$rev }
272     || $CHARSET{G94n}->{ $g94n_f }
273     || $CHARSET{G94n}->{ "\x7E" }; '';
274     } elsif ($g96_g) {
275     $C->{ $_GB_to_GN{ $g96_g } } = $CHARSET{G96}->{ $g96_f.$rev }
276     || $CHARSET{G96}->{ $g96_f }
277     || $CHARSET{G96}->{ "\x7E" }; '';
278     } elsif (defined $g96n_f) {
279     $C->{ $_GB_to_GN{ $g96n_g } } = $CHARSET{G96n}->{ $g96n_f.$rev }
280     || $CHARSET{G96n}->{ $g96n_f }
281     || $CHARSET{G96n}->{ "\x7E" }; '';
282     } elsif ($Fe) { ## ESC Fe => C1
283     chr ($C->{ $C->{ESC_Fe} }->{ucs} + (ord ($Fe) - 0x40));
284     } elsif ($CZD) {
285     $C->{C0} = $CHARSET{C0}->{ $CZD.$rev }
286     || $CHARSET{C0}->{ $CZD } || $CHARSET{C0}->{ "\x7E" }; '';
287     } elsif ($C1D) {
288     $C->{C1} = $CHARSET{C1}->{ $C1D.$rev }
289     || $CHARSET{C1}->{ $C1D } || $CHARSET{C1}->{ "\x7E" }; '';
290     } elsif ($Fs) {
291     if ($Fs eq "\x6E") { ## LS2
292     $C->{GL} = 'G2'; '';
293     } elsif ($Fs eq "\x6F") { ## LS3
294     $C->{GL} = 'G3'; '';
295 wakaba 1.7 } elsif ($Fs eq "\x7E" || $Fs eq "\x6B") { ## LS1R
296 wakaba 1.1 $C->{GR} = 'G1'; $C->{GL} = 'G1' if $C->{bit} == 7; '';
297 wakaba 1.7 } elsif ($Fs eq "\x7D" || $Fs eq "\x6C") { ## LS2R
298 wakaba 1.1 $C->{GR} = 'G2'; $C->{GL} = 'G2' if $C->{bit} == 7; '';
299 wakaba 1.7 } elsif ($Fs eq "\x7C" || $Fs eq "\x6D") { ## LS3R
300 wakaba 1.1 $C->{GR} = 'G3'; $C->{GL} = 'G3' if $C->{bit} == 7; '';
301     } else {
302     chr ($CHARSET{single_control}->{Fs}->{ucs} + (ord ($Fs) - 0x60));
303     }
304     } elsif ($sI) {
305     chr ($CHARSET{single_control}->{'3F'.$sI}->{ucs} + (ord ($sF) - 0x30));
306     } elsif ($ACS) { ## Announcer
307     if ($ACS eq "\x4A") { $C->{bit} = 7 }
308     elsif ($ACS eq "\x4B") { $C->{bit} = 8 }
309     '';
310     }
311     }gex;
312     $C->{_irr} = undef;
313     }
314     $esc;
315 wakaba 1.3 ## Locking shifts
316     } elsif ($ls) {
317     if ($ls eq $C->{$C->{CL}}->{LS0}) {
318     $C->{GL} = 'G0'; '';
319     } elsif ($ls eq $C->{$C->{CL}}->{LS1}) {
320     $C->{GL} = 'G1'; '';
321     }
322 wakaba 1.1 ## Control sequence
323     } elsif ($csi) {
324     $csi =~ tr/\xA0-\xFF/\x20-\x7F/d;
325     $csi =~ s/$C->{$C->{CL}}->{LS0}//g if $C->{$C->{CL}}->{LS0};
326     $csi =~ s/$C->{$C->{CL}}->{LS1}//g if $C->{$C->{CL}}->{LS1};
327     "\x9B".$csi;
328     }
329     }gex;
330     $s;
331     }
332    
333     sub internal_to_iso2022 ($\%) {
334     my ($s, $C) = @_;
335     $C ||= &new_object;
336    
337     my $r = '';
338     for my $c (split //, $s) {
339     my $cc = ord $c;
340     my $t;
341     if ($cc <= 0x1F) {
342     $t = _i2c ($c, $C, type => 'C0', charset => '@');
343     } elsif ($cc == 0x20 || $cc == 0x7F) {
344     $t = _back2ascii ($C) . $c;
345     } elsif ($cc < 0x7F) {
346     $t = _i2g ($c, $C, type => 'G94', charset => 'B');
347     } elsif ($cc <= 0x9F) {
348     $t = _i2c ($c, $C, type => 'C1', charset_id => '64291991C1',
349     charset => $C->{private_set}->{XC1}->{'64291991C1'});
350     } elsif ($cc <= 0xFF) {
351     $t = _i2g (chr($cc-0x80), $C, type => 'G96', charset => 'A');
352     } elsif ($cc <= 0x24FF) {
353     my $c = $cc - 0x100;
354     my $final = $C->{private_set}->{U96n}->[0];
355     if (length $final) {
356     $t = _i2g (chr(($c / 96)+0x20).chr(($c % 96)+0x20), $C,
357     type => 'G96n', charset => $final);
358     }
359     } elsif ($cc <= 0x33FF) {
360     my $c = $cc - 0x2500;
361     my $final = $C->{private_set}->{U96n}->[1];
362     if (length $final) {
363     $t = _i2g (chr(($c / 96)+0x20).chr(($c % 96)+0x20), $C,
364     type => 'G96n', charset => $final);
365     }
366     } elsif (0xE000 <= $cc && $cc <= 0xFFFF) {
367     my $c = $cc - 0xE000;
368     my $final = $C->{private_set}->{U96n}->[2];
369     if (length $final) {
370     $t = _i2g (chr(($c / 96)+0x20).chr(($c % 96)+0x20), $C,
371     type => 'G96n', charset => $final);
372     }
373    
374     } elsif (0xE9F6C0 <= $cc && $cc <= 0xF06F80) {
375     my $c = $cc - 0xE9F6C0;
376     $t = _i2g (chr((($c % 8836) / 94)+0x21).chr(($c % 94)+0x21), $C,
377     type => 'G94n', charset => chr(($c / 8836)+0x30));
378     } elsif (0xF49D7C <= $cc && $cc <= 0xF4BFFF) {
379     my $c = $cc - 0xF49D7C;
380     $t = _i2g (chr(($c / 94)+0x21).chr(($c % 94)+0x21), $C,
381     type => 'G94n', charset => 'B', revision => '@');
382    
383     } elsif (0xF0000 <= $cc && $cc <= 0x10F7FF) {
384     my $c = $cc - 0xF0000;
385     $t = _i2g (chr((($c % 9216) / 96)+0x20).chr(($c % 96)+0x20), $C,
386     type => 'G96n', charset => "\x20".chr(($c / 9216)+0x40));
387     } elsif (0xE90940 <= $cc && $cc <= 0xE92641) {
388     my $c = $cc - 0xE90940;
389     $t = _i2g (chr(($c % 94)+0x21), $C,
390     type => 'G94', charset => chr(($c / 94)+0x30));
391     } elsif (0xE92642 <= $cc && $cc <= 0xE9269F) {
392     my $c = $cc - 0xE92642;
393     $t = _i2g (chr($c+0x21),$C,type => 'G94', charset => "\x64", revision => '@');
394     } elsif (0xE926A0 <= $cc && $cc <= 0xE9443F) {
395     my $c = $cc - 0xE926A0;
396     $t = _i2g (chr(($c % 96)+0x20), $C,
397     type => 'G96', charset => chr(($c / 96)+0x30));
398     } elsif (0xE944A0 <= $cc && $cc <= 0xE961A1) {
399     my $c = $cc - 0xE944A0;
400     $t = _i2g (chr(($c % 94)+0x21), $C,
401     type => 'G94', charset => '!'.chr(($c / 94)+0x30));
402     } elsif (0xE96200 <= $cc && $cc <= 0xE97F9F) {
403     my $c = $cc - 0xE96200;
404     $t = _i2g (chr(($c % 96)+0x20), $C,
405     type => 'G96', charset => '!'.chr(($c / 96)+0x30));
406     } elsif (0xE98000 <= $cc && $cc <= 0xE99D01) {
407     my $c = $cc - 0xE98000;
408     $t = _i2g (chr(($c % 94)+0x21), $C,
409     type => 'G94', charset => '"'.chr(($c / 94)+0x30));
410     } elsif (0xE99D60 <= $cc && $cc <= 0xE9BAFF) {
411     my $c = $cc - 0xE99D60;
412     $t = _i2g (chr(($c % 96)+0x20), $C,
413     type => 'G96', charset => '"'.chr(($c / 96)+0x30));
414     } elsif (0xE9BB60 <= $cc && $cc <= 0xE9D861) {
415     my $c = $cc - 0xE9BB60;
416     $t = _i2g (chr(($c % 94)+0x21), $C,
417     type => 'G94', charset => '#'.chr(($c / 94)+0x30));
418     } elsif (0xE9D8C0 <= $cc && $cc <= 0xE9F65F) {
419     my $c = $cc - 0xE9D8C0;
420     $t = _i2g (chr(($c % 96)+0x20), $C,
421     type => 'G96', charset => '#'.chr(($c / 96)+0x30));
422     } elsif (0x70090940 <= $cc && $cc <= 0x70092641) {
423     my $c = $cc - 0x70090940;
424     $t = _i2g (chr(($c % 94)+0x21), $C,
425     type => 'G94', charset => "\x20".chr(($c / 94)+0x30));
426     } elsif (0x700926A0 <= $cc && $cc <= 0x7009443F) {
427     my $c = $cc - 0x700926A0;
428     $t = _i2g (chr(($c % 96)+0x20), $C,
429     type => 'G96', charset => "\x20".chr(($c / 96)+0x30));
430     ## TODO: DRCS with I byte: U+700944A0-U+7009F6BF
431     } elsif (0x70400000 <= $cc && $cc <= 0x7040FFED) {
432     my $c = $cc - 0x70400000;
433     $t = _i2g (chr(($c % 94)+0x21), $C, charset_id => 'P'.int ($c / 94),
434     type => 'G94', charset => $C->{private_set}->{G94}->[ $c / 94 ]);
435     } elsif (0x70410000 <= $cc && $cc <= 0x7041FFBF) {
436     my $c = $cc - 0x70410000;
437     $t = _i2g (chr(($c % 96)+0x20), $C, charset_id => 'P'.int ($c / 96),
438     type => 'G96', charset => $C->{private_set}->{G96}->[ $c / 96 ]);
439     } elsif (0x70420000 <= $cc && $cc <= 0x7046F19B) {
440     my $c = $cc % 0x10000;
441     $t = _i2g (chr((($c % 8836) / 94)+0x21).chr(($c % 94)+0x21), $C,
442     type => 'G94n',
443 wakaba 1.2 charset_id => 'P'.int(($cc / 0x10000) - 0x7042).'_'.int($c / 8836),
444 wakaba 1.1 charset => $C->{private_set}->{G94n}->[ ($cc / 0x10000) - 0x7042 ]
445     ->[ $c / 8836 ]);
446     }
447     if (defined $t) {
448 wakaba 1.5 $t = _i2o ($t, $C, cs_F => "\x40")
449     if $C->{coding_system} ne $CODING_SYSTEM{"\x40"};
450     } else {
451     my $F; my @F = qw~G /G /H /I B /A /D /F~;
452     push @F, qw~/J /K /L~ if $cc <= 0x10FFFF;
453     push @F, qw~/@ /C /E~ if $cc <= 0xFFFF;
454     for (@F) {
455     if (defined $C->{option}->{designate_to}->{coding_system}->{$_}
456     && $C->{option}->{designate_to}->{coding_system}->{$_} > -1) {
457     $F = $_; last;
458     } elsif ($C->{option}->{designate_to}->{coding_system}->{default} > -1) {
459     $F = $_; last;
460     }
461     }
462     $t = _i2o ($c, $C, cs_F => $F) if $F;
463     }
464     if (defined $t) {
465 wakaba 1.1 $r .= $t;
466     } else {
467 wakaba 1.7 unless ($C->{option}->{undef_char}->[0] eq "\x20") {
468     $t = _i2g ($C->{option}->{undef_char}->[0], $C,
469     %{ $C->{option}->{undef_char}->[1] });
470     } else { ## SP
471     $t = _back2ascii ($C) . "\x20";
472     }
473 wakaba 1.5 $r .= $C->{coding_system} eq $CODING_SYSTEM{"\x40"} ?
474     $t : _i2o ($t, $C, cs_F => "\x40");
475 wakaba 1.1 }
476     }
477     $r . _back2ascii ($C);
478     }
479    
480     ## $O{charset} eq undef means that charset is same as the current designated one.
481     sub _i2c ($%%) {
482     my ($s, $C, %O) = @_;
483     my $r = '';
484     if ($O{type} eq 'C0') {
485     if (defined $O{charset}) {
486     if ( $C->{C0} ne $CHARSET{C0}->{$O{charset}}
487     && $C->{C0} ne $CHARSET{C0}->{$O{charset_id}}) {
488     for ($C->{option}->{designate_to}->{C0}->{$O{charset}},
489     $C->{option}->{designate_to}->{C0}->{default}) {
490     if (defined $_) { return undef if $_ == -1; last }
491     }
492     $r .= "\x1B\x21".$O{charset};
493     $C->{C0} = $CHARSET{C0}->{$O{charset}};
494     }
495     } elsif (defined $O{charset_id}) {
496     if ($C->{C0} ne $CHARSET{C0}->{$O{charset_id}}) {
497     return undef; ## Control set is not designated nor has F byte
498     }
499     }
500     $r .= _back2ascii ($C, reset_all => $C->{C0}->{reset_all}->{$s});
501     return $r . $s;
502     } elsif ($O{type} eq 'C1') {
503     if (defined $O{charset}) {
504     if ( $C->{C1} ne $CHARSET{C1}->{$O{charset}}
505     && $C->{C1} ne $CHARSET{C1}->{$O{charset_id}}) {
506     for ($C->{option}->{designate_to}->{C1}->{$O{charset}},
507     $C->{option}->{designate_to}->{C1}->{default}) {
508     if (defined $_) { return undef if $_ == -1; last }
509     }
510     $r .= "\x1B\x22".$O{charset};
511     $C->{C1} = $CHARSET{C1}->{$O{charset}};
512     }
513     } elsif (defined $O{charset_id}) {
514     if ($C->{C1} ne $CHARSET{C1}->{$O{charset_id}}) {
515     return undef; ## Control set is not designated nor has F byte
516     }
517     }
518     $r .= _back2ascii ($C, reset_all => $C->{C1}->{reset_all}->{$s});
519     unless ($C->{option}->{C1invoke_to_right}) { ## ESC Fe
520     $s =~ s/([\x80-\x9F])/"\x1B" . chr (ord ($1) - 0x40)/ge;
521     }
522     return $r . $s;
523     }
524     }
525     sub _i2g ($%%) {
526     my ($s, $C, %O) = @_;
527     my $r = '';
528 wakaba 1.4 my $set = $CHARSET{$O{type}}->{$O{charset}.
529     ($O{revision}&&$C->{option}->{use_revision}?$O{revision}:'')};
530 wakaba 1.1 my $set0 = $CHARSET{$O{type}}->{$O{charset_id}};
531     ## -- designate character set
532     my $G = 0;
533     if ($C->{G0} eq $set || $C->{G0} eq $set0) { $G = 0 }
534     elsif ($C->{G1} eq $set || $C->{G1} eq $set0) { $G = 1 }
535     elsif ($C->{G2} eq $set || $C->{G2} eq $set0) { $G = 2 }
536     elsif ($C->{G3} eq $set || $C->{G3} eq $set0) { $G = 3 }
537     else {
538     return undef unless $set; ## charset does not have F byte
539     $G = 1 if $O{type} eq 'G96' || $O{type} eq 'G96n';
540     for ($C->{option}->{designate_to}->{$O{type}}->{$O{charset}},
541     $C->{option}->{designate_to}->{$O{type}}->{default}) {
542     if (defined $_) {
543     $G = $_; last;
544     }
545     }
546     if ($G == -1) {
547     return undef;
548     }
549     if ($O{type} eq 'G94') {
550     $r .= ($O{revision}&&$C->{option}->{use_revision}?"\x1B\x26".$O{revision}:'')
551     ."\x1B".("\x28","\x29","\x2A","\x2B")[$G].$O{charset};
552     } elsif ($O{type} eq 'G94n') {
553     if ($G == 0 && !$C->{option}->{G94n_designate_long}
554     && ($O{charset} eq '@' || $O{charset} eq 'A' || $O{charset} eq 'B')) {
555     $r .= ($O{revision}&&$C->{option}->{use_revision}?"\x1B\x26".$O{revision}:'')
556     ."\x1B\x24".$O{charset};
557     } else {
558     $r .= ($O{revision}&&$C->{option}->{use_revision}?"\x1B\x26".$O{revision}:'')
559     ."\x1B\x24".("\x28","\x29","\x2A","\x2B")[$G].$O{charset};
560     }
561     } elsif ($O{type} eq 'G96') {
562     $r .= ($O{revision}&&$C->{option}->{use_revision}?"\x1B\x26".$O{revision}:'')
563     ."\x1B".("\x2C","\x2D","\x2E","\x2F")[$G].$O{charset};
564     } elsif ($O{type} eq 'G96n') {
565     $r .= ($O{revision}&&$C->{option}->{use_revision}?"\x1B\x26".$O{revision}:'')
566     ."\x1B\x24".("\x2C","\x2D","\x2E","\x2F")[$G].$O{charset};
567     }
568     $C->{'G'.$G} = $CHARSET{$O{type}}->{$O{charset}};
569     }
570     ## -- invoke G buffer
571     my $left = $C->{option}->{Ginvoke_to_left}->[$G];
572     if ($C->{GL} eq 'G'.$G) {
573     $left = 1;
574     } elsif ($C->{GR} eq 'G'.$G) {
575     $left = 0;
576     } else {
577     if ($C->{option}->{Ginvoke_by_single_shift}->[$G]) {
578     if ($C->{C1}->{'C_SS'.$G}) {
579     $r .= _i2c ($C->{C1}->{'C_SS'.$G}, $C, type => 'C1') || return undef;
580     } elsif ($C->{C0}->{'C_SS'.$G}) {
581     $r .= _i2c ($C->{C0}->{'C_SS'.$G}, $C, type => 'C0') || return undef;
582     } else { ## Both C0 and C1 set do not have SS2/3.
583 wakaba 1.4 $left = 0 if $G == 1 && !$C->{C0}->{C_LS1};
584 wakaba 1.1 $r .= __invoke ($C, $G => $left) if $C->{$left?'GL':'GR'} ne 'G'.$G;
585     }
586     } else {
587     $left = 0 if $G == 1 && !$C->{C0}->{C_LS1};
588     $r .= __invoke ($C, $G => $left) if $C->{$left?'GL':'GR'} ne 'G'.$G;
589     }
590     }
591     $s =~ tr/\x00-\x7F/\x80-\xFF/ unless $left;
592     $r . $s;
593     }
594     sub _back2ascii (%) {
595     my ($C, %O) = @_;
596     my $r = '';
597     if ($C->{option}->{reset}->{Gdesignation}) {
598     my $F = $C->{option}->{reset}->{Gdesignation}; # \x42
599     $r .= "\x1B\x28".$F unless $C->{G0} eq $CHARSET{G94}->{$F};
600     $C->{G0} = $CHARSET{G94}->{$F};
601     if ($O{reset_all}) {
602     $C->{G1} = $CHARSET{G94}->{"\x7E"};
603     $C->{G2} = $CHARSET{G94}->{"\x7E"};
604     $C->{G3} = $CHARSET{G94}->{"\x7E"};
605     }
606     }
607     if ($C->{option}->{reset}->{Ginvoke}) {
608     if ($C->{GL} ne 'G0') {
609     $r .= $C->{C0}->{C_LS0} || ($C->{C0} = $CHARSET{C0}->{'@'},"\x1B\x21\x40\x0F");
610     $C->{GL} = 'G0';
611     }
612     $C->{GR} = undef if $O{reset_all};
613     }
614     $r;
615     }
616     ## __invoke (\%C, $G, $left_or_right)
617     sub __invoke (\%$$) {
618     my ($C, $G) = @_;
619     if ($_[2]) {
620     $C->{GL} = 'G'.$G;
621     return ($C->{C0}->{C_LS0}
622     || scalar ($C->{C0}=$CHARSET{C0}->{'@'},"\x1B\x21\x40\x0F"),
623     $C->{C0}->{C_LS1}, "\x1B\x6E", "\x1B\x6F")[$G];
624     } else {
625     $C->{GR} = 'G'.$G;
626     return ("", "\x1B\x7E", "\x1B\x7D", "\x1B\x7C")[$G];
627     }
628     '';
629     }
630 wakaba 1.5 sub _i2o ($\%%) {
631     my ($s, $C, %O) = @_;
632     my $CS = $CODING_SYSTEM{ $O{cs_F} } || $CODING_SYSTEM{ $O{cs_id} } || return undef;
633     my $r = '';
634     if ($CS ne $C->{coding_system}) {
635     my $e = '';
636     $e .= "\x1B\x25";
637     $e .= $O{cs_F} || $C->{private_set}->{coding_system}->{ $O{cs_id} }
638     || return undef;
639     if ($C->{coding_system} eq $CODING_SYSTEM{"\x2F\x40"}
640     || $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x43"}
641     || $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x45"}
642     || $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x4A"}
643     || $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x4B"}
644     || $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x4C"}) {
645     $e =~ s/(.)/\x00$1/go;
646     } elsif ($C->{coding_system} eq $CODING_SYSTEM{"\x2F\x41"}
647     || $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x44"}
648     || $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x46"}) {
649     $e =~ s/(.)/\x00\x00\x00$1/go;
650     }
651     $r .= $e;
652     $C->{coding_system} = $CS;
653     if ($CS->{reset_state}) {
654     $C->{GL} = undef; $C->{GR} = undef;
655     $C->{C0} = $CHARSET{C0}->{"\x7E"};
656     $C->{C1} = $CHARSET{C1}->{"\x7E"};
657     $C->{G0} = $CHARSET{G94}->{"\x7E"};
658     $C->{G1} = $CHARSET{G94}->{"\x7E"};
659     $C->{G2} = $CHARSET{G94}->{"\x7E"};
660     $C->{G3} = $CHARSET{G94}->{"\x7E"};
661     }
662     }
663     if ($CS eq $CODING_SYSTEM{"\x40"}) {
664     #
665     } elsif ($CS eq $CODING_SYSTEM{G} || $CS eq $CODING_SYSTEM{'/G'}
666     || $CS eq $CODING_SYSTEM{'/H'} || $CS eq $CODING_SYSTEM{'/I'}) {
667     Encode::_utf8_off ($s);
668     } elsif ($CS eq $CODING_SYSTEM{'/@'} || $CS eq $CODING_SYSTEM{'/C'}
669     || $CS eq $CODING_SYSTEM{'/E'}) {
670     $s = Encode::encode ('ucs-2be', $s);
671     } elsif ($CS eq $CODING_SYSTEM{'/A'} || $CS eq $CODING_SYSTEM{'/D'}
672     || $CS eq $CODING_SYSTEM{'/F'}) {
673     $s = Encode::encode ('ucs-4be', $s);
674     } elsif ($CS eq $CODING_SYSTEM{'/J'} || $CS eq $CODING_SYSTEM{'/K'}
675     || $CS eq $CODING_SYSTEM{'/L'}) {
676     $s = Encode::encode ('UTF-16BE', $s);
677     } elsif ($CS eq $CODING_SYSTEM{B}) {
678     $s = Encode::encode ('utf-1', $s);
679     } else {
680     return undef;
681     }
682     $r . $s;
683 wakaba 1.1 }
684    
685     1;
686     __END__
687    
688     =head1 SEE ALSO
689    
690     ISO/IEC 646:1991, "7-bit coded graphic character set for intormation interchange".
691    
692     ISO/IEC 2022:1994, "Character Code Structure and Extension Techniques".
693     (IDT with ECMA 35, JIS X 0202:1998)
694    
695     ISO/IEC 4873:1991, "8-Bit Coded Character Set Structure and Rules".
696     (IDT with ECMA 43)
697    
698     ISO/IEC 6429:1992, "Control Functions for Coded Character Sets".
699     (IDT with ECMA 48:1991, JIS X 0211:1998)
700    
701     ISO/IEC 8859, "8-Bit Single-Byte Coded Graphic Character Sets".
702    
703     Encode, perlunicode
704    
705 wakaba 1.4 =head1 TODO
706    
707     =over 4
708    
709     =item NCR (coding system other than ISO/IEC 2022) support
710    
711     =over 2
712    
713     =item ESC 02/05 02/15 03/x of X Compound Text
714    
715     =back
716    
717     =item Output of control character sets, single control functions
718    
719     =item Designation sequence of control character sets (input)
720    
721     =item Special graphic character sets such as G3 of EUC-TW
722    
723     =item SUPER SHIFT (SS) invoke function of old control character set
724    
725     =item Safe transparent of control string (ISO/IEC 6429)
726    
727     =item Output of unoutputable characters as alternative notation such as SGML-like entity
728    
729     =item C0 set invoked to CR area like ISIRI code
730    
731     Really need?
732    
733     =item special treatment of 0x20, 0x7E, 0xA0, 0xFF
734    
735     For example, GB mongolian sets use MSP (MONGOLIAN SPACE)
736     with these code positions.
737    
738     And, no less coding systems does not use (or does ban using) DEL.
739    
740     =item A lot of character sets don't have pseudo-UCS mapping.
741    
742     Most of 9m^n (n >= 3) sets, 9m^n sets with I byte, 9m^n
743     DRCSes do not have pseudo-UCS mapping area. It is
744     questionable to allocate lots of code positions to these
745     rarely-(or no-)used character sets.
746    
747     =item Even character sets that have pseudo-UCS mapping, some of them can't be outputed in ISO/IEC 2022.
748    
749     Because output of rarely-used character sets is
750     not implemented yet.
751    
752     =back
753    
754 wakaba 1.5 =head1 AUTHORS
755    
756     Nanashi-san
757    
758     Wakaba <w@suika.fam.cx>
759    
760 wakaba 1.1 =head1 LICENSE
761    
762 wakaba 1.5 Copyright 2002 AUTHORS
763 wakaba 1.1
764     This library is free software; you can redistribute it
765     and/or modify it under the same terms as Perl itself.
766    
767     =cut
768    
769 wakaba 1.7 # $Date: 2002/09/21 01:34:08 $
770 wakaba 1.1 ### ISO2022.pm ends here

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24