/[suikacvs]/perl/lib/Encode/ISO2022.pm
Suika

Contents of /perl/lib/Encode/ISO2022.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.5 - (hide annotations) (download)
Fri Sep 20 14:01:45 2002 UTC (22 years, 1 month ago) by wakaba
Branch: MAIN
Changes since 1.4: +139 -165 lines
2002-09-20  Wakaba <w@suika.fam.cx>

	* ISO2022.pm:
	- (iso2022_to_internal): New function.
	- (_iso2022_to_internal): Renamed from iso2022_to_internal.
	- (iso2022_to_internal): Experimental support of DOCS.
	- (internal_to_iso2022): Output in UCS coding systems
	if the character is unable to be encoded in ISO/IEC 2022
	coded character sets.
	- (_i2o): New procedure.
	- ($C->{option}->{designate_to}->{coding_system}): New option
	property object.
	- ($C->{coding_system}): New property.
	- (%CODING_SYSTEM): New hash.  (Alias to Encode::Charset's one.)
	* Charset.pm (make_initial_coding_system): Set 'reset_state'
	property with 1 value to coding systems of DOCS with 02/14 I byte.

1 wakaba 1.1
2     =head1 NAME
3    
4     Encode::ISO2022 --- ISO/IEC 2022 encoder and decoder
5    
6     =cut
7    
8     require v5.7.3;
9     package Encode::ISO2022;
10     use strict;
11 wakaba 1.5 use vars qw(%CHARSET %CODING_SYSTEM $VERSION);
12     $VERSION=do{my @r=(q$Revision: 1.4 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
13 wakaba 1.1 use base qw(Encode::Encoding);
14 wakaba 1.2 __PACKAGE__->Define (qw/iso-2022 iso2022 2022 cp2022/);
15 wakaba 1.5 require Encode::Charset;
16     *CHARSET = \%Encode::Charset::CHARSET;
17     *CODING_SYSTEM = \%Encode::Charset::CODING_SYSTEM;
18 wakaba 1.1
19     ### --- Intialization
20    
21     my %_CHARS_to_RANGE = (
22     l94 => q/[\x21-\x7E]/,
23     l96 => q/[\x20-\x7F]/,
24     l128 => q/[\x00-\x7F]/,
25     l256 => q/[\x00-\xFF]/,
26     r94 => q/[\xA1-\xFE]/,
27     r96 => q/[\xA0-\xFF]/,
28     r128 => q/[\x80-\xFF]/,
29     r256 => q/[\x80-\xFF]/,
30     b94 => q/[\x21-\x7E\xA1-\xFE]/,
31     b96 => q/[\x20-\x7F\xA0-\xFF]/,
32     b128 => q/[\x00-\xFF]/,
33     b256 => q/[\x00-\xFF]/,
34     );
35    
36     ### --- Perl Encode module common functions
37    
38     sub encode ($$;$) {
39     my ($obj, $str, $chk) = @_;
40     $_[1] = '' if $chk;
41     $str = &internal_to_iso2022 ($str);
42     return $str;
43     }
44    
45     sub decode ($$;$) {
46     my ($obj, $str, $chk) = @_;
47     $_[1] = '' if $chk;
48     return &iso2022_to_internal ($str);
49     }
50    
51     ### --- Encode::ISO2022 unique functions
52    
53     ## Make a new ISO/IEC 2022-buffers object with default status
54     sub new_object {
55     my %C;
56     $C{bit} = 8;
57     $C{CL} = 'C0'; $C{CR} = 'C1'; $C{ESC_Fe} = 'C1';
58     $C{C0} = $CHARSET{C0}->{"\x40"}; ## ISO/IEC 6429:1991 C0
59     $C{C1} = $CHARSET{C1}->{'64291991C1'}; ## ISO/IEC 6429:1991 C1
60     $C{GL} = 'G0'; $C{GR} = 'G1';
61     $C{G0} = $CHARSET{G94}->{"\x42"}; ## ISO/IEC 646:1991 IRV
62     #$C{G1} = $CHARSET{G96}->{"\x41"}; ## ISO/IEC 8859-1 GR
63     $C{G1} = $CHARSET{G94}->{"\x7E"}; ## empty set
64     $C{G2} = $CHARSET{G94}->{"\x7E"}; ## empty set
65     $C{G3} = $CHARSET{G94}->{"\x7E"}; ## empty set
66 wakaba 1.5 $C{coding_system} = $CODING_SYSTEM{"\x40"}; ## ISO/IEC 2022
67 wakaba 1.1 $C{option} = {
68     C1invoke_to_right => 0, ## C1 invoked to: (0: ESC Fe, 1: CR)
69     G94n_designate_long => 0, ## (1: ESC 02/04 02/08 04/00..02)
70     designate_to => { ## Designated G buffer (-1: not be outputed)
71     C0 => {
72     default => 0,
73     },
74     C1 => {
75     default => 1,
76     },
77     G94 => {
78     "\x42" => 0,
79     default => 0,
80     },
81     G96 => {
82     default => 1,
83     },
84     G94n => {
85     default => 0,
86     },
87     G96n => {
88     default => 1,
89     },
90 wakaba 1.5 coding_system => {
91     default => -1,
92     },
93 wakaba 1.1 },
94     Ginvoke_by_single_shift => [0,0,0,0], ## Invoked by SS
95     Ginvoke_to_left => [1,1,1,1], ## Which invoked to? (1: L, 0: R)
96     private_set => { ## Private set vs Final byte
97     C0 => [],
98     C1 => [],
99     G94 => [],
100     G94n => [[],[],[],[],[]],
101     G96 => [],
102     #G96n => [], ## (not implemented)
103     U96n => [], ## mule-unicode sets
104     XC1 => {
105     '64291991C1' => undef, ## ISO/IEC 6429:1991 C1
106     },
107     },
108     reset => { ## Reset status at top of line
109     Gdesignation => "\x42", ## F of designation or 0
110     Ginvoke => 1,
111     },
112 wakaba 1.4 undef_char => ["\x3F", {type => 'G94', charset => 'B'}],
113 wakaba 1.1 use_revision => 1, ## Output IRR
114     };
115     \%C;
116     }
117    
118     sub iso2022_to_internal ($;\%) {
119     my ($s, $C) = @_;
120 wakaba 1.5 my $t = '';
121     $s =~ s{
122     ^((?:(?!\x1B\x25\x2F?[\x30-\x7E]).)*)
123     }{
124     my $i2 = $1;
125     $t = _iso2022_to_internal ($i2, $C);
126     '';
127     }gesx;
128     $s =~ s{
129     ## ISO/IEC 2022
130     \x1B\x25\x40((?:(?!\x1B\x25\x2F?[\x30-\x7E]).)*)
131     ## UTF-8
132     |\x1B\x25(?:\x47|\x2F[\x47-\x49])((?:(?!\x1B\x25\x2F?[\x30-\x7E]).)*)
133     ## UCS-2, UTF-16
134     |\x1B\x25\x2F[\x40\x43\x45\x4A-\x4C]
135     ((?!\x00\x1B\x00\x25\x00\x2F?\x00[\x30-\x7E].)*)
136     ## UCS-4
137     |\x1B\x25\x2F[\x41\x44\x46]
138     ((?!\x00\x00\x00\x1B\x00\x00\x00\x25\x00\x00\x00\x2F?
139     \x00\x00\x00[\x30-\x7E].)*)
140     ## with standard return
141     |\x1B\x25([\x30-\x7E])((?:(?!\x1B\x25\x2F?[\x30-\x7E]).)*)
142     ## without standard return
143     |\x1B\x25\x2F([\x30-\x7E])(.*)
144     }{
145     my ($i2,$u8,$Fu2,$u2,$u4,$Fsr,$sr,$Fnsr,$nsr) = ($1,$2,$3,$4,$5,$6,$7,$8,$9);
146     my $r = '';
147     if (defined $i2) {
148     $r = _iso2022_to_internal ($i2, $C);
149     } elsif (defined $u8) {
150     $r = Encode::decode ('utf8', $u8);
151     } elsif ($Fu2) {
152     if (ord ($Fu2) > 0x49) {
153     $r = Encode::decode ('utf-16be', $u2);
154     } else {
155     $r = Encode::decode ('ucs-2be', $u2);
156     }
157     } elsif (defined $u4) {
158     $r = Encode::decode ('ucs-4be', $u2);
159     } else { ## temporary
160     $r = '?+';
161     }
162     $r;
163     }gesx;
164     $t . $s;
165     }
166    
167     sub _iso2022_to_internal ($;\%) {
168     my ($s, $C) = @_;
169 wakaba 1.1 my %_GB_to_GN = (
170     "\x28"=>'G0',"\x29"=>'G1',"\x2A"=>'G2',"\x2B"=>'G3',
171     "\x2C"=>'G0',"\x2D"=>'G1',"\x2E"=>'G2',"\x2F"=>'G3',
172     );
173     $C ||= &new_object;
174    
175     use re 'eval';
176     $s =~ s{
177     ((??{ $_CHARS_to_RANGE{'l'.$C->{$C->{GL}}->{chars}}
178     . qq/{$C->{$C->{GL}}->{dimension},$C->{$C->{GL}}->{dimension}}/ }))
179     |((??{ $_CHARS_to_RANGE{'r'.$C->{$C->{GL}}->{chars}}
180     . qq/{$C->{$C->{GR}}->{dimension},$C->{$C->{GR}}->{dimension}}/ }))
181    
182     | (??{ q/(?:/ . ($C->{$C->{CR}}->{r_SS2} || '(?!)')
183     . ($C->{$C->{ESC_Fe}}->{r_SS2_ESC} ?
184     qq/|$C->{$C->{ESC_Fe}}->{r_SS2_ESC}/ : '')
185     . ($C->{$C->{CL}}->{r_SS2} ? qq/|$C->{$C->{CL}}->{r_SS2}/ : '') . q/)/
186     . ( $C->{$C->{CL}}->{r_LS0}
187     ||$C->{$C->{CL}}->{r_LS1}? ## ISO/IEC 6429:1992 9
188     qq/[$C->{$C->{CL}}->{r_LS0}$C->{$C->{CL}}->{r_LS1}]*/:'')
189     })
190     ((??{ $_CHARS_to_RANGE{'b'.$C->{$C->{GL}}->{chars}}
191     . qq/{$C->{G2}->{dimension},$C->{G2}->{dimension}}/ }))
192     | (??{ q/(?:/ . ($C->{$C->{CR}}->{r_SS3} || '(?!)')
193     . ($C->{$C->{ESC_Fe}}->{r_SS3_ESC} ?
194     qq/|$C->{$C->{ESC_Fe}}->{r_SS3_ESC}/ : '')
195     . ($C->{$C->{CL}}->{r_SS3} ? qq/|$C->{$C->{CL}}->{r_SS3}/ : '') . q/)/
196     . ( $C->{$C->{CL}}->{r_LS0}
197     || $C->{$C->{CL}}->{r_LS1}? ## ISO/IEC 6429:1992 9
198     qq/[$C->{$C->{CL}}->{r_LS0}$C->{$C->{CL}}->{r_LS1}]*/:'')
199     })
200     ((??{ $_CHARS_to_RANGE{'b'.$C->{$C->{GL}}->{chars}}
201     . qq/{$C->{G3}->{dimension},$C->{G3}->{dimension}}/ }))
202    
203 wakaba 1.3 ## Locking shift
204     |( \x1B[\x6E\x6F\x7C-\x7E]
205     |(??{ $C->{$C->{CL}}->{r_LS0}||'(?!)' })
206     |(??{ $C->{$C->{CL}}->{r_LS1}||'(?!)' })
207     )
208 wakaba 1.1
209     ## Control sequence
210     |(??{ '(?:'.($C->{$C->{CR}}->{r_CSI}||'(?!)')
211     .($C->{$C->{ESC_Fe}}->{r_CSI_ESC} ?
212     qq/|$C->{$C->{ESC_Fe}}->{r_CSI_ESC}/: '')
213     .')'
214     })
215     ((??{ qq/[\x30-\x3F$C->{$C->{CL}}->{LS0}$C->{$C->{CL}}->{LS1}\xB0-\xBF]*/
216     .qq/[\x20-\x2F$C->{$C->{CL}}->{LS0}$C->{$C->{CL}}->{LS1}\xA0-\xAF]*/
217     }) [\x40-\x7E\xD0-\xFE])
218    
219     ## Other escape sequence
220     |(\x1B[\x20-\x2F]*[\x30-\x7E])
221    
222     ## Misc. sequence (SP, control, or broken data)
223     |([\x00-\xFF])
224     }{
225 wakaba 1.3 my ($gl,$gr,$ss2,$ss3,$ls,$csi,$esc,$misc)
226     = ($1,$2,$3,$4,$5,$6,$7,$8,$9);
227 wakaba 1.1 $C->{_irr} = undef unless defined $esc;
228     ## GL graphic character
229     if (defined $gl) {
230     my $c = 0;
231     my $m = $C->{$C->{GL}}->{chars}==94?0x21:$C->{$C->{GL}}->{chars}==96?0x20:0;
232     for (split //, $gl) {
233     $c = $c * $C->{$C->{GL}}->{chars} + unpack ('C', $_) - $m;
234     }
235     chr ($C->{$C->{GL}}->{ucs} + $c);
236     ## Control, SP, or broken data
237     ## TODO: support control sets other than ISO/IEC 6429's
238     } elsif (defined $misc) {
239     $misc;
240     ## GR graphic character
241     } elsif ($gr) {
242     my $c = 0;
243     my $m = $C->{$C->{GR}}->{chars}==94?0xA1:$C->{$C->{GR}}->{chars}==96?0xA0:0x80;
244     for (split //, $gr) {
245     $c = $c * $C->{$C->{GR}}->{chars} + unpack ('C', $_) - $m;
246     }
247     chr ($C->{$C->{GR}}->{ucs} + $c);
248     ## Graphic character with SS2
249     } elsif ($ss2) {
250     $ss2 =~ tr/\x80-\xFF/\x00-\x7F/;
251     my $c = 0; my $m = $C->{G2}->{chars}==94?0x21:$C->{G2}->{chars}==96?0x20:0;
252     for (split //, $ss2) {
253     $c = $c * $C->{G2}->{chars} + unpack ('C', $_) - $m;
254     }
255     chr ($C->{G2}->{ucs} + $c);
256     ## Graphic character with SS3
257     } elsif ($ss3) {
258     $ss3 =~ tr/\x80-\xFF/\x00-\x7F/;
259     my $c = 0; my $m = $C->{G3}->{chars}==94?0x21:$C->{G3}->{chars}==96?0x20:0;
260     for (split //, $ss3) {
261     $c = $c * $C->{G3}->{chars} + unpack ('C', $_) - $m;
262     }
263     chr ($C->{G3}->{ucs} + $c);
264     ## Escape sequence
265     } elsif ($esc) {
266     ## IRR (revision number)
267     if ($esc =~ /\x1B\x26([\x40-\x7E])/) {
268     $C->{_irr} = $1; $esc = '';
269     } else {
270     $esc =~ s{
271     \x1B([\x28-\x2B])(\x20?[\x21-\x23]?[\x30-\x7E]) ## Gx = 94^1
272     |\x1B\x24([\x28-\x2B]?)(\x20?[\x21-\x23]?[\x30-\x7E]) ## Gx = 94^n
273    
274     |\x1B([\x2C-\x2F])(\x20?[\x21-\x23]?[\x30-\x7E]) ## Gx = 96^1
275     |\x1B\x24([\x2C-\x2F])(\x20?[\x21-\x23]?[\x30-\x7E]) ## Gx = 96^n
276    
277     |\x1B([\x40-\x5F]) ## ESC Fe
278    
279     |\x1B\x21([\x21-\x23]?[\x30-\x7E]) ## CL = C0
280     |\x1B\x22([\x21-\x23]?[\x30-\x7E]) ## CR & ESC Fe = C1
281    
282     |\x1B([\x60-\x7E]) ## Single control functions
283     |\x1B\x23([\x21-\x23]?)([\x30-\x7E])
284    
285     |\x1B\x20([\x40-\x7E]) ## Announcer
286     }{
287     my ($g94_g,$g94_f,$g94n_g,$g94n_f,$g96_g,$g96_f,$g96n_g,$g96n_f,$Fe,
288     $CZD, $C1D, $Fs, $sI, $sF,$ACS)
289     = ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15);
290     my $rev = $C->{_irr} || '';
291     if ($g94_g) {
292     $C->{ $_GB_to_GN{ $g94_g } } = $CHARSET{G94}->{ $g94_f.$rev }
293     || $CHARSET{G94}->{ $g94_f }
294     || $CHARSET{G94}->{ "\x7E" }; '';
295     } elsif (defined $g94n_f) {
296     $C->{ $_GB_to_GN{ $g94n_g } || 'G0' } = $CHARSET{G94n}->{ $g94n_f.$rev }
297     || $CHARSET{G94n}->{ $g94n_f }
298     || $CHARSET{G94n}->{ "\x7E" }; '';
299     } elsif ($g96_g) {
300     $C->{ $_GB_to_GN{ $g96_g } } = $CHARSET{G96}->{ $g96_f.$rev }
301     || $CHARSET{G96}->{ $g96_f }
302     || $CHARSET{G96}->{ "\x7E" }; '';
303     } elsif (defined $g96n_f) {
304     $C->{ $_GB_to_GN{ $g96n_g } } = $CHARSET{G96n}->{ $g96n_f.$rev }
305     || $CHARSET{G96n}->{ $g96n_f }
306     || $CHARSET{G96n}->{ "\x7E" }; '';
307     } elsif ($Fe) { ## ESC Fe => C1
308     chr ($C->{ $C->{ESC_Fe} }->{ucs} + (ord ($Fe) - 0x40));
309     } elsif ($CZD) {
310     $C->{C0} = $CHARSET{C0}->{ $CZD.$rev }
311     || $CHARSET{C0}->{ $CZD } || $CHARSET{C0}->{ "\x7E" }; '';
312     } elsif ($C1D) {
313     $C->{C1} = $CHARSET{C1}->{ $C1D.$rev }
314     || $CHARSET{C1}->{ $C1D } || $CHARSET{C1}->{ "\x7E" }; '';
315     } elsif ($Fs) {
316     if ($Fs eq "\x6E") { ## LS2
317     $C->{GL} = 'G2'; '';
318     } elsif ($Fs eq "\x6F") { ## LS3
319     $C->{GL} = 'G3'; '';
320     } elsif ($Fs eq "\x7E") { ## LS1R
321     $C->{GR} = 'G1'; $C->{GL} = 'G1' if $C->{bit} == 7; '';
322     } elsif ($Fs eq "\x7D") { ## LS2R
323     $C->{GR} = 'G2'; $C->{GL} = 'G2' if $C->{bit} == 7; '';
324     } elsif ($Fs eq "\x7C") { ## LS3R
325     $C->{GR} = 'G3'; $C->{GL} = 'G3' if $C->{bit} == 7; '';
326     } else {
327     chr ($CHARSET{single_control}->{Fs}->{ucs} + (ord ($Fs) - 0x60));
328     }
329     } elsif ($sI) {
330     chr ($CHARSET{single_control}->{'3F'.$sI}->{ucs} + (ord ($sF) - 0x30));
331     } elsif ($ACS) { ## Announcer
332     if ($ACS eq "\x4A") { $C->{bit} = 7 }
333     elsif ($ACS eq "\x4B") { $C->{bit} = 8 }
334     '';
335     }
336     }gex;
337     $C->{_irr} = undef;
338     }
339     $esc;
340 wakaba 1.3 ## Locking shifts
341     } elsif ($ls) {
342     if ($ls eq $C->{$C->{CL}}->{LS0}) {
343     $C->{GL} = 'G0'; '';
344     } elsif ($ls eq $C->{$C->{CL}}->{LS1}) {
345     $C->{GL} = 'G1'; '';
346     } elsif ($ls =~ /\x1B([\x6E\x6F])/) {
347     $C->{GL} = {"\x6E"=>2, "\x6F"=>3}->{$1}; '';
348     } elsif ($ls =~ /\x1B([\x7C-\x7E])/) {
349     $C->{GR} = {"\x7E"=>1, "\x7D"=>2, "\x7C"=>3}->{$1}; '';
350     }
351 wakaba 1.1 ## Control sequence
352     } elsif ($csi) {
353     $csi =~ tr/\xA0-\xFF/\x20-\x7F/d;
354     $csi =~ s/$C->{$C->{CL}}->{LS0}//g if $C->{$C->{CL}}->{LS0};
355     $csi =~ s/$C->{$C->{CL}}->{LS1}//g if $C->{$C->{CL}}->{LS1};
356     "\x9B".$csi;
357     }
358     }gex;
359     $s;
360     }
361    
362     sub internal_to_iso2022 ($\%) {
363     my ($s, $C) = @_;
364     $C ||= &new_object;
365    
366     my $r = '';
367     for my $c (split //, $s) {
368     my $cc = ord $c;
369     my $t;
370     if ($cc <= 0x1F) {
371     $t = _i2c ($c, $C, type => 'C0', charset => '@');
372     } elsif ($cc == 0x20 || $cc == 0x7F) {
373     $t = _back2ascii ($C) . $c;
374     } elsif ($cc < 0x7F) {
375     $t = _i2g ($c, $C, type => 'G94', charset => 'B');
376     } elsif ($cc <= 0x9F) {
377     $t = _i2c ($c, $C, type => 'C1', charset_id => '64291991C1',
378     charset => $C->{private_set}->{XC1}->{'64291991C1'});
379     } elsif ($cc <= 0xFF) {
380     $t = _i2g (chr($cc-0x80), $C, type => 'G96', charset => 'A');
381     } elsif ($cc <= 0x24FF) {
382     my $c = $cc - 0x100;
383     my $final = $C->{private_set}->{U96n}->[0];
384     if (length $final) {
385     $t = _i2g (chr(($c / 96)+0x20).chr(($c % 96)+0x20), $C,
386     type => 'G96n', charset => $final);
387     }
388     } elsif ($cc <= 0x33FF) {
389     my $c = $cc - 0x2500;
390     my $final = $C->{private_set}->{U96n}->[1];
391     if (length $final) {
392     $t = _i2g (chr(($c / 96)+0x20).chr(($c % 96)+0x20), $C,
393     type => 'G96n', charset => $final);
394     }
395     } elsif (0xE000 <= $cc && $cc <= 0xFFFF) {
396     my $c = $cc - 0xE000;
397     my $final = $C->{private_set}->{U96n}->[2];
398     if (length $final) {
399     $t = _i2g (chr(($c / 96)+0x20).chr(($c % 96)+0x20), $C,
400     type => 'G96n', charset => $final);
401     }
402    
403     } elsif (0xE9F6C0 <= $cc && $cc <= 0xF06F80) {
404     my $c = $cc - 0xE9F6C0;
405     $t = _i2g (chr((($c % 8836) / 94)+0x21).chr(($c % 94)+0x21), $C,
406     type => 'G94n', charset => chr(($c / 8836)+0x30));
407     } elsif (0xF49D7C <= $cc && $cc <= 0xF4BFFF) {
408     my $c = $cc - 0xF49D7C;
409     $t = _i2g (chr(($c / 94)+0x21).chr(($c % 94)+0x21), $C,
410     type => 'G94n', charset => 'B', revision => '@');
411    
412     } elsif (0xF0000 <= $cc && $cc <= 0x10F7FF) {
413     my $c = $cc - 0xF0000;
414     $t = _i2g (chr((($c % 9216) / 96)+0x20).chr(($c % 96)+0x20), $C,
415     type => 'G96n', charset => "\x20".chr(($c / 9216)+0x40));
416     } elsif (0xE90940 <= $cc && $cc <= 0xE92641) {
417     my $c = $cc - 0xE90940;
418     $t = _i2g (chr(($c % 94)+0x21), $C,
419     type => 'G94', charset => chr(($c / 94)+0x30));
420     } elsif (0xE92642 <= $cc && $cc <= 0xE9269F) {
421     my $c = $cc - 0xE92642;
422     $t = _i2g (chr($c+0x21),$C,type => 'G94', charset => "\x64", revision => '@');
423     } elsif (0xE926A0 <= $cc && $cc <= 0xE9443F) {
424     my $c = $cc - 0xE926A0;
425     $t = _i2g (chr(($c % 96)+0x20), $C,
426     type => 'G96', charset => chr(($c / 96)+0x30));
427     } elsif (0xE944A0 <= $cc && $cc <= 0xE961A1) {
428     my $c = $cc - 0xE944A0;
429     $t = _i2g (chr(($c % 94)+0x21), $C,
430     type => 'G94', charset => '!'.chr(($c / 94)+0x30));
431     } elsif (0xE96200 <= $cc && $cc <= 0xE97F9F) {
432     my $c = $cc - 0xE96200;
433     $t = _i2g (chr(($c % 96)+0x20), $C,
434     type => 'G96', charset => '!'.chr(($c / 96)+0x30));
435     } elsif (0xE98000 <= $cc && $cc <= 0xE99D01) {
436     my $c = $cc - 0xE98000;
437     $t = _i2g (chr(($c % 94)+0x21), $C,
438     type => 'G94', charset => '"'.chr(($c / 94)+0x30));
439     } elsif (0xE99D60 <= $cc && $cc <= 0xE9BAFF) {
440     my $c = $cc - 0xE99D60;
441     $t = _i2g (chr(($c % 96)+0x20), $C,
442     type => 'G96', charset => '"'.chr(($c / 96)+0x30));
443     } elsif (0xE9BB60 <= $cc && $cc <= 0xE9D861) {
444     my $c = $cc - 0xE9BB60;
445     $t = _i2g (chr(($c % 94)+0x21), $C,
446     type => 'G94', charset => '#'.chr(($c / 94)+0x30));
447     } elsif (0xE9D8C0 <= $cc && $cc <= 0xE9F65F) {
448     my $c = $cc - 0xE9D8C0;
449     $t = _i2g (chr(($c % 96)+0x20), $C,
450     type => 'G96', charset => '#'.chr(($c / 96)+0x30));
451     } elsif (0x70090940 <= $cc && $cc <= 0x70092641) {
452     my $c = $cc - 0x70090940;
453     $t = _i2g (chr(($c % 94)+0x21), $C,
454     type => 'G94', charset => "\x20".chr(($c / 94)+0x30));
455     } elsif (0x700926A0 <= $cc && $cc <= 0x7009443F) {
456     my $c = $cc - 0x700926A0;
457     $t = _i2g (chr(($c % 96)+0x20), $C,
458     type => 'G96', charset => "\x20".chr(($c / 96)+0x30));
459     ## TODO: DRCS with I byte: U+700944A0-U+7009F6BF
460     } elsif (0x70400000 <= $cc && $cc <= 0x7040FFED) {
461     my $c = $cc - 0x70400000;
462     $t = _i2g (chr(($c % 94)+0x21), $C, charset_id => 'P'.int ($c / 94),
463     type => 'G94', charset => $C->{private_set}->{G94}->[ $c / 94 ]);
464     } elsif (0x70410000 <= $cc && $cc <= 0x7041FFBF) {
465     my $c = $cc - 0x70410000;
466     $t = _i2g (chr(($c % 96)+0x20), $C, charset_id => 'P'.int ($c / 96),
467     type => 'G96', charset => $C->{private_set}->{G96}->[ $c / 96 ]);
468     } elsif (0x70420000 <= $cc && $cc <= 0x7046F19B) {
469     my $c = $cc % 0x10000;
470     $t = _i2g (chr((($c % 8836) / 94)+0x21).chr(($c % 94)+0x21), $C,
471     type => 'G94n',
472 wakaba 1.2 charset_id => 'P'.int(($cc / 0x10000) - 0x7042).'_'.int($c / 8836),
473 wakaba 1.1 charset => $C->{private_set}->{G94n}->[ ($cc / 0x10000) - 0x7042 ]
474     ->[ $c / 8836 ]);
475     }
476     if (defined $t) {
477 wakaba 1.5 $t = _i2o ($t, $C, cs_F => "\x40")
478     if $C->{coding_system} ne $CODING_SYSTEM{"\x40"};
479     } else {
480     my $F; my @F = qw~G /G /H /I B /A /D /F~;
481     push @F, qw~/J /K /L~ if $cc <= 0x10FFFF;
482     push @F, qw~/@ /C /E~ if $cc <= 0xFFFF;
483     for (@F) {
484     if (defined $C->{option}->{designate_to}->{coding_system}->{$_}
485     && $C->{option}->{designate_to}->{coding_system}->{$_} > -1) {
486     $F = $_; last;
487     } elsif ($C->{option}->{designate_to}->{coding_system}->{default} > -1) {
488     $F = $_; last;
489     }
490     }
491     $t = _i2o ($c, $C, cs_F => $F) if $F;
492     }
493     if (defined $t) {
494 wakaba 1.1 $r .= $t;
495     } else {
496 wakaba 1.5 $t = _i2g ($C->{option}->{undef_char}->[0], $C,
497 wakaba 1.4 %{ $C->{option}->{undef_char}->[1] });
498 wakaba 1.5 $r .= $C->{coding_system} eq $CODING_SYSTEM{"\x40"} ?
499     $t : _i2o ($t, $C, cs_F => "\x40");
500 wakaba 1.1 }
501     }
502     $r . _back2ascii ($C);
503     }
504    
505     ## $O{charset} eq undef means that charset is same as the current designated one.
506     sub _i2c ($%%) {
507     my ($s, $C, %O) = @_;
508     my $r = '';
509     if ($O{type} eq 'C0') {
510     if (defined $O{charset}) {
511     if ( $C->{C0} ne $CHARSET{C0}->{$O{charset}}
512     && $C->{C0} ne $CHARSET{C0}->{$O{charset_id}}) {
513     for ($C->{option}->{designate_to}->{C0}->{$O{charset}},
514     $C->{option}->{designate_to}->{C0}->{default}) {
515     if (defined $_) { return undef if $_ == -1; last }
516     }
517     $r .= "\x1B\x21".$O{charset};
518     $C->{C0} = $CHARSET{C0}->{$O{charset}};
519     }
520     } elsif (defined $O{charset_id}) {
521     if ($C->{C0} ne $CHARSET{C0}->{$O{charset_id}}) {
522     return undef; ## Control set is not designated nor has F byte
523     }
524     }
525     $r .= _back2ascii ($C, reset_all => $C->{C0}->{reset_all}->{$s});
526     return $r . $s;
527     } elsif ($O{type} eq 'C1') {
528     if (defined $O{charset}) {
529     if ( $C->{C1} ne $CHARSET{C1}->{$O{charset}}
530     && $C->{C1} ne $CHARSET{C1}->{$O{charset_id}}) {
531     for ($C->{option}->{designate_to}->{C1}->{$O{charset}},
532     $C->{option}->{designate_to}->{C1}->{default}) {
533     if (defined $_) { return undef if $_ == -1; last }
534     }
535     $r .= "\x1B\x22".$O{charset};
536     $C->{C1} = $CHARSET{C1}->{$O{charset}};
537     }
538     } elsif (defined $O{charset_id}) {
539     if ($C->{C1} ne $CHARSET{C1}->{$O{charset_id}}) {
540     return undef; ## Control set is not designated nor has F byte
541     }
542     }
543     $r .= _back2ascii ($C, reset_all => $C->{C1}->{reset_all}->{$s});
544     unless ($C->{option}->{C1invoke_to_right}) { ## ESC Fe
545     $s =~ s/([\x80-\x9F])/"\x1B" . chr (ord ($1) - 0x40)/ge;
546     }
547     return $r . $s;
548     }
549     }
550     sub _i2g ($%%) {
551     my ($s, $C, %O) = @_;
552     my $r = '';
553 wakaba 1.4 my $set = $CHARSET{$O{type}}->{$O{charset}.
554     ($O{revision}&&$C->{option}->{use_revision}?$O{revision}:'')};
555 wakaba 1.1 my $set0 = $CHARSET{$O{type}}->{$O{charset_id}};
556     ## -- designate character set
557     my $G = 0;
558     if ($C->{G0} eq $set || $C->{G0} eq $set0) { $G = 0 }
559     elsif ($C->{G1} eq $set || $C->{G1} eq $set0) { $G = 1 }
560     elsif ($C->{G2} eq $set || $C->{G2} eq $set0) { $G = 2 }
561     elsif ($C->{G3} eq $set || $C->{G3} eq $set0) { $G = 3 }
562     else {
563     return undef unless $set; ## charset does not have F byte
564     $G = 1 if $O{type} eq 'G96' || $O{type} eq 'G96n';
565     for ($C->{option}->{designate_to}->{$O{type}}->{$O{charset}},
566     $C->{option}->{designate_to}->{$O{type}}->{default}) {
567     if (defined $_) {
568     $G = $_; last;
569     }
570     }
571     if ($G == -1) {
572     return undef;
573     }
574     if ($O{type} eq 'G94') {
575     $r .= ($O{revision}&&$C->{option}->{use_revision}?"\x1B\x26".$O{revision}:'')
576     ."\x1B".("\x28","\x29","\x2A","\x2B")[$G].$O{charset};
577     } elsif ($O{type} eq 'G94n') {
578     if ($G == 0 && !$C->{option}->{G94n_designate_long}
579     && ($O{charset} eq '@' || $O{charset} eq 'A' || $O{charset} eq 'B')) {
580     $r .= ($O{revision}&&$C->{option}->{use_revision}?"\x1B\x26".$O{revision}:'')
581     ."\x1B\x24".$O{charset};
582     } else {
583     $r .= ($O{revision}&&$C->{option}->{use_revision}?"\x1B\x26".$O{revision}:'')
584     ."\x1B\x24".("\x28","\x29","\x2A","\x2B")[$G].$O{charset};
585     }
586     } elsif ($O{type} eq 'G96') {
587     $r .= ($O{revision}&&$C->{option}->{use_revision}?"\x1B\x26".$O{revision}:'')
588     ."\x1B".("\x2C","\x2D","\x2E","\x2F")[$G].$O{charset};
589     } elsif ($O{type} eq 'G96n') {
590     $r .= ($O{revision}&&$C->{option}->{use_revision}?"\x1B\x26".$O{revision}:'')
591     ."\x1B\x24".("\x2C","\x2D","\x2E","\x2F")[$G].$O{charset};
592     }
593     $C->{'G'.$G} = $CHARSET{$O{type}}->{$O{charset}};
594     }
595     ## -- invoke G buffer
596     my $left = $C->{option}->{Ginvoke_to_left}->[$G];
597     if ($C->{GL} eq 'G'.$G) {
598     $left = 1;
599     } elsif ($C->{GR} eq 'G'.$G) {
600     $left = 0;
601     } else {
602     if ($C->{option}->{Ginvoke_by_single_shift}->[$G]) {
603     if ($C->{C1}->{'C_SS'.$G}) {
604     $r .= _i2c ($C->{C1}->{'C_SS'.$G}, $C, type => 'C1') || return undef;
605     } elsif ($C->{C0}->{'C_SS'.$G}) {
606     $r .= _i2c ($C->{C0}->{'C_SS'.$G}, $C, type => 'C0') || return undef;
607     } else { ## Both C0 and C1 set do not have SS2/3.
608 wakaba 1.4 $left = 0 if $G == 1 && !$C->{C0}->{C_LS1};
609 wakaba 1.1 $r .= __invoke ($C, $G => $left) if $C->{$left?'GL':'GR'} ne 'G'.$G;
610     }
611     } else {
612     $left = 0 if $G == 1 && !$C->{C0}->{C_LS1};
613     $r .= __invoke ($C, $G => $left) if $C->{$left?'GL':'GR'} ne 'G'.$G;
614     }
615     }
616     $s =~ tr/\x00-\x7F/\x80-\xFF/ unless $left;
617     $r . $s;
618     }
619     sub _back2ascii (%) {
620     my ($C, %O) = @_;
621     my $r = '';
622     if ($C->{option}->{reset}->{Gdesignation}) {
623     my $F = $C->{option}->{reset}->{Gdesignation}; # \x42
624     $r .= "\x1B\x28".$F unless $C->{G0} eq $CHARSET{G94}->{$F};
625     $C->{G0} = $CHARSET{G94}->{$F};
626     if ($O{reset_all}) {
627     $C->{G1} = $CHARSET{G94}->{"\x7E"};
628     $C->{G2} = $CHARSET{G94}->{"\x7E"};
629     $C->{G3} = $CHARSET{G94}->{"\x7E"};
630     }
631     }
632     if ($C->{option}->{reset}->{Ginvoke}) {
633     if ($C->{GL} ne 'G0') {
634     $r .= $C->{C0}->{C_LS0} || ($C->{C0} = $CHARSET{C0}->{'@'},"\x1B\x21\x40\x0F");
635     $C->{GL} = 'G0';
636     }
637     $C->{GR} = undef if $O{reset_all};
638     }
639     $r;
640     }
641     ## __invoke (\%C, $G, $left_or_right)
642     sub __invoke (\%$$) {
643     my ($C, $G) = @_;
644     if ($_[2]) {
645     $C->{GL} = 'G'.$G;
646     return ($C->{C0}->{C_LS0}
647     || scalar ($C->{C0}=$CHARSET{C0}->{'@'},"\x1B\x21\x40\x0F"),
648     $C->{C0}->{C_LS1}, "\x1B\x6E", "\x1B\x6F")[$G];
649     } else {
650     $C->{GR} = 'G'.$G;
651     return ("", "\x1B\x7E", "\x1B\x7D", "\x1B\x7C")[$G];
652     }
653     '';
654     }
655 wakaba 1.5 sub _i2o ($\%%) {
656     my ($s, $C, %O) = @_;
657     my $CS = $CODING_SYSTEM{ $O{cs_F} } || $CODING_SYSTEM{ $O{cs_id} } || return undef;
658     my $r = '';
659     if ($CS ne $C->{coding_system}) {
660     my $e = '';
661     $e .= "\x1B\x25";
662     $e .= $O{cs_F} || $C->{private_set}->{coding_system}->{ $O{cs_id} }
663     || return undef;
664     if ($C->{coding_system} eq $CODING_SYSTEM{"\x2F\x40"}
665     || $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x43"}
666     || $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x45"}
667     || $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x4A"}
668     || $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x4B"}
669     || $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x4C"}) {
670     $e =~ s/(.)/\x00$1/go;
671     } elsif ($C->{coding_system} eq $CODING_SYSTEM{"\x2F\x41"}
672     || $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x44"}
673     || $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x46"}) {
674     $e =~ s/(.)/\x00\x00\x00$1/go;
675     }
676     $r .= $e;
677     $C->{coding_system} = $CS;
678     if ($CS->{reset_state}) {
679     $C->{GL} = undef; $C->{GR} = undef;
680     $C->{C0} = $CHARSET{C0}->{"\x7E"};
681     $C->{C1} = $CHARSET{C1}->{"\x7E"};
682     $C->{G0} = $CHARSET{G94}->{"\x7E"};
683     $C->{G1} = $CHARSET{G94}->{"\x7E"};
684     $C->{G2} = $CHARSET{G94}->{"\x7E"};
685     $C->{G3} = $CHARSET{G94}->{"\x7E"};
686     }
687     }
688     if ($CS eq $CODING_SYSTEM{"\x40"}) {
689     #
690     } elsif ($CS eq $CODING_SYSTEM{G} || $CS eq $CODING_SYSTEM{'/G'}
691     || $CS eq $CODING_SYSTEM{'/H'} || $CS eq $CODING_SYSTEM{'/I'}) {
692     Encode::_utf8_off ($s);
693     } elsif ($CS eq $CODING_SYSTEM{'/@'} || $CS eq $CODING_SYSTEM{'/C'}
694     || $CS eq $CODING_SYSTEM{'/E'}) {
695     $s = Encode::encode ('ucs-2be', $s);
696     } elsif ($CS eq $CODING_SYSTEM{'/A'} || $CS eq $CODING_SYSTEM{'/D'}
697     || $CS eq $CODING_SYSTEM{'/F'}) {
698     $s = Encode::encode ('ucs-4be', $s);
699     } elsif ($CS eq $CODING_SYSTEM{'/J'} || $CS eq $CODING_SYSTEM{'/K'}
700     || $CS eq $CODING_SYSTEM{'/L'}) {
701     $s = Encode::encode ('UTF-16BE', $s);
702     } elsif ($CS eq $CODING_SYSTEM{B}) {
703     $s = Encode::encode ('utf-1', $s);
704     } else {
705     return undef;
706     }
707     $r . $s;
708 wakaba 1.1 }
709    
710     1;
711     __END__
712    
713     =head1 SEE ALSO
714    
715     ISO/IEC 646:1991, "7-bit coded graphic character set for intormation interchange".
716    
717     ISO/IEC 2022:1994, "Character Code Structure and Extension Techniques".
718     (IDT with ECMA 35, JIS X 0202:1998)
719    
720     ISO/IEC 4873:1991, "8-Bit Coded Character Set Structure and Rules".
721     (IDT with ECMA 43)
722    
723     ISO/IEC 6429:1992, "Control Functions for Coded Character Sets".
724     (IDT with ECMA 48:1991, JIS X 0211:1998)
725    
726     ISO/IEC 8859, "8-Bit Single-Byte Coded Graphic Character Sets".
727    
728     Encode, perlunicode
729    
730 wakaba 1.4 =head1 TODO
731    
732     =over 4
733    
734     =item NCR (coding system other than ISO/IEC 2022) support
735    
736     =over 2
737    
738     =item ESC 02/05 02/15 03/x of X Compound Text
739    
740     =back
741    
742     =item Output of control character sets, single control functions
743    
744     =item Designation sequence of control character sets (input)
745    
746     =item Special graphic character sets such as G3 of EUC-TW
747    
748     =item SUPER SHIFT (SS) invoke function of old control character set
749    
750     =item Safe transparent of control string (ISO/IEC 6429)
751    
752     =item Output of unoutputable characters as alternative notation such as SGML-like entity
753    
754     =item C0 set invoked to CR area like ISIRI code
755    
756     Really need?
757    
758     =item special treatment of 0x20, 0x7E, 0xA0, 0xFF
759    
760     For example, GB mongolian sets use MSP (MONGOLIAN SPACE)
761     with these code positions.
762    
763     And, no less coding systems does not use (or does ban using) DEL.
764    
765     =item A lot of character sets don't have pseudo-UCS mapping.
766    
767     Most of 9m^n (n >= 3) sets, 9m^n sets with I byte, 9m^n
768     DRCSes do not have pseudo-UCS mapping area. It is
769     questionable to allocate lots of code positions to these
770     rarely-(or no-)used character sets.
771    
772     =item Even character sets that have pseudo-UCS mapping, some of them can't be outputed in ISO/IEC 2022.
773    
774     Because output of rarely-used character sets is
775     not implemented yet.
776    
777     =back
778    
779 wakaba 1.5 =head1 AUTHORS
780    
781     Nanashi-san
782    
783     Wakaba <w@suika.fam.cx>
784    
785 wakaba 1.1 =head1 LICENSE
786    
787 wakaba 1.5 Copyright 2002 AUTHORS
788 wakaba 1.1
789     This library is free software; you can redistribute it
790     and/or modify it under the same terms as Perl itself.
791    
792     =cut
793    
794 wakaba 1.5 # $Date: 2002/09/16 06:35:16 $
795 wakaba 1.1 ### ISO2022.pm ends here

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24