/[suikacvs]/perl/lib/Encode/ISO2022.pm
Suika

Contents of /perl/lib/Encode/ISO2022.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.11 - (hide annotations) (download)
Thu Dec 12 08:17:16 2002 UTC (21 years, 10 months ago) by wakaba
Branch: MAIN
Changes since 1.10: +2 -3 lines
]

1 wakaba 1.1
2     =head1 NAME
3    
4     Encode::ISO2022 --- ISO/IEC 2022 encoder and decoder
5    
6 wakaba 1.6 =head1 ENCODINGS
7    
8     =over 4
9    
10     =item iso2022
11    
12     ISO/IEC 2022:1994. Default status is:
13    
14     =over 2
15    
16     =item CL = C0 = ISO/IEC 6429:1991 C0 set
17    
18     =item CR = C1 = ISO/IEC 6429:1991 C1 set
19    
20     =item GL = G0 = ISO/IEC 646:1991 IRV GL(G0) set
21    
22     =item GR = G1 = empty set
23    
24     =item G2 = empty set
25    
26     =item G3 = empty set
27    
28     =back
29    
30     (Alias: iso/iec2022, iso-2022, 2022, cp2022)
31    
32     =back
33    
34     Note that ISO/IEC 2022 based encodings are found in
35     Encode::ISO2022::* modules. This module, Encode::ISO2022
36     only provides a general ISO/IEC 2022 encoder/decoder.
37    
38 wakaba 1.1 =cut
39    
40     require v5.7.3;
41     package Encode::ISO2022;
42     use strict;
43 wakaba 1.5 use vars qw(%CHARSET %CODING_SYSTEM $VERSION);
44 wakaba 1.11 $VERSION=do{my @r=(q$Revision: 1.10 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
45 wakaba 1.1 use base qw(Encode::Encoding);
46 wakaba 1.6 __PACKAGE__->Define (qw!iso-2022 iso/iec2022 iso2022 2022 cp2022!);
47 wakaba 1.5 require Encode::Charset;
48     *CHARSET = \%Encode::Charset::CHARSET;
49     *CODING_SYSTEM = \%Encode::Charset::CODING_SYSTEM;
50 wakaba 1.1
51     ### --- Perl Encode module common functions
52    
53     sub encode ($$;$) {
54     my ($obj, $str, $chk) = @_;
55     $_[1] = '' if $chk;
56     $str = &internal_to_iso2022 ($str);
57     return $str;
58     }
59    
60     sub decode ($$;$) {
61     my ($obj, $str, $chk) = @_;
62     $_[1] = '' if $chk;
63     return &iso2022_to_internal ($str);
64     }
65    
66     ### --- Encode::ISO2022 unique functions
67 wakaba 1.6 *new_object = \&Encode::Charset::new_object;
68 wakaba 1.1
69 wakaba 1.10 sub iso2022_to_internal ($;%) {
70 wakaba 1.1 my ($s, $C) = @_;
71 wakaba 1.6 $C ||= &new_object;
72 wakaba 1.5 my $t = '';
73 wakaba 1.9 $s =~ s{^((?:(?!\x1B\x25\x2F?[\x30-\x7E]).)*)}{
74 wakaba 1.5 my $i2 = $1;
75     $t = _iso2022_to_internal ($i2, $C);
76     '';
77 wakaba 1.9 }es;
78 wakaba 1.6 my $pad = '';
79     use re 'eval';
80 wakaba 1.5 $s =~ s{
81     ## ISO/IEC 2022
82 wakaba 1.6 (??{"$pad\x1B$pad\x25$pad\x40"})((?:(?!\x1B\x25\x2F?[\x30-\x7E]).)*)
83 wakaba 1.5 ## UTF-8
84 wakaba 1.6 |(??{"$pad\x1B$pad\x25$pad(?:\x47|\x2F$pad"."[\x47-\x49])"})
85     ((?:(?!\x1B\x25\x2F?[\x30-\x7E]).)*)
86 wakaba 1.5 ## UCS-2, UTF-16
87 wakaba 1.6 |(??{"$pad\x1B$pad\x25$pad\x2F$pad"})([\x40\x43\x45\x4A-\x4C])
88     ((?:(?!\x00\x1B\x00\x25(?:\x00\x2F)?\x00[\x30-\x7E])..)*)
89 wakaba 1.5 ## UCS-4
90 wakaba 1.6 |(??{"$pad\x1B$pad\x25$pad\x2F$pad"})[\x41\x44\x46]
91     ((?:(?!\x00\x00\x00\x1B\x00\x00\x00\x25(?:\x00\x00\x00\x2F)?
92     \x00\x00\x00[\x30-\x7E])....)*)
93 wakaba 1.5 ## with standard return
94 wakaba 1.6 |(??{"$pad\x1B$pad\x25$pad"})([\x30-\x7E])
95     ((?:(?!\x1B\x25\x2F?[\x30-\x7E]).)*)
96 wakaba 1.5 ## without standard return
97 wakaba 1.6 |(??{"$pad\x1B$pad\x25$pad\x2F$pad"})([\x30-\x7E])(.*)
98 wakaba 1.5 }{
99     my ($i2,$u8,$Fu2,$u2,$u4,$Fsr,$sr,$Fnsr,$nsr) = ($1,$2,$3,$4,$5,$6,$7,$8,$9);
100     my $r = '';
101     if (defined $i2) {
102 wakaba 1.6 $r = _iso2022_to_internal ($i2, $C); $pad = '';
103 wakaba 1.5 } elsif (defined $u8) {
104 wakaba 1.6 $r = Encode::decode ('utf8', $u8); $pad = '';
105 wakaba 1.5 } elsif ($Fu2) {
106     if (ord ($Fu2) > 0x49) {
107     $r = Encode::decode ('utf-16be', $u2);
108     } else {
109     $r = Encode::decode ('ucs-2be', $u2);
110     }
111 wakaba 1.6 $pad = "\x00";
112 wakaba 1.5 } elsif (defined $u4) {
113 wakaba 1.6 $r = Encode::decode ('ucs-4be', $u2); $pad = "\x00\x00\x00";
114     } elsif (defined $Fsr && $CODING_SYSTEM{$Fsr}->{perl_name}) {
115     $r = Encode::decode ($CODING_SYSTEM{$Fsr}->{perl_name}, $sr); $pad = '';
116     } elsif (defined $Fnsr && $CODING_SYSTEM{$Fnsr}->{perl_name}) {
117     $r = Encode::decode ($CODING_SYSTEM{$Fnsr}->{perl_name}, $nsr); $pad = '';
118 wakaba 1.5 } else { ## temporary
119 wakaba 1.6 $r = '?' x length ($sr.$nsr); $pad = '';
120 wakaba 1.5 }
121     $r;
122     }gesx;
123     $t . $s;
124     }
125    
126 wakaba 1.9 # this is very very trickey. my perl 5.8.0 does not process
127     # regex with eval except the first time (i think it's a bug
128     # of perl), so we redefine this function whenever being called!
129     # when this unexpected behavior is fixed or someone finds
130     # better way to avoid it, we will rewrite this code.
131     &_iso2022_to_internal (undef);
132 wakaba 1.10 sub _iso2022_to_internal ($;%) {
133     eval q{ sub __iso2022_to_internal ($;%) { 0 } };
134 wakaba 1.9 eval q{
135 wakaba 1.10 sub __iso2022_to_internal ($;%) {
136 wakaba 1.9 use re 'eval';
137 wakaba 1.5 my ($s, $C) = @_;
138 wakaba 1.1 my %_GB_to_GN = (
139     "\x28"=>'G0',"\x29"=>'G1',"\x2A"=>'G2',"\x2B"=>'G3',
140     "\x2C"=>'G0',"\x2D"=>'G1',"\x2E"=>'G2',"\x2F"=>'G3',
141     );
142 wakaba 1.9 my %_CHARS_to_RANGE = (
143     l94 => q/[\x21-\x7E]/, l96 => q/[\x20-\x7F]/,
144     l128 => q/[\x00-\x7F]/, l256 => q/[\x00-\xFF]/,
145     r94 => q/[\xA1-\xFE]/, r96 => q/[\xA0-\xFF]/,
146     r128 => q/[\x80-\xFF]/, r256 => q/[\x80-\xFF]/,
147     b94 => q/[\x21-\x7E\xA1-\xFE]/, b96 => q/[\x20-\x7F\xA0-\xFF]/,
148     b128 => q/[\x00-\xFF]/, b256 => q/[\x00-\xFF]/,
149     );
150 wakaba 1.1
151     $s =~ s{
152     ((??{ $_CHARS_to_RANGE{'l'.$C->{$C->{GL}}->{chars}}
153     . qq/{$C->{$C->{GL}}->{dimension},$C->{$C->{GL}}->{dimension}}/ }))
154 wakaba 1.6 |((??{ $_CHARS_to_RANGE{'r'.$C->{$C->{GR}}->{chars}}
155 wakaba 1.9 . qq/{$C->{$C->{GR}}->{dimension},$C->{$C->{GR}}->{dimension}}/ }))
156 wakaba 1.1 | (??{ q/(?:/ . ($C->{$C->{CR}}->{r_SS2} || '(?!)')
157     . ($C->{$C->{ESC_Fe}}->{r_SS2_ESC} ?
158     qq/|$C->{$C->{ESC_Fe}}->{r_SS2_ESC}/ : '')
159     . ($C->{$C->{CL}}->{r_SS2} ? qq/|$C->{$C->{CL}}->{r_SS2}/ : '') . q/)/
160     . ( $C->{$C->{CL}}->{r_LS0}
161     ||$C->{$C->{CL}}->{r_LS1}? ## ISO/IEC 6429:1992 9
162     qq/[$C->{$C->{CL}}->{r_LS0}$C->{$C->{CL}}->{r_LS1}]*/:'')
163     })
164 wakaba 1.6 ((??{ $_CHARS_to_RANGE{'b'.$C->{G2}->{chars}}
165 wakaba 1.9 . qq/{$C->{G2}->{dimension},$C->{G2}->{dimension}}/ }))
166 wakaba 1.1 | (??{ q/(?:/ . ($C->{$C->{CR}}->{r_SS3} || '(?!)')
167     . ($C->{$C->{ESC_Fe}}->{r_SS3_ESC} ?
168     qq/|$C->{$C->{ESC_Fe}}->{r_SS3_ESC}/ : '')
169     . ($C->{$C->{CL}}->{r_SS3} ? qq/|$C->{$C->{CL}}->{r_SS3}/ : '') . q/)/
170     . ( $C->{$C->{CL}}->{r_LS0}
171     || $C->{$C->{CL}}->{r_LS1}? ## ISO/IEC 6429:1992 9
172     qq/[$C->{$C->{CL}}->{r_LS0}$C->{$C->{CL}}->{r_LS1}]*/:'')
173     })
174 wakaba 1.6 ((??{ $_CHARS_to_RANGE{'b'.$C->{G3}->{chars}}
175 wakaba 1.1 . qq/{$C->{G3}->{dimension},$C->{G3}->{dimension}}/ }))
176    
177 wakaba 1.3 ## Locking shift
178 wakaba 1.6 |( (??{ $C->{$C->{CL}}->{r_LS0}||'(?!)' })
179 wakaba 1.3 |(??{ $C->{$C->{CL}}->{r_LS1}||'(?!)' })
180     )
181 wakaba 1.1
182     ## Control sequence
183     |(??{ '(?:'.($C->{$C->{CR}}->{r_CSI}||'(?!)')
184     .($C->{$C->{ESC_Fe}}->{r_CSI_ESC} ?
185     qq/|$C->{$C->{ESC_Fe}}->{r_CSI_ESC}/: '')
186     .')'
187     })
188     ((??{ qq/[\x30-\x3F$C->{$C->{CL}}->{LS0}$C->{$C->{CL}}->{LS1}\xB0-\xBF]*/
189     .qq/[\x20-\x2F$C->{$C->{CL}}->{LS0}$C->{$C->{CL}}->{LS1}\xA0-\xAF]*/
190     }) [\x40-\x7E\xD0-\xFE])
191    
192     ## Other escape sequence
193     |(\x1B[\x20-\x2F]*[\x30-\x7E])
194    
195     ## Misc. sequence (SP, control, or broken data)
196     |([\x00-\xFF])
197     }{
198 wakaba 1.3 my ($gl,$gr,$ss2,$ss3,$ls,$csi,$esc,$misc)
199     = ($1,$2,$3,$4,$5,$6,$7,$8,$9);
200 wakaba 1.1 $C->{_irr} = undef unless defined $esc;
201     ## GL graphic character
202     if (defined $gl) {
203     my $c = 0;
204     my $m = $C->{$C->{GL}}->{chars}==94?0x21:$C->{$C->{GL}}->{chars}==96?0x20:0;
205     for (split //, $gl) {
206     $c = $c * $C->{$C->{GL}}->{chars} + unpack ('C', $_) - $m;
207     }
208     chr ($C->{$C->{GL}}->{ucs} + $c);
209     ## GR graphic character
210     } elsif ($gr) {
211     my $c = 0;
212     my $m = $C->{$C->{GR}}->{chars}==94?0xA1:$C->{$C->{GR}}->{chars}==96?0xA0:0x80;
213     for (split //, $gr) {
214     $c = $c * $C->{$C->{GR}}->{chars} + unpack ('C', $_) - $m;
215     }
216     chr ($C->{$C->{GR}}->{ucs} + $c);
217 wakaba 1.10 ## Control, SP, or broken data
218     ## TODO: support control sets other than ISO/IEC 6429's
219     } elsif (defined $misc) {
220     $misc;
221 wakaba 1.1 ## Graphic character with SS2
222     } elsif ($ss2) {
223     $ss2 =~ tr/\x80-\xFF/\x00-\x7F/;
224     my $c = 0; my $m = $C->{G2}->{chars}==94?0x21:$C->{G2}->{chars}==96?0x20:0;
225     for (split //, $ss2) {
226     $c = $c * $C->{G2}->{chars} + unpack ('C', $_) - $m;
227     }
228     chr ($C->{G2}->{ucs} + $c);
229     ## Graphic character with SS3
230     } elsif ($ss3) {
231     $ss3 =~ tr/\x80-\xFF/\x00-\x7F/;
232     my $c = 0; my $m = $C->{G3}->{chars}==94?0x21:$C->{G3}->{chars}==96?0x20:0;
233     for (split //, $ss3) {
234     $c = $c * $C->{G3}->{chars} + unpack ('C', $_) - $m;
235     }
236     chr ($C->{G3}->{ucs} + $c);
237     ## Escape sequence
238     } elsif ($esc) {
239 wakaba 1.10 if ($esc =~ /\x1B\x26([\x40-\x7E])/) { ## 6F (IRR) = ESC 02/06 Ft
240 wakaba 1.1 $C->{_irr} = $1; $esc = '';
241     } else {
242     $esc =~ s{
243     \x1B([\x28-\x2B])(\x20?[\x21-\x23]?[\x30-\x7E]) ## Gx = 94^1
244     |\x1B\x24([\x28-\x2B]?)(\x20?[\x21-\x23]?[\x30-\x7E]) ## Gx = 94^n
245    
246     |\x1B([\x2C-\x2F])(\x20?[\x21-\x23]?[\x30-\x7E]) ## Gx = 96^1
247     |\x1B\x24([\x2C-\x2F])(\x20?[\x21-\x23]?[\x30-\x7E]) ## Gx = 96^n
248    
249     |\x1B([\x40-\x5F]) ## ESC Fe
250    
251     |\x1B\x21([\x21-\x23]?[\x30-\x7E]) ## CL = C0
252     |\x1B\x22([\x21-\x23]?[\x30-\x7E]) ## CR & ESC Fe = C1
253    
254     |\x1B([\x60-\x7E]) ## Single control functions
255     |\x1B\x23([\x21-\x23]?)([\x30-\x7E])
256    
257     |\x1B\x20([\x40-\x7E]) ## Announcer
258     }{
259     my ($g94_g,$g94_f,$g94n_g,$g94n_f,$g96_g,$g96_f,$g96n_g,$g96n_f,$Fe,
260     $CZD, $C1D, $Fs, $sI, $sF,$ACS)
261     = ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15);
262     my $rev = $C->{_irr} || '';
263 wakaba 1.10 my $f2s = $C->{option}->{final_to_set};
264     if ($g94_g) { ## ESC 02/08 [I] F
265     $C->{ $_GB_to_GN{ $g94_g } }
266     = $CHARSET{G94}->{ $f2s->{G94}->{$g94_f.$rev} || $g94_f.$rev }
267     || $CHARSET{G94}->{ $f2s->{G94}->{$g94_f} || $g94_f }
268     || $CHARSET{G94}->{ "\x7E" }; '';
269     } elsif (defined $g94n_f) { ## ESC 02/04 [02/08..11] [I] F
270     $C->{ $_GB_to_GN{ $g94n_g } || 'G0' }
271     = $CHARSET{G94n}->{ $f2s->{G94n}->{$g94n_f.$rev} || $g94n_f.$rev }
272     || $CHARSET{G94n}->{ $f2s->{G94n}->{$g94n_f} || $g94n_f }
273     || $CHARSET{G94n}->{ "\x7E" }; '';
274     } elsif ($g96_g) { ## ESC 02/12..15 [I] F
275     $C->{ $_GB_to_GN{ $g96_g } }
276     = $CHARSET{G96}->{ $f2s->{G96}->{$g96_f.$rev} || $g96_f.$rev }
277     || $CHARSET{G96}->{ $f2s->{G96}->{$g96_f} || $g96_f }
278     || $CHARSET{G96}->{ "\x7E" }; '';
279     } elsif ($g96n_g) { ## ESC 02/04 02/12..15 [I] F
280     $C->{ $_GB_to_GN{ $g96n_g } }
281     = $CHARSET{G96n}->{ $f2s->{G96n}->{$g96n_f.$rev} || $g96n_f.$rev }
282     || $CHARSET{G96n}->{ $f2s->{G96n}->{$g96n_f} || $g96n_f }
283     || $CHARSET{G96n}->{ "\x7E" }; '';
284     } elsif ($Fe) { ## ESC Fe = C1
285 wakaba 1.1 chr ($C->{ $C->{ESC_Fe} }->{ucs} + (ord ($Fe) - 0x40));
286 wakaba 1.10 } elsif (defined $Fs) { ## ESC Fs
287 wakaba 1.1 if ($Fs eq "\x6E") { ## LS2
288     $C->{GL} = 'G2'; '';
289     } elsif ($Fs eq "\x6F") { ## LS3
290     $C->{GL} = 'G3'; '';
291 wakaba 1.7 } elsif ($Fs eq "\x7E" || $Fs eq "\x6B") { ## LS1R
292 wakaba 1.1 $C->{GR} = 'G1'; $C->{GL} = 'G1' if $C->{bit} == 7; '';
293 wakaba 1.7 } elsif ($Fs eq "\x7D" || $Fs eq "\x6C") { ## LS2R
294 wakaba 1.1 $C->{GR} = 'G2'; $C->{GL} = 'G2' if $C->{bit} == 7; '';
295 wakaba 1.7 } elsif ($Fs eq "\x7C" || $Fs eq "\x6D") { ## LS3R
296 wakaba 1.1 $C->{GR} = 'G3'; $C->{GL} = 'G3' if $C->{bit} == 7; '';
297     } else {
298     chr ($CHARSET{single_control}->{Fs}->{ucs} + (ord ($Fs) - 0x60));
299     }
300 wakaba 1.10 } elsif (defined $CZD) { ## 1F (CZD) = ESC 02/01 [I] F
301     $C->{C0} = $CHARSET{C0}->{ $f2s->{C0}->{$CZD.$rev} || $CZD.$rev }
302     || $CHARSET{C0}->{ $f2s->{C0}->{$CZD} || $CZD }
303     || $CHARSET{C0}->{ "\x7E" }; '';
304     } elsif (defined $C1D) { ## 2F (C1D) = ESC 02/02 [I] F
305     $C->{C1} = $CHARSET{C1}->{ $f2s->{C1}->{$C1D.$rev} || $C1D.$rev }
306     || $CHARSET{C1}->{ $f2s->{C1}->{$C1D} || $C1D }
307     || $CHARSET{C1}->{ "\x7E" }; '';
308     } elsif ($sI) { ## 3F = ESC 02/03 [I] F
309 wakaba 1.1 chr ($CHARSET{single_control}->{'3F'.$sI}->{ucs} + (ord ($sF) - 0x30));
310 wakaba 1.10 } elsif ($ACS) { ## 0F (Announcer) = ESC 02/00 F
311 wakaba 1.1 if ($ACS eq "\x4A") { $C->{bit} = 7 }
312     elsif ($ACS eq "\x4B") { $C->{bit} = 8 }
313     '';
314     }
315     }gex;
316     $C->{_irr} = undef;
317     }
318     $esc;
319 wakaba 1.10 } elsif ($ls) { ## Locking shifts = LS0 / LS1
320 wakaba 1.3 if ($ls eq $C->{$C->{CL}}->{LS0}) {
321     $C->{GL} = 'G0'; '';
322     } elsif ($ls eq $C->{$C->{CL}}->{LS1}) {
323     $C->{GL} = 'G1'; '';
324     }
325 wakaba 1.10 } elsif ($csi) { ## Control sequence = CSI [P..] [I] F
326 wakaba 1.1 $csi =~ tr/\xA0-\xFF/\x20-\x7F/d;
327     $csi =~ s/$C->{$C->{CL}}->{LS0}//g if $C->{$C->{CL}}->{LS0};
328     $csi =~ s/$C->{$C->{CL}}->{LS1}//g if $C->{$C->{CL}}->{LS1};
329     "\x9B".$csi;
330     }
331 wakaba 1.9 }gesx;
332 wakaba 1.1 $s;
333 wakaba 1.9 } # __iso2022_to_internal
334    
335     };
336     &__iso2022_to_internal (@_) if defined $_[0];
337    
338     } # _iso2022_to_internal
339 wakaba 1.1
340 wakaba 1.10 sub internal_to_iso2022 ($;%) {
341 wakaba 1.1 my ($s, $C) = @_;
342     $C ||= &new_object;
343    
344     my $r = '';
345     for my $c (split //, $s) {
346 wakaba 1.9 my $cc = ord $c; Encode::_utf8_off ($c);
347 wakaba 1.1 my $t;
348     if ($cc <= 0x1F) {
349     $t = _i2c ($c, $C, type => 'C0', charset => '@');
350     } elsif ($cc == 0x20 || $cc == 0x7F) {
351     $t = _back2ascii ($C) . $c;
352     } elsif ($cc < 0x7F) {
353     $t = _i2g ($c, $C, type => 'G94', charset => 'B');
354     } elsif ($cc <= 0x9F) {
355 wakaba 1.10 $t = _i2c (pack ('C', $cc), $C, type => 'C1', charset_id => '64291991C1',
356 wakaba 1.8 charset => $C->{option}->{private_set}->{XC1}->{'64291991C1'});
357 wakaba 1.1 } elsif ($cc <= 0xFF) {
358 wakaba 1.10 $t = _i2g (pack ('C', $cc-0x80), $C, type => 'G96', charset => 'A');
359 wakaba 1.1 } elsif ($cc <= 0x24FF) {
360     my $c = $cc - 0x100;
361 wakaba 1.8 my $final = $C->{option}->{private_set}->{U96n}->[0];
362 wakaba 1.1 if (length $final) {
363     $t = _i2g (chr(($c / 96)+0x20).chr(($c % 96)+0x20), $C,
364     type => 'G96n', charset => $final);
365     }
366     } elsif ($cc <= 0x33FF) {
367     my $c = $cc - 0x2500;
368 wakaba 1.8 my $final = $C->{option}->{private_set}->{U96n}->[1];
369 wakaba 1.1 if (length $final) {
370     $t = _i2g (chr(($c / 96)+0x20).chr(($c % 96)+0x20), $C,
371     type => 'G96n', charset => $final);
372     }
373     } elsif (0xE000 <= $cc && $cc <= 0xFFFF) {
374     my $c = $cc - 0xE000;
375 wakaba 1.8 my $final = $C->{option}->{private_set}->{U96n}->[2];
376 wakaba 1.1 if (length $final) {
377     $t = _i2g (chr(($c / 96)+0x20).chr(($c % 96)+0x20), $C,
378     type => 'G96n', charset => $final);
379     }
380    
381     } elsif (0xE9F6C0 <= $cc && $cc <= 0xF06F80) {
382     my $c = $cc - 0xE9F6C0;
383     $t = _i2g (chr((($c % 8836) / 94)+0x21).chr(($c % 94)+0x21), $C,
384     type => 'G94n', charset => chr(($c / 8836)+0x30));
385     } elsif (0xF49D7C <= $cc && $cc <= 0xF4BFFF) {
386     my $c = $cc - 0xF49D7C;
387     $t = _i2g (chr(($c / 94)+0x21).chr(($c % 94)+0x21), $C,
388     type => 'G94n', charset => 'B', revision => '@');
389    
390     } elsif (0xF0000 <= $cc && $cc <= 0x10F7FF) {
391     my $c = $cc - 0xF0000;
392     $t = _i2g (chr((($c % 9216) / 96)+0x20).chr(($c % 96)+0x20), $C,
393     type => 'G96n', charset => "\x20".chr(($c / 9216)+0x40));
394     } elsif (0xE90940 <= $cc && $cc <= 0xE92641) {
395     my $c = $cc - 0xE90940;
396     $t = _i2g (chr(($c % 94)+0x21), $C,
397     type => 'G94', charset => chr(($c / 94)+0x30));
398     } elsif (0xE92642 <= $cc && $cc <= 0xE9269F) {
399     my $c = $cc - 0xE92642;
400     $t = _i2g (chr($c+0x21),$C,type => 'G94', charset => "\x64", revision => '@');
401     } elsif (0xE926A0 <= $cc && $cc <= 0xE9443F) {
402     my $c = $cc - 0xE926A0;
403     $t = _i2g (chr(($c % 96)+0x20), $C,
404     type => 'G96', charset => chr(($c / 96)+0x30));
405     } elsif (0xE944A0 <= $cc && $cc <= 0xE961A1) {
406     my $c = $cc - 0xE944A0;
407     $t = _i2g (chr(($c % 94)+0x21), $C,
408     type => 'G94', charset => '!'.chr(($c / 94)+0x30));
409     } elsif (0xE96200 <= $cc && $cc <= 0xE97F9F) {
410     my $c = $cc - 0xE96200;
411     $t = _i2g (chr(($c % 96)+0x20), $C,
412     type => 'G96', charset => '!'.chr(($c / 96)+0x30));
413     } elsif (0xE98000 <= $cc && $cc <= 0xE99D01) {
414     my $c = $cc - 0xE98000;
415     $t = _i2g (chr(($c % 94)+0x21), $C,
416     type => 'G94', charset => '"'.chr(($c / 94)+0x30));
417     } elsif (0xE99D60 <= $cc && $cc <= 0xE9BAFF) {
418     my $c = $cc - 0xE99D60;
419     $t = _i2g (chr(($c % 96)+0x20), $C,
420     type => 'G96', charset => '"'.chr(($c / 96)+0x30));
421     } elsif (0xE9BB60 <= $cc && $cc <= 0xE9D861) {
422     my $c = $cc - 0xE9BB60;
423     $t = _i2g (chr(($c % 94)+0x21), $C,
424     type => 'G94', charset => '#'.chr(($c / 94)+0x30));
425     } elsif (0xE9D8C0 <= $cc && $cc <= 0xE9F65F) {
426     my $c = $cc - 0xE9D8C0;
427     $t = _i2g (chr(($c % 96)+0x20), $C,
428     type => 'G96', charset => '#'.chr(($c / 96)+0x30));
429     } elsif (0x70090940 <= $cc && $cc <= 0x70092641) {
430     my $c = $cc - 0x70090940;
431     $t = _i2g (chr(($c % 94)+0x21), $C,
432     type => 'G94', charset => "\x20".chr(($c / 94)+0x30));
433     } elsif (0x700926A0 <= $cc && $cc <= 0x7009443F) {
434     my $c = $cc - 0x700926A0;
435     $t = _i2g (chr(($c % 96)+0x20), $C,
436     type => 'G96', charset => "\x20".chr(($c / 96)+0x30));
437     ## TODO: DRCS with I byte: U+700944A0-U+7009F6BF
438     } elsif (0x70400000 <= $cc && $cc <= 0x7040FFED) {
439     my $c = $cc - 0x70400000;
440     $t = _i2g (chr(($c % 94)+0x21), $C, charset_id => 'P'.int ($c / 94),
441 wakaba 1.8 type => 'G94', charset => $C->{option}->{private_set}->{G94}->[ $c / 94 ]);
442 wakaba 1.1 } elsif (0x70410000 <= $cc && $cc <= 0x7041FFBF) {
443     my $c = $cc - 0x70410000;
444     $t = _i2g (chr(($c % 96)+0x20), $C, charset_id => 'P'.int ($c / 96),
445 wakaba 1.8 type => 'G96', charset => $C->{option}->{private_set}->{G96}->[ $c / 96 ]);
446 wakaba 1.1 } elsif (0x70420000 <= $cc && $cc <= 0x7046F19B) {
447     my $c = $cc % 0x10000;
448     $t = _i2g (chr((($c % 8836) / 94)+0x21).chr(($c % 94)+0x21), $C,
449     type => 'G94n',
450 wakaba 1.2 charset_id => 'P'.int(($cc / 0x10000) - 0x7042).'_'.int($c / 8836),
451 wakaba 1.8 charset => $C->{option}->{private_set}->{G94n}
452     ->[ ($cc / 0x10000) - 0x7042 ]->[ $c / 8836 ]);
453 wakaba 1.1 }
454     if (defined $t) {
455 wakaba 1.5 $t = _i2o ($t, $C, cs_F => "\x40")
456     if $C->{coding_system} ne $CODING_SYSTEM{"\x40"};
457     } else {
458     my $F; my @F = qw~G /G /H /I B /A /D /F~;
459     push @F, qw~/J /K /L~ if $cc <= 0x10FFFF;
460     push @F, qw~/@ /C /E~ if $cc <= 0xFFFF;
461     for (@F) {
462     if (defined $C->{option}->{designate_to}->{coding_system}->{$_}
463     && $C->{option}->{designate_to}->{coding_system}->{$_} > -1) {
464     $F = $_; last;
465     } elsif ($C->{option}->{designate_to}->{coding_system}->{default} > -1) {
466     $F = $_; last;
467     }
468     }
469     $t = _i2o ($c, $C, cs_F => $F) if $F;
470     }
471     if (defined $t) {
472 wakaba 1.1 $r .= $t;
473     } else {
474 wakaba 1.7 unless ($C->{option}->{undef_char}->[0] eq "\x20") {
475     $t = _i2g ($C->{option}->{undef_char}->[0], $C,
476     %{ $C->{option}->{undef_char}->[1] });
477     } else { ## SP
478     $t = _back2ascii ($C) . "\x20";
479     }
480 wakaba 1.5 $r .= $C->{coding_system} eq $CODING_SYSTEM{"\x40"} ?
481     $t : _i2o ($t, $C, cs_F => "\x40");
482 wakaba 1.1 }
483     }
484     $r . _back2ascii ($C);
485     }
486    
487     ## $O{charset} eq undef means that charset is same as the current designated one.
488     sub _i2c ($%%) {
489     my ($s, $C, %O) = @_;
490     my $r = '';
491     if ($O{type} eq 'C0') {
492     if (defined $O{charset}) {
493     if ( $C->{C0} ne $CHARSET{C0}->{$O{charset}}
494     && $C->{C0} ne $CHARSET{C0}->{$O{charset_id}}) {
495     for ($C->{option}->{designate_to}->{C0}->{$O{charset}},
496     $C->{option}->{designate_to}->{C0}->{default}) {
497     if (defined $_) { return undef if $_ == -1; last }
498     }
499     $r .= "\x1B\x21".$O{charset};
500     $C->{C0} = $CHARSET{C0}->{$O{charset}};
501     }
502     } elsif (defined $O{charset_id}) {
503     if ($C->{C0} ne $CHARSET{C0}->{$O{charset_id}}) {
504     return undef; ## Control set is not designated nor has F byte
505     }
506     }
507     $r .= _back2ascii ($C, reset_all => $C->{C0}->{reset_all}->{$s});
508     return $r . $s;
509     } elsif ($O{type} eq 'C1') {
510     if (defined $O{charset}) {
511     if ( $C->{C1} ne $CHARSET{C1}->{$O{charset}}
512     && $C->{C1} ne $CHARSET{C1}->{$O{charset_id}}) {
513     for ($C->{option}->{designate_to}->{C1}->{$O{charset}},
514     $C->{option}->{designate_to}->{C1}->{default}) {
515     if (defined $_) { return undef if $_ == -1; last }
516     }
517     $r .= "\x1B\x22".$O{charset};
518     $C->{C1} = $CHARSET{C1}->{$O{charset}};
519     }
520     } elsif (defined $O{charset_id}) {
521     if ($C->{C1} ne $CHARSET{C1}->{$O{charset_id}}) {
522     return undef; ## Control set is not designated nor has F byte
523     }
524     }
525     $r .= _back2ascii ($C, reset_all => $C->{C1}->{reset_all}->{$s});
526     unless ($C->{option}->{C1invoke_to_right}) { ## ESC Fe
527 wakaba 1.10 $s =~ s/([\x80-\x9F])/"\x1B" . pack ('C', ord ($1) - 0x40)/ge;
528 wakaba 1.1 }
529     return $r . $s;
530     }
531     }
532     sub _i2g ($%%) {
533     my ($s, $C, %O) = @_;
534     my $r = '';
535 wakaba 1.4 my $set = $CHARSET{$O{type}}->{$O{charset}.
536     ($O{revision}&&$C->{option}->{use_revision}?$O{revision}:'')};
537 wakaba 1.1 my $set0 = $CHARSET{$O{type}}->{$O{charset_id}};
538     ## -- designate character set
539     my $G = 0;
540     if ($C->{G0} eq $set || $C->{G0} eq $set0) { $G = 0 }
541     elsif ($C->{G1} eq $set || $C->{G1} eq $set0) { $G = 1 }
542     elsif ($C->{G2} eq $set || $C->{G2} eq $set0) { $G = 2 }
543     elsif ($C->{G3} eq $set || $C->{G3} eq $set0) { $G = 3 }
544     else {
545     return undef unless $set; ## charset does not have F byte
546     $G = 1 if $O{type} eq 'G96' || $O{type} eq 'G96n';
547     for ($C->{option}->{designate_to}->{$O{type}}->{$O{charset}},
548     $C->{option}->{designate_to}->{$O{type}}->{default}) {
549     if (defined $_) {
550     $G = $_; last;
551     }
552     }
553     if ($G == -1) {
554     return undef;
555     }
556     if ($O{type} eq 'G94') {
557     $r .= ($O{revision}&&$C->{option}->{use_revision}?"\x1B\x26".$O{revision}:'')
558     ."\x1B".("\x28","\x29","\x2A","\x2B")[$G].$O{charset};
559     } elsif ($O{type} eq 'G94n') {
560     if ($G == 0 && !$C->{option}->{G94n_designate_long}
561     && ($O{charset} eq '@' || $O{charset} eq 'A' || $O{charset} eq 'B')) {
562     $r .= ($O{revision}&&$C->{option}->{use_revision}?"\x1B\x26".$O{revision}:'')
563     ."\x1B\x24".$O{charset};
564     } else {
565     $r .= ($O{revision}&&$C->{option}->{use_revision}?"\x1B\x26".$O{revision}:'')
566     ."\x1B\x24".("\x28","\x29","\x2A","\x2B")[$G].$O{charset};
567     }
568     } elsif ($O{type} eq 'G96') {
569     $r .= ($O{revision}&&$C->{option}->{use_revision}?"\x1B\x26".$O{revision}:'')
570     ."\x1B".("\x2C","\x2D","\x2E","\x2F")[$G].$O{charset};
571     } elsif ($O{type} eq 'G96n') {
572     $r .= ($O{revision}&&$C->{option}->{use_revision}?"\x1B\x26".$O{revision}:'')
573     ."\x1B\x24".("\x2C","\x2D","\x2E","\x2F")[$G].$O{charset};
574     }
575     $C->{'G'.$G} = $CHARSET{$O{type}}->{$O{charset}};
576     }
577     ## -- invoke G buffer
578     my $left = $C->{option}->{Ginvoke_to_left}->[$G];
579     if ($C->{GL} eq 'G'.$G) {
580     $left = 1;
581     } elsif ($C->{GR} eq 'G'.$G) {
582     $left = 0;
583     } else {
584     if ($C->{option}->{Ginvoke_by_single_shift}->[$G]) {
585     if ($C->{C1}->{'C_SS'.$G}) {
586     $r .= _i2c ($C->{C1}->{'C_SS'.$G}, $C, type => 'C1') || return undef;
587     } elsif ($C->{C0}->{'C_SS'.$G}) {
588     $r .= _i2c ($C->{C0}->{'C_SS'.$G}, $C, type => 'C0') || return undef;
589     } else { ## Both C0 and C1 set do not have SS2/3.
590 wakaba 1.4 $left = 0 if $G == 1 && !$C->{C0}->{C_LS1};
591 wakaba 1.1 $r .= __invoke ($C, $G => $left) if $C->{$left?'GL':'GR'} ne 'G'.$G;
592     }
593     } else {
594     $left = 0 if $G == 1 && !$C->{C0}->{C_LS1};
595     $r .= __invoke ($C, $G => $left) if $C->{$left?'GL':'GR'} ne 'G'.$G;
596     }
597     }
598     $s =~ tr/\x00-\x7F/\x80-\xFF/ unless $left;
599     $r . $s;
600     }
601     sub _back2ascii (%) {
602     my ($C, %O) = @_;
603     my $r = '';
604     if ($C->{option}->{reset}->{Gdesignation}) {
605     my $F = $C->{option}->{reset}->{Gdesignation}; # \x42
606     $r .= "\x1B\x28".$F unless $C->{G0} eq $CHARSET{G94}->{$F};
607     $C->{G0} = $CHARSET{G94}->{$F};
608     if ($O{reset_all}) {
609     $C->{G1} = $CHARSET{G94}->{"\x7E"};
610     $C->{G2} = $CHARSET{G94}->{"\x7E"};
611     $C->{G3} = $CHARSET{G94}->{"\x7E"};
612     }
613     }
614     if ($C->{option}->{reset}->{Ginvoke}) {
615     if ($C->{GL} ne 'G0') {
616     $r .= $C->{C0}->{C_LS0} || ($C->{C0} = $CHARSET{C0}->{'@'},"\x1B\x21\x40\x0F");
617     $C->{GL} = 'G0';
618     }
619     $C->{GR} = undef if $O{reset_all};
620     }
621     $r;
622     }
623     ## __invoke (\%C, $G, $left_or_right)
624     sub __invoke (\%$$) {
625     my ($C, $G) = @_;
626     if ($_[2]) {
627     $C->{GL} = 'G'.$G;
628     return ($C->{C0}->{C_LS0}
629     || scalar ($C->{C0}=$CHARSET{C0}->{'@'},"\x1B\x21\x40\x0F"),
630     $C->{C0}->{C_LS1}, "\x1B\x6E", "\x1B\x6F")[$G];
631     } else {
632     $C->{GR} = 'G'.$G;
633     return ("", "\x1B\x7E", "\x1B\x7D", "\x1B\x7C")[$G];
634     }
635     '';
636     }
637 wakaba 1.5 sub _i2o ($\%%) {
638     my ($s, $C, %O) = @_;
639     my $CS = $CODING_SYSTEM{ $O{cs_F} } || $CODING_SYSTEM{ $O{cs_id} } || return undef;
640     my $r = '';
641     if ($CS ne $C->{coding_system}) {
642     my $e = '';
643     $e .= "\x1B\x25";
644 wakaba 1.8 $e .= $O{cs_F} || $C->{option}->{private_set}->{coding_system}->{ $O{cs_id} }
645 wakaba 1.5 || return undef;
646     if ($C->{coding_system} eq $CODING_SYSTEM{"\x2F\x40"}
647     || $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x43"}
648     || $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x45"}
649     || $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x4A"}
650     || $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x4B"}
651     || $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x4C"}) {
652     $e =~ s/(.)/\x00$1/go;
653     } elsif ($C->{coding_system} eq $CODING_SYSTEM{"\x2F\x41"}
654     || $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x44"}
655     || $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x46"}) {
656     $e =~ s/(.)/\x00\x00\x00$1/go;
657     }
658     $r .= $e;
659     $C->{coding_system} = $CS;
660     if ($CS->{reset_state}) {
661     $C->{GL} = undef; $C->{GR} = undef;
662     $C->{C0} = $CHARSET{C0}->{"\x7E"};
663     $C->{C1} = $CHARSET{C1}->{"\x7E"};
664     $C->{G0} = $CHARSET{G94}->{"\x7E"};
665     $C->{G1} = $CHARSET{G94}->{"\x7E"};
666     $C->{G2} = $CHARSET{G94}->{"\x7E"};
667     $C->{G3} = $CHARSET{G94}->{"\x7E"};
668     }
669     }
670     if ($CS eq $CODING_SYSTEM{"\x40"}) {
671     #
672     } elsif ($CS eq $CODING_SYSTEM{G} || $CS eq $CODING_SYSTEM{'/G'}
673     || $CS eq $CODING_SYSTEM{'/H'} || $CS eq $CODING_SYSTEM{'/I'}) {
674     Encode::_utf8_off ($s);
675     } elsif ($CS eq $CODING_SYSTEM{'/@'} || $CS eq $CODING_SYSTEM{'/C'}
676     || $CS eq $CODING_SYSTEM{'/E'}) {
677     $s = Encode::encode ('ucs-2be', $s);
678     } elsif ($CS eq $CODING_SYSTEM{'/A'} || $CS eq $CODING_SYSTEM{'/D'}
679     || $CS eq $CODING_SYSTEM{'/F'}) {
680     $s = Encode::encode ('ucs-4be', $s);
681     } elsif ($CS eq $CODING_SYSTEM{'/J'} || $CS eq $CODING_SYSTEM{'/K'}
682     || $CS eq $CODING_SYSTEM{'/L'}) {
683     $s = Encode::encode ('UTF-16BE', $s);
684     } elsif ($CS eq $CODING_SYSTEM{B}) {
685     $s = Encode::encode ('utf-1', $s);
686     } else {
687     return undef;
688     }
689     $r . $s;
690 wakaba 1.1 }
691    
692     1;
693     __END__
694    
695     =head1 SEE ALSO
696    
697     ISO/IEC 646:1991, "7-bit coded graphic character set for intormation interchange".
698    
699     ISO/IEC 2022:1994, "Character Code Structure and Extension Techniques".
700     (IDT with ECMA 35, JIS X 0202:1998)
701    
702     ISO/IEC 4873:1991, "8-Bit Coded Character Set Structure and Rules".
703     (IDT with ECMA 43)
704    
705     ISO/IEC 6429:1992, "Control Functions for Coded Character Sets".
706     (IDT with ECMA 48:1991, JIS X 0211:1998)
707    
708     ISO/IEC 8859, "8-Bit Single-Byte Coded Graphic Character Sets".
709    
710     Encode, perlunicode
711    
712 wakaba 1.4 =head1 TODO
713    
714     =over 4
715    
716     =item NCR (coding system other than ISO/IEC 2022) support
717    
718     =over 2
719    
720     =item ESC 02/05 02/15 03/x of X Compound Text
721    
722     =back
723    
724     =item Output of control character sets, single control functions
725    
726     =item Designation sequence of control character sets (input)
727    
728     =item Special graphic character sets such as G3 of EUC-TW
729    
730     =item SUPER SHIFT (SS) invoke function of old control character set
731    
732     =item Safe transparent of control string (ISO/IEC 6429)
733    
734     =item Output of unoutputable characters as alternative notation such as SGML-like entity
735    
736     =item C0 set invoked to CR area like ISIRI code
737    
738     Really need?
739    
740     =item special treatment of 0x20, 0x7E, 0xA0, 0xFF
741    
742     For example, GB mongolian sets use MSP (MONGOLIAN SPACE)
743     with these code positions.
744    
745     And, no less coding systems does not use (or does ban using) DEL.
746    
747     =item A lot of character sets don't have pseudo-UCS mapping.
748    
749     Most of 9m^n (n >= 3) sets, 9m^n sets with I byte, 9m^n
750     DRCSes do not have pseudo-UCS mapping area. It is
751     questionable to allocate lots of code positions to these
752     rarely-(or no-)used character sets.
753    
754     =item Even character sets that have pseudo-UCS mapping, some of them can't be outputed in ISO/IEC 2022.
755    
756     Because output of rarely-used character sets is
757     not implemented yet.
758    
759     =back
760    
761 wakaba 1.5 =head1 AUTHORS
762    
763     Nanashi-san
764    
765     Wakaba <w@suika.fam.cx>
766    
767 wakaba 1.1 =head1 LICENSE
768    
769 wakaba 1.5 Copyright 2002 AUTHORS
770 wakaba 1.1
771     This library is free software; you can redistribute it
772     and/or modify it under the same terms as Perl itself.
773    
774     =cut
775    
776 wakaba 1.11 # $Date: 2002/10/16 10:39:35 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24