/[suikacvs]/perl/lib/Encode/ISO2022.pm
Suika

Contents of /perl/lib/Encode/ISO2022.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.14 - (hide annotations) (download)
Wed Dec 18 10:21:09 2002 UTC (21 years, 10 months ago) by wakaba
Branch: MAIN
Changes since 1.13: +3 -3 lines
*** empty log message ***

1 wakaba 1.1
2     =head1 NAME
3    
4     Encode::ISO2022 --- ISO/IEC 2022 encoder and decoder
5    
6 wakaba 1.6 =head1 ENCODINGS
7    
8     =over 4
9    
10     =item iso2022
11    
12     ISO/IEC 2022:1994. Default status is:
13    
14     =over 2
15    
16     =item CL = C0 = ISO/IEC 6429:1991 C0 set
17    
18     =item CR = C1 = ISO/IEC 6429:1991 C1 set
19    
20     =item GL = G0 = ISO/IEC 646:1991 IRV GL(G0) set
21    
22     =item GR = G1 = empty set
23    
24     =item G2 = empty set
25    
26     =item G3 = empty set
27    
28     =back
29    
30     (Alias: iso/iec2022, iso-2022, 2022, cp2022)
31    
32     =back
33    
34     Note that ISO/IEC 2022 based encodings are found in
35     Encode::ISO2022::* modules. This module, Encode::ISO2022
36     only provides a general ISO/IEC 2022 encoder/decoder.
37    
38 wakaba 1.1 =cut
39    
40     require v5.7.3;
41     package Encode::ISO2022;
42     use strict;
43 wakaba 1.5 use vars qw(%CHARSET %CODING_SYSTEM $VERSION);
44 wakaba 1.14 $VERSION=do{my @r=(q$Revision: 1.13 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
45 wakaba 1.1 use base qw(Encode::Encoding);
46 wakaba 1.6 __PACKAGE__->Define (qw!iso-2022 iso/iec2022 iso2022 2022 cp2022!);
47 wakaba 1.5 require Encode::Charset;
48     *CHARSET = \%Encode::Charset::CHARSET;
49     *CODING_SYSTEM = \%Encode::Charset::CODING_SYSTEM;
50 wakaba 1.1
51     ### --- Perl Encode module common functions
52    
53     sub encode ($$;$) {
54     my ($obj, $str, $chk) = @_;
55     $_[1] = '' if $chk;
56     $str = &internal_to_iso2022 ($str);
57     return $str;
58     }
59    
60     sub decode ($$;$) {
61     my ($obj, $str, $chk) = @_;
62     $_[1] = '' if $chk;
63     return &iso2022_to_internal ($str);
64     }
65    
66     ### --- Encode::ISO2022 unique functions
67 wakaba 1.6 *new_object = \&Encode::Charset::new_object;
68 wakaba 1.1
69 wakaba 1.10 sub iso2022_to_internal ($;%) {
70 wakaba 1.1 my ($s, $C) = @_;
71 wakaba 1.6 $C ||= &new_object;
72 wakaba 1.5 my $t = '';
73 wakaba 1.9 $s =~ s{^((?:(?!\x1B\x25\x2F?[\x30-\x7E]).)*)}{
74 wakaba 1.5 my $i2 = $1;
75     $t = _iso2022_to_internal ($i2, $C);
76     '';
77 wakaba 1.9 }es;
78 wakaba 1.6 my $pad = '';
79     use re 'eval';
80 wakaba 1.5 $s =~ s{
81     ## ISO/IEC 2022
82 wakaba 1.6 (??{"$pad\x1B$pad\x25$pad\x40"})((?:(?!\x1B\x25\x2F?[\x30-\x7E]).)*)
83 wakaba 1.5 ## UTF-8
84 wakaba 1.6 |(??{"$pad\x1B$pad\x25$pad(?:\x47|\x2F$pad"."[\x47-\x49])"})
85     ((?:(?!\x1B\x25\x2F?[\x30-\x7E]).)*)
86 wakaba 1.5 ## UCS-2, UTF-16
87 wakaba 1.6 |(??{"$pad\x1B$pad\x25$pad\x2F$pad"})([\x40\x43\x45\x4A-\x4C])
88     ((?:(?!\x00\x1B\x00\x25(?:\x00\x2F)?\x00[\x30-\x7E])..)*)
89 wakaba 1.5 ## UCS-4
90 wakaba 1.6 |(??{"$pad\x1B$pad\x25$pad\x2F$pad"})[\x41\x44\x46]
91     ((?:(?!\x00\x00\x00\x1B\x00\x00\x00\x25(?:\x00\x00\x00\x2F)?
92     \x00\x00\x00[\x30-\x7E])....)*)
93 wakaba 1.5 ## with standard return
94 wakaba 1.6 |(??{"$pad\x1B$pad\x25$pad"})([\x30-\x7E])
95     ((?:(?!\x1B\x25\x2F?[\x30-\x7E]).)*)
96 wakaba 1.5 ## without standard return
97 wakaba 1.6 |(??{"$pad\x1B$pad\x25$pad\x2F$pad"})([\x30-\x7E])(.*)
98 wakaba 1.5 }{
99     my ($i2,$u8,$Fu2,$u2,$u4,$Fsr,$sr,$Fnsr,$nsr) = ($1,$2,$3,$4,$5,$6,$7,$8,$9);
100     my $r = '';
101     if (defined $i2) {
102 wakaba 1.6 $r = _iso2022_to_internal ($i2, $C); $pad = '';
103 wakaba 1.5 } elsif (defined $u8) {
104 wakaba 1.6 $r = Encode::decode ('utf8', $u8); $pad = '';
105 wakaba 1.5 } elsif ($Fu2) {
106     if (ord ($Fu2) > 0x49) {
107     $r = Encode::decode ('utf-16be', $u2);
108     } else {
109     $r = Encode::decode ('ucs-2be', $u2);
110     }
111 wakaba 1.6 $pad = "\x00";
112 wakaba 1.5 } elsif (defined $u4) {
113 wakaba 1.6 $r = Encode::decode ('ucs-4be', $u2); $pad = "\x00\x00\x00";
114     } elsif (defined $Fsr && $CODING_SYSTEM{$Fsr}->{perl_name}) {
115     $r = Encode::decode ($CODING_SYSTEM{$Fsr}->{perl_name}, $sr); $pad = '';
116     } elsif (defined $Fnsr && $CODING_SYSTEM{$Fnsr}->{perl_name}) {
117     $r = Encode::decode ($CODING_SYSTEM{$Fnsr}->{perl_name}, $nsr); $pad = '';
118 wakaba 1.5 } else { ## temporary
119 wakaba 1.6 $r = '?' x length ($sr.$nsr); $pad = '';
120 wakaba 1.5 }
121     $r;
122     }gesx;
123     $t . $s;
124     }
125    
126 wakaba 1.9 # this is very very trickey. my perl 5.8.0 does not process
127     # regex with eval except the first time (i think it's a bug
128     # of perl), so we redefine this function whenever being called!
129     # when this unexpected behavior is fixed or someone finds
130     # better way to avoid it, we will rewrite this code.
131     &_iso2022_to_internal (undef);
132 wakaba 1.10 sub _iso2022_to_internal ($;%) {
133     eval q{ sub __iso2022_to_internal ($;%) { 0 } };
134 wakaba 1.9 eval q{
135 wakaba 1.10 sub __iso2022_to_internal ($;%) {
136 wakaba 1.9 use re 'eval';
137 wakaba 1.5 my ($s, $C) = @_;
138 wakaba 1.1 my %_GB_to_GN = (
139     "\x28"=>'G0',"\x29"=>'G1',"\x2A"=>'G2',"\x2B"=>'G3',
140     "\x2C"=>'G0',"\x2D"=>'G1',"\x2E"=>'G2',"\x2F"=>'G3',
141     );
142 wakaba 1.9 my %_CHARS_to_RANGE = (
143     l94 => q/[\x21-\x7E]/, l96 => q/[\x20-\x7F]/,
144     l128 => q/[\x00-\x7F]/, l256 => q/[\x00-\xFF]/,
145     r94 => q/[\xA1-\xFE]/, r96 => q/[\xA0-\xFF]/,
146     r128 => q/[\x80-\xFF]/, r256 => q/[\x80-\xFF]/,
147     b94 => q/[\x21-\x7E\xA1-\xFE]/, b96 => q/[\x20-\x7F\xA0-\xFF]/,
148     b128 => q/[\x00-\xFF]/, b256 => q/[\x00-\xFF]/,
149     );
150 wakaba 1.1
151     $s =~ s{
152     ((??{ $_CHARS_to_RANGE{'l'.$C->{$C->{GL}}->{chars}}
153     . qq/{$C->{$C->{GL}}->{dimension},$C->{$C->{GL}}->{dimension}}/ }))
154 wakaba 1.6 |((??{ $_CHARS_to_RANGE{'r'.$C->{$C->{GR}}->{chars}}
155 wakaba 1.9 . qq/{$C->{$C->{GR}}->{dimension},$C->{$C->{GR}}->{dimension}}/ }))
156 wakaba 1.1 | (??{ q/(?:/ . ($C->{$C->{CR}}->{r_SS2} || '(?!)')
157     . ($C->{$C->{ESC_Fe}}->{r_SS2_ESC} ?
158     qq/|$C->{$C->{ESC_Fe}}->{r_SS2_ESC}/ : '')
159     . ($C->{$C->{CL}}->{r_SS2} ? qq/|$C->{$C->{CL}}->{r_SS2}/ : '') . q/)/
160     . ( $C->{$C->{CL}}->{r_LS0}
161     ||$C->{$C->{CL}}->{r_LS1}? ## ISO/IEC 6429:1992 9
162     qq/[$C->{$C->{CL}}->{r_LS0}$C->{$C->{CL}}->{r_LS1}]*/:'')
163     })
164 wakaba 1.6 ((??{ $_CHARS_to_RANGE{'b'.$C->{G2}->{chars}}
165 wakaba 1.9 . qq/{$C->{G2}->{dimension},$C->{G2}->{dimension}}/ }))
166 wakaba 1.1 | (??{ q/(?:/ . ($C->{$C->{CR}}->{r_SS3} || '(?!)')
167     . ($C->{$C->{ESC_Fe}}->{r_SS3_ESC} ?
168     qq/|$C->{$C->{ESC_Fe}}->{r_SS3_ESC}/ : '')
169     . ($C->{$C->{CL}}->{r_SS3} ? qq/|$C->{$C->{CL}}->{r_SS3}/ : '') . q/)/
170     . ( $C->{$C->{CL}}->{r_LS0}
171     || $C->{$C->{CL}}->{r_LS1}? ## ISO/IEC 6429:1992 9
172     qq/[$C->{$C->{CL}}->{r_LS0}$C->{$C->{CL}}->{r_LS1}]*/:'')
173     })
174 wakaba 1.6 ((??{ $_CHARS_to_RANGE{'b'.$C->{G3}->{chars}}
175 wakaba 1.1 . qq/{$C->{G3}->{dimension},$C->{G3}->{dimension}}/ }))
176    
177 wakaba 1.3 ## Locking shift
178 wakaba 1.6 |( (??{ $C->{$C->{CL}}->{r_LS0}||'(?!)' })
179 wakaba 1.3 |(??{ $C->{$C->{CL}}->{r_LS1}||'(?!)' })
180     )
181 wakaba 1.1
182     ## Control sequence
183     |(??{ '(?:'.($C->{$C->{CR}}->{r_CSI}||'(?!)')
184     .($C->{$C->{ESC_Fe}}->{r_CSI_ESC} ?
185     qq/|$C->{$C->{ESC_Fe}}->{r_CSI_ESC}/: '')
186     .')'
187     })
188     ((??{ qq/[\x30-\x3F$C->{$C->{CL}}->{LS0}$C->{$C->{CL}}->{LS1}\xB0-\xBF]*/
189     .qq/[\x20-\x2F$C->{$C->{CL}}->{LS0}$C->{$C->{CL}}->{LS1}\xA0-\xAF]*/
190     }) [\x40-\x7E\xD0-\xFE])
191    
192     ## Other escape sequence
193     |(\x1B[\x20-\x2F]*[\x30-\x7E])
194    
195     ## Misc. sequence (SP, control, or broken data)
196     |([\x00-\xFF])
197     }{
198 wakaba 1.3 my ($gl,$gr,$ss2,$ss3,$ls,$csi,$esc,$misc)
199     = ($1,$2,$3,$4,$5,$6,$7,$8,$9);
200 wakaba 1.1 $C->{_irr} = undef unless defined $esc;
201     ## GL graphic character
202     if (defined $gl) {
203     my $c = 0;
204     my $m = $C->{$C->{GL}}->{chars}==94?0x21:$C->{$C->{GL}}->{chars}==96?0x20:0;
205     for (split //, $gl) {
206     $c = $c * $C->{$C->{GL}}->{chars} + unpack ('C', $_) - $m;
207     }
208     chr ($C->{$C->{GL}}->{ucs} + $c);
209     ## GR graphic character
210     } elsif ($gr) {
211     my $c = 0;
212     my $m = $C->{$C->{GR}}->{chars}==94?0xA1:$C->{$C->{GR}}->{chars}==96?0xA0:0x80;
213     for (split //, $gr) {
214     $c = $c * $C->{$C->{GR}}->{chars} + unpack ('C', $_) - $m;
215     }
216     chr ($C->{$C->{GR}}->{ucs} + $c);
217 wakaba 1.10 ## Control, SP, or broken data
218     ## TODO: support control sets other than ISO/IEC 6429's
219     } elsif (defined $misc) {
220     $misc;
221 wakaba 1.1 ## Graphic character with SS2
222     } elsif ($ss2) {
223     $ss2 =~ tr/\x80-\xFF/\x00-\x7F/;
224     my $c = 0; my $m = $C->{G2}->{chars}==94?0x21:$C->{G2}->{chars}==96?0x20:0;
225     for (split //, $ss2) {
226     $c = $c * $C->{G2}->{chars} + unpack ('C', $_) - $m;
227     }
228     chr ($C->{G2}->{ucs} + $c);
229     ## Graphic character with SS3
230     } elsif ($ss3) {
231     $ss3 =~ tr/\x80-\xFF/\x00-\x7F/;
232     my $c = 0; my $m = $C->{G3}->{chars}==94?0x21:$C->{G3}->{chars}==96?0x20:0;
233     for (split //, $ss3) {
234     $c = $c * $C->{G3}->{chars} + unpack ('C', $_) - $m;
235     }
236     chr ($C->{G3}->{ucs} + $c);
237     ## Escape sequence
238     } elsif ($esc) {
239 wakaba 1.10 if ($esc =~ /\x1B\x26([\x40-\x7E])/) { ## 6F (IRR) = ESC 02/06 Ft
240 wakaba 1.1 $C->{_irr} = $1; $esc = '';
241     } else {
242     $esc =~ s{
243     \x1B([\x28-\x2B])(\x20?[\x21-\x23]?[\x30-\x7E]) ## Gx = 94^1
244     |\x1B\x24([\x28-\x2B]?)(\x20?[\x21-\x23]?[\x30-\x7E]) ## Gx = 94^n
245    
246     |\x1B([\x2C-\x2F])(\x20?[\x21-\x23]?[\x30-\x7E]) ## Gx = 96^1
247     |\x1B\x24([\x2C-\x2F])(\x20?[\x21-\x23]?[\x30-\x7E]) ## Gx = 96^n
248    
249     |\x1B([\x40-\x5F]) ## ESC Fe
250    
251     |\x1B\x21([\x21-\x23]?[\x30-\x7E]) ## CL = C0
252     |\x1B\x22([\x21-\x23]?[\x30-\x7E]) ## CR & ESC Fe = C1
253    
254     |\x1B([\x60-\x7E]) ## Single control functions
255     |\x1B\x23([\x21-\x23]?)([\x30-\x7E])
256    
257     |\x1B\x20([\x40-\x7E]) ## Announcer
258     }{
259     my ($g94_g,$g94_f,$g94n_g,$g94n_f,$g96_g,$g96_f,$g96n_g,$g96n_f,$Fe,
260     $CZD, $C1D, $Fs, $sI, $sF,$ACS)
261     = ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15);
262     my $rev = $C->{_irr} || '';
263 wakaba 1.10 my $f2s = $C->{option}->{final_to_set};
264     if ($g94_g) { ## ESC 02/08 [I] F
265     $C->{ $_GB_to_GN{ $g94_g } }
266     = $CHARSET{G94}->{ $f2s->{G94}->{$g94_f.$rev} || $g94_f.$rev }
267     || $CHARSET{G94}->{ $f2s->{G94}->{$g94_f} || $g94_f }
268     || $CHARSET{G94}->{ "\x7E" }; '';
269     } elsif (defined $g94n_f) { ## ESC 02/04 [02/08..11] [I] F
270     $C->{ $_GB_to_GN{ $g94n_g } || 'G0' }
271     = $CHARSET{G94n}->{ $f2s->{G94n}->{$g94n_f.$rev} || $g94n_f.$rev }
272     || $CHARSET{G94n}->{ $f2s->{G94n}->{$g94n_f} || $g94n_f }
273     || $CHARSET{G94n}->{ "\x7E" }; '';
274     } elsif ($g96_g) { ## ESC 02/12..15 [I] F
275     $C->{ $_GB_to_GN{ $g96_g } }
276     = $CHARSET{G96}->{ $f2s->{G96}->{$g96_f.$rev} || $g96_f.$rev }
277     || $CHARSET{G96}->{ $f2s->{G96}->{$g96_f} || $g96_f }
278     || $CHARSET{G96}->{ "\x7E" }; '';
279     } elsif ($g96n_g) { ## ESC 02/04 02/12..15 [I] F
280     $C->{ $_GB_to_GN{ $g96n_g } }
281     = $CHARSET{G96n}->{ $f2s->{G96n}->{$g96n_f.$rev} || $g96n_f.$rev }
282     || $CHARSET{G96n}->{ $f2s->{G96n}->{$g96n_f} || $g96n_f }
283     || $CHARSET{G96n}->{ "\x7E" }; '';
284     } elsif ($Fe) { ## ESC Fe = C1
285 wakaba 1.1 chr ($C->{ $C->{ESC_Fe} }->{ucs} + (ord ($Fe) - 0x40));
286 wakaba 1.10 } elsif (defined $Fs) { ## ESC Fs
287 wakaba 1.1 if ($Fs eq "\x6E") { ## LS2
288     $C->{GL} = 'G2'; '';
289     } elsif ($Fs eq "\x6F") { ## LS3
290     $C->{GL} = 'G3'; '';
291 wakaba 1.7 } elsif ($Fs eq "\x7E" || $Fs eq "\x6B") { ## LS1R
292 wakaba 1.1 $C->{GR} = 'G1'; $C->{GL} = 'G1' if $C->{bit} == 7; '';
293 wakaba 1.7 } elsif ($Fs eq "\x7D" || $Fs eq "\x6C") { ## LS2R
294 wakaba 1.1 $C->{GR} = 'G2'; $C->{GL} = 'G2' if $C->{bit} == 7; '';
295 wakaba 1.7 } elsif ($Fs eq "\x7C" || $Fs eq "\x6D") { ## LS3R
296 wakaba 1.1 $C->{GR} = 'G3'; $C->{GL} = 'G3' if $C->{bit} == 7; '';
297     } else {
298     chr ($CHARSET{single_control}->{Fs}->{ucs} + (ord ($Fs) - 0x60));
299     }
300 wakaba 1.10 } elsif (defined $CZD) { ## 1F (CZD) = ESC 02/01 [I] F
301     $C->{C0} = $CHARSET{C0}->{ $f2s->{C0}->{$CZD.$rev} || $CZD.$rev }
302     || $CHARSET{C0}->{ $f2s->{C0}->{$CZD} || $CZD }
303     || $CHARSET{C0}->{ "\x7E" }; '';
304     } elsif (defined $C1D) { ## 2F (C1D) = ESC 02/02 [I] F
305     $C->{C1} = $CHARSET{C1}->{ $f2s->{C1}->{$C1D.$rev} || $C1D.$rev }
306     || $CHARSET{C1}->{ $f2s->{C1}->{$C1D} || $C1D }
307     || $CHARSET{C1}->{ "\x7E" }; '';
308     } elsif ($sI) { ## 3F = ESC 02/03 [I] F
309 wakaba 1.1 chr ($CHARSET{single_control}->{'3F'.$sI}->{ucs} + (ord ($sF) - 0x30));
310 wakaba 1.10 } elsif ($ACS) { ## 0F (Announcer) = ESC 02/00 F
311 wakaba 1.1 if ($ACS eq "\x4A") { $C->{bit} = 7 }
312     elsif ($ACS eq "\x4B") { $C->{bit} = 8 }
313     '';
314     }
315     }gex;
316     $C->{_irr} = undef;
317     }
318     $esc;
319 wakaba 1.10 } elsif ($ls) { ## Locking shifts = LS0 / LS1
320 wakaba 1.3 if ($ls eq $C->{$C->{CL}}->{LS0}) {
321     $C->{GL} = 'G0'; '';
322     } elsif ($ls eq $C->{$C->{CL}}->{LS1}) {
323     $C->{GL} = 'G1'; '';
324     }
325 wakaba 1.10 } elsif ($csi) { ## Control sequence = CSI [P..] [I] F
326 wakaba 1.1 $csi =~ tr/\xA0-\xFF/\x20-\x7F/d;
327     $csi =~ s/$C->{$C->{CL}}->{LS0}//g if $C->{$C->{CL}}->{LS0};
328     $csi =~ s/$C->{$C->{CL}}->{LS1}//g if $C->{$C->{CL}}->{LS1};
329     "\x9B".$csi;
330     }
331 wakaba 1.9 }gesx;
332 wakaba 1.1 $s;
333 wakaba 1.9 } # __iso2022_to_internal
334    
335     };
336     &__iso2022_to_internal (@_) if defined $_[0];
337    
338     } # _iso2022_to_internal
339 wakaba 1.1
340 wakaba 1.10 sub internal_to_iso2022 ($;%) {
341 wakaba 1.1 my ($s, $C) = @_;
342     $C ||= &new_object;
343    
344     my $r = '';
345 wakaba 1.12 my @c = split //, $s;
346     for my $i (0..$#c) {
347     my $c = $c[$i]; my $cc = ord $c; Encode::_utf8_off ($c);
348 wakaba 1.1 my $t;
349     if ($cc <= 0x1F) {
350     $t = _i2c ($c, $C, type => 'C0', charset => '@');
351     } elsif ($cc == 0x20 || $cc == 0x7F) {
352     $t = _back2ascii ($C) . $c;
353     } elsif ($cc < 0x7F) {
354     $t = _i2g ($c, $C, type => 'G94', charset => 'B');
355     } elsif ($cc <= 0x9F) {
356 wakaba 1.10 $t = _i2c (pack ('C', $cc), $C, type => 'C1', charset_id => '64291991C1',
357 wakaba 1.8 charset => $C->{option}->{private_set}->{XC1}->{'64291991C1'});
358 wakaba 1.1 } elsif ($cc <= 0xFF) {
359 wakaba 1.10 $t = _i2g (pack ('C', $cc-0x80), $C, type => 'G96', charset => 'A');
360 wakaba 1.1 } elsif ($cc <= 0x24FF) {
361     my $c = $cc - 0x100;
362 wakaba 1.8 my $final = $C->{option}->{private_set}->{U96n}->[0];
363 wakaba 1.1 if (length $final) {
364     $t = _i2g (chr(($c / 96)+0x20).chr(($c % 96)+0x20), $C,
365     type => 'G96n', charset => $final);
366     }
367     } elsif ($cc <= 0x33FF) {
368     my $c = $cc - 0x2500;
369 wakaba 1.8 my $final = $C->{option}->{private_set}->{U96n}->[1];
370 wakaba 1.1 if (length $final) {
371     $t = _i2g (chr(($c / 96)+0x20).chr(($c % 96)+0x20), $C,
372     type => 'G96n', charset => $final);
373     }
374     } elsif (0xE000 <= $cc && $cc <= 0xFFFF) {
375     my $c = $cc - 0xE000;
376 wakaba 1.8 my $final = $C->{option}->{private_set}->{U96n}->[2];
377 wakaba 1.1 if (length $final) {
378     $t = _i2g (chr(($c / 96)+0x20).chr(($c % 96)+0x20), $C,
379     type => 'G96n', charset => $final);
380     }
381    
382     } elsif (0xE9F6C0 <= $cc && $cc <= 0xF06F80) {
383     my $c = $cc - 0xE9F6C0;
384     $t = _i2g (chr((($c % 8836) / 94)+0x21).chr(($c % 94)+0x21), $C,
385     type => 'G94n', charset => chr(($c / 8836)+0x30));
386     } elsif (0xF49D7C <= $cc && $cc <= 0xF4BFFF) {
387     my $c = $cc - 0xF49D7C;
388     $t = _i2g (chr(($c / 94)+0x21).chr(($c % 94)+0x21), $C,
389     type => 'G94n', charset => 'B', revision => '@');
390    
391     } elsif (0xF0000 <= $cc && $cc <= 0x10F7FF) {
392     my $c = $cc - 0xF0000;
393     $t = _i2g (chr((($c % 9216) / 96)+0x20).chr(($c % 96)+0x20), $C,
394     type => 'G96n', charset => "\x20".chr(($c / 9216)+0x40));
395     } elsif (0xE90940 <= $cc && $cc <= 0xE92641) {
396     my $c = $cc - 0xE90940;
397     $t = _i2g (chr(($c % 94)+0x21), $C,
398     type => 'G94', charset => chr(($c / 94)+0x30));
399     } elsif (0xE92642 <= $cc && $cc <= 0xE9269F) {
400     my $c = $cc - 0xE92642;
401     $t = _i2g (chr($c+0x21),$C,type => 'G94', charset => "\x64", revision => '@');
402     } elsif (0xE926A0 <= $cc && $cc <= 0xE9443F) {
403     my $c = $cc - 0xE926A0;
404     $t = _i2g (chr(($c % 96)+0x20), $C,
405     type => 'G96', charset => chr(($c / 96)+0x30));
406     } elsif (0xE944A0 <= $cc && $cc <= 0xE961A1) {
407     my $c = $cc - 0xE944A0;
408     $t = _i2g (chr(($c % 94)+0x21), $C,
409     type => 'G94', charset => '!'.chr(($c / 94)+0x30));
410     } elsif (0xE96200 <= $cc && $cc <= 0xE97F9F) {
411     my $c = $cc - 0xE96200;
412     $t = _i2g (chr(($c % 96)+0x20), $C,
413     type => 'G96', charset => '!'.chr(($c / 96)+0x30));
414     } elsif (0xE98000 <= $cc && $cc <= 0xE99D01) {
415     my $c = $cc - 0xE98000;
416     $t = _i2g (chr(($c % 94)+0x21), $C,
417     type => 'G94', charset => '"'.chr(($c / 94)+0x30));
418     } elsif (0xE99D60 <= $cc && $cc <= 0xE9BAFF) {
419     my $c = $cc - 0xE99D60;
420     $t = _i2g (chr(($c % 96)+0x20), $C,
421     type => 'G96', charset => '"'.chr(($c / 96)+0x30));
422     } elsif (0xE9BB60 <= $cc && $cc <= 0xE9D861) {
423     my $c = $cc - 0xE9BB60;
424     $t = _i2g (chr(($c % 94)+0x21), $C,
425     type => 'G94', charset => '#'.chr(($c / 94)+0x30));
426     } elsif (0xE9D8C0 <= $cc && $cc <= 0xE9F65F) {
427     my $c = $cc - 0xE9D8C0;
428     $t = _i2g (chr(($c % 96)+0x20), $C,
429     type => 'G96', charset => '#'.chr(($c / 96)+0x30));
430     } elsif (0x70090940 <= $cc && $cc <= 0x70092641) {
431     my $c = $cc - 0x70090940;
432     $t = _i2g (chr(($c % 94)+0x21), $C,
433     type => 'G94', charset => "\x20".chr(($c / 94)+0x30));
434     } elsif (0x700926A0 <= $cc && $cc <= 0x7009443F) {
435     my $c = $cc - 0x700926A0;
436     $t = _i2g (chr(($c % 96)+0x20), $C,
437     type => 'G96', charset => "\x20".chr(($c / 96)+0x30));
438     ## TODO: DRCS with I byte: U+700944A0-U+7009F6BF
439     } elsif (0x70400000 <= $cc && $cc <= 0x7040FFED) {
440     my $c = $cc - 0x70400000;
441     $t = _i2g (chr(($c % 94)+0x21), $C, charset_id => 'P'.int ($c / 94),
442 wakaba 1.8 type => 'G94', charset => $C->{option}->{private_set}->{G94}->[ $c / 94 ]);
443 wakaba 1.1 } elsif (0x70410000 <= $cc && $cc <= 0x7041FFBF) {
444     my $c = $cc - 0x70410000;
445     $t = _i2g (chr(($c % 96)+0x20), $C, charset_id => 'P'.int ($c / 96),
446 wakaba 1.8 type => 'G96', charset => $C->{option}->{private_set}->{G96}->[ $c / 96 ]);
447 wakaba 1.1 } elsif (0x70420000 <= $cc && $cc <= 0x7046F19B) {
448     my $c = $cc % 0x10000;
449     $t = _i2g (chr((($c % 8836) / 94)+0x21).chr(($c % 94)+0x21), $C,
450     type => 'G94n',
451 wakaba 1.2 charset_id => 'P'.int(($cc / 0x10000) - 0x7042).'_'.int($c / 8836),
452 wakaba 1.8 charset => $C->{option}->{private_set}->{G94n}
453     ->[ ($cc / 0x10000) - 0x7042 ]->[ $c / 8836 ]);
454 wakaba 1.1 }
455     if (defined $t) {
456 wakaba 1.12 ## Back to ISO/IEC 2022 if necessary
457 wakaba 1.5 $t = _i2o ($t, $C, cs_F => "\x40")
458     if $C->{coding_system} ne $CODING_SYSTEM{"\x40"};
459     } else {
460 wakaba 1.12 ## Output in UCS-n or UTF-n if character can't be represented in ISO/IEC 2022
461 wakaba 1.5 my $F; my @F = qw~G /G /H /I B /A /D /F~;
462     push @F, qw~/J /K /L~ if $cc <= 0x10FFFF;
463     push @F, qw~/@ /C /E~ if $cc <= 0xFFFF;
464     for (@F) {
465     if (defined $C->{option}->{designate_to}->{coding_system}->{$_}
466     && $C->{option}->{designate_to}->{coding_system}->{$_} > -1) {
467     $F = $_; last;
468     } elsif ($C->{option}->{designate_to}->{coding_system}->{default} > -1) {
469     $F = $_; last;
470     }
471     }
472     $t = _i2o ($c, $C, cs_F => $F) if $F;
473     }
474 wakaba 1.13 if (defined $t) { ## Output the character itself
475 wakaba 1.1 $r .= $t;
476 wakaba 1.12 } elsif ($C->{option}->{fallback_from_ucs} =~ /quiet/) {
477     $r .= _back2ascii ($C) if $C->{option}->{fallback_from_ucs} =~ /back/;
478     return ($r, halfway => 1, converted_length => $i,
479     warn => $C->{option}->{fallback_from_ucs} =~ /warn/ ? 1 : 0,
480     reason => sprintf (q(U+%04X: There is no character mapped to), $cc));
481     } elsif ($C->{option}->{fallback_from_ucs} eq 'croak') {
482     return ($r, halfway => 1, die => 1,
483     reason => sprintf (q(U+%04X: There is no character mapped to), $cc));
484 wakaba 1.1 } else {
485 wakaba 1.12 ## Try to output with fallback escape sequence (if specified)
486 wakaba 1.14 my $t = Encode::Charset->fallback_escape ($C, $c);
487 wakaba 1.12 if (defined $t) {
488     my %D = (fallback => $C->{option}->{fallback_from_ucs}, reset => $C->{option}->{reset});
489     $C->{option}->{fallback_from_ucs} = 'croak';
490     $C->{option}->{reset} = {Gdesignation => 0, Ginvoke => 0};
491     eval q{$t = $C->{_encoder}->_encode_internal ($t, $C)} or undef $t;
492     $C->{option}->{fallback_from_ucs} = $D{fallback};
493     $C->{option}->{reset} = $D{reset};
494     }
495     if (defined $t) {
496     $r .= $t;
497     } else { ## Replacement character specified in charset definition
498     unless ($C->{option}->{undef_char}->[0] eq "\x20") { ## A graphic character
499     $t = _i2g ($C->{option}->{undef_char}->[0], $C,
500     %{ $C->{option}->{undef_char}->[1] });
501     } else { ## SPACE
502     $t = _back2ascii ($C) . "\x20";
503     }
504     $r .= $C->{coding_system} eq $CODING_SYSTEM{"\x40"} ?
505     $t : _i2o ($t, $C, cs_F => "\x40");
506 wakaba 1.7 }
507 wakaba 1.1 }
508     }
509 wakaba 1.12 ($r . _back2ascii ($C)); ## Back to ASCII at the end of document if specified
510 wakaba 1.1 }
511    
512     ## $O{charset} eq undef means that charset is same as the current designated one.
513     sub _i2c ($%%) {
514     my ($s, $C, %O) = @_;
515     my $r = '';
516     if ($O{type} eq 'C0') {
517     if (defined $O{charset}) {
518     if ( $C->{C0} ne $CHARSET{C0}->{$O{charset}}
519     && $C->{C0} ne $CHARSET{C0}->{$O{charset_id}}) {
520     for ($C->{option}->{designate_to}->{C0}->{$O{charset}},
521     $C->{option}->{designate_to}->{C0}->{default}) {
522     if (defined $_) { return undef if $_ == -1; last }
523     }
524     $r .= "\x1B\x21".$O{charset};
525     $C->{C0} = $CHARSET{C0}->{$O{charset}};
526     }
527     } elsif (defined $O{charset_id}) {
528     if ($C->{C0} ne $CHARSET{C0}->{$O{charset_id}}) {
529     return undef; ## Control set is not designated nor has F byte
530     }
531     }
532     $r .= _back2ascii ($C, reset_all => $C->{C0}->{reset_all}->{$s});
533     return $r . $s;
534     } elsif ($O{type} eq 'C1') {
535     if (defined $O{charset}) {
536     if ( $C->{C1} ne $CHARSET{C1}->{$O{charset}}
537     && $C->{C1} ne $CHARSET{C1}->{$O{charset_id}}) {
538     for ($C->{option}->{designate_to}->{C1}->{$O{charset}},
539     $C->{option}->{designate_to}->{C1}->{default}) {
540     if (defined $_) { return undef if $_ == -1; last }
541     }
542     $r .= "\x1B\x22".$O{charset};
543     $C->{C1} = $CHARSET{C1}->{$O{charset}};
544     }
545     } elsif (defined $O{charset_id}) {
546     if ($C->{C1} ne $CHARSET{C1}->{$O{charset_id}}) {
547     return undef; ## Control set is not designated nor has F byte
548     }
549     }
550     $r .= _back2ascii ($C, reset_all => $C->{C1}->{reset_all}->{$s});
551     unless ($C->{option}->{C1invoke_to_right}) { ## ESC Fe
552 wakaba 1.10 $s =~ s/([\x80-\x9F])/"\x1B" . pack ('C', ord ($1) - 0x40)/ge;
553 wakaba 1.1 }
554     return $r . $s;
555     }
556     }
557     sub _i2g ($%%) {
558     my ($s, $C, %O) = @_;
559     my $r = '';
560 wakaba 1.4 my $set = $CHARSET{$O{type}}->{$O{charset}.
561     ($O{revision}&&$C->{option}->{use_revision}?$O{revision}:'')};
562 wakaba 1.1 my $set0 = $CHARSET{$O{type}}->{$O{charset_id}};
563     ## -- designate character set
564     my $G = 0;
565     if ($C->{G0} eq $set || $C->{G0} eq $set0) { $G = 0 }
566     elsif ($C->{G1} eq $set || $C->{G1} eq $set0) { $G = 1 }
567     elsif ($C->{G2} eq $set || $C->{G2} eq $set0) { $G = 2 }
568     elsif ($C->{G3} eq $set || $C->{G3} eq $set0) { $G = 3 }
569     else {
570     return undef unless $set; ## charset does not have F byte
571     $G = 1 if $O{type} eq 'G96' || $O{type} eq 'G96n';
572     for ($C->{option}->{designate_to}->{$O{type}}->{$O{charset}},
573     $C->{option}->{designate_to}->{$O{type}}->{default}) {
574     if (defined $_) {
575     $G = $_; last;
576     }
577     }
578     if ($G == -1) {
579     return undef;
580     }
581     if ($O{type} eq 'G94') {
582     $r .= ($O{revision}&&$C->{option}->{use_revision}?"\x1B\x26".$O{revision}:'')
583     ."\x1B".("\x28","\x29","\x2A","\x2B")[$G].$O{charset};
584     } elsif ($O{type} eq 'G94n') {
585     if ($G == 0 && !$C->{option}->{G94n_designate_long}
586     && ($O{charset} eq '@' || $O{charset} eq 'A' || $O{charset} eq 'B')) {
587     $r .= ($O{revision}&&$C->{option}->{use_revision}?"\x1B\x26".$O{revision}:'')
588     ."\x1B\x24".$O{charset};
589     } else {
590     $r .= ($O{revision}&&$C->{option}->{use_revision}?"\x1B\x26".$O{revision}:'')
591     ."\x1B\x24".("\x28","\x29","\x2A","\x2B")[$G].$O{charset};
592     }
593     } elsif ($O{type} eq 'G96') {
594     $r .= ($O{revision}&&$C->{option}->{use_revision}?"\x1B\x26".$O{revision}:'')
595     ."\x1B".("\x2C","\x2D","\x2E","\x2F")[$G].$O{charset};
596     } elsif ($O{type} eq 'G96n') {
597     $r .= ($O{revision}&&$C->{option}->{use_revision}?"\x1B\x26".$O{revision}:'')
598     ."\x1B\x24".("\x2C","\x2D","\x2E","\x2F")[$G].$O{charset};
599     }
600     $C->{'G'.$G} = $CHARSET{$O{type}}->{$O{charset}};
601     }
602     ## -- invoke G buffer
603     my $left = $C->{option}->{Ginvoke_to_left}->[$G];
604     if ($C->{GL} eq 'G'.$G) {
605     $left = 1;
606     } elsif ($C->{GR} eq 'G'.$G) {
607     $left = 0;
608     } else {
609     if ($C->{option}->{Ginvoke_by_single_shift}->[$G]) {
610     if ($C->{C1}->{'C_SS'.$G}) {
611     $r .= _i2c ($C->{C1}->{'C_SS'.$G}, $C, type => 'C1') || return undef;
612     } elsif ($C->{C0}->{'C_SS'.$G}) {
613     $r .= _i2c ($C->{C0}->{'C_SS'.$G}, $C, type => 'C0') || return undef;
614     } else { ## Both C0 and C1 set do not have SS2/3.
615 wakaba 1.4 $left = 0 if $G == 1 && !$C->{C0}->{C_LS1};
616 wakaba 1.1 $r .= __invoke ($C, $G => $left) if $C->{$left?'GL':'GR'} ne 'G'.$G;
617     }
618     } else {
619     $left = 0 if $G == 1 && !$C->{C0}->{C_LS1};
620     $r .= __invoke ($C, $G => $left) if $C->{$left?'GL':'GR'} ne 'G'.$G;
621     }
622     }
623     $s =~ tr/\x00-\x7F/\x80-\xFF/ unless $left;
624     $r . $s;
625     }
626     sub _back2ascii (%) {
627     my ($C, %O) = @_;
628     my $r = '';
629     if ($C->{option}->{reset}->{Gdesignation}) {
630     my $F = $C->{option}->{reset}->{Gdesignation}; # \x42
631     $r .= "\x1B\x28".$F unless $C->{G0} eq $CHARSET{G94}->{$F};
632     $C->{G0} = $CHARSET{G94}->{$F};
633     if ($O{reset_all}) {
634     $C->{G1} = $CHARSET{G94}->{"\x7E"};
635     $C->{G2} = $CHARSET{G94}->{"\x7E"};
636     $C->{G3} = $CHARSET{G94}->{"\x7E"};
637     }
638     }
639     if ($C->{option}->{reset}->{Ginvoke}) {
640     if ($C->{GL} ne 'G0') {
641     $r .= $C->{C0}->{C_LS0} || ($C->{C0} = $CHARSET{C0}->{'@'},"\x1B\x21\x40\x0F");
642     $C->{GL} = 'G0';
643     }
644     $C->{GR} = undef if $O{reset_all};
645     }
646     $r;
647     }
648     ## __invoke (\%C, $G, $left_or_right)
649     sub __invoke (\%$$) {
650     my ($C, $G) = @_;
651     if ($_[2]) {
652     $C->{GL} = 'G'.$G;
653     return ($C->{C0}->{C_LS0}
654     || scalar ($C->{C0}=$CHARSET{C0}->{'@'},"\x1B\x21\x40\x0F"),
655     $C->{C0}->{C_LS1}, "\x1B\x6E", "\x1B\x6F")[$G];
656     } else {
657     $C->{GR} = 'G'.$G;
658     return ("", "\x1B\x7E", "\x1B\x7D", "\x1B\x7C")[$G];
659     }
660     '';
661     }
662 wakaba 1.5 sub _i2o ($\%%) {
663     my ($s, $C, %O) = @_;
664     my $CS = $CODING_SYSTEM{ $O{cs_F} } || $CODING_SYSTEM{ $O{cs_id} } || return undef;
665     my $r = '';
666     if ($CS ne $C->{coding_system}) {
667     my $e = '';
668     $e .= "\x1B\x25";
669 wakaba 1.8 $e .= $O{cs_F} || $C->{option}->{private_set}->{coding_system}->{ $O{cs_id} }
670 wakaba 1.5 || return undef;
671     if ($C->{coding_system} eq $CODING_SYSTEM{"\x2F\x40"}
672     || $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x43"}
673     || $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x45"}
674     || $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x4A"}
675     || $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x4B"}
676     || $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x4C"}) {
677     $e =~ s/(.)/\x00$1/go;
678     } elsif ($C->{coding_system} eq $CODING_SYSTEM{"\x2F\x41"}
679     || $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x44"}
680     || $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x46"}) {
681     $e =~ s/(.)/\x00\x00\x00$1/go;
682     }
683     $r .= $e;
684     $C->{coding_system} = $CS;
685     if ($CS->{reset_state}) {
686     $C->{GL} = undef; $C->{GR} = undef;
687     $C->{C0} = $CHARSET{C0}->{"\x7E"};
688     $C->{C1} = $CHARSET{C1}->{"\x7E"};
689     $C->{G0} = $CHARSET{G94}->{"\x7E"};
690     $C->{G1} = $CHARSET{G94}->{"\x7E"};
691     $C->{G2} = $CHARSET{G94}->{"\x7E"};
692     $C->{G3} = $CHARSET{G94}->{"\x7E"};
693     }
694     }
695     if ($CS eq $CODING_SYSTEM{"\x40"}) {
696     #
697     } elsif ($CS eq $CODING_SYSTEM{G} || $CS eq $CODING_SYSTEM{'/G'}
698     || $CS eq $CODING_SYSTEM{'/H'} || $CS eq $CODING_SYSTEM{'/I'}) {
699     Encode::_utf8_off ($s);
700     } elsif ($CS eq $CODING_SYSTEM{'/@'} || $CS eq $CODING_SYSTEM{'/C'}
701     || $CS eq $CODING_SYSTEM{'/E'}) {
702     $s = Encode::encode ('ucs-2be', $s);
703     } elsif ($CS eq $CODING_SYSTEM{'/A'} || $CS eq $CODING_SYSTEM{'/D'}
704     || $CS eq $CODING_SYSTEM{'/F'}) {
705     $s = Encode::encode ('ucs-4be', $s);
706     } elsif ($CS eq $CODING_SYSTEM{'/J'} || $CS eq $CODING_SYSTEM{'/K'}
707     || $CS eq $CODING_SYSTEM{'/L'}) {
708     $s = Encode::encode ('UTF-16BE', $s);
709     } elsif ($CS eq $CODING_SYSTEM{B}) {
710     $s = Encode::encode ('utf-1', $s);
711     } else {
712     return undef;
713     }
714     $r . $s;
715 wakaba 1.1 }
716    
717     =head1 SEE ALSO
718    
719     ISO/IEC 646:1991, "7-bit coded graphic character set for intormation interchange".
720    
721     ISO/IEC 2022:1994, "Character Code Structure and Extension Techniques".
722     (IDT with ECMA 35, JIS X 0202:1998)
723    
724     ISO/IEC 4873:1991, "8-Bit Coded Character Set Structure and Rules".
725     (IDT with ECMA 43)
726    
727     ISO/IEC 6429:1992, "Control Functions for Coded Character Sets".
728     (IDT with ECMA 48:1991, JIS X 0211:1998)
729    
730     ISO/IEC 8859, "8-Bit Single-Byte Coded Graphic Character Sets".
731    
732 wakaba 1.12 L<Encode>, perlunicode
733 wakaba 1.1
734 wakaba 1.4 =head1 TODO
735    
736     =over 4
737    
738     =item NCR (coding system other than ISO/IEC 2022) support
739    
740     =over 2
741    
742     =item ESC 02/05 02/15 03/x of X Compound Text
743    
744     =back
745    
746     =item Output of control character sets, single control functions
747    
748     =item Designation sequence of control character sets (input)
749    
750     =item Special graphic character sets such as G3 of EUC-TW
751    
752     =item SUPER SHIFT (SS) invoke function of old control character set
753    
754     =item Safe transparent of control string (ISO/IEC 6429)
755    
756     =item Output of unoutputable characters as alternative notation such as SGML-like entity
757    
758     =item C0 set invoked to CR area like ISIRI code
759    
760     Really need?
761    
762     =item special treatment of 0x20, 0x7E, 0xA0, 0xFF
763    
764     For example, GB mongolian sets use MSP (MONGOLIAN SPACE)
765     with these code positions.
766    
767     And, no less coding systems does not use (or does ban using) DEL.
768    
769     =item A lot of character sets don't have pseudo-UCS mapping.
770    
771     Most of 9m^n (n >= 3) sets, 9m^n sets with I byte, 9m^n
772     DRCSes do not have pseudo-UCS mapping area. It is
773     questionable to allocate lots of code positions to these
774     rarely-(or no-)used character sets.
775    
776     =item Even character sets that have pseudo-UCS mapping, some of them can't be outputed in ISO/IEC 2022.
777    
778     Because output of rarely-used character sets is
779     not implemented yet.
780    
781     =back
782    
783 wakaba 1.5 =head1 AUTHORS
784    
785 wakaba 1.12 Nanashi-san <nanashi.san@nanashi.invalid>
786 wakaba 1.5
787     Wakaba <w@suika.fam.cx>
788    
789 wakaba 1.1 =head1 LICENSE
790    
791 wakaba 1.12 Copyright 2002 AUTHORS, all rights reserved.
792 wakaba 1.1
793     This library is free software; you can redistribute it
794     and/or modify it under the same terms as Perl itself.
795    
796     =cut
797    
798 wakaba 1.14 1; # $Date: 2002/12/16 10:25:01 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24