/[suikacvs]/perl/lib/Encode/ISO2022.pm
Suika

Contents of /perl/lib/Encode/ISO2022.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.15 - (hide annotations) (download)
Wed Dec 18 12:57:40 2002 UTC (21 years, 10 months ago) by wakaba
Branch: MAIN
CVS Tags: HEAD
Changes since 1.14: +11 -9 lines
Pictogram charsets support

1 wakaba 1.1
2     =head1 NAME
3    
4     Encode::ISO2022 --- ISO/IEC 2022 encoder and decoder
5    
6 wakaba 1.6 =head1 ENCODINGS
7    
8     =over 4
9    
10     =item iso2022
11    
12     ISO/IEC 2022:1994. Default status is:
13    
14     =over 2
15    
16     =item CL = C0 = ISO/IEC 6429:1991 C0 set
17    
18     =item CR = C1 = ISO/IEC 6429:1991 C1 set
19    
20     =item GL = G0 = ISO/IEC 646:1991 IRV GL(G0) set
21    
22     =item GR = G1 = empty set
23    
24     =item G2 = empty set
25    
26     =item G3 = empty set
27    
28     =back
29    
30     (Alias: iso/iec2022, iso-2022, 2022, cp2022)
31    
32     =back
33    
34     Note that ISO/IEC 2022 based encodings are found in
35     Encode::ISO2022::* modules. This module, Encode::ISO2022
36     only provides a general ISO/IEC 2022 encoder/decoder.
37    
38 wakaba 1.1 =cut
39    
40     require v5.7.3;
41     package Encode::ISO2022;
42     use strict;
43 wakaba 1.5 use vars qw(%CHARSET %CODING_SYSTEM $VERSION);
44 wakaba 1.15 $VERSION=do{my @r=(q$Revision: 1.14 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
45 wakaba 1.1 use base qw(Encode::Encoding);
46 wakaba 1.6 __PACKAGE__->Define (qw!iso-2022 iso/iec2022 iso2022 2022 cp2022!);
47 wakaba 1.5 require Encode::Charset;
48     *CHARSET = \%Encode::Charset::CHARSET;
49     *CODING_SYSTEM = \%Encode::Charset::CODING_SYSTEM;
50 wakaba 1.1
51     ### --- Perl Encode module common functions
52    
53     sub encode ($$;$) {
54     my ($obj, $str, $chk) = @_;
55     $_[1] = '' if $chk;
56     $str = &internal_to_iso2022 ($str);
57     return $str;
58     }
59    
60     sub decode ($$;$) {
61     my ($obj, $str, $chk) = @_;
62     $_[1] = '' if $chk;
63     return &iso2022_to_internal ($str);
64     }
65    
66     ### --- Encode::ISO2022 unique functions
67 wakaba 1.6 *new_object = \&Encode::Charset::new_object;
68 wakaba 1.1
69 wakaba 1.10 sub iso2022_to_internal ($;%) {
70 wakaba 1.1 my ($s, $C) = @_;
71 wakaba 1.6 $C ||= &new_object;
72 wakaba 1.5 my $t = '';
73 wakaba 1.9 $s =~ s{^((?:(?!\x1B\x25\x2F?[\x30-\x7E]).)*)}{
74 wakaba 1.5 my $i2 = $1;
75     $t = _iso2022_to_internal ($i2, $C);
76     '';
77 wakaba 1.9 }es;
78 wakaba 1.6 my $pad = '';
79     use re 'eval';
80 wakaba 1.5 $s =~ s{
81     ## ISO/IEC 2022
82 wakaba 1.6 (??{"$pad\x1B$pad\x25$pad\x40"})((?:(?!\x1B\x25\x2F?[\x30-\x7E]).)*)
83 wakaba 1.5 ## UTF-8
84 wakaba 1.6 |(??{"$pad\x1B$pad\x25$pad(?:\x47|\x2F$pad"."[\x47-\x49])"})
85     ((?:(?!\x1B\x25\x2F?[\x30-\x7E]).)*)
86 wakaba 1.5 ## UCS-2, UTF-16
87 wakaba 1.6 |(??{"$pad\x1B$pad\x25$pad\x2F$pad"})([\x40\x43\x45\x4A-\x4C])
88     ((?:(?!\x00\x1B\x00\x25(?:\x00\x2F)?\x00[\x30-\x7E])..)*)
89 wakaba 1.5 ## UCS-4
90 wakaba 1.6 |(??{"$pad\x1B$pad\x25$pad\x2F$pad"})[\x41\x44\x46]
91     ((?:(?!\x00\x00\x00\x1B\x00\x00\x00\x25(?:\x00\x00\x00\x2F)?
92     \x00\x00\x00[\x30-\x7E])....)*)
93 wakaba 1.5 ## with standard return
94 wakaba 1.6 |(??{"$pad\x1B$pad\x25$pad"})([\x30-\x7E])
95     ((?:(?!\x1B\x25\x2F?[\x30-\x7E]).)*)
96 wakaba 1.5 ## without standard return
97 wakaba 1.6 |(??{"$pad\x1B$pad\x25$pad\x2F$pad"})([\x30-\x7E])(.*)
98 wakaba 1.5 }{
99     my ($i2,$u8,$Fu2,$u2,$u4,$Fsr,$sr,$Fnsr,$nsr) = ($1,$2,$3,$4,$5,$6,$7,$8,$9);
100     my $r = '';
101     if (defined $i2) {
102 wakaba 1.6 $r = _iso2022_to_internal ($i2, $C); $pad = '';
103 wakaba 1.5 } elsif (defined $u8) {
104 wakaba 1.6 $r = Encode::decode ('utf8', $u8); $pad = '';
105 wakaba 1.5 } elsif ($Fu2) {
106     if (ord ($Fu2) > 0x49) {
107     $r = Encode::decode ('utf-16be', $u2);
108     } else {
109     $r = Encode::decode ('ucs-2be', $u2);
110     }
111 wakaba 1.6 $pad = "\x00";
112 wakaba 1.5 } elsif (defined $u4) {
113 wakaba 1.6 $r = Encode::decode ('ucs-4be', $u2); $pad = "\x00\x00\x00";
114     } elsif (defined $Fsr && $CODING_SYSTEM{$Fsr}->{perl_name}) {
115     $r = Encode::decode ($CODING_SYSTEM{$Fsr}->{perl_name}, $sr); $pad = '';
116     } elsif (defined $Fnsr && $CODING_SYSTEM{$Fnsr}->{perl_name}) {
117     $r = Encode::decode ($CODING_SYSTEM{$Fnsr}->{perl_name}, $nsr); $pad = '';
118 wakaba 1.5 } else { ## temporary
119 wakaba 1.6 $r = '?' x length ($sr.$nsr); $pad = '';
120 wakaba 1.5 }
121     $r;
122     }gesx;
123     $t . $s;
124     }
125    
126 wakaba 1.9 # this is very very trickey. my perl 5.8.0 does not process
127     # regex with eval except the first time (i think it's a bug
128     # of perl), so we redefine this function whenever being called!
129     # when this unexpected behavior is fixed or someone finds
130     # better way to avoid it, we will rewrite this code.
131     &_iso2022_to_internal (undef);
132 wakaba 1.10 sub _iso2022_to_internal ($;%) {
133     eval q{ sub __iso2022_to_internal ($;%) { 0 } };
134 wakaba 1.9 eval q{
135 wakaba 1.10 sub __iso2022_to_internal ($;%) {
136 wakaba 1.9 use re 'eval';
137 wakaba 1.5 my ($s, $C) = @_;
138 wakaba 1.1 my %_GB_to_GN = (
139     "\x28"=>'G0',"\x29"=>'G1',"\x2A"=>'G2',"\x2B"=>'G3',
140     "\x2C"=>'G0',"\x2D"=>'G1',"\x2E"=>'G2',"\x2F"=>'G3',
141     );
142 wakaba 1.9 my %_CHARS_to_RANGE = (
143     l94 => q/[\x21-\x7E]/, l96 => q/[\x20-\x7F]/,
144     l128 => q/[\x00-\x7F]/, l256 => q/[\x00-\xFF]/,
145     r94 => q/[\xA1-\xFE]/, r96 => q/[\xA0-\xFF]/,
146     r128 => q/[\x80-\xFF]/, r256 => q/[\x80-\xFF]/,
147     b94 => q/[\x21-\x7E\xA1-\xFE]/, b96 => q/[\x20-\x7F\xA0-\xFF]/,
148     b128 => q/[\x00-\xFF]/, b256 => q/[\x00-\xFF]/,
149     );
150 wakaba 1.1
151     $s =~ s{
152     ((??{ $_CHARS_to_RANGE{'l'.$C->{$C->{GL}}->{chars}}
153     . qq/{$C->{$C->{GL}}->{dimension},$C->{$C->{GL}}->{dimension}}/ }))
154 wakaba 1.6 |((??{ $_CHARS_to_RANGE{'r'.$C->{$C->{GR}}->{chars}}
155 wakaba 1.9 . qq/{$C->{$C->{GR}}->{dimension},$C->{$C->{GR}}->{dimension}}/ }))
156 wakaba 1.1 | (??{ q/(?:/ . ($C->{$C->{CR}}->{r_SS2} || '(?!)')
157     . ($C->{$C->{ESC_Fe}}->{r_SS2_ESC} ?
158     qq/|$C->{$C->{ESC_Fe}}->{r_SS2_ESC}/ : '')
159     . ($C->{$C->{CL}}->{r_SS2} ? qq/|$C->{$C->{CL}}->{r_SS2}/ : '') . q/)/
160     . ( $C->{$C->{CL}}->{r_LS0}
161     ||$C->{$C->{CL}}->{r_LS1}? ## ISO/IEC 6429:1992 9
162     qq/[$C->{$C->{CL}}->{r_LS0}$C->{$C->{CL}}->{r_LS1}]*/:'')
163     })
164 wakaba 1.6 ((??{ $_CHARS_to_RANGE{'b'.$C->{G2}->{chars}}
165 wakaba 1.9 . qq/{$C->{G2}->{dimension},$C->{G2}->{dimension}}/ }))
166 wakaba 1.1 | (??{ q/(?:/ . ($C->{$C->{CR}}->{r_SS3} || '(?!)')
167     . ($C->{$C->{ESC_Fe}}->{r_SS3_ESC} ?
168     qq/|$C->{$C->{ESC_Fe}}->{r_SS3_ESC}/ : '')
169     . ($C->{$C->{CL}}->{r_SS3} ? qq/|$C->{$C->{CL}}->{r_SS3}/ : '') . q/)/
170     . ( $C->{$C->{CL}}->{r_LS0}
171     || $C->{$C->{CL}}->{r_LS1}? ## ISO/IEC 6429:1992 9
172     qq/[$C->{$C->{CL}}->{r_LS0}$C->{$C->{CL}}->{r_LS1}]*/:'')
173     })
174 wakaba 1.6 ((??{ $_CHARS_to_RANGE{'b'.$C->{G3}->{chars}}
175 wakaba 1.1 . qq/{$C->{G3}->{dimension},$C->{G3}->{dimension}}/ }))
176    
177 wakaba 1.3 ## Locking shift
178 wakaba 1.6 |( (??{ $C->{$C->{CL}}->{r_LS0}||'(?!)' })
179 wakaba 1.3 |(??{ $C->{$C->{CL}}->{r_LS1}||'(?!)' })
180     )
181 wakaba 1.1
182     ## Control sequence
183     |(??{ '(?:'.($C->{$C->{CR}}->{r_CSI}||'(?!)')
184     .($C->{$C->{ESC_Fe}}->{r_CSI_ESC} ?
185     qq/|$C->{$C->{ESC_Fe}}->{r_CSI_ESC}/: '')
186     .')'
187     })
188     ((??{ qq/[\x30-\x3F$C->{$C->{CL}}->{LS0}$C->{$C->{CL}}->{LS1}\xB0-\xBF]*/
189     .qq/[\x20-\x2F$C->{$C->{CL}}->{LS0}$C->{$C->{CL}}->{LS1}\xA0-\xAF]*/
190     }) [\x40-\x7E\xD0-\xFE])
191    
192     ## Other escape sequence
193     |(\x1B[\x20-\x2F]*[\x30-\x7E])
194    
195     ## Misc. sequence (SP, control, or broken data)
196     |([\x00-\xFF])
197     }{
198 wakaba 1.3 my ($gl,$gr,$ss2,$ss3,$ls,$csi,$esc,$misc)
199     = ($1,$2,$3,$4,$5,$6,$7,$8,$9);
200 wakaba 1.1 $C->{_irr} = undef unless defined $esc;
201     ## GL graphic character
202     if (defined $gl) {
203     my $c = 0;
204     my $m = $C->{$C->{GL}}->{chars}==94?0x21:$C->{$C->{GL}}->{chars}==96?0x20:0;
205     for (split //, $gl) {
206     $c = $c * $C->{$C->{GL}}->{chars} + unpack ('C', $_) - $m;
207     }
208     chr ($C->{$C->{GL}}->{ucs} + $c);
209     ## GR graphic character
210     } elsif ($gr) {
211     my $c = 0;
212     my $m = $C->{$C->{GR}}->{chars}==94?0xA1:$C->{$C->{GR}}->{chars}==96?0xA0:0x80;
213     for (split //, $gr) {
214     $c = $c * $C->{$C->{GR}}->{chars} + unpack ('C', $_) - $m;
215     }
216     chr ($C->{$C->{GR}}->{ucs} + $c);
217 wakaba 1.10 ## Control, SP, or broken data
218     ## TODO: support control sets other than ISO/IEC 6429's
219     } elsif (defined $misc) {
220     $misc;
221 wakaba 1.1 ## Graphic character with SS2
222     } elsif ($ss2) {
223     $ss2 =~ tr/\x80-\xFF/\x00-\x7F/;
224     my $c = 0; my $m = $C->{G2}->{chars}==94?0x21:$C->{G2}->{chars}==96?0x20:0;
225     for (split //, $ss2) {
226     $c = $c * $C->{G2}->{chars} + unpack ('C', $_) - $m;
227     }
228     chr ($C->{G2}->{ucs} + $c);
229     ## Graphic character with SS3
230     } elsif ($ss3) {
231     $ss3 =~ tr/\x80-\xFF/\x00-\x7F/;
232     my $c = 0; my $m = $C->{G3}->{chars}==94?0x21:$C->{G3}->{chars}==96?0x20:0;
233     for (split //, $ss3) {
234     $c = $c * $C->{G3}->{chars} + unpack ('C', $_) - $m;
235     }
236     chr ($C->{G3}->{ucs} + $c);
237     ## Escape sequence
238     } elsif ($esc) {
239 wakaba 1.10 if ($esc =~ /\x1B\x26([\x40-\x7E])/) { ## 6F (IRR) = ESC 02/06 Ft
240 wakaba 1.1 $C->{_irr} = $1; $esc = '';
241     } else {
242     $esc =~ s{
243     \x1B([\x28-\x2B])(\x20?[\x21-\x23]?[\x30-\x7E]) ## Gx = 94^1
244     |\x1B\x24([\x28-\x2B]?)(\x20?[\x21-\x23]?[\x30-\x7E]) ## Gx = 94^n
245    
246     |\x1B([\x2C-\x2F])(\x20?[\x21-\x23]?[\x30-\x7E]) ## Gx = 96^1
247     |\x1B\x24([\x2C-\x2F])(\x20?[\x21-\x23]?[\x30-\x7E]) ## Gx = 96^n
248    
249     |\x1B([\x40-\x5F]) ## ESC Fe
250    
251     |\x1B\x21([\x21-\x23]?[\x30-\x7E]) ## CL = C0
252     |\x1B\x22([\x21-\x23]?[\x30-\x7E]) ## CR & ESC Fe = C1
253    
254     |\x1B([\x60-\x7E]) ## Single control functions
255     |\x1B\x23([\x21-\x23]?)([\x30-\x7E])
256    
257     |\x1B\x20([\x40-\x7E]) ## Announcer
258     }{
259     my ($g94_g,$g94_f,$g94n_g,$g94n_f,$g96_g,$g96_f,$g96n_g,$g96n_f,$Fe,
260     $CZD, $C1D, $Fs, $sI, $sF,$ACS)
261     = ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15);
262     my $rev = $C->{_irr} || '';
263 wakaba 1.10 my $f2s = $C->{option}->{final_to_set};
264     if ($g94_g) { ## ESC 02/08 [I] F
265     $C->{ $_GB_to_GN{ $g94_g } }
266     = $CHARSET{G94}->{ $f2s->{G94}->{$g94_f.$rev} || $g94_f.$rev }
267     || $CHARSET{G94}->{ $f2s->{G94}->{$g94_f} || $g94_f }
268     || $CHARSET{G94}->{ "\x7E" }; '';
269     } elsif (defined $g94n_f) { ## ESC 02/04 [02/08..11] [I] F
270     $C->{ $_GB_to_GN{ $g94n_g } || 'G0' }
271     = $CHARSET{G94n}->{ $f2s->{G94n}->{$g94n_f.$rev} || $g94n_f.$rev }
272     || $CHARSET{G94n}->{ $f2s->{G94n}->{$g94n_f} || $g94n_f }
273     || $CHARSET{G94n}->{ "\x7E" }; '';
274     } elsif ($g96_g) { ## ESC 02/12..15 [I] F
275     $C->{ $_GB_to_GN{ $g96_g } }
276     = $CHARSET{G96}->{ $f2s->{G96}->{$g96_f.$rev} || $g96_f.$rev }
277     || $CHARSET{G96}->{ $f2s->{G96}->{$g96_f} || $g96_f }
278     || $CHARSET{G96}->{ "\x7E" }; '';
279     } elsif ($g96n_g) { ## ESC 02/04 02/12..15 [I] F
280     $C->{ $_GB_to_GN{ $g96n_g } }
281     = $CHARSET{G96n}->{ $f2s->{G96n}->{$g96n_f.$rev} || $g96n_f.$rev }
282     || $CHARSET{G96n}->{ $f2s->{G96n}->{$g96n_f} || $g96n_f }
283     || $CHARSET{G96n}->{ "\x7E" }; '';
284     } elsif ($Fe) { ## ESC Fe = C1
285 wakaba 1.1 chr ($C->{ $C->{ESC_Fe} }->{ucs} + (ord ($Fe) - 0x40));
286 wakaba 1.10 } elsif (defined $Fs) { ## ESC Fs
287 wakaba 1.1 if ($Fs eq "\x6E") { ## LS2
288     $C->{GL} = 'G2'; '';
289     } elsif ($Fs eq "\x6F") { ## LS3
290     $C->{GL} = 'G3'; '';
291 wakaba 1.7 } elsif ($Fs eq "\x7E" || $Fs eq "\x6B") { ## LS1R
292 wakaba 1.1 $C->{GR} = 'G1'; $C->{GL} = 'G1' if $C->{bit} == 7; '';
293 wakaba 1.7 } elsif ($Fs eq "\x7D" || $Fs eq "\x6C") { ## LS2R
294 wakaba 1.1 $C->{GR} = 'G2'; $C->{GL} = 'G2' if $C->{bit} == 7; '';
295 wakaba 1.7 } elsif ($Fs eq "\x7C" || $Fs eq "\x6D") { ## LS3R
296 wakaba 1.1 $C->{GR} = 'G3'; $C->{GL} = 'G3' if $C->{bit} == 7; '';
297     } else {
298     chr ($CHARSET{single_control}->{Fs}->{ucs} + (ord ($Fs) - 0x60));
299     }
300 wakaba 1.10 } elsif (defined $CZD) { ## 1F (CZD) = ESC 02/01 [I] F
301     $C->{C0} = $CHARSET{C0}->{ $f2s->{C0}->{$CZD.$rev} || $CZD.$rev }
302     || $CHARSET{C0}->{ $f2s->{C0}->{$CZD} || $CZD }
303     || $CHARSET{C0}->{ "\x7E" }; '';
304     } elsif (defined $C1D) { ## 2F (C1D) = ESC 02/02 [I] F
305     $C->{C1} = $CHARSET{C1}->{ $f2s->{C1}->{$C1D.$rev} || $C1D.$rev }
306     || $CHARSET{C1}->{ $f2s->{C1}->{$C1D} || $C1D }
307     || $CHARSET{C1}->{ "\x7E" }; '';
308     } elsif ($sI) { ## 3F = ESC 02/03 [I] F
309 wakaba 1.1 chr ($CHARSET{single_control}->{'3F'.$sI}->{ucs} + (ord ($sF) - 0x30));
310 wakaba 1.10 } elsif ($ACS) { ## 0F (Announcer) = ESC 02/00 F
311 wakaba 1.1 if ($ACS eq "\x4A") { $C->{bit} = 7 }
312     elsif ($ACS eq "\x4B") { $C->{bit} = 8 }
313     '';
314     }
315     }gex;
316     $C->{_irr} = undef;
317     }
318     $esc;
319 wakaba 1.10 } elsif ($ls) { ## Locking shifts = LS0 / LS1
320 wakaba 1.3 if ($ls eq $C->{$C->{CL}}->{LS0}) {
321     $C->{GL} = 'G0'; '';
322     } elsif ($ls eq $C->{$C->{CL}}->{LS1}) {
323     $C->{GL} = 'G1'; '';
324     }
325 wakaba 1.10 } elsif ($csi) { ## Control sequence = CSI [P..] [I] F
326 wakaba 1.1 $csi =~ tr/\xA0-\xFF/\x20-\x7F/d;
327     $csi =~ s/$C->{$C->{CL}}->{LS0}//g if $C->{$C->{CL}}->{LS0};
328     $csi =~ s/$C->{$C->{CL}}->{LS1}//g if $C->{$C->{CL}}->{LS1};
329     "\x9B".$csi;
330     }
331 wakaba 1.9 }gesx;
332 wakaba 1.1 $s;
333 wakaba 1.9 } # __iso2022_to_internal
334    
335     };
336     &__iso2022_to_internal (@_) if defined $_[0];
337    
338     } # _iso2022_to_internal
339 wakaba 1.1
340 wakaba 1.10 sub internal_to_iso2022 ($;%) {
341 wakaba 1.1 my ($s, $C) = @_;
342     $C ||= &new_object;
343    
344     my $r = '';
345 wakaba 1.12 my @c = split //, $s;
346     for my $i (0..$#c) {
347     my $c = $c[$i]; my $cc = ord $c; Encode::_utf8_off ($c);
348 wakaba 1.1 my $t;
349     if ($cc <= 0x1F) {
350     $t = _i2c ($c, $C, type => 'C0', charset => '@');
351     } elsif ($cc == 0x20 || $cc == 0x7F) {
352     $t = _back2ascii ($C) . $c;
353     } elsif ($cc < 0x7F) {
354     $t = _i2g ($c, $C, type => 'G94', charset => 'B');
355     } elsif ($cc <= 0x9F) {
356 wakaba 1.10 $t = _i2c (pack ('C', $cc), $C, type => 'C1', charset_id => '64291991C1',
357 wakaba 1.8 charset => $C->{option}->{private_set}->{XC1}->{'64291991C1'});
358 wakaba 1.1 } elsif ($cc <= 0xFF) {
359 wakaba 1.10 $t = _i2g (pack ('C', $cc-0x80), $C, type => 'G96', charset => 'A');
360 wakaba 1.1 } elsif ($cc <= 0x24FF) {
361     my $c = $cc - 0x100;
362 wakaba 1.8 my $final = $C->{option}->{private_set}->{U96n}->[0];
363 wakaba 1.1 if (length $final) {
364     $t = _i2g (chr(($c / 96)+0x20).chr(($c % 96)+0x20), $C,
365     type => 'G96n', charset => $final);
366     }
367     } elsif ($cc <= 0x33FF) {
368     my $c = $cc - 0x2500;
369 wakaba 1.8 my $final = $C->{option}->{private_set}->{U96n}->[1];
370 wakaba 1.1 if (length $final) {
371     $t = _i2g (chr(($c / 96)+0x20).chr(($c % 96)+0x20), $C,
372     type => 'G96n', charset => $final);
373     }
374     } elsif (0xE000 <= $cc && $cc <= 0xFFFF) {
375     my $c = $cc - 0xE000;
376 wakaba 1.8 my $final = $C->{option}->{private_set}->{U96n}->[2];
377 wakaba 1.1 if (length $final) {
378     $t = _i2g (chr(($c / 96)+0x20).chr(($c % 96)+0x20), $C,
379     type => 'G96n', charset => $final);
380     }
381    
382     } elsif (0xE9F6C0 <= $cc && $cc <= 0xF06F80) {
383     my $c = $cc - 0xE9F6C0;
384     $t = _i2g (chr((($c % 8836) / 94)+0x21).chr(($c % 94)+0x21), $C,
385     type => 'G94n', charset => chr(($c / 8836)+0x30));
386     } elsif (0xF49D7C <= $cc && $cc <= 0xF4BFFF) {
387     my $c = $cc - 0xF49D7C;
388     $t = _i2g (chr(($c / 94)+0x21).chr(($c % 94)+0x21), $C,
389     type => 'G94n', charset => 'B', revision => '@');
390    
391     } elsif (0xF0000 <= $cc && $cc <= 0x10F7FF) {
392     my $c = $cc - 0xF0000;
393     $t = _i2g (chr((($c % 9216) / 96)+0x20).chr(($c % 96)+0x20), $C,
394     type => 'G96n', charset => "\x20".chr(($c / 9216)+0x40));
395     } elsif (0xE90940 <= $cc && $cc <= 0xE92641) {
396     my $c = $cc - 0xE90940;
397     $t = _i2g (chr(($c % 94)+0x21), $C,
398     type => 'G94', charset => chr(($c / 94)+0x30));
399     } elsif (0xE92642 <= $cc && $cc <= 0xE9269F) {
400     my $c = $cc - 0xE92642;
401     $t = _i2g (chr($c+0x21),$C,type => 'G94', charset => "\x64", revision => '@');
402     } elsif (0xE926A0 <= $cc && $cc <= 0xE9443F) {
403     my $c = $cc - 0xE926A0;
404     $t = _i2g (chr(($c % 96)+0x20), $C,
405     type => 'G96', charset => chr(($c / 96)+0x30));
406     } elsif (0xE944A0 <= $cc && $cc <= 0xE961A1) {
407     my $c = $cc - 0xE944A0;
408     $t = _i2g (chr(($c % 94)+0x21), $C,
409     type => 'G94', charset => '!'.chr(($c / 94)+0x30));
410     } elsif (0xE96200 <= $cc && $cc <= 0xE97F9F) {
411     my $c = $cc - 0xE96200;
412     $t = _i2g (chr(($c % 96)+0x20), $C,
413     type => 'G96', charset => '!'.chr(($c / 96)+0x30));
414     } elsif (0xE98000 <= $cc && $cc <= 0xE99D01) {
415     my $c = $cc - 0xE98000;
416     $t = _i2g (chr(($c % 94)+0x21), $C,
417     type => 'G94', charset => '"'.chr(($c / 94)+0x30));
418     } elsif (0xE99D60 <= $cc && $cc <= 0xE9BAFF) {
419     my $c = $cc - 0xE99D60;
420     $t = _i2g (chr(($c % 96)+0x20), $C,
421     type => 'G96', charset => '"'.chr(($c / 96)+0x30));
422     } elsif (0xE9BB60 <= $cc && $cc <= 0xE9D861) {
423     my $c = $cc - 0xE9BB60;
424     $t = _i2g (chr(($c % 94)+0x21), $C,
425     type => 'G94', charset => '#'.chr(($c / 94)+0x30));
426     } elsif (0xE9D8C0 <= $cc && $cc <= 0xE9F65F) {
427     my $c = $cc - 0xE9D8C0;
428     $t = _i2g (chr(($c % 96)+0x20), $C,
429     type => 'G96', charset => '#'.chr(($c / 96)+0x30));
430     } elsif (0x70090940 <= $cc && $cc <= 0x70092641) {
431     my $c = $cc - 0x70090940;
432     $t = _i2g (chr(($c % 94)+0x21), $C,
433     type => 'G94', charset => "\x20".chr(($c / 94)+0x30));
434     } elsif (0x700926A0 <= $cc && $cc <= 0x7009443F) {
435     my $c = $cc - 0x700926A0;
436     $t = _i2g (chr(($c % 96)+0x20), $C,
437     type => 'G96', charset => "\x20".chr(($c / 96)+0x30));
438     ## TODO: DRCS with I byte: U+700944A0-U+7009F6BF
439     } elsif (0x70400000 <= $cc && $cc <= 0x7040FFED) {
440     my $c = $cc - 0x70400000;
441     $t = _i2g (chr(($c % 94)+0x21), $C, charset_id => 'P'.int ($c / 94),
442 wakaba 1.8 type => 'G94', charset => $C->{option}->{private_set}->{G94}->[ $c / 94 ]);
443 wakaba 1.1 } elsif (0x70410000 <= $cc && $cc <= 0x7041FFBF) {
444     my $c = $cc - 0x70410000;
445     $t = _i2g (chr(($c % 96)+0x20), $C, charset_id => 'P'.int ($c / 96),
446 wakaba 1.8 type => 'G96', charset => $C->{option}->{private_set}->{G96}->[ $c / 96 ]);
447 wakaba 1.1 } elsif (0x70420000 <= $cc && $cc <= 0x7046F19B) {
448     my $c = $cc % 0x10000;
449     $t = _i2g (chr((($c % 8836) / 94)+0x21).chr(($c % 94)+0x21), $C,
450     type => 'G94n',
451 wakaba 1.2 charset_id => 'P'.int(($cc / 0x10000) - 0x7042).'_'.int($c / 8836),
452 wakaba 1.8 charset => $C->{option}->{private_set}->{G94n}
453     ->[ ($cc / 0x10000) - 0x7042 ]->[ $c / 8836 ]);
454 wakaba 1.1 }
455     if (defined $t) {
456 wakaba 1.12 ## Back to ISO/IEC 2022 if necessary
457 wakaba 1.5 $t = _i2o ($t, $C, cs_F => "\x40")
458     if $C->{coding_system} ne $CODING_SYSTEM{"\x40"};
459     } else {
460 wakaba 1.12 ## Output in UCS-n or UTF-n if character can't be represented in ISO/IEC 2022
461 wakaba 1.5 my $F; my @F = qw~G /G /H /I B /A /D /F~;
462     push @F, qw~/J /K /L~ if $cc <= 0x10FFFF;
463     push @F, qw~/@ /C /E~ if $cc <= 0xFFFF;
464     for (@F) {
465     if (defined $C->{option}->{designate_to}->{coding_system}->{$_}
466     && $C->{option}->{designate_to}->{coding_system}->{$_} > -1) {
467     $F = $_; last;
468     } elsif ($C->{option}->{designate_to}->{coding_system}->{default} > -1) {
469     $F = $_; last;
470     }
471     }
472     $t = _i2o ($c, $C, cs_F => $F) if $F;
473     }
474 wakaba 1.13 if (defined $t) { ## Output the character itself
475 wakaba 1.1 $r .= $t;
476 wakaba 1.12 } elsif ($C->{option}->{fallback_from_ucs} =~ /quiet/) {
477     $r .= _back2ascii ($C) if $C->{option}->{fallback_from_ucs} =~ /back/;
478     return ($r, halfway => 1, converted_length => $i,
479     warn => $C->{option}->{fallback_from_ucs} =~ /warn/ ? 1 : 0,
480     reason => sprintf (q(U+%04X: There is no character mapped to), $cc));
481     } elsif ($C->{option}->{fallback_from_ucs} eq 'croak') {
482     return ($r, halfway => 1, die => 1,
483     reason => sprintf (q(U+%04X: There is no character mapped to), $cc));
484 wakaba 1.1 } else {
485 wakaba 1.12 ## Try to output with fallback escape sequence (if specified)
486 wakaba 1.14 my $t = Encode::Charset->fallback_escape ($C, $c);
487 wakaba 1.12 if (defined $t) {
488     my %D = (fallback => $C->{option}->{fallback_from_ucs}, reset => $C->{option}->{reset});
489     $C->{option}->{fallback_from_ucs} = 'croak';
490 wakaba 1.15 $C->{option}->{reset_at_end} = {Gdesignation => 0, Ginvoke => 0};
491 wakaba 1.12 eval q{$t = $C->{_encoder}->_encode_internal ($t, $C)} or undef $t;
492     $C->{option}->{fallback_from_ucs} = $D{fallback};
493 wakaba 1.15 $C->{option}->{reset_at_end} = $D{reset_at_end};
494 wakaba 1.12 }
495     if (defined $t) {
496     $r .= $t;
497     } else { ## Replacement character specified in charset definition
498     unless ($C->{option}->{undef_char}->[0] eq "\x20") { ## A graphic character
499     $t = _i2g ($C->{option}->{undef_char}->[0], $C,
500     %{ $C->{option}->{undef_char}->[1] });
501     } else { ## SPACE
502     $t = _back2ascii ($C) . "\x20";
503     }
504     $r .= $C->{coding_system} eq $CODING_SYSTEM{"\x40"} ?
505     $t : _i2o ($t, $C, cs_F => "\x40");
506 wakaba 1.7 }
507 wakaba 1.1 }
508     }
509 wakaba 1.15 ($r . _back2ascii ($C, at => 'reset_at_end'));
510 wakaba 1.1 }
511    
512     ## $O{charset} eq undef means that charset is same as the current designated one.
513     sub _i2c ($%%) {
514     my ($s, $C, %O) = @_;
515     my $r = '';
516     if ($O{type} eq 'C0') {
517     if (defined $O{charset}) {
518     if ( $C->{C0} ne $CHARSET{C0}->{$O{charset}}
519     && $C->{C0} ne $CHARSET{C0}->{$O{charset_id}}) {
520     for ($C->{option}->{designate_to}->{C0}->{$O{charset}},
521     $C->{option}->{designate_to}->{C0}->{default}) {
522     if (defined $_) { return undef if $_ == -1; last }
523     }
524     $r .= "\x1B\x21".$O{charset};
525     $C->{C0} = $CHARSET{C0}->{$O{charset}};
526     }
527     } elsif (defined $O{charset_id}) {
528     if ($C->{C0} ne $CHARSET{C0}->{$O{charset_id}}) {
529     return undef; ## Control set is not designated nor has F byte
530     }
531     }
532     $r .= _back2ascii ($C, reset_all => $C->{C0}->{reset_all}->{$s});
533     return $r . $s;
534     } elsif ($O{type} eq 'C1') {
535     if (defined $O{charset}) {
536     if ( $C->{C1} ne $CHARSET{C1}->{$O{charset}}
537     && $C->{C1} ne $CHARSET{C1}->{$O{charset_id}}) {
538     for ($C->{option}->{designate_to}->{C1}->{$O{charset}},
539     $C->{option}->{designate_to}->{C1}->{default}) {
540     if (defined $_) { return undef if $_ == -1; last }
541     }
542     $r .= "\x1B\x22".$O{charset};
543     $C->{C1} = $CHARSET{C1}->{$O{charset}};
544     }
545     } elsif (defined $O{charset_id}) {
546     if ($C->{C1} ne $CHARSET{C1}->{$O{charset_id}}) {
547     return undef; ## Control set is not designated nor has F byte
548     }
549     }
550     $r .= _back2ascii ($C, reset_all => $C->{C1}->{reset_all}->{$s});
551     unless ($C->{option}->{C1invoke_to_right}) { ## ESC Fe
552 wakaba 1.10 $s =~ s/([\x80-\x9F])/"\x1B" . pack ('C', ord ($1) - 0x40)/ge;
553 wakaba 1.1 }
554     return $r . $s;
555     }
556     }
557     sub _i2g ($%%) {
558     my ($s, $C, %O) = @_;
559     my $r = '';
560 wakaba 1.4 my $set = $CHARSET{$O{type}}->{$O{charset}.
561     ($O{revision}&&$C->{option}->{use_revision}?$O{revision}:'')};
562 wakaba 1.1 my $set0 = $CHARSET{$O{type}}->{$O{charset_id}};
563     ## -- designate character set
564     my $G = 0;
565     if ($C->{G0} eq $set || $C->{G0} eq $set0) { $G = 0 }
566     elsif ($C->{G1} eq $set || $C->{G1} eq $set0) { $G = 1 }
567     elsif ($C->{G2} eq $set || $C->{G2} eq $set0) { $G = 2 }
568     elsif ($C->{G3} eq $set || $C->{G3} eq $set0) { $G = 3 }
569     else {
570     return undef unless $set; ## charset does not have F byte
571     $G = 1 if $O{type} eq 'G96' || $O{type} eq 'G96n';
572     for ($C->{option}->{designate_to}->{$O{type}}->{$O{charset}},
573     $C->{option}->{designate_to}->{$O{type}}->{default}) {
574     if (defined $_) {
575     $G = $_; last;
576     }
577     }
578     if ($G == -1) {
579     return undef;
580     }
581     if ($O{type} eq 'G94') {
582     $r .= ($O{revision}&&$C->{option}->{use_revision}?"\x1B\x26".$O{revision}:'')
583     ."\x1B".("\x28","\x29","\x2A","\x2B")[$G].$O{charset};
584     } elsif ($O{type} eq 'G94n') {
585     if ($G == 0 && !$C->{option}->{G94n_designate_long}
586     && ($O{charset} eq '@' || $O{charset} eq 'A' || $O{charset} eq 'B')) {
587     $r .= ($O{revision}&&$C->{option}->{use_revision}?"\x1B\x26".$O{revision}:'')
588     ."\x1B\x24".$O{charset};
589     } else {
590     $r .= ($O{revision}&&$C->{option}->{use_revision}?"\x1B\x26".$O{revision}:'')
591     ."\x1B\x24".("\x28","\x29","\x2A","\x2B")[$G].$O{charset};
592     }
593     } elsif ($O{type} eq 'G96') {
594     $r .= ($O{revision}&&$C->{option}->{use_revision}?"\x1B\x26".$O{revision}:'')
595     ."\x1B".("\x2C","\x2D","\x2E","\x2F")[$G].$O{charset};
596     } elsif ($O{type} eq 'G96n') {
597     $r .= ($O{revision}&&$C->{option}->{use_revision}?"\x1B\x26".$O{revision}:'')
598     ."\x1B\x24".("\x2C","\x2D","\x2E","\x2F")[$G].$O{charset};
599     }
600     $C->{'G'.$G} = $CHARSET{$O{type}}->{$O{charset}};
601     }
602     ## -- invoke G buffer
603     my $left = $C->{option}->{Ginvoke_to_left}->[$G];
604     if ($C->{GL} eq 'G'.$G) {
605     $left = 1;
606     } elsif ($C->{GR} eq 'G'.$G) {
607     $left = 0;
608     } else {
609     if ($C->{option}->{Ginvoke_by_single_shift}->[$G]) {
610     if ($C->{C1}->{'C_SS'.$G}) {
611     $r .= _i2c ($C->{C1}->{'C_SS'.$G}, $C, type => 'C1') || return undef;
612     } elsif ($C->{C0}->{'C_SS'.$G}) {
613     $r .= _i2c ($C->{C0}->{'C_SS'.$G}, $C, type => 'C0') || return undef;
614     } else { ## Both C0 and C1 set do not have SS2/3.
615 wakaba 1.4 $left = 0 if $G == 1 && !$C->{C0}->{C_LS1};
616 wakaba 1.1 $r .= __invoke ($C, $G => $left) if $C->{$left?'GL':'GR'} ne 'G'.$G;
617     }
618     } else {
619     $left = 0 if $G == 1 && !$C->{C0}->{C_LS1};
620     $r .= __invoke ($C, $G => $left) if $C->{$left?'GL':'GR'} ne 'G'.$G;
621     }
622     }
623     $s =~ tr/\x00-\x7F/\x80-\xFF/ unless $left;
624     $r . $s;
625     }
626 wakaba 1.15 sub _back2ascii ($;%) {
627 wakaba 1.1 my ($C, %O) = @_;
628     my $r = '';
629 wakaba 1.15 $O{at} ||= 'reset';
630     if ($C->{option}->{$O{at}}->{Gdesignation}||$C->{option}->{reset}->{Gdesignation}) {
631     my $F = $C->{option}->{$O{at}}->{Gdesignation}
632     || $C->{option}->{reset}->{Gdesignation}; # \x42
633 wakaba 1.1 $r .= "\x1B\x28".$F unless $C->{G0} eq $CHARSET{G94}->{$F};
634     $C->{G0} = $CHARSET{G94}->{$F};
635     if ($O{reset_all}) {
636     $C->{G1} = $CHARSET{G94}->{"\x7E"};
637     $C->{G2} = $CHARSET{G94}->{"\x7E"};
638     $C->{G3} = $CHARSET{G94}->{"\x7E"};
639     }
640     }
641 wakaba 1.15 if ($C->{option}->{$O{at}}->{Ginvoke}||$C->{option}->{reset}->{Ginvoke}) {
642 wakaba 1.1 if ($C->{GL} ne 'G0') {
643     $r .= $C->{C0}->{C_LS0} || ($C->{C0} = $CHARSET{C0}->{'@'},"\x1B\x21\x40\x0F");
644     $C->{GL} = 'G0';
645     }
646     $C->{GR} = undef if $O{reset_all};
647     }
648     $r;
649     }
650     ## __invoke (\%C, $G, $left_or_right)
651     sub __invoke (\%$$) {
652     my ($C, $G) = @_;
653     if ($_[2]) {
654     $C->{GL} = 'G'.$G;
655     return ($C->{C0}->{C_LS0}
656     || scalar ($C->{C0}=$CHARSET{C0}->{'@'},"\x1B\x21\x40\x0F"),
657     $C->{C0}->{C_LS1}, "\x1B\x6E", "\x1B\x6F")[$G];
658     } else {
659     $C->{GR} = 'G'.$G;
660     return ("", "\x1B\x7E", "\x1B\x7D", "\x1B\x7C")[$G];
661     }
662     '';
663     }
664 wakaba 1.5 sub _i2o ($\%%) {
665     my ($s, $C, %O) = @_;
666     my $CS = $CODING_SYSTEM{ $O{cs_F} } || $CODING_SYSTEM{ $O{cs_id} } || return undef;
667     my $r = '';
668     if ($CS ne $C->{coding_system}) {
669     my $e = '';
670     $e .= "\x1B\x25";
671 wakaba 1.8 $e .= $O{cs_F} || $C->{option}->{private_set}->{coding_system}->{ $O{cs_id} }
672 wakaba 1.5 || return undef;
673     if ($C->{coding_system} eq $CODING_SYSTEM{"\x2F\x40"}
674     || $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x43"}
675     || $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x45"}
676     || $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x4A"}
677     || $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x4B"}
678     || $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x4C"}) {
679     $e =~ s/(.)/\x00$1/go;
680     } elsif ($C->{coding_system} eq $CODING_SYSTEM{"\x2F\x41"}
681     || $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x44"}
682     || $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x46"}) {
683     $e =~ s/(.)/\x00\x00\x00$1/go;
684     }
685     $r .= $e;
686     $C->{coding_system} = $CS;
687     if ($CS->{reset_state}) {
688     $C->{GL} = undef; $C->{GR} = undef;
689     $C->{C0} = $CHARSET{C0}->{"\x7E"};
690     $C->{C1} = $CHARSET{C1}->{"\x7E"};
691     $C->{G0} = $CHARSET{G94}->{"\x7E"};
692     $C->{G1} = $CHARSET{G94}->{"\x7E"};
693     $C->{G2} = $CHARSET{G94}->{"\x7E"};
694     $C->{G3} = $CHARSET{G94}->{"\x7E"};
695     }
696     }
697     if ($CS eq $CODING_SYSTEM{"\x40"}) {
698     #
699     } elsif ($CS eq $CODING_SYSTEM{G} || $CS eq $CODING_SYSTEM{'/G'}
700     || $CS eq $CODING_SYSTEM{'/H'} || $CS eq $CODING_SYSTEM{'/I'}) {
701     Encode::_utf8_off ($s);
702     } elsif ($CS eq $CODING_SYSTEM{'/@'} || $CS eq $CODING_SYSTEM{'/C'}
703     || $CS eq $CODING_SYSTEM{'/E'}) {
704     $s = Encode::encode ('ucs-2be', $s);
705     } elsif ($CS eq $CODING_SYSTEM{'/A'} || $CS eq $CODING_SYSTEM{'/D'}
706     || $CS eq $CODING_SYSTEM{'/F'}) {
707     $s = Encode::encode ('ucs-4be', $s);
708     } elsif ($CS eq $CODING_SYSTEM{'/J'} || $CS eq $CODING_SYSTEM{'/K'}
709     || $CS eq $CODING_SYSTEM{'/L'}) {
710     $s = Encode::encode ('UTF-16BE', $s);
711     } elsif ($CS eq $CODING_SYSTEM{B}) {
712     $s = Encode::encode ('utf-1', $s);
713     } else {
714     return undef;
715     }
716     $r . $s;
717 wakaba 1.1 }
718    
719     =head1 SEE ALSO
720    
721     ISO/IEC 646:1991, "7-bit coded graphic character set for intormation interchange".
722    
723     ISO/IEC 2022:1994, "Character Code Structure and Extension Techniques".
724     (IDT with ECMA 35, JIS X 0202:1998)
725    
726     ISO/IEC 4873:1991, "8-Bit Coded Character Set Structure and Rules".
727     (IDT with ECMA 43)
728    
729     ISO/IEC 6429:1992, "Control Functions for Coded Character Sets".
730     (IDT with ECMA 48:1991, JIS X 0211:1998)
731    
732     ISO/IEC 8859, "8-Bit Single-Byte Coded Graphic Character Sets".
733    
734 wakaba 1.12 L<Encode>, perlunicode
735 wakaba 1.1
736 wakaba 1.4 =head1 TODO
737    
738     =over 4
739    
740     =item NCR (coding system other than ISO/IEC 2022) support
741    
742     =over 2
743    
744     =item ESC 02/05 02/15 03/x of X Compound Text
745    
746     =back
747    
748     =item Output of control character sets, single control functions
749    
750     =item Designation sequence of control character sets (input)
751    
752     =item Special graphic character sets such as G3 of EUC-TW
753    
754     =item SUPER SHIFT (SS) invoke function of old control character set
755    
756     =item Safe transparent of control string (ISO/IEC 6429)
757    
758     =item Output of unoutputable characters as alternative notation such as SGML-like entity
759    
760     =item C0 set invoked to CR area like ISIRI code
761    
762     Really need?
763    
764     =item special treatment of 0x20, 0x7E, 0xA0, 0xFF
765    
766     For example, GB mongolian sets use MSP (MONGOLIAN SPACE)
767     with these code positions.
768    
769     And, no less coding systems does not use (or does ban using) DEL.
770    
771     =item A lot of character sets don't have pseudo-UCS mapping.
772    
773     Most of 9m^n (n >= 3) sets, 9m^n sets with I byte, 9m^n
774     DRCSes do not have pseudo-UCS mapping area. It is
775     questionable to allocate lots of code positions to these
776     rarely-(or no-)used character sets.
777    
778     =item Even character sets that have pseudo-UCS mapping, some of them can't be outputed in ISO/IEC 2022.
779    
780     Because output of rarely-used character sets is
781     not implemented yet.
782    
783     =back
784    
785 wakaba 1.5 =head1 AUTHORS
786    
787 wakaba 1.12 Nanashi-san <nanashi.san@nanashi.invalid>
788 wakaba 1.5
789     Wakaba <w@suika.fam.cx>
790    
791 wakaba 1.1 =head1 LICENSE
792    
793 wakaba 1.12 Copyright 2002 AUTHORS, all rights reserved.
794 wakaba 1.1
795     This library is free software; you can redistribute it
796     and/or modify it under the same terms as Perl itself.
797    
798     =cut
799    
800 wakaba 1.15 1; # $Date: 2002/12/18 10:21:09 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24