/[suikacvs]/perl/lib/Encode/ISO2022.pm
Suika

Contents of /perl/lib/Encode/ISO2022.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.4 - (hide annotations) (download)
Mon Sep 16 06:35:16 2002 UTC (22 years, 1 month ago) by wakaba
Branch: MAIN
Changes since 1.3: +57 -4 lines
2002-09-16  Wakaba <w@suika.fam.cx>

	* ISO2022.pm:
	- (iso2022_to_internal): Invoke G1,G2,G3 by locking
	shifts of ESC Fs style.
	- (make_initial_charset): Create charset definition
	of 94^2 DRCSes.
	- (undef_char): New option.
	- (pod:TODO): New section.
	* HZ.pm:
	- (__hz_encoding_name): New function.
	- (Encode::HZ): Added new alias names.
	- (Encode::HZ::HZ165): New package.
	- (pod:ENCODINGS): New section.

1 wakaba 1.1
2     =head1 NAME
3    
4     Encode::ISO2022 --- ISO/IEC 2022 encoder and decoder
5    
6     =cut
7    
8     require v5.7.3;
9     package Encode::ISO2022;
10     use strict;
11     use vars qw(%CHARSET $VERSION);
12 wakaba 1.4 $VERSION=do{my @r=(q$Revision: 1.3 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
13 wakaba 1.1 use base qw(Encode::Encoding);
14 wakaba 1.2 __PACKAGE__->Define (qw/iso-2022 iso2022 2022 cp2022/);
15 wakaba 1.1
16     ### --- Intialization
17    
18     my %_CHARS_to_RANGE = (
19     l94 => q/[\x21-\x7E]/,
20     l96 => q/[\x20-\x7F]/,
21     l128 => q/[\x00-\x7F]/,
22     l256 => q/[\x00-\xFF]/,
23     r94 => q/[\xA1-\xFE]/,
24     r96 => q/[\xA0-\xFF]/,
25     r128 => q/[\x80-\xFF]/,
26     r256 => q/[\x80-\xFF]/,
27     b94 => q/[\x21-\x7E\xA1-\xFE]/,
28     b96 => q/[\x20-\x7F\xA0-\xFF]/,
29     b128 => q/[\x00-\xFF]/,
30     b256 => q/[\x00-\xFF]/,
31     );
32    
33     ## --- Make initial charset definitions
34     &_make_initial_charsets;
35     sub _make_initial_charsets () {
36     for my $f (0x30..0x7E) {
37     my $F = pack 'C', $f;
38     for ('', '!', '"', '#') {
39     $CHARSET{G94}->{ $_.$F }->{dimension} = 1;
40     $CHARSET{G94}->{ $_.$F }->{chars} = 94;
41     $CHARSET{G94}->{ $_.$F }->{ucs} =
42     {'' => 0xE90940, '!' => 0xE944A0, '"' => 0xE98000, '#' => 0xE9BB60}->{ $_ }
43     + 94 * ($f-0x30);
44    
45     $CHARSET{G96}->{ $_.$F }->{dimension} = 1;
46     $CHARSET{G96}->{ $_.$F }->{chars} = 96;
47     $CHARSET{G96}->{ $_.$F }->{ucs} =
48     {'' => 0xE926A0, '!' => 0xE96200, '"' => 0xE99D60, '#' => 0xE9D8C0}->{ $_ }
49     + 96 * ($f-0x30);
50    
51     $CHARSET{C0}->{ $_.$F }->{dimension} = 1;
52     $CHARSET{C0}->{ $_.$F }->{chars} = 32;
53     $CHARSET{C0}->{ $_.$F }->{ucs} =
54     {'' => 0x70000000, '!' => 0x70001400,
55     '"' => 0x70002800, '#' => 0x70003C00}->{ $_ } + 32 * ($f-0x30);
56    
57     $CHARSET{C1}->{ $_.$F }->{dimension} = 1;
58     $CHARSET{C1}->{ $_.$F }->{chars} = 32;
59     $CHARSET{C1}->{ $_.$F }->{ucs} =
60     {'' => 0x70000A00, '!' => 0x70001E00,
61     '"' => 0x70003200, '#' => 0x70004600}->{ $_ } + 32 * ($f-0x30);
62    
63     $CHARSET{G94}->{ ' '.$_.$F }->{dimension} = 1; ## DRCS
64     $CHARSET{G94}->{ ' '.$_.$F }->{chars} = 94;
65     $CHARSET{G94}->{ ' '.$_.$F }->{ucs} =
66     {'' => 0x70090940, '!' => 0x700944A0,
67     '"' => 0x70098000, '#' => 0x7009BB60}->{ $_ } + 94 * ($f-0x30);
68    
69     $CHARSET{G96}->{ ' '.$_.$F }->{dimension} = 1; ## DRCS
70     $CHARSET{G96}->{ ' '.$_.$F }->{chars} = 96;
71     $CHARSET{G96}->{ ' '.$_.$F }->{ucs} =
72     {'' => 0x700926A0, '!' => 0x70096200,
73     '"' => 0x70099D60, '#' => 0x7009D8C0}->{ $_ } + 96 * ($f-0x30);
74     }
75     }
76     for my $f (0x30..0x5F, 0x7E) {
77     my $F = pack 'C', $f;
78 wakaba 1.3 for ('', '!', '"', '#', ' ') {
79 wakaba 1.1 $CHARSET{G94n}->{ $_.$F }->{dimension} = 2;
80     $CHARSET{G94n}->{ $_.$F }->{chars} = 94;
81     $CHARSET{G94n}->{ $_.$F }->{ucs} =
82     ({'' => 0xE9F6C0}->{ $_ }||0) + 94*94 * ($f-0x30);
83 wakaba 1.3 ## BUG: 94^n sets with I byte have no mapping area
84 wakaba 1.1
85     $CHARSET{G96n}->{ $_.$F }->{dimension} = 2;
86     $CHARSET{G96n}->{ $_.$F }->{chars} = 96;
87     $CHARSET{G96n}->{ $_.$F }->{ucs} =
88     ({'' => 0xF4C000}->{ $_ }||0) + 96*96 * ($f-0x30);
89     ## BUG: 94^n DRCSes with I byte have no mapping area
90     }
91     }
92     for (0x60..0x6F) {
93     my $F = pack 'C', $_;
94     ## BUG: 9x^3 sets have no mapping area
95     for ('', '!', '"', '#', ' ') {
96     $CHARSET{G94n}->{ $_.$F }->{dimension} = 3;
97     $CHARSET{G94n}->{ $_.$F }->{chars} = 94;
98    
99     $CHARSET{G96n}->{ $_.$F }->{dimension} = 3;
100     $CHARSET{G96n}->{ $_.$F }->{chars} = 96;
101     }
102     }
103     for (0x70..0x7D) {
104     my $F = pack 'C', $_;
105     ## BUG: 9x^4 sets have no mapping area
106     for ('', '!', '"', '#', ' ') {
107     $CHARSET{G94n}->{ $_.$F }->{dimension} = 4;
108     $CHARSET{G94n}->{ $_.$F }->{chars} = 94;
109    
110     $CHARSET{G96n}->{ $_.$F }->{dimension} = 4;
111     $CHARSET{G96n}->{ $_.$F }->{chars} = 96;
112     }
113     }
114     for my $f (0x40..0x4E) {
115     my $F = pack 'C', $f;
116     $CHARSET{G96n}->{ ' '.$F }->{dimension} = 2;
117     $CHARSET{G96n}->{ ' '.$F }->{chars} = 96;
118     $CHARSET{G96n}->{ ' '.$F }->{ucs} = 0xF0000 + 96*96*($f-0x40);
119     ## U+F0000-U+10F7FF (private) -> ESC 02/04 02/00 <I> (04/00-04/14) (DRCS)
120     }
121    
122     $CHARSET{G94}->{B}->{ucs} = 0x21; ## ASCII
123     $CHARSET{G96}->{A}->{ucs} = 0xA0; ## ISO 8859-1
124    
125     $CHARSET{G94n}->{'B@'}->{dimension} = 2; ## JIS X 0208-1990
126     $CHARSET{G94n}->{'B@'}->{chars} = 94;
127     $CHARSET{G94n}->{'B@'}->{ucs} = 0xE9F6C0 + 94*94*79;
128    
129     ## -- Control character sets
130     $CHARSET{C0}->{'@'}->{ucs} = 0x00; ## ISO/IEC 6429 C0
131     for ("\x40", "\x43", "\x44", "\x45", "\x46", "\x49", "\x4A", "\x4B", "\x4C") {
132     $CHARSET{C0}->{$_}->{C_LS0} = "\x0F";
133     $CHARSET{C0}->{$_}->{C_LS1} = "\x0E";
134     $CHARSET{C0}->{$_}->{r_LS0} = '\x0F';
135     $CHARSET{C0}->{$_}->{r_LS1} = '\x0E';
136     }
137     for ("\x40", "\x44", "\x45", "\x46", "\x48", "\x4C") {
138     $CHARSET{C0}->{$_}->{reset_all} = {"\x0A" => 1, "\x0B" => 1,
139     "\x0C" => 1, "\x0D" => 1};
140     }
141     $CHARSET{C0}->{"\x43"}->{reset_all} = {"\x0A" => 1};
142     $CHARSET{C0}->{"\x44"}->{C_SS2} = "\x1C";
143     $CHARSET{C0}->{"\x44"}->{r_SS2} = '\x1C';
144     for ("\x45", "\x49", "\x4A", "\x4B") {
145     $CHARSET{C0}->{$_}->{C_SS2} = "\x19";
146     $CHARSET{C0}->{$_}->{C_SS3} = "\x1D";
147     $CHARSET{C0}->{$_}->{r_SS2} = '\x19';
148     $CHARSET{C0}->{$_}->{r_SS3} = '\x1D';
149     }
150     $CHARSET{C0}->{"\x4C"}->{C_SS2} = "\x19";
151     $CHARSET{C0}->{"\x4C"}->{r_SS2} = '\x19';
152    
153     $CHARSET{C1}->{'64291991C1'}->{dimension} = 1; ## ISO/IEC 6429:1991 C1
154     $CHARSET{C1}->{'64291991C1'}->{chars} = 32;
155     $CHARSET{C1}->{'64291991C1'}->{ucs} = 0x80;
156     for ("\x43", "\x45", "\x47", '64291991C1') {
157     $CHARSET{C1}->{$_}->{C_SS2} = "\x8E";
158     $CHARSET{C1}->{$_}->{C_SS3} = "\x8F";
159     $CHARSET{C1}->{$_}->{r_SS2} = '\x8E';
160     $CHARSET{C1}->{$_}->{r_SS3} = '\x8F';
161     $CHARSET{C1}->{$_}->{r_SS2_ESC} = '\x1B\x4E';
162     $CHARSET{C1}->{$_}->{r_SS3_ESC} = '\x1B\x4F';
163     }
164     for ("\x43", '64291991C1') {
165     $CHARSET{C1}->{$_}->{r_CSI} = '\x9B';
166     $CHARSET{C1}->{$_}->{r_CSI_ESC} = '\x1B\x5B';
167     $CHARSET{C1}->{$_}->{r_DCS} = '\x90';
168     $CHARSET{C1}->{$_}->{r_ST} = '\x9C';
169     $CHARSET{C1}->{$_}->{r_OSC} = '\x9D';
170     $CHARSET{C1}->{$_}->{r_PM} = '\x9E';
171     $CHARSET{C1}->{$_}->{r_APC} = '\x9F';
172     $CHARSET{C1}->{$_}->{reset_all} = {"\x85"=>1, "\x90"=>1,
173     "\x9C"=>1, "\x9D"=>1, "\x9E"=>1, "\x9F"=>1};
174     }
175     $CHARSET{C1}->{'64291991C1'}->{r_SCI} = '\x9A';
176    
177     $CHARSET{single_control}->{Fs} ={ucs => 0x70005000, chars => 32, dimension => 1};
178     $CHARSET{single_control}->{'3F'} ={ucs => 0x70005020, chars => 80, dimension => 1};
179     $CHARSET{single_control}->{'3F!'}={ucs => 0x70005070, chars => 80, dimension => 1};
180     $CHARSET{single_control}->{'3F"'}={ucs => 0x700050C0, chars => 80, dimension => 1};
181     $CHARSET{single_control}->{'3F#'}={ucs => 0x70005110, chars => 80, dimension => 1};
182     }
183    
184    
185     ### --- Perl Encode module common functions
186    
187     sub encode ($$;$) {
188     my ($obj, $str, $chk) = @_;
189     $_[1] = '' if $chk;
190     $str = &internal_to_iso2022 ($str);
191     return $str;
192     }
193    
194     sub decode ($$;$) {
195     my ($obj, $str, $chk) = @_;
196     $_[1] = '' if $chk;
197     return &iso2022_to_internal ($str);
198     }
199    
200     ### --- Encode::ISO2022 unique functions
201    
202     ## Make a new ISO/IEC 2022-buffers object with default status
203     sub new_object {
204     my %C;
205     $C{bit} = 8;
206     $C{CL} = 'C0'; $C{CR} = 'C1'; $C{ESC_Fe} = 'C1';
207     $C{C0} = $CHARSET{C0}->{"\x40"}; ## ISO/IEC 6429:1991 C0
208     $C{C1} = $CHARSET{C1}->{'64291991C1'}; ## ISO/IEC 6429:1991 C1
209     $C{GL} = 'G0'; $C{GR} = 'G1';
210     $C{G0} = $CHARSET{G94}->{"\x42"}; ## ISO/IEC 646:1991 IRV
211     #$C{G1} = $CHARSET{G96}->{"\x41"}; ## ISO/IEC 8859-1 GR
212     $C{G1} = $CHARSET{G94}->{"\x7E"}; ## empty set
213     $C{G2} = $CHARSET{G94}->{"\x7E"}; ## empty set
214     $C{G3} = $CHARSET{G94}->{"\x7E"}; ## empty set
215     $C{option} = {
216     C1invoke_to_right => 0, ## C1 invoked to: (0: ESC Fe, 1: CR)
217     G94n_designate_long => 0, ## (1: ESC 02/04 02/08 04/00..02)
218     designate_to => { ## Designated G buffer (-1: not be outputed)
219     C0 => {
220     default => 0,
221     },
222     C1 => {
223     default => 1,
224     },
225     G94 => {
226     "\x42" => 0,
227     default => 0,
228     },
229     G96 => {
230     default => 1,
231     },
232     G94n => {
233     default => 0,
234     },
235     G96n => {
236     default => 1,
237     },
238     },
239     Ginvoke_by_single_shift => [0,0,0,0], ## Invoked by SS
240     Ginvoke_to_left => [1,1,1,1], ## Which invoked to? (1: L, 0: R)
241     private_set => { ## Private set vs Final byte
242     C0 => [],
243     C1 => [],
244     G94 => [],
245     G94n => [[],[],[],[],[]],
246     G96 => [],
247     #G96n => [], ## (not implemented)
248     U96n => [], ## mule-unicode sets
249     XC1 => {
250     '64291991C1' => undef, ## ISO/IEC 6429:1991 C1
251     },
252     },
253     reset => { ## Reset status at top of line
254     Gdesignation => "\x42", ## F of designation or 0
255     Ginvoke => 1,
256     },
257 wakaba 1.4 undef_char => ["\x3F", {type => 'G94', charset => 'B'}],
258 wakaba 1.1 use_revision => 1, ## Output IRR
259     };
260     \%C;
261     }
262    
263     sub iso2022_to_internal ($;\%) {
264     my ($s, $C) = @_;
265     my %_GB_to_GN = (
266     "\x28"=>'G0',"\x29"=>'G1',"\x2A"=>'G2',"\x2B"=>'G3',
267     "\x2C"=>'G0',"\x2D"=>'G1',"\x2E"=>'G2',"\x2F"=>'G3',
268     );
269     $C ||= &new_object;
270    
271     use re 'eval';
272     $s =~ s{
273     ((??{ $_CHARS_to_RANGE{'l'.$C->{$C->{GL}}->{chars}}
274     . qq/{$C->{$C->{GL}}->{dimension},$C->{$C->{GL}}->{dimension}}/ }))
275     |((??{ $_CHARS_to_RANGE{'r'.$C->{$C->{GL}}->{chars}}
276     . qq/{$C->{$C->{GR}}->{dimension},$C->{$C->{GR}}->{dimension}}/ }))
277    
278     | (??{ q/(?:/ . ($C->{$C->{CR}}->{r_SS2} || '(?!)')
279     . ($C->{$C->{ESC_Fe}}->{r_SS2_ESC} ?
280     qq/|$C->{$C->{ESC_Fe}}->{r_SS2_ESC}/ : '')
281     . ($C->{$C->{CL}}->{r_SS2} ? qq/|$C->{$C->{CL}}->{r_SS2}/ : '') . q/)/
282     . ( $C->{$C->{CL}}->{r_LS0}
283     ||$C->{$C->{CL}}->{r_LS1}? ## ISO/IEC 6429:1992 9
284     qq/[$C->{$C->{CL}}->{r_LS0}$C->{$C->{CL}}->{r_LS1}]*/:'')
285     })
286     ((??{ $_CHARS_to_RANGE{'b'.$C->{$C->{GL}}->{chars}}
287     . qq/{$C->{G2}->{dimension},$C->{G2}->{dimension}}/ }))
288     | (??{ q/(?:/ . ($C->{$C->{CR}}->{r_SS3} || '(?!)')
289     . ($C->{$C->{ESC_Fe}}->{r_SS3_ESC} ?
290     qq/|$C->{$C->{ESC_Fe}}->{r_SS3_ESC}/ : '')
291     . ($C->{$C->{CL}}->{r_SS3} ? qq/|$C->{$C->{CL}}->{r_SS3}/ : '') . q/)/
292     . ( $C->{$C->{CL}}->{r_LS0}
293     || $C->{$C->{CL}}->{r_LS1}? ## ISO/IEC 6429:1992 9
294     qq/[$C->{$C->{CL}}->{r_LS0}$C->{$C->{CL}}->{r_LS1}]*/:'')
295     })
296     ((??{ $_CHARS_to_RANGE{'b'.$C->{$C->{GL}}->{chars}}
297     . qq/{$C->{G3}->{dimension},$C->{G3}->{dimension}}/ }))
298    
299 wakaba 1.3 ## Locking shift
300     |( \x1B[\x6E\x6F\x7C-\x7E]
301     |(??{ $C->{$C->{CL}}->{r_LS0}||'(?!)' })
302     |(??{ $C->{$C->{CL}}->{r_LS1}||'(?!)' })
303     )
304 wakaba 1.1
305     ## Control sequence
306     |(??{ '(?:'.($C->{$C->{CR}}->{r_CSI}||'(?!)')
307     .($C->{$C->{ESC_Fe}}->{r_CSI_ESC} ?
308     qq/|$C->{$C->{ESC_Fe}}->{r_CSI_ESC}/: '')
309     .')'
310     })
311     ((??{ qq/[\x30-\x3F$C->{$C->{CL}}->{LS0}$C->{$C->{CL}}->{LS1}\xB0-\xBF]*/
312     .qq/[\x20-\x2F$C->{$C->{CL}}->{LS0}$C->{$C->{CL}}->{LS1}\xA0-\xAF]*/
313     }) [\x40-\x7E\xD0-\xFE])
314    
315     ## Other escape sequence
316     |(\x1B[\x20-\x2F]*[\x30-\x7E])
317    
318     ## Misc. sequence (SP, control, or broken data)
319     |([\x00-\xFF])
320     }{
321 wakaba 1.3 my ($gl,$gr,$ss2,$ss3,$ls,$csi,$esc,$misc)
322     = ($1,$2,$3,$4,$5,$6,$7,$8,$9);
323 wakaba 1.1 $C->{_irr} = undef unless defined $esc;
324     ## GL graphic character
325     if (defined $gl) {
326     my $c = 0;
327     my $m = $C->{$C->{GL}}->{chars}==94?0x21:$C->{$C->{GL}}->{chars}==96?0x20:0;
328     for (split //, $gl) {
329     $c = $c * $C->{$C->{GL}}->{chars} + unpack ('C', $_) - $m;
330     }
331     chr ($C->{$C->{GL}}->{ucs} + $c);
332     ## Control, SP, or broken data
333     ## TODO: support control sets other than ISO/IEC 6429's
334     } elsif (defined $misc) {
335     $misc;
336     ## GR graphic character
337     } elsif ($gr) {
338     my $c = 0;
339     my $m = $C->{$C->{GR}}->{chars}==94?0xA1:$C->{$C->{GR}}->{chars}==96?0xA0:0x80;
340     for (split //, $gr) {
341     $c = $c * $C->{$C->{GR}}->{chars} + unpack ('C', $_) - $m;
342     }
343     chr ($C->{$C->{GR}}->{ucs} + $c);
344     ## Graphic character with SS2
345     } elsif ($ss2) {
346     $ss2 =~ tr/\x80-\xFF/\x00-\x7F/;
347     my $c = 0; my $m = $C->{G2}->{chars}==94?0x21:$C->{G2}->{chars}==96?0x20:0;
348     for (split //, $ss2) {
349     $c = $c * $C->{G2}->{chars} + unpack ('C', $_) - $m;
350     }
351     chr ($C->{G2}->{ucs} + $c);
352     ## Graphic character with SS3
353     } elsif ($ss3) {
354     $ss3 =~ tr/\x80-\xFF/\x00-\x7F/;
355     my $c = 0; my $m = $C->{G3}->{chars}==94?0x21:$C->{G3}->{chars}==96?0x20:0;
356     for (split //, $ss3) {
357     $c = $c * $C->{G3}->{chars} + unpack ('C', $_) - $m;
358     }
359     chr ($C->{G3}->{ucs} + $c);
360     ## Escape sequence
361     } elsif ($esc) {
362     ## IRR (revision number)
363     if ($esc =~ /\x1B\x26([\x40-\x7E])/) {
364     $C->{_irr} = $1; $esc = '';
365     } else {
366     $esc =~ s{
367     \x1B([\x28-\x2B])(\x20?[\x21-\x23]?[\x30-\x7E]) ## Gx = 94^1
368     |\x1B\x24([\x28-\x2B]?)(\x20?[\x21-\x23]?[\x30-\x7E]) ## Gx = 94^n
369    
370     |\x1B([\x2C-\x2F])(\x20?[\x21-\x23]?[\x30-\x7E]) ## Gx = 96^1
371     |\x1B\x24([\x2C-\x2F])(\x20?[\x21-\x23]?[\x30-\x7E]) ## Gx = 96^n
372    
373     |\x1B([\x40-\x5F]) ## ESC Fe
374    
375     |\x1B\x21([\x21-\x23]?[\x30-\x7E]) ## CL = C0
376     |\x1B\x22([\x21-\x23]?[\x30-\x7E]) ## CR & ESC Fe = C1
377    
378     |\x1B([\x60-\x7E]) ## Single control functions
379     |\x1B\x23([\x21-\x23]?)([\x30-\x7E])
380    
381     |\x1B\x20([\x40-\x7E]) ## Announcer
382     }{
383     my ($g94_g,$g94_f,$g94n_g,$g94n_f,$g96_g,$g96_f,$g96n_g,$g96n_f,$Fe,
384     $CZD, $C1D, $Fs, $sI, $sF,$ACS)
385     = ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15);
386     my $rev = $C->{_irr} || '';
387     if ($g94_g) {
388     $C->{ $_GB_to_GN{ $g94_g } } = $CHARSET{G94}->{ $g94_f.$rev }
389     || $CHARSET{G94}->{ $g94_f }
390     || $CHARSET{G94}->{ "\x7E" }; '';
391     } elsif (defined $g94n_f) {
392     $C->{ $_GB_to_GN{ $g94n_g } || 'G0' } = $CHARSET{G94n}->{ $g94n_f.$rev }
393     || $CHARSET{G94n}->{ $g94n_f }
394     || $CHARSET{G94n}->{ "\x7E" }; '';
395     } elsif ($g96_g) {
396     $C->{ $_GB_to_GN{ $g96_g } } = $CHARSET{G96}->{ $g96_f.$rev }
397     || $CHARSET{G96}->{ $g96_f }
398     || $CHARSET{G96}->{ "\x7E" }; '';
399     } elsif (defined $g96n_f) {
400     $C->{ $_GB_to_GN{ $g96n_g } } = $CHARSET{G96n}->{ $g96n_f.$rev }
401     || $CHARSET{G96n}->{ $g96n_f }
402     || $CHARSET{G96n}->{ "\x7E" }; '';
403     } elsif ($Fe) { ## ESC Fe => C1
404     chr ($C->{ $C->{ESC_Fe} }->{ucs} + (ord ($Fe) - 0x40));
405     } elsif ($CZD) {
406     $C->{C0} = $CHARSET{C0}->{ $CZD.$rev }
407     || $CHARSET{C0}->{ $CZD } || $CHARSET{C0}->{ "\x7E" }; '';
408     } elsif ($C1D) {
409     $C->{C1} = $CHARSET{C1}->{ $C1D.$rev }
410     || $CHARSET{C1}->{ $C1D } || $CHARSET{C1}->{ "\x7E" }; '';
411     } elsif ($Fs) {
412     if ($Fs eq "\x6E") { ## LS2
413     $C->{GL} = 'G2'; '';
414     } elsif ($Fs eq "\x6F") { ## LS3
415     $C->{GL} = 'G3'; '';
416     } elsif ($Fs eq "\x7E") { ## LS1R
417     $C->{GR} = 'G1'; $C->{GL} = 'G1' if $C->{bit} == 7; '';
418     } elsif ($Fs eq "\x7D") { ## LS2R
419     $C->{GR} = 'G2'; $C->{GL} = 'G2' if $C->{bit} == 7; '';
420     } elsif ($Fs eq "\x7C") { ## LS3R
421     $C->{GR} = 'G3'; $C->{GL} = 'G3' if $C->{bit} == 7; '';
422     } else {
423     chr ($CHARSET{single_control}->{Fs}->{ucs} + (ord ($Fs) - 0x60));
424     }
425     } elsif ($sI) {
426     chr ($CHARSET{single_control}->{'3F'.$sI}->{ucs} + (ord ($sF) - 0x30));
427     } elsif ($ACS) { ## Announcer
428     if ($ACS eq "\x4A") { $C->{bit} = 7 }
429     elsif ($ACS eq "\x4B") { $C->{bit} = 8 }
430     '';
431     }
432     }gex;
433     $C->{_irr} = undef;
434     }
435     $esc;
436 wakaba 1.3 ## Locking shifts
437     } elsif ($ls) {
438     if ($ls eq $C->{$C->{CL}}->{LS0}) {
439     $C->{GL} = 'G0'; '';
440     } elsif ($ls eq $C->{$C->{CL}}->{LS1}) {
441     $C->{GL} = 'G1'; '';
442     } elsif ($ls =~ /\x1B([\x6E\x6F])/) {
443     $C->{GL} = {"\x6E"=>2, "\x6F"=>3}->{$1}; '';
444     } elsif ($ls =~ /\x1B([\x7C-\x7E])/) {
445     $C->{GR} = {"\x7E"=>1, "\x7D"=>2, "\x7C"=>3}->{$1}; '';
446     }
447 wakaba 1.1 ## Control sequence
448     } elsif ($csi) {
449     $csi =~ tr/\xA0-\xFF/\x20-\x7F/d;
450     $csi =~ s/$C->{$C->{CL}}->{LS0}//g if $C->{$C->{CL}}->{LS0};
451     $csi =~ s/$C->{$C->{CL}}->{LS1}//g if $C->{$C->{CL}}->{LS1};
452     "\x9B".$csi;
453     }
454     }gex;
455     $s;
456     }
457    
458     sub internal_to_iso2022 ($\%) {
459     my ($s, $C) = @_;
460     $C ||= &new_object;
461    
462     my $r = '';
463     for my $c (split //, $s) {
464     my $cc = ord $c;
465     my $t;
466     if ($cc <= 0x1F) {
467     $t = _i2c ($c, $C, type => 'C0', charset => '@');
468     } elsif ($cc == 0x20 || $cc == 0x7F) {
469     $t = _back2ascii ($C) . $c;
470     } elsif ($cc < 0x7F) {
471     $t = _i2g ($c, $C, type => 'G94', charset => 'B');
472     } elsif ($cc <= 0x9F) {
473     $t = _i2c ($c, $C, type => 'C1', charset_id => '64291991C1',
474     charset => $C->{private_set}->{XC1}->{'64291991C1'});
475     } elsif ($cc <= 0xFF) {
476     $t = _i2g (chr($cc-0x80), $C, type => 'G96', charset => 'A');
477     } elsif ($cc <= 0x24FF) {
478     my $c = $cc - 0x100;
479     my $final = $C->{private_set}->{U96n}->[0];
480     if (length $final) {
481     $t = _i2g (chr(($c / 96)+0x20).chr(($c % 96)+0x20), $C,
482     type => 'G96n', charset => $final);
483     }
484     } elsif ($cc <= 0x33FF) {
485     my $c = $cc - 0x2500;
486     my $final = $C->{private_set}->{U96n}->[1];
487     if (length $final) {
488     $t = _i2g (chr(($c / 96)+0x20).chr(($c % 96)+0x20), $C,
489     type => 'G96n', charset => $final);
490     }
491     } elsif (0xE000 <= $cc && $cc <= 0xFFFF) {
492     my $c = $cc - 0xE000;
493     my $final = $C->{private_set}->{U96n}->[2];
494     if (length $final) {
495     $t = _i2g (chr(($c / 96)+0x20).chr(($c % 96)+0x20), $C,
496     type => 'G96n', charset => $final);
497     }
498    
499     } elsif (0xE9F6C0 <= $cc && $cc <= 0xF06F80) {
500     my $c = $cc - 0xE9F6C0;
501     $t = _i2g (chr((($c % 8836) / 94)+0x21).chr(($c % 94)+0x21), $C,
502     type => 'G94n', charset => chr(($c / 8836)+0x30));
503     } elsif (0xF49D7C <= $cc && $cc <= 0xF4BFFF) {
504     my $c = $cc - 0xF49D7C;
505     $t = _i2g (chr(($c / 94)+0x21).chr(($c % 94)+0x21), $C,
506     type => 'G94n', charset => 'B', revision => '@');
507    
508     } elsif (0xF0000 <= $cc && $cc <= 0x10F7FF) {
509     my $c = $cc - 0xF0000;
510     $t = _i2g (chr((($c % 9216) / 96)+0x20).chr(($c % 96)+0x20), $C,
511     type => 'G96n', charset => "\x20".chr(($c / 9216)+0x40));
512     } elsif (0xE90940 <= $cc && $cc <= 0xE92641) {
513     my $c = $cc - 0xE90940;
514     $t = _i2g (chr(($c % 94)+0x21), $C,
515     type => 'G94', charset => chr(($c / 94)+0x30));
516     } elsif (0xE92642 <= $cc && $cc <= 0xE9269F) {
517     my $c = $cc - 0xE92642;
518     $t = _i2g (chr($c+0x21),$C,type => 'G94', charset => "\x64", revision => '@');
519     } elsif (0xE926A0 <= $cc && $cc <= 0xE9443F) {
520     my $c = $cc - 0xE926A0;
521     $t = _i2g (chr(($c % 96)+0x20), $C,
522     type => 'G96', charset => chr(($c / 96)+0x30));
523     } elsif (0xE944A0 <= $cc && $cc <= 0xE961A1) {
524     my $c = $cc - 0xE944A0;
525     $t = _i2g (chr(($c % 94)+0x21), $C,
526     type => 'G94', charset => '!'.chr(($c / 94)+0x30));
527     } elsif (0xE96200 <= $cc && $cc <= 0xE97F9F) {
528     my $c = $cc - 0xE96200;
529     $t = _i2g (chr(($c % 96)+0x20), $C,
530     type => 'G96', charset => '!'.chr(($c / 96)+0x30));
531     } elsif (0xE98000 <= $cc && $cc <= 0xE99D01) {
532     my $c = $cc - 0xE98000;
533     $t = _i2g (chr(($c % 94)+0x21), $C,
534     type => 'G94', charset => '"'.chr(($c / 94)+0x30));
535     } elsif (0xE99D60 <= $cc && $cc <= 0xE9BAFF) {
536     my $c = $cc - 0xE99D60;
537     $t = _i2g (chr(($c % 96)+0x20), $C,
538     type => 'G96', charset => '"'.chr(($c / 96)+0x30));
539     } elsif (0xE9BB60 <= $cc && $cc <= 0xE9D861) {
540     my $c = $cc - 0xE9BB60;
541     $t = _i2g (chr(($c % 94)+0x21), $C,
542     type => 'G94', charset => '#'.chr(($c / 94)+0x30));
543     } elsif (0xE9D8C0 <= $cc && $cc <= 0xE9F65F) {
544     my $c = $cc - 0xE9D8C0;
545     $t = _i2g (chr(($c % 96)+0x20), $C,
546     type => 'G96', charset => '#'.chr(($c / 96)+0x30));
547     } elsif (0x70090940 <= $cc && $cc <= 0x70092641) {
548     my $c = $cc - 0x70090940;
549     $t = _i2g (chr(($c % 94)+0x21), $C,
550     type => 'G94', charset => "\x20".chr(($c / 94)+0x30));
551     } elsif (0x700926A0 <= $cc && $cc <= 0x7009443F) {
552     my $c = $cc - 0x700926A0;
553     $t = _i2g (chr(($c % 96)+0x20), $C,
554     type => 'G96', charset => "\x20".chr(($c / 96)+0x30));
555     ## TODO: DRCS with I byte: U+700944A0-U+7009F6BF
556     } elsif (0x70400000 <= $cc && $cc <= 0x7040FFED) {
557     my $c = $cc - 0x70400000;
558     $t = _i2g (chr(($c % 94)+0x21), $C, charset_id => 'P'.int ($c / 94),
559     type => 'G94', charset => $C->{private_set}->{G94}->[ $c / 94 ]);
560     } elsif (0x70410000 <= $cc && $cc <= 0x7041FFBF) {
561     my $c = $cc - 0x70410000;
562     $t = _i2g (chr(($c % 96)+0x20), $C, charset_id => 'P'.int ($c / 96),
563     type => 'G96', charset => $C->{private_set}->{G96}->[ $c / 96 ]);
564     } elsif (0x70420000 <= $cc && $cc <= 0x7046F19B) {
565     my $c = $cc % 0x10000;
566     $t = _i2g (chr((($c % 8836) / 94)+0x21).chr(($c % 94)+0x21), $C,
567     type => 'G94n',
568 wakaba 1.2 charset_id => 'P'.int(($cc / 0x10000) - 0x7042).'_'.int($c / 8836),
569 wakaba 1.1 charset => $C->{private_set}->{G94n}->[ ($cc / 0x10000) - 0x7042 ]
570     ->[ $c / 8836 ]);
571     }
572     if (defined $t) {
573     $r .= $t;
574     } else {
575 wakaba 1.4 $r .= _i2g ($C->{option}->{undef_char}->[0], $C,
576     %{ $C->{option}->{undef_char}->[1] });
577 wakaba 1.1 }
578     }
579     $r . _back2ascii ($C);
580     }
581    
582     ## $O{charset} eq undef means that charset is same as the current designated one.
583     sub _i2c ($%%) {
584     my ($s, $C, %O) = @_;
585     my $r = '';
586     if ($O{type} eq 'C0') {
587     if (defined $O{charset}) {
588     if ( $C->{C0} ne $CHARSET{C0}->{$O{charset}}
589     && $C->{C0} ne $CHARSET{C0}->{$O{charset_id}}) {
590     for ($C->{option}->{designate_to}->{C0}->{$O{charset}},
591     $C->{option}->{designate_to}->{C0}->{default}) {
592     if (defined $_) { return undef if $_ == -1; last }
593     }
594     $r .= "\x1B\x21".$O{charset};
595     $C->{C0} = $CHARSET{C0}->{$O{charset}};
596     }
597     } elsif (defined $O{charset_id}) {
598     if ($C->{C0} ne $CHARSET{C0}->{$O{charset_id}}) {
599     return undef; ## Control set is not designated nor has F byte
600     }
601     }
602     $r .= _back2ascii ($C, reset_all => $C->{C0}->{reset_all}->{$s});
603     return $r . $s;
604     } elsif ($O{type} eq 'C1') {
605     if (defined $O{charset}) {
606     if ( $C->{C1} ne $CHARSET{C1}->{$O{charset}}
607     && $C->{C1} ne $CHARSET{C1}->{$O{charset_id}}) {
608     for ($C->{option}->{designate_to}->{C1}->{$O{charset}},
609     $C->{option}->{designate_to}->{C1}->{default}) {
610     if (defined $_) { return undef if $_ == -1; last }
611     }
612     $r .= "\x1B\x22".$O{charset};
613     $C->{C1} = $CHARSET{C1}->{$O{charset}};
614     }
615     } elsif (defined $O{charset_id}) {
616     if ($C->{C1} ne $CHARSET{C1}->{$O{charset_id}}) {
617     return undef; ## Control set is not designated nor has F byte
618     }
619     }
620     $r .= _back2ascii ($C, reset_all => $C->{C1}->{reset_all}->{$s});
621     unless ($C->{option}->{C1invoke_to_right}) { ## ESC Fe
622     $s =~ s/([\x80-\x9F])/"\x1B" . chr (ord ($1) - 0x40)/ge;
623     }
624     return $r . $s;
625     }
626     }
627     sub _i2g ($%%) {
628     my ($s, $C, %O) = @_;
629     my $r = '';
630 wakaba 1.4 my $set = $CHARSET{$O{type}}->{$O{charset}.
631     ($O{revision}&&$C->{option}->{use_revision}?$O{revision}:'')};
632 wakaba 1.1 my $set0 = $CHARSET{$O{type}}->{$O{charset_id}};
633     ## -- designate character set
634     my $G = 0;
635     if ($C->{G0} eq $set || $C->{G0} eq $set0) { $G = 0 }
636     elsif ($C->{G1} eq $set || $C->{G1} eq $set0) { $G = 1 }
637     elsif ($C->{G2} eq $set || $C->{G2} eq $set0) { $G = 2 }
638     elsif ($C->{G3} eq $set || $C->{G3} eq $set0) { $G = 3 }
639     else {
640     return undef unless $set; ## charset does not have F byte
641     $G = 1 if $O{type} eq 'G96' || $O{type} eq 'G96n';
642     for ($C->{option}->{designate_to}->{$O{type}}->{$O{charset}},
643     $C->{option}->{designate_to}->{$O{type}}->{default}) {
644     if (defined $_) {
645     $G = $_; last;
646     }
647     }
648     if ($G == -1) {
649     return undef;
650     }
651     if ($O{type} eq 'G94') {
652     $r .= ($O{revision}&&$C->{option}->{use_revision}?"\x1B\x26".$O{revision}:'')
653     ."\x1B".("\x28","\x29","\x2A","\x2B")[$G].$O{charset};
654     } elsif ($O{type} eq 'G94n') {
655     if ($G == 0 && !$C->{option}->{G94n_designate_long}
656     && ($O{charset} eq '@' || $O{charset} eq 'A' || $O{charset} eq 'B')) {
657     $r .= ($O{revision}&&$C->{option}->{use_revision}?"\x1B\x26".$O{revision}:'')
658     ."\x1B\x24".$O{charset};
659     } else {
660     $r .= ($O{revision}&&$C->{option}->{use_revision}?"\x1B\x26".$O{revision}:'')
661     ."\x1B\x24".("\x28","\x29","\x2A","\x2B")[$G].$O{charset};
662     }
663     } elsif ($O{type} eq 'G96') {
664     $r .= ($O{revision}&&$C->{option}->{use_revision}?"\x1B\x26".$O{revision}:'')
665     ."\x1B".("\x2C","\x2D","\x2E","\x2F")[$G].$O{charset};
666     } elsif ($O{type} eq 'G96n') {
667     $r .= ($O{revision}&&$C->{option}->{use_revision}?"\x1B\x26".$O{revision}:'')
668     ."\x1B\x24".("\x2C","\x2D","\x2E","\x2F")[$G].$O{charset};
669     }
670     $C->{'G'.$G} = $CHARSET{$O{type}}->{$O{charset}};
671     }
672     ## -- invoke G buffer
673     my $left = $C->{option}->{Ginvoke_to_left}->[$G];
674     if ($C->{GL} eq 'G'.$G) {
675     $left = 1;
676     } elsif ($C->{GR} eq 'G'.$G) {
677     $left = 0;
678     } else {
679     if ($C->{option}->{Ginvoke_by_single_shift}->[$G]) {
680     if ($C->{C1}->{'C_SS'.$G}) {
681     $r .= _i2c ($C->{C1}->{'C_SS'.$G}, $C, type => 'C1') || return undef;
682     } elsif ($C->{C0}->{'C_SS'.$G}) {
683     $r .= _i2c ($C->{C0}->{'C_SS'.$G}, $C, type => 'C0') || return undef;
684     } else { ## Both C0 and C1 set do not have SS2/3.
685 wakaba 1.4 $left = 0 if $G == 1 && !$C->{C0}->{C_LS1};
686 wakaba 1.1 $r .= __invoke ($C, $G => $left) if $C->{$left?'GL':'GR'} ne 'G'.$G;
687     }
688     } else {
689     $left = 0 if $G == 1 && !$C->{C0}->{C_LS1};
690     $r .= __invoke ($C, $G => $left) if $C->{$left?'GL':'GR'} ne 'G'.$G;
691     }
692     }
693     $s =~ tr/\x00-\x7F/\x80-\xFF/ unless $left;
694     $r . $s;
695     }
696     sub _back2ascii (%) {
697     my ($C, %O) = @_;
698     my $r = '';
699     if ($C->{option}->{reset}->{Gdesignation}) {
700     my $F = $C->{option}->{reset}->{Gdesignation}; # \x42
701     $r .= "\x1B\x28".$F unless $C->{G0} eq $CHARSET{G94}->{$F};
702     $C->{G0} = $CHARSET{G94}->{$F};
703     if ($O{reset_all}) {
704     $C->{G1} = $CHARSET{G94}->{"\x7E"};
705     $C->{G2} = $CHARSET{G94}->{"\x7E"};
706     $C->{G3} = $CHARSET{G94}->{"\x7E"};
707     }
708     }
709     if ($C->{option}->{reset}->{Ginvoke}) {
710     if ($C->{GL} ne 'G0') {
711     $r .= $C->{C0}->{C_LS0} || ($C->{C0} = $CHARSET{C0}->{'@'},"\x1B\x21\x40\x0F");
712     $C->{GL} = 'G0';
713     }
714     $C->{GR} = undef if $O{reset_all};
715     }
716     $r;
717     }
718     ## __invoke (\%C, $G, $left_or_right)
719     sub __invoke (\%$$) {
720     my ($C, $G) = @_;
721     if ($_[2]) {
722     $C->{GL} = 'G'.$G;
723     return ($C->{C0}->{C_LS0}
724     || scalar ($C->{C0}=$CHARSET{C0}->{'@'},"\x1B\x21\x40\x0F"),
725     $C->{C0}->{C_LS1}, "\x1B\x6E", "\x1B\x6F")[$G];
726     } else {
727     $C->{GR} = 'G'.$G;
728     return ("", "\x1B\x7E", "\x1B\x7D", "\x1B\x7C")[$G];
729     }
730     '';
731     }
732    
733     sub make_charset (%) {
734     ## TODO: support private charset ID such as 'X0'
735     my %set = @_;
736     my $setid = qq($set{I}$set{F}$set{revision});
737     my $settype = $set{type} || 'G94';
738     delete $set{type}, $set{I}, $set{F}, $set{revision};
739     $CHARSET{ $settype }->{ $setid } = \%set;
740     }
741    
742     1;
743     __END__
744    
745     =head1 SEE ALSO
746    
747     ISO/IEC 646:1991, "7-bit coded graphic character set for intormation interchange".
748    
749     ISO/IEC 2022:1994, "Character Code Structure and Extension Techniques".
750     (IDT with ECMA 35, JIS X 0202:1998)
751    
752     ISO/IEC 4873:1991, "8-Bit Coded Character Set Structure and Rules".
753     (IDT with ECMA 43)
754    
755     ISO/IEC 6429:1992, "Control Functions for Coded Character Sets".
756     (IDT with ECMA 48:1991, JIS X 0211:1998)
757    
758     ISO/IEC 8859, "8-Bit Single-Byte Coded Graphic Character Sets".
759    
760     Encode, perlunicode
761    
762 wakaba 1.4 =head1 TODO
763    
764     =over 4
765    
766     =item NCR (coding system other than ISO/IEC 2022) support
767    
768     =over 2
769    
770     =item ESC 02/05 02/15 03/x of X Compound Text
771    
772     =back
773    
774     =item Output of control character sets, single control functions
775    
776     =item Designation sequence of control character sets (input)
777    
778     =item Special graphic character sets such as G3 of EUC-TW
779    
780     =item SUPER SHIFT (SS) invoke function of old control character set
781    
782     =item Safe transparent of control string (ISO/IEC 6429)
783    
784     =item Output of unoutputable characters as alternative notation such as SGML-like entity
785    
786     =item C0 set invoked to CR area like ISIRI code
787    
788     Really need?
789    
790     =item special treatment of 0x20, 0x7E, 0xA0, 0xFF
791    
792     For example, GB mongolian sets use MSP (MONGOLIAN SPACE)
793     with these code positions.
794    
795     And, no less coding systems does not use (or does ban using) DEL.
796    
797     =item A lot of character sets don't have pseudo-UCS mapping.
798    
799     Most of 9m^n (n >= 3) sets, 9m^n sets with I byte, 9m^n
800     DRCSes do not have pseudo-UCS mapping area. It is
801     questionable to allocate lots of code positions to these
802     rarely-(or no-)used character sets.
803    
804     =item Even character sets that have pseudo-UCS mapping, some of them can't be outputed in ISO/IEC 2022.
805    
806     Because output of rarely-used character sets is
807     not implemented yet.
808    
809     =back
810    
811 wakaba 1.1 =head1 LICENSE
812    
813     Copyright 2002 wakaba <w@suika.fam.cx>
814    
815     This library is free software; you can redistribute it
816     and/or modify it under the same terms as Perl itself.
817    
818     =cut
819    
820 wakaba 1.4 # $Date: 2002/09/16 02:20:18 $
821 wakaba 1.1 ### ISO2022.pm ends here

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24