/[suikacvs]/perl/lib/Encode/ISO2022.pm
Suika

Contents of /perl/lib/Encode/ISO2022.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.3 - (hide annotations) (download)
Mon Sep 16 02:20:18 2002 UTC (22 years, 1 month ago) by wakaba
Branch: MAIN
Changes since 1.2: +22 -13 lines
2002-09-16  Wakaba <w@suika.fam.cx>

	* ISO2022.pm:
	- (iso2022_to_internal): Invoke G1,G2,G3 by locking
	shifts of ESC Fs style.
	- (make_initial_charset): Create charset definition
	of 94^2 DRCSes.

1 wakaba 1.1
2     =head1 NAME
3    
4     Encode::ISO2022 --- ISO/IEC 2022 encoder and decoder
5    
6     =cut
7    
8     require v5.7.3;
9     package Encode::ISO2022;
10     use strict;
11     use vars qw(%CHARSET $VERSION);
12 wakaba 1.3 $VERSION=do{my @r=(q$Revision: 1.2 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
13 wakaba 1.1 use base qw(Encode::Encoding);
14 wakaba 1.2 __PACKAGE__->Define (qw/iso-2022 iso2022 2022 cp2022/);
15 wakaba 1.1
16     ### --- Intialization
17    
18     my %_CHARS_to_RANGE = (
19     l94 => q/[\x21-\x7E]/,
20     l96 => q/[\x20-\x7F]/,
21     l128 => q/[\x00-\x7F]/,
22     l256 => q/[\x00-\xFF]/,
23     r94 => q/[\xA1-\xFE]/,
24     r96 => q/[\xA0-\xFF]/,
25     r128 => q/[\x80-\xFF]/,
26     r256 => q/[\x80-\xFF]/,
27     b94 => q/[\x21-\x7E\xA1-\xFE]/,
28     b96 => q/[\x20-\x7F\xA0-\xFF]/,
29     b128 => q/[\x00-\xFF]/,
30     b256 => q/[\x00-\xFF]/,
31     );
32    
33     ## --- Make initial charset definitions
34     &_make_initial_charsets;
35     sub _make_initial_charsets () {
36     for my $f (0x30..0x7E) {
37     my $F = pack 'C', $f;
38     for ('', '!', '"', '#') {
39     $CHARSET{G94}->{ $_.$F }->{dimension} = 1;
40     $CHARSET{G94}->{ $_.$F }->{chars} = 94;
41     $CHARSET{G94}->{ $_.$F }->{ucs} =
42     {'' => 0xE90940, '!' => 0xE944A0, '"' => 0xE98000, '#' => 0xE9BB60}->{ $_ }
43     + 94 * ($f-0x30);
44    
45     $CHARSET{G96}->{ $_.$F }->{dimension} = 1;
46     $CHARSET{G96}->{ $_.$F }->{chars} = 96;
47     $CHARSET{G96}->{ $_.$F }->{ucs} =
48     {'' => 0xE926A0, '!' => 0xE96200, '"' => 0xE99D60, '#' => 0xE9D8C0}->{ $_ }
49     + 96 * ($f-0x30);
50    
51     $CHARSET{C0}->{ $_.$F }->{dimension} = 1;
52     $CHARSET{C0}->{ $_.$F }->{chars} = 32;
53     $CHARSET{C0}->{ $_.$F }->{ucs} =
54     {'' => 0x70000000, '!' => 0x70001400,
55     '"' => 0x70002800, '#' => 0x70003C00}->{ $_ } + 32 * ($f-0x30);
56    
57     $CHARSET{C1}->{ $_.$F }->{dimension} = 1;
58     $CHARSET{C1}->{ $_.$F }->{chars} = 32;
59     $CHARSET{C1}->{ $_.$F }->{ucs} =
60     {'' => 0x70000A00, '!' => 0x70001E00,
61     '"' => 0x70003200, '#' => 0x70004600}->{ $_ } + 32 * ($f-0x30);
62    
63     $CHARSET{G94}->{ ' '.$_.$F }->{dimension} = 1; ## DRCS
64     $CHARSET{G94}->{ ' '.$_.$F }->{chars} = 94;
65     $CHARSET{G94}->{ ' '.$_.$F }->{ucs} =
66     {'' => 0x70090940, '!' => 0x700944A0,
67     '"' => 0x70098000, '#' => 0x7009BB60}->{ $_ } + 94 * ($f-0x30);
68    
69     $CHARSET{G96}->{ ' '.$_.$F }->{dimension} = 1; ## DRCS
70     $CHARSET{G96}->{ ' '.$_.$F }->{chars} = 96;
71     $CHARSET{G96}->{ ' '.$_.$F }->{ucs} =
72     {'' => 0x700926A0, '!' => 0x70096200,
73     '"' => 0x70099D60, '#' => 0x7009D8C0}->{ $_ } + 96 * ($f-0x30);
74     }
75     }
76     for my $f (0x30..0x5F, 0x7E) {
77     my $F = pack 'C', $f;
78 wakaba 1.3 for ('', '!', '"', '#', ' ') {
79 wakaba 1.1 $CHARSET{G94n}->{ $_.$F }->{dimension} = 2;
80     $CHARSET{G94n}->{ $_.$F }->{chars} = 94;
81     $CHARSET{G94n}->{ $_.$F }->{ucs} =
82     ({'' => 0xE9F6C0}->{ $_ }||0) + 94*94 * ($f-0x30);
83 wakaba 1.3 ## BUG: 94^n sets with I byte have no mapping area
84 wakaba 1.1
85     $CHARSET{G96n}->{ $_.$F }->{dimension} = 2;
86     $CHARSET{G96n}->{ $_.$F }->{chars} = 96;
87     $CHARSET{G96n}->{ $_.$F }->{ucs} =
88     ({'' => 0xF4C000}->{ $_ }||0) + 96*96 * ($f-0x30);
89     ## BUG: 94^n DRCSes with I byte have no mapping area
90     }
91     }
92     for (0x60..0x6F) {
93     my $F = pack 'C', $_;
94     ## BUG: 9x^3 sets have no mapping area
95     for ('', '!', '"', '#', ' ') {
96     $CHARSET{G94n}->{ $_.$F }->{dimension} = 3;
97     $CHARSET{G94n}->{ $_.$F }->{chars} = 94;
98    
99     $CHARSET{G96n}->{ $_.$F }->{dimension} = 3;
100     $CHARSET{G96n}->{ $_.$F }->{chars} = 96;
101     }
102     }
103     for (0x70..0x7D) {
104     my $F = pack 'C', $_;
105     ## BUG: 9x^4 sets have no mapping area
106     for ('', '!', '"', '#', ' ') {
107     $CHARSET{G94n}->{ $_.$F }->{dimension} = 4;
108     $CHARSET{G94n}->{ $_.$F }->{chars} = 94;
109    
110     $CHARSET{G96n}->{ $_.$F }->{dimension} = 4;
111     $CHARSET{G96n}->{ $_.$F }->{chars} = 96;
112     }
113     }
114     for my $f (0x40..0x4E) {
115     my $F = pack 'C', $f;
116     $CHARSET{G96n}->{ ' '.$F }->{dimension} = 2;
117     $CHARSET{G96n}->{ ' '.$F }->{chars} = 96;
118     $CHARSET{G96n}->{ ' '.$F }->{ucs} = 0xF0000 + 96*96*($f-0x40);
119     ## U+F0000-U+10F7FF (private) -> ESC 02/04 02/00 <I> (04/00-04/14) (DRCS)
120     }
121    
122     $CHARSET{G94}->{B}->{ucs} = 0x21; ## ASCII
123     $CHARSET{G96}->{A}->{ucs} = 0xA0; ## ISO 8859-1
124    
125     $CHARSET{G94n}->{'B@'}->{dimension} = 2; ## JIS X 0208-1990
126     $CHARSET{G94n}->{'B@'}->{chars} = 94;
127     $CHARSET{G94n}->{'B@'}->{ucs} = 0xE9F6C0 + 94*94*79;
128    
129     ## -- Control character sets
130     $CHARSET{C0}->{'@'}->{ucs} = 0x00; ## ISO/IEC 6429 C0
131     for ("\x40", "\x43", "\x44", "\x45", "\x46", "\x49", "\x4A", "\x4B", "\x4C") {
132     $CHARSET{C0}->{$_}->{C_LS0} = "\x0F";
133     $CHARSET{C0}->{$_}->{C_LS1} = "\x0E";
134     $CHARSET{C0}->{$_}->{r_LS0} = '\x0F';
135     $CHARSET{C0}->{$_}->{r_LS1} = '\x0E';
136     }
137     for ("\x40", "\x44", "\x45", "\x46", "\x48", "\x4C") {
138     $CHARSET{C0}->{$_}->{reset_all} = {"\x0A" => 1, "\x0B" => 1,
139     "\x0C" => 1, "\x0D" => 1};
140     }
141     $CHARSET{C0}->{"\x43"}->{reset_all} = {"\x0A" => 1};
142     $CHARSET{C0}->{"\x44"}->{C_SS2} = "\x1C";
143     $CHARSET{C0}->{"\x44"}->{r_SS2} = '\x1C';
144     for ("\x45", "\x49", "\x4A", "\x4B") {
145     $CHARSET{C0}->{$_}->{C_SS2} = "\x19";
146     $CHARSET{C0}->{$_}->{C_SS3} = "\x1D";
147     $CHARSET{C0}->{$_}->{r_SS2} = '\x19';
148     $CHARSET{C0}->{$_}->{r_SS3} = '\x1D';
149     }
150     $CHARSET{C0}->{"\x4C"}->{C_SS2} = "\x19";
151     $CHARSET{C0}->{"\x4C"}->{r_SS2} = '\x19';
152    
153     $CHARSET{C1}->{'64291991C1'}->{dimension} = 1; ## ISO/IEC 6429:1991 C1
154     $CHARSET{C1}->{'64291991C1'}->{chars} = 32;
155     $CHARSET{C1}->{'64291991C1'}->{ucs} = 0x80;
156     for ("\x43", "\x45", "\x47", '64291991C1') {
157     $CHARSET{C1}->{$_}->{C_SS2} = "\x8E";
158     $CHARSET{C1}->{$_}->{C_SS3} = "\x8F";
159     $CHARSET{C1}->{$_}->{r_SS2} = '\x8E';
160     $CHARSET{C1}->{$_}->{r_SS3} = '\x8F';
161     $CHARSET{C1}->{$_}->{r_SS2_ESC} = '\x1B\x4E';
162     $CHARSET{C1}->{$_}->{r_SS3_ESC} = '\x1B\x4F';
163     }
164     for ("\x43", '64291991C1') {
165     $CHARSET{C1}->{$_}->{r_CSI} = '\x9B';
166     $CHARSET{C1}->{$_}->{r_CSI_ESC} = '\x1B\x5B';
167     $CHARSET{C1}->{$_}->{r_DCS} = '\x90';
168     $CHARSET{C1}->{$_}->{r_ST} = '\x9C';
169     $CHARSET{C1}->{$_}->{r_OSC} = '\x9D';
170     $CHARSET{C1}->{$_}->{r_PM} = '\x9E';
171     $CHARSET{C1}->{$_}->{r_APC} = '\x9F';
172     $CHARSET{C1}->{$_}->{reset_all} = {"\x85"=>1, "\x90"=>1,
173     "\x9C"=>1, "\x9D"=>1, "\x9E"=>1, "\x9F"=>1};
174     }
175     $CHARSET{C1}->{'64291991C1'}->{r_SCI} = '\x9A';
176    
177     $CHARSET{single_control}->{Fs} ={ucs => 0x70005000, chars => 32, dimension => 1};
178     $CHARSET{single_control}->{'3F'} ={ucs => 0x70005020, chars => 80, dimension => 1};
179     $CHARSET{single_control}->{'3F!'}={ucs => 0x70005070, chars => 80, dimension => 1};
180     $CHARSET{single_control}->{'3F"'}={ucs => 0x700050C0, chars => 80, dimension => 1};
181     $CHARSET{single_control}->{'3F#'}={ucs => 0x70005110, chars => 80, dimension => 1};
182     }
183    
184    
185     ### --- Perl Encode module common functions
186    
187     sub encode ($$;$) {
188     my ($obj, $str, $chk) = @_;
189     $_[1] = '' if $chk;
190     $str = &internal_to_iso2022 ($str);
191     return $str;
192     }
193    
194     sub decode ($$;$) {
195     my ($obj, $str, $chk) = @_;
196     $_[1] = '' if $chk;
197     return &iso2022_to_internal ($str);
198     }
199    
200     ### --- Encode::ISO2022 unique functions
201    
202     ## Make a new ISO/IEC 2022-buffers object with default status
203     sub new_object {
204     my %C;
205     $C{bit} = 8;
206     $C{CL} = 'C0'; $C{CR} = 'C1'; $C{ESC_Fe} = 'C1';
207     $C{C0} = $CHARSET{C0}->{"\x40"}; ## ISO/IEC 6429:1991 C0
208     $C{C1} = $CHARSET{C1}->{'64291991C1'}; ## ISO/IEC 6429:1991 C1
209     $C{GL} = 'G0'; $C{GR} = 'G1';
210     $C{G0} = $CHARSET{G94}->{"\x42"}; ## ISO/IEC 646:1991 IRV
211     #$C{G1} = $CHARSET{G96}->{"\x41"}; ## ISO/IEC 8859-1 GR
212     $C{G1} = $CHARSET{G94}->{"\x7E"}; ## empty set
213     $C{G2} = $CHARSET{G94}->{"\x7E"}; ## empty set
214     $C{G3} = $CHARSET{G94}->{"\x7E"}; ## empty set
215     $C{option} = {
216     C1invoke_to_right => 0, ## C1 invoked to: (0: ESC Fe, 1: CR)
217     G94n_designate_long => 0, ## (1: ESC 02/04 02/08 04/00..02)
218     designate_to => { ## Designated G buffer (-1: not be outputed)
219     C0 => {
220     default => 0,
221     },
222     C1 => {
223     default => 1,
224     },
225     G94 => {
226     "\x42" => 0,
227     default => 0,
228     },
229     G96 => {
230     default => 1,
231     },
232     G94n => {
233     default => 0,
234     },
235     G96n => {
236     default => 1,
237     },
238     },
239     Ginvoke_by_single_shift => [0,0,0,0], ## Invoked by SS
240     Ginvoke_to_left => [1,1,1,1], ## Which invoked to? (1: L, 0: R)
241     private_set => { ## Private set vs Final byte
242     C0 => [],
243     C1 => [],
244     G94 => [],
245     G94n => [[],[],[],[],[]],
246     G96 => [],
247     #G96n => [], ## (not implemented)
248     U96n => [], ## mule-unicode sets
249     XC1 => {
250     '64291991C1' => undef, ## ISO/IEC 6429:1991 C1
251     },
252     },
253     reset => { ## Reset status at top of line
254     Gdesignation => "\x42", ## F of designation or 0
255     Ginvoke => 1,
256     },
257     use_revision => 1, ## Output IRR
258     };
259     \%C;
260     }
261    
262     sub iso2022_to_internal ($;\%) {
263     my ($s, $C) = @_;
264     my %_GB_to_GN = (
265     "\x28"=>'G0',"\x29"=>'G1',"\x2A"=>'G2',"\x2B"=>'G3',
266     "\x2C"=>'G0',"\x2D"=>'G1',"\x2E"=>'G2',"\x2F"=>'G3',
267     );
268     $C ||= &new_object;
269    
270     use re 'eval';
271     $s =~ s{
272     ((??{ $_CHARS_to_RANGE{'l'.$C->{$C->{GL}}->{chars}}
273     . qq/{$C->{$C->{GL}}->{dimension},$C->{$C->{GL}}->{dimension}}/ }))
274     |((??{ $_CHARS_to_RANGE{'r'.$C->{$C->{GL}}->{chars}}
275     . qq/{$C->{$C->{GR}}->{dimension},$C->{$C->{GR}}->{dimension}}/ }))
276    
277     | (??{ q/(?:/ . ($C->{$C->{CR}}->{r_SS2} || '(?!)')
278     . ($C->{$C->{ESC_Fe}}->{r_SS2_ESC} ?
279     qq/|$C->{$C->{ESC_Fe}}->{r_SS2_ESC}/ : '')
280     . ($C->{$C->{CL}}->{r_SS2} ? qq/|$C->{$C->{CL}}->{r_SS2}/ : '') . q/)/
281     . ( $C->{$C->{CL}}->{r_LS0}
282     ||$C->{$C->{CL}}->{r_LS1}? ## ISO/IEC 6429:1992 9
283     qq/[$C->{$C->{CL}}->{r_LS0}$C->{$C->{CL}}->{r_LS1}]*/:'')
284     })
285     ((??{ $_CHARS_to_RANGE{'b'.$C->{$C->{GL}}->{chars}}
286     . qq/{$C->{G2}->{dimension},$C->{G2}->{dimension}}/ }))
287     | (??{ q/(?:/ . ($C->{$C->{CR}}->{r_SS3} || '(?!)')
288     . ($C->{$C->{ESC_Fe}}->{r_SS3_ESC} ?
289     qq/|$C->{$C->{ESC_Fe}}->{r_SS3_ESC}/ : '')
290     . ($C->{$C->{CL}}->{r_SS3} ? qq/|$C->{$C->{CL}}->{r_SS3}/ : '') . q/)/
291     . ( $C->{$C->{CL}}->{r_LS0}
292     || $C->{$C->{CL}}->{r_LS1}? ## ISO/IEC 6429:1992 9
293     qq/[$C->{$C->{CL}}->{r_LS0}$C->{$C->{CL}}->{r_LS1}]*/:'')
294     })
295     ((??{ $_CHARS_to_RANGE{'b'.$C->{$C->{GL}}->{chars}}
296     . qq/{$C->{G3}->{dimension},$C->{G3}->{dimension}}/ }))
297    
298 wakaba 1.3 ## Locking shift
299     |( \x1B[\x6E\x6F\x7C-\x7E]
300     |(??{ $C->{$C->{CL}}->{r_LS0}||'(?!)' })
301     |(??{ $C->{$C->{CL}}->{r_LS1}||'(?!)' })
302     )
303 wakaba 1.1
304     ## Control sequence
305     |(??{ '(?:'.($C->{$C->{CR}}->{r_CSI}||'(?!)')
306     .($C->{$C->{ESC_Fe}}->{r_CSI_ESC} ?
307     qq/|$C->{$C->{ESC_Fe}}->{r_CSI_ESC}/: '')
308     .')'
309     })
310     ((??{ qq/[\x30-\x3F$C->{$C->{CL}}->{LS0}$C->{$C->{CL}}->{LS1}\xB0-\xBF]*/
311     .qq/[\x20-\x2F$C->{$C->{CL}}->{LS0}$C->{$C->{CL}}->{LS1}\xA0-\xAF]*/
312     }) [\x40-\x7E\xD0-\xFE])
313    
314     ## Other escape sequence
315     |(\x1B[\x20-\x2F]*[\x30-\x7E])
316    
317     ## Misc. sequence (SP, control, or broken data)
318     |([\x00-\xFF])
319     }{
320 wakaba 1.3 my ($gl,$gr,$ss2,$ss3,$ls,$csi,$esc,$misc)
321     = ($1,$2,$3,$4,$5,$6,$7,$8,$9);
322 wakaba 1.1 $C->{_irr} = undef unless defined $esc;
323     ## GL graphic character
324     if (defined $gl) {
325     my $c = 0;
326     my $m = $C->{$C->{GL}}->{chars}==94?0x21:$C->{$C->{GL}}->{chars}==96?0x20:0;
327     for (split //, $gl) {
328     $c = $c * $C->{$C->{GL}}->{chars} + unpack ('C', $_) - $m;
329     }
330     chr ($C->{$C->{GL}}->{ucs} + $c);
331     ## Control, SP, or broken data
332     ## TODO: support control sets other than ISO/IEC 6429's
333     } elsif (defined $misc) {
334     $misc;
335     ## GR graphic character
336     } elsif ($gr) {
337     my $c = 0;
338     my $m = $C->{$C->{GR}}->{chars}==94?0xA1:$C->{$C->{GR}}->{chars}==96?0xA0:0x80;
339     for (split //, $gr) {
340     $c = $c * $C->{$C->{GR}}->{chars} + unpack ('C', $_) - $m;
341     }
342     chr ($C->{$C->{GR}}->{ucs} + $c);
343     ## Graphic character with SS2
344     } elsif ($ss2) {
345     $ss2 =~ tr/\x80-\xFF/\x00-\x7F/;
346     my $c = 0; my $m = $C->{G2}->{chars}==94?0x21:$C->{G2}->{chars}==96?0x20:0;
347     for (split //, $ss2) {
348     $c = $c * $C->{G2}->{chars} + unpack ('C', $_) - $m;
349     }
350     chr ($C->{G2}->{ucs} + $c);
351     ## Graphic character with SS3
352     } elsif ($ss3) {
353     $ss3 =~ tr/\x80-\xFF/\x00-\x7F/;
354     my $c = 0; my $m = $C->{G3}->{chars}==94?0x21:$C->{G3}->{chars}==96?0x20:0;
355     for (split //, $ss3) {
356     $c = $c * $C->{G3}->{chars} + unpack ('C', $_) - $m;
357     }
358     chr ($C->{G3}->{ucs} + $c);
359     ## Escape sequence
360     } elsif ($esc) {
361     ## IRR (revision number)
362     if ($esc =~ /\x1B\x26([\x40-\x7E])/) {
363     $C->{_irr} = $1; $esc = '';
364     } else {
365     $esc =~ s{
366     \x1B([\x28-\x2B])(\x20?[\x21-\x23]?[\x30-\x7E]) ## Gx = 94^1
367     |\x1B\x24([\x28-\x2B]?)(\x20?[\x21-\x23]?[\x30-\x7E]) ## Gx = 94^n
368    
369     |\x1B([\x2C-\x2F])(\x20?[\x21-\x23]?[\x30-\x7E]) ## Gx = 96^1
370     |\x1B\x24([\x2C-\x2F])(\x20?[\x21-\x23]?[\x30-\x7E]) ## Gx = 96^n
371    
372     |\x1B([\x40-\x5F]) ## ESC Fe
373    
374     |\x1B\x21([\x21-\x23]?[\x30-\x7E]) ## CL = C0
375     |\x1B\x22([\x21-\x23]?[\x30-\x7E]) ## CR & ESC Fe = C1
376    
377     |\x1B([\x60-\x7E]) ## Single control functions
378     |\x1B\x23([\x21-\x23]?)([\x30-\x7E])
379    
380     |\x1B\x20([\x40-\x7E]) ## Announcer
381     }{
382     my ($g94_g,$g94_f,$g94n_g,$g94n_f,$g96_g,$g96_f,$g96n_g,$g96n_f,$Fe,
383     $CZD, $C1D, $Fs, $sI, $sF,$ACS)
384     = ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15);
385     my $rev = $C->{_irr} || '';
386     if ($g94_g) {
387     $C->{ $_GB_to_GN{ $g94_g } } = $CHARSET{G94}->{ $g94_f.$rev }
388     || $CHARSET{G94}->{ $g94_f }
389     || $CHARSET{G94}->{ "\x7E" }; '';
390     } elsif (defined $g94n_f) {
391     $C->{ $_GB_to_GN{ $g94n_g } || 'G0' } = $CHARSET{G94n}->{ $g94n_f.$rev }
392     || $CHARSET{G94n}->{ $g94n_f }
393     || $CHARSET{G94n}->{ "\x7E" }; '';
394     } elsif ($g96_g) {
395     $C->{ $_GB_to_GN{ $g96_g } } = $CHARSET{G96}->{ $g96_f.$rev }
396     || $CHARSET{G96}->{ $g96_f }
397     || $CHARSET{G96}->{ "\x7E" }; '';
398     } elsif (defined $g96n_f) {
399     $C->{ $_GB_to_GN{ $g96n_g } } = $CHARSET{G96n}->{ $g96n_f.$rev }
400     || $CHARSET{G96n}->{ $g96n_f }
401     || $CHARSET{G96n}->{ "\x7E" }; '';
402     } elsif ($Fe) { ## ESC Fe => C1
403     chr ($C->{ $C->{ESC_Fe} }->{ucs} + (ord ($Fe) - 0x40));
404     } elsif ($CZD) {
405     $C->{C0} = $CHARSET{C0}->{ $CZD.$rev }
406     || $CHARSET{C0}->{ $CZD } || $CHARSET{C0}->{ "\x7E" }; '';
407     } elsif ($C1D) {
408     $C->{C1} = $CHARSET{C1}->{ $C1D.$rev }
409     || $CHARSET{C1}->{ $C1D } || $CHARSET{C1}->{ "\x7E" }; '';
410     } elsif ($Fs) {
411     if ($Fs eq "\x6E") { ## LS2
412     $C->{GL} = 'G2'; '';
413     } elsif ($Fs eq "\x6F") { ## LS3
414     $C->{GL} = 'G3'; '';
415     } elsif ($Fs eq "\x7E") { ## LS1R
416     $C->{GR} = 'G1'; $C->{GL} = 'G1' if $C->{bit} == 7; '';
417     } elsif ($Fs eq "\x7D") { ## LS2R
418     $C->{GR} = 'G2'; $C->{GL} = 'G2' if $C->{bit} == 7; '';
419     } elsif ($Fs eq "\x7C") { ## LS3R
420     $C->{GR} = 'G3'; $C->{GL} = 'G3' if $C->{bit} == 7; '';
421     } else {
422     chr ($CHARSET{single_control}->{Fs}->{ucs} + (ord ($Fs) - 0x60));
423     }
424     } elsif ($sI) {
425     chr ($CHARSET{single_control}->{'3F'.$sI}->{ucs} + (ord ($sF) - 0x30));
426     } elsif ($ACS) { ## Announcer
427     if ($ACS eq "\x4A") { $C->{bit} = 7 }
428     elsif ($ACS eq "\x4B") { $C->{bit} = 8 }
429     '';
430     }
431     }gex;
432     $C->{_irr} = undef;
433     }
434     $esc;
435 wakaba 1.3 ## Locking shifts
436     } elsif ($ls) {
437     if ($ls eq $C->{$C->{CL}}->{LS0}) {
438     $C->{GL} = 'G0'; '';
439     } elsif ($ls eq $C->{$C->{CL}}->{LS1}) {
440     $C->{GL} = 'G1'; '';
441     } elsif ($ls =~ /\x1B([\x6E\x6F])/) {
442     $C->{GL} = {"\x6E"=>2, "\x6F"=>3}->{$1}; '';
443     } elsif ($ls =~ /\x1B([\x7C-\x7E])/) {
444     $C->{GR} = {"\x7E"=>1, "\x7D"=>2, "\x7C"=>3}->{$1}; '';
445     }
446 wakaba 1.1 ## Control sequence
447     } elsif ($csi) {
448     $csi =~ tr/\xA0-\xFF/\x20-\x7F/d;
449     $csi =~ s/$C->{$C->{CL}}->{LS0}//g if $C->{$C->{CL}}->{LS0};
450     $csi =~ s/$C->{$C->{CL}}->{LS1}//g if $C->{$C->{CL}}->{LS1};
451     "\x9B".$csi;
452     }
453     }gex;
454     $s;
455     }
456    
457     sub internal_to_iso2022 ($\%) {
458     my ($s, $C) = @_;
459     $C ||= &new_object;
460    
461     my $r = '';
462     for my $c (split //, $s) {
463     my $cc = ord $c;
464     my $t;
465     if ($cc <= 0x1F) {
466     $t = _i2c ($c, $C, type => 'C0', charset => '@');
467     } elsif ($cc == 0x20 || $cc == 0x7F) {
468     $t = _back2ascii ($C) . $c;
469     } elsif ($cc < 0x7F) {
470     $t = _i2g ($c, $C, type => 'G94', charset => 'B');
471     } elsif ($cc <= 0x9F) {
472     $t = _i2c ($c, $C, type => 'C1', charset_id => '64291991C1',
473     charset => $C->{private_set}->{XC1}->{'64291991C1'});
474     } elsif ($cc <= 0xFF) {
475     $t = _i2g (chr($cc-0x80), $C, type => 'G96', charset => 'A');
476     } elsif ($cc <= 0x24FF) {
477     my $c = $cc - 0x100;
478     my $final = $C->{private_set}->{U96n}->[0];
479     if (length $final) {
480     $t = _i2g (chr(($c / 96)+0x20).chr(($c % 96)+0x20), $C,
481     type => 'G96n', charset => $final);
482     }
483     } elsif ($cc <= 0x33FF) {
484     my $c = $cc - 0x2500;
485     my $final = $C->{private_set}->{U96n}->[1];
486     if (length $final) {
487     $t = _i2g (chr(($c / 96)+0x20).chr(($c % 96)+0x20), $C,
488     type => 'G96n', charset => $final);
489     }
490     } elsif (0xE000 <= $cc && $cc <= 0xFFFF) {
491     my $c = $cc - 0xE000;
492     my $final = $C->{private_set}->{U96n}->[2];
493     if (length $final) {
494     $t = _i2g (chr(($c / 96)+0x20).chr(($c % 96)+0x20), $C,
495     type => 'G96n', charset => $final);
496     }
497    
498     } elsif (0xE9F6C0 <= $cc && $cc <= 0xF06F80) {
499     my $c = $cc - 0xE9F6C0;
500     $t = _i2g (chr((($c % 8836) / 94)+0x21).chr(($c % 94)+0x21), $C,
501     type => 'G94n', charset => chr(($c / 8836)+0x30));
502     } elsif (0xF49D7C <= $cc && $cc <= 0xF4BFFF) {
503     my $c = $cc - 0xF49D7C;
504     $t = _i2g (chr(($c / 94)+0x21).chr(($c % 94)+0x21), $C,
505     type => 'G94n', charset => 'B', revision => '@');
506    
507     } elsif (0xF0000 <= $cc && $cc <= 0x10F7FF) {
508     my $c = $cc - 0xF0000;
509     $t = _i2g (chr((($c % 9216) / 96)+0x20).chr(($c % 96)+0x20), $C,
510     type => 'G96n', charset => "\x20".chr(($c / 9216)+0x40));
511     } elsif (0xE90940 <= $cc && $cc <= 0xE92641) {
512     my $c = $cc - 0xE90940;
513     $t = _i2g (chr(($c % 94)+0x21), $C,
514     type => 'G94', charset => chr(($c / 94)+0x30));
515     } elsif (0xE92642 <= $cc && $cc <= 0xE9269F) {
516     my $c = $cc - 0xE92642;
517     $t = _i2g (chr($c+0x21),$C,type => 'G94', charset => "\x64", revision => '@');
518     } elsif (0xE926A0 <= $cc && $cc <= 0xE9443F) {
519     my $c = $cc - 0xE926A0;
520     $t = _i2g (chr(($c % 96)+0x20), $C,
521     type => 'G96', charset => chr(($c / 96)+0x30));
522     } elsif (0xE944A0 <= $cc && $cc <= 0xE961A1) {
523     my $c = $cc - 0xE944A0;
524     $t = _i2g (chr(($c % 94)+0x21), $C,
525     type => 'G94', charset => '!'.chr(($c / 94)+0x30));
526     } elsif (0xE96200 <= $cc && $cc <= 0xE97F9F) {
527     my $c = $cc - 0xE96200;
528     $t = _i2g (chr(($c % 96)+0x20), $C,
529     type => 'G96', charset => '!'.chr(($c / 96)+0x30));
530     } elsif (0xE98000 <= $cc && $cc <= 0xE99D01) {
531     my $c = $cc - 0xE98000;
532     $t = _i2g (chr(($c % 94)+0x21), $C,
533     type => 'G94', charset => '"'.chr(($c / 94)+0x30));
534     } elsif (0xE99D60 <= $cc && $cc <= 0xE9BAFF) {
535     my $c = $cc - 0xE99D60;
536     $t = _i2g (chr(($c % 96)+0x20), $C,
537     type => 'G96', charset => '"'.chr(($c / 96)+0x30));
538     } elsif (0xE9BB60 <= $cc && $cc <= 0xE9D861) {
539     my $c = $cc - 0xE9BB60;
540     $t = _i2g (chr(($c % 94)+0x21), $C,
541     type => 'G94', charset => '#'.chr(($c / 94)+0x30));
542     } elsif (0xE9D8C0 <= $cc && $cc <= 0xE9F65F) {
543     my $c = $cc - 0xE9D8C0;
544     $t = _i2g (chr(($c % 96)+0x20), $C,
545     type => 'G96', charset => '#'.chr(($c / 96)+0x30));
546     } elsif (0x70090940 <= $cc && $cc <= 0x70092641) {
547     my $c = $cc - 0x70090940;
548     $t = _i2g (chr(($c % 94)+0x21), $C,
549     type => 'G94', charset => "\x20".chr(($c / 94)+0x30));
550     } elsif (0x700926A0 <= $cc && $cc <= 0x7009443F) {
551     my $c = $cc - 0x700926A0;
552     $t = _i2g (chr(($c % 96)+0x20), $C,
553     type => 'G96', charset => "\x20".chr(($c / 96)+0x30));
554     ## TODO: DRCS with I byte: U+700944A0-U+7009F6BF
555     } elsif (0x70400000 <= $cc && $cc <= 0x7040FFED) {
556     my $c = $cc - 0x70400000;
557     $t = _i2g (chr(($c % 94)+0x21), $C, charset_id => 'P'.int ($c / 94),
558     type => 'G94', charset => $C->{private_set}->{G94}->[ $c / 94 ]);
559     } elsif (0x70410000 <= $cc && $cc <= 0x7041FFBF) {
560     my $c = $cc - 0x70410000;
561     $t = _i2g (chr(($c % 96)+0x20), $C, charset_id => 'P'.int ($c / 96),
562     type => 'G96', charset => $C->{private_set}->{G96}->[ $c / 96 ]);
563     } elsif (0x70420000 <= $cc && $cc <= 0x7046F19B) {
564     my $c = $cc % 0x10000;
565     $t = _i2g (chr((($c % 8836) / 94)+0x21).chr(($c % 94)+0x21), $C,
566     type => 'G94n',
567 wakaba 1.2 charset_id => 'P'.int(($cc / 0x10000) - 0x7042).'_'.int($c / 8836),
568 wakaba 1.1 charset => $C->{private_set}->{G94n}->[ ($cc / 0x10000) - 0x7042 ]
569     ->[ $c / 8836 ]);
570     }
571     if (defined $t) {
572     $r .= $t;
573     } else {
574     $r .= _i2g ("\x3F", $C, type => 'G94', charset => 'B');
575     }
576     }
577     $r . _back2ascii ($C);
578     }
579    
580     ## $O{charset} eq undef means that charset is same as the current designated one.
581     sub _i2c ($%%) {
582     my ($s, $C, %O) = @_;
583     my $r = '';
584     if ($O{type} eq 'C0') {
585     if (defined $O{charset}) {
586     if ( $C->{C0} ne $CHARSET{C0}->{$O{charset}}
587     && $C->{C0} ne $CHARSET{C0}->{$O{charset_id}}) {
588     for ($C->{option}->{designate_to}->{C0}->{$O{charset}},
589     $C->{option}->{designate_to}->{C0}->{default}) {
590     if (defined $_) { return undef if $_ == -1; last }
591     }
592     $r .= "\x1B\x21".$O{charset};
593     $C->{C0} = $CHARSET{C0}->{$O{charset}};
594     }
595     } elsif (defined $O{charset_id}) {
596     if ($C->{C0} ne $CHARSET{C0}->{$O{charset_id}}) {
597     return undef; ## Control set is not designated nor has F byte
598     }
599     }
600     $r .= _back2ascii ($C, reset_all => $C->{C0}->{reset_all}->{$s});
601     return $r . $s;
602     } elsif ($O{type} eq 'C1') {
603     if (defined $O{charset}) {
604     if ( $C->{C1} ne $CHARSET{C1}->{$O{charset}}
605     && $C->{C1} ne $CHARSET{C1}->{$O{charset_id}}) {
606     for ($C->{option}->{designate_to}->{C1}->{$O{charset}},
607     $C->{option}->{designate_to}->{C1}->{default}) {
608     if (defined $_) { return undef if $_ == -1; last }
609     }
610     $r .= "\x1B\x22".$O{charset};
611     $C->{C1} = $CHARSET{C1}->{$O{charset}};
612     }
613     } elsif (defined $O{charset_id}) {
614     if ($C->{C1} ne $CHARSET{C1}->{$O{charset_id}}) {
615     return undef; ## Control set is not designated nor has F byte
616     }
617     }
618     $r .= _back2ascii ($C, reset_all => $C->{C1}->{reset_all}->{$s});
619     unless ($C->{option}->{C1invoke_to_right}) { ## ESC Fe
620     $s =~ s/([\x80-\x9F])/"\x1B" . chr (ord ($1) - 0x40)/ge;
621     }
622     return $r . $s;
623     }
624     }
625     sub _i2g ($%%) {
626     my ($s, $C, %O) = @_;
627     my $r = '';
628     my $set = $CHARSET{$O{type}}->{$O{charset}};
629     my $set0 = $CHARSET{$O{type}}->{$O{charset_id}};
630     ## -- designate character set
631     my $G = 0;
632     if ($C->{G0} eq $set || $C->{G0} eq $set0) { $G = 0 }
633     elsif ($C->{G1} eq $set || $C->{G1} eq $set0) { $G = 1 }
634     elsif ($C->{G2} eq $set || $C->{G2} eq $set0) { $G = 2 }
635     elsif ($C->{G3} eq $set || $C->{G3} eq $set0) { $G = 3 }
636     else {
637     return undef unless $set; ## charset does not have F byte
638     $G = 1 if $O{type} eq 'G96' || $O{type} eq 'G96n';
639     for ($C->{option}->{designate_to}->{$O{type}}->{$O{charset}},
640     $C->{option}->{designate_to}->{$O{type}}->{default}) {
641     if (defined $_) {
642     $G = $_; last;
643     }
644     }
645     if ($G == -1) {
646     return undef;
647     }
648     if ($O{type} eq 'G94') {
649     $r .= ($O{revision}&&$C->{option}->{use_revision}?"\x1B\x26".$O{revision}:'')
650     ."\x1B".("\x28","\x29","\x2A","\x2B")[$G].$O{charset};
651     } elsif ($O{type} eq 'G94n') {
652     if ($G == 0 && !$C->{option}->{G94n_designate_long}
653     && ($O{charset} eq '@' || $O{charset} eq 'A' || $O{charset} eq 'B')) {
654     $r .= ($O{revision}&&$C->{option}->{use_revision}?"\x1B\x26".$O{revision}:'')
655     ."\x1B\x24".$O{charset};
656     } else {
657     $r .= ($O{revision}&&$C->{option}->{use_revision}?"\x1B\x26".$O{revision}:'')
658     ."\x1B\x24".("\x28","\x29","\x2A","\x2B")[$G].$O{charset};
659     }
660     } elsif ($O{type} eq 'G96') {
661     $r .= ($O{revision}&&$C->{option}->{use_revision}?"\x1B\x26".$O{revision}:'')
662     ."\x1B".("\x2C","\x2D","\x2E","\x2F")[$G].$O{charset};
663     } elsif ($O{type} eq 'G96n') {
664     $r .= ($O{revision}&&$C->{option}->{use_revision}?"\x1B\x26".$O{revision}:'')
665     ."\x1B\x24".("\x2C","\x2D","\x2E","\x2F")[$G].$O{charset};
666     }
667     $C->{'G'.$G} = $CHARSET{$O{type}}->{$O{charset}};
668     }
669     ## -- invoke G buffer
670     my $left = $C->{option}->{Ginvoke_to_left}->[$G];
671     if ($C->{GL} eq 'G'.$G) {
672     $left = 1;
673     } elsif ($C->{GR} eq 'G'.$G) {
674     $left = 0;
675     } else {
676     if ($C->{option}->{Ginvoke_by_single_shift}->[$G]) {
677     if ($C->{C1}->{'C_SS'.$G}) {
678     $r .= _i2c ($C->{C1}->{'C_SS'.$G}, $C, type => 'C1') || return undef;
679     } elsif ($C->{C0}->{'C_SS'.$G}) {
680     $r .= _i2c ($C->{C0}->{'C_SS'.$G}, $C, type => 'C0') || return undef;
681     } else { ## Both C0 and C1 set do not have SS2/3.
682     $r .= __invoke ($C, $G => $left) if $C->{$left?'GL':'GR'} ne 'G'.$G;
683     }
684     } else {
685     $left = 0 if $G == 1 && !$C->{C0}->{C_LS1};
686     $r .= __invoke ($C, $G => $left) if $C->{$left?'GL':'GR'} ne 'G'.$G;
687     }
688     }
689     $s =~ tr/\x00-\x7F/\x80-\xFF/ unless $left;
690     $r . $s;
691     }
692     sub _back2ascii (%) {
693     my ($C, %O) = @_;
694     my $r = '';
695     if ($C->{option}->{reset}->{Gdesignation}) {
696     my $F = $C->{option}->{reset}->{Gdesignation}; # \x42
697     $r .= "\x1B\x28".$F unless $C->{G0} eq $CHARSET{G94}->{$F};
698     $C->{G0} = $CHARSET{G94}->{$F};
699     if ($O{reset_all}) {
700     $C->{G1} = $CHARSET{G94}->{"\x7E"};
701     $C->{G2} = $CHARSET{G94}->{"\x7E"};
702     $C->{G3} = $CHARSET{G94}->{"\x7E"};
703     }
704     }
705     if ($C->{option}->{reset}->{Ginvoke}) {
706     if ($C->{GL} ne 'G0') {
707     $r .= $C->{C0}->{C_LS0} || ($C->{C0} = $CHARSET{C0}->{'@'},"\x1B\x21\x40\x0F");
708     $C->{GL} = 'G0';
709     }
710     $C->{GR} = undef if $O{reset_all};
711     }
712     $r;
713     }
714     ## __invoke (\%C, $G, $left_or_right)
715     sub __invoke (\%$$) {
716     my ($C, $G) = @_;
717     if ($_[2]) {
718     $C->{GL} = 'G'.$G;
719     return ($C->{C0}->{C_LS0}
720     || scalar ($C->{C0}=$CHARSET{C0}->{'@'},"\x1B\x21\x40\x0F"),
721     $C->{C0}->{C_LS1}, "\x1B\x6E", "\x1B\x6F")[$G];
722     } else {
723     $C->{GR} = 'G'.$G;
724     return ("", "\x1B\x7E", "\x1B\x7D", "\x1B\x7C")[$G];
725     }
726     '';
727     }
728    
729     sub make_charset (%) {
730     ## TODO: support private charset ID such as 'X0'
731     my %set = @_;
732     my $setid = qq($set{I}$set{F}$set{revision});
733     my $settype = $set{type} || 'G94';
734     delete $set{type}, $set{I}, $set{F}, $set{revision};
735     $CHARSET{ $settype }->{ $setid } = \%set;
736     }
737    
738     1;
739     __END__
740    
741     =head1 SEE ALSO
742    
743     ISO/IEC 646:1991, "7-bit coded graphic character set for intormation interchange".
744    
745     ISO/IEC 2022:1994, "Character Code Structure and Extension Techniques".
746     (IDT with ECMA 35, JIS X 0202:1998)
747    
748     ISO/IEC 4873:1991, "8-Bit Coded Character Set Structure and Rules".
749     (IDT with ECMA 43)
750    
751     ISO/IEC 6429:1992, "Control Functions for Coded Character Sets".
752     (IDT with ECMA 48:1991, JIS X 0211:1998)
753    
754     ISO/IEC 8859, "8-Bit Single-Byte Coded Graphic Character Sets".
755    
756     Encode, perlunicode
757    
758     =head1 LICENSE
759    
760     Copyright 2002 wakaba <w@suika.fam.cx>
761    
762     This library is free software; you can redistribute it
763     and/or modify it under the same terms as Perl itself.
764    
765     =cut
766    
767 wakaba 1.3 # $Date: 2002/09/15 05:08:13 $
768 wakaba 1.1 ### ISO2022.pm ends here

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24