/[suikacvs]/perl/lib/Encode/ISO2022.pm
Suika

Contents of /perl/lib/Encode/ISO2022.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.4 - (show annotations) (download)
Mon Sep 16 06:35:16 2002 UTC (22 years, 1 month ago) by wakaba
Branch: MAIN
Changes since 1.3: +57 -4 lines
2002-09-16  Wakaba <w@suika.fam.cx>

	* ISO2022.pm:
	- (iso2022_to_internal): Invoke G1,G2,G3 by locking
	shifts of ESC Fs style.
	- (make_initial_charset): Create charset definition
	of 94^2 DRCSes.
	- (undef_char): New option.
	- (pod:TODO): New section.
	* HZ.pm:
	- (__hz_encoding_name): New function.
	- (Encode::HZ): Added new alias names.
	- (Encode::HZ::HZ165): New package.
	- (pod:ENCODINGS): New section.

1
2 =head1 NAME
3
4 Encode::ISO2022 --- ISO/IEC 2022 encoder and decoder
5
6 =cut
7
8 require v5.7.3;
9 package Encode::ISO2022;
10 use strict;
11 use vars qw(%CHARSET $VERSION);
12 $VERSION=do{my @r=(q$Revision: 1.3 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
13 use base qw(Encode::Encoding);
14 __PACKAGE__->Define (qw/iso-2022 iso2022 2022 cp2022/);
15
16 ### --- Intialization
17
18 my %_CHARS_to_RANGE = (
19 l94 => q/[\x21-\x7E]/,
20 l96 => q/[\x20-\x7F]/,
21 l128 => q/[\x00-\x7F]/,
22 l256 => q/[\x00-\xFF]/,
23 r94 => q/[\xA1-\xFE]/,
24 r96 => q/[\xA0-\xFF]/,
25 r128 => q/[\x80-\xFF]/,
26 r256 => q/[\x80-\xFF]/,
27 b94 => q/[\x21-\x7E\xA1-\xFE]/,
28 b96 => q/[\x20-\x7F\xA0-\xFF]/,
29 b128 => q/[\x00-\xFF]/,
30 b256 => q/[\x00-\xFF]/,
31 );
32
33 ## --- Make initial charset definitions
34 &_make_initial_charsets;
35 sub _make_initial_charsets () {
36 for my $f (0x30..0x7E) {
37 my $F = pack 'C', $f;
38 for ('', '!', '"', '#') {
39 $CHARSET{G94}->{ $_.$F }->{dimension} = 1;
40 $CHARSET{G94}->{ $_.$F }->{chars} = 94;
41 $CHARSET{G94}->{ $_.$F }->{ucs} =
42 {'' => 0xE90940, '!' => 0xE944A0, '"' => 0xE98000, '#' => 0xE9BB60}->{ $_ }
43 + 94 * ($f-0x30);
44
45 $CHARSET{G96}->{ $_.$F }->{dimension} = 1;
46 $CHARSET{G96}->{ $_.$F }->{chars} = 96;
47 $CHARSET{G96}->{ $_.$F }->{ucs} =
48 {'' => 0xE926A0, '!' => 0xE96200, '"' => 0xE99D60, '#' => 0xE9D8C0}->{ $_ }
49 + 96 * ($f-0x30);
50
51 $CHARSET{C0}->{ $_.$F }->{dimension} = 1;
52 $CHARSET{C0}->{ $_.$F }->{chars} = 32;
53 $CHARSET{C0}->{ $_.$F }->{ucs} =
54 {'' => 0x70000000, '!' => 0x70001400,
55 '"' => 0x70002800, '#' => 0x70003C00}->{ $_ } + 32 * ($f-0x30);
56
57 $CHARSET{C1}->{ $_.$F }->{dimension} = 1;
58 $CHARSET{C1}->{ $_.$F }->{chars} = 32;
59 $CHARSET{C1}->{ $_.$F }->{ucs} =
60 {'' => 0x70000A00, '!' => 0x70001E00,
61 '"' => 0x70003200, '#' => 0x70004600}->{ $_ } + 32 * ($f-0x30);
62
63 $CHARSET{G94}->{ ' '.$_.$F }->{dimension} = 1; ## DRCS
64 $CHARSET{G94}->{ ' '.$_.$F }->{chars} = 94;
65 $CHARSET{G94}->{ ' '.$_.$F }->{ucs} =
66 {'' => 0x70090940, '!' => 0x700944A0,
67 '"' => 0x70098000, '#' => 0x7009BB60}->{ $_ } + 94 * ($f-0x30);
68
69 $CHARSET{G96}->{ ' '.$_.$F }->{dimension} = 1; ## DRCS
70 $CHARSET{G96}->{ ' '.$_.$F }->{chars} = 96;
71 $CHARSET{G96}->{ ' '.$_.$F }->{ucs} =
72 {'' => 0x700926A0, '!' => 0x70096200,
73 '"' => 0x70099D60, '#' => 0x7009D8C0}->{ $_ } + 96 * ($f-0x30);
74 }
75 }
76 for my $f (0x30..0x5F, 0x7E) {
77 my $F = pack 'C', $f;
78 for ('', '!', '"', '#', ' ') {
79 $CHARSET{G94n}->{ $_.$F }->{dimension} = 2;
80 $CHARSET{G94n}->{ $_.$F }->{chars} = 94;
81 $CHARSET{G94n}->{ $_.$F }->{ucs} =
82 ({'' => 0xE9F6C0}->{ $_ }||0) + 94*94 * ($f-0x30);
83 ## BUG: 94^n sets with I byte have no mapping area
84
85 $CHARSET{G96n}->{ $_.$F }->{dimension} = 2;
86 $CHARSET{G96n}->{ $_.$F }->{chars} = 96;
87 $CHARSET{G96n}->{ $_.$F }->{ucs} =
88 ({'' => 0xF4C000}->{ $_ }||0) + 96*96 * ($f-0x30);
89 ## BUG: 94^n DRCSes with I byte have no mapping area
90 }
91 }
92 for (0x60..0x6F) {
93 my $F = pack 'C', $_;
94 ## BUG: 9x^3 sets have no mapping area
95 for ('', '!', '"', '#', ' ') {
96 $CHARSET{G94n}->{ $_.$F }->{dimension} = 3;
97 $CHARSET{G94n}->{ $_.$F }->{chars} = 94;
98
99 $CHARSET{G96n}->{ $_.$F }->{dimension} = 3;
100 $CHARSET{G96n}->{ $_.$F }->{chars} = 96;
101 }
102 }
103 for (0x70..0x7D) {
104 my $F = pack 'C', $_;
105 ## BUG: 9x^4 sets have no mapping area
106 for ('', '!', '"', '#', ' ') {
107 $CHARSET{G94n}->{ $_.$F }->{dimension} = 4;
108 $CHARSET{G94n}->{ $_.$F }->{chars} = 94;
109
110 $CHARSET{G96n}->{ $_.$F }->{dimension} = 4;
111 $CHARSET{G96n}->{ $_.$F }->{chars} = 96;
112 }
113 }
114 for my $f (0x40..0x4E) {
115 my $F = pack 'C', $f;
116 $CHARSET{G96n}->{ ' '.$F }->{dimension} = 2;
117 $CHARSET{G96n}->{ ' '.$F }->{chars} = 96;
118 $CHARSET{G96n}->{ ' '.$F }->{ucs} = 0xF0000 + 96*96*($f-0x40);
119 ## U+F0000-U+10F7FF (private) -> ESC 02/04 02/00 <I> (04/00-04/14) (DRCS)
120 }
121
122 $CHARSET{G94}->{B}->{ucs} = 0x21; ## ASCII
123 $CHARSET{G96}->{A}->{ucs} = 0xA0; ## ISO 8859-1
124
125 $CHARSET{G94n}->{'B@'}->{dimension} = 2; ## JIS X 0208-1990
126 $CHARSET{G94n}->{'B@'}->{chars} = 94;
127 $CHARSET{G94n}->{'B@'}->{ucs} = 0xE9F6C0 + 94*94*79;
128
129 ## -- Control character sets
130 $CHARSET{C0}->{'@'}->{ucs} = 0x00; ## ISO/IEC 6429 C0
131 for ("\x40", "\x43", "\x44", "\x45", "\x46", "\x49", "\x4A", "\x4B", "\x4C") {
132 $CHARSET{C0}->{$_}->{C_LS0} = "\x0F";
133 $CHARSET{C0}->{$_}->{C_LS1} = "\x0E";
134 $CHARSET{C0}->{$_}->{r_LS0} = '\x0F';
135 $CHARSET{C0}->{$_}->{r_LS1} = '\x0E';
136 }
137 for ("\x40", "\x44", "\x45", "\x46", "\x48", "\x4C") {
138 $CHARSET{C0}->{$_}->{reset_all} = {"\x0A" => 1, "\x0B" => 1,
139 "\x0C" => 1, "\x0D" => 1};
140 }
141 $CHARSET{C0}->{"\x43"}->{reset_all} = {"\x0A" => 1};
142 $CHARSET{C0}->{"\x44"}->{C_SS2} = "\x1C";
143 $CHARSET{C0}->{"\x44"}->{r_SS2} = '\x1C';
144 for ("\x45", "\x49", "\x4A", "\x4B") {
145 $CHARSET{C0}->{$_}->{C_SS2} = "\x19";
146 $CHARSET{C0}->{$_}->{C_SS3} = "\x1D";
147 $CHARSET{C0}->{$_}->{r_SS2} = '\x19';
148 $CHARSET{C0}->{$_}->{r_SS3} = '\x1D';
149 }
150 $CHARSET{C0}->{"\x4C"}->{C_SS2} = "\x19";
151 $CHARSET{C0}->{"\x4C"}->{r_SS2} = '\x19';
152
153 $CHARSET{C1}->{'64291991C1'}->{dimension} = 1; ## ISO/IEC 6429:1991 C1
154 $CHARSET{C1}->{'64291991C1'}->{chars} = 32;
155 $CHARSET{C1}->{'64291991C1'}->{ucs} = 0x80;
156 for ("\x43", "\x45", "\x47", '64291991C1') {
157 $CHARSET{C1}->{$_}->{C_SS2} = "\x8E";
158 $CHARSET{C1}->{$_}->{C_SS3} = "\x8F";
159 $CHARSET{C1}->{$_}->{r_SS2} = '\x8E';
160 $CHARSET{C1}->{$_}->{r_SS3} = '\x8F';
161 $CHARSET{C1}->{$_}->{r_SS2_ESC} = '\x1B\x4E';
162 $CHARSET{C1}->{$_}->{r_SS3_ESC} = '\x1B\x4F';
163 }
164 for ("\x43", '64291991C1') {
165 $CHARSET{C1}->{$_}->{r_CSI} = '\x9B';
166 $CHARSET{C1}->{$_}->{r_CSI_ESC} = '\x1B\x5B';
167 $CHARSET{C1}->{$_}->{r_DCS} = '\x90';
168 $CHARSET{C1}->{$_}->{r_ST} = '\x9C';
169 $CHARSET{C1}->{$_}->{r_OSC} = '\x9D';
170 $CHARSET{C1}->{$_}->{r_PM} = '\x9E';
171 $CHARSET{C1}->{$_}->{r_APC} = '\x9F';
172 $CHARSET{C1}->{$_}->{reset_all} = {"\x85"=>1, "\x90"=>1,
173 "\x9C"=>1, "\x9D"=>1, "\x9E"=>1, "\x9F"=>1};
174 }
175 $CHARSET{C1}->{'64291991C1'}->{r_SCI} = '\x9A';
176
177 $CHARSET{single_control}->{Fs} ={ucs => 0x70005000, chars => 32, dimension => 1};
178 $CHARSET{single_control}->{'3F'} ={ucs => 0x70005020, chars => 80, dimension => 1};
179 $CHARSET{single_control}->{'3F!'}={ucs => 0x70005070, chars => 80, dimension => 1};
180 $CHARSET{single_control}->{'3F"'}={ucs => 0x700050C0, chars => 80, dimension => 1};
181 $CHARSET{single_control}->{'3F#'}={ucs => 0x70005110, chars => 80, dimension => 1};
182 }
183
184
185 ### --- Perl Encode module common functions
186
187 sub encode ($$;$) {
188 my ($obj, $str, $chk) = @_;
189 $_[1] = '' if $chk;
190 $str = &internal_to_iso2022 ($str);
191 return $str;
192 }
193
194 sub decode ($$;$) {
195 my ($obj, $str, $chk) = @_;
196 $_[1] = '' if $chk;
197 return &iso2022_to_internal ($str);
198 }
199
200 ### --- Encode::ISO2022 unique functions
201
202 ## Make a new ISO/IEC 2022-buffers object with default status
203 sub new_object {
204 my %C;
205 $C{bit} = 8;
206 $C{CL} = 'C0'; $C{CR} = 'C1'; $C{ESC_Fe} = 'C1';
207 $C{C0} = $CHARSET{C0}->{"\x40"}; ## ISO/IEC 6429:1991 C0
208 $C{C1} = $CHARSET{C1}->{'64291991C1'}; ## ISO/IEC 6429:1991 C1
209 $C{GL} = 'G0'; $C{GR} = 'G1';
210 $C{G0} = $CHARSET{G94}->{"\x42"}; ## ISO/IEC 646:1991 IRV
211 #$C{G1} = $CHARSET{G96}->{"\x41"}; ## ISO/IEC 8859-1 GR
212 $C{G1} = $CHARSET{G94}->{"\x7E"}; ## empty set
213 $C{G2} = $CHARSET{G94}->{"\x7E"}; ## empty set
214 $C{G3} = $CHARSET{G94}->{"\x7E"}; ## empty set
215 $C{option} = {
216 C1invoke_to_right => 0, ## C1 invoked to: (0: ESC Fe, 1: CR)
217 G94n_designate_long => 0, ## (1: ESC 02/04 02/08 04/00..02)
218 designate_to => { ## Designated G buffer (-1: not be outputed)
219 C0 => {
220 default => 0,
221 },
222 C1 => {
223 default => 1,
224 },
225 G94 => {
226 "\x42" => 0,
227 default => 0,
228 },
229 G96 => {
230 default => 1,
231 },
232 G94n => {
233 default => 0,
234 },
235 G96n => {
236 default => 1,
237 },
238 },
239 Ginvoke_by_single_shift => [0,0,0,0], ## Invoked by SS
240 Ginvoke_to_left => [1,1,1,1], ## Which invoked to? (1: L, 0: R)
241 private_set => { ## Private set vs Final byte
242 C0 => [],
243 C1 => [],
244 G94 => [],
245 G94n => [[],[],[],[],[]],
246 G96 => [],
247 #G96n => [], ## (not implemented)
248 U96n => [], ## mule-unicode sets
249 XC1 => {
250 '64291991C1' => undef, ## ISO/IEC 6429:1991 C1
251 },
252 },
253 reset => { ## Reset status at top of line
254 Gdesignation => "\x42", ## F of designation or 0
255 Ginvoke => 1,
256 },
257 undef_char => ["\x3F", {type => 'G94', charset => 'B'}],
258 use_revision => 1, ## Output IRR
259 };
260 \%C;
261 }
262
263 sub iso2022_to_internal ($;\%) {
264 my ($s, $C) = @_;
265 my %_GB_to_GN = (
266 "\x28"=>'G0',"\x29"=>'G1',"\x2A"=>'G2',"\x2B"=>'G3',
267 "\x2C"=>'G0',"\x2D"=>'G1',"\x2E"=>'G2',"\x2F"=>'G3',
268 );
269 $C ||= &new_object;
270
271 use re 'eval';
272 $s =~ s{
273 ((??{ $_CHARS_to_RANGE{'l'.$C->{$C->{GL}}->{chars}}
274 . qq/{$C->{$C->{GL}}->{dimension},$C->{$C->{GL}}->{dimension}}/ }))
275 |((??{ $_CHARS_to_RANGE{'r'.$C->{$C->{GL}}->{chars}}
276 . qq/{$C->{$C->{GR}}->{dimension},$C->{$C->{GR}}->{dimension}}/ }))
277
278 | (??{ q/(?:/ . ($C->{$C->{CR}}->{r_SS2} || '(?!)')
279 . ($C->{$C->{ESC_Fe}}->{r_SS2_ESC} ?
280 qq/|$C->{$C->{ESC_Fe}}->{r_SS2_ESC}/ : '')
281 . ($C->{$C->{CL}}->{r_SS2} ? qq/|$C->{$C->{CL}}->{r_SS2}/ : '') . q/)/
282 . ( $C->{$C->{CL}}->{r_LS0}
283 ||$C->{$C->{CL}}->{r_LS1}? ## ISO/IEC 6429:1992 9
284 qq/[$C->{$C->{CL}}->{r_LS0}$C->{$C->{CL}}->{r_LS1}]*/:'')
285 })
286 ((??{ $_CHARS_to_RANGE{'b'.$C->{$C->{GL}}->{chars}}
287 . qq/{$C->{G2}->{dimension},$C->{G2}->{dimension}}/ }))
288 | (??{ q/(?:/ . ($C->{$C->{CR}}->{r_SS3} || '(?!)')
289 . ($C->{$C->{ESC_Fe}}->{r_SS3_ESC} ?
290 qq/|$C->{$C->{ESC_Fe}}->{r_SS3_ESC}/ : '')
291 . ($C->{$C->{CL}}->{r_SS3} ? qq/|$C->{$C->{CL}}->{r_SS3}/ : '') . q/)/
292 . ( $C->{$C->{CL}}->{r_LS0}
293 || $C->{$C->{CL}}->{r_LS1}? ## ISO/IEC 6429:1992 9
294 qq/[$C->{$C->{CL}}->{r_LS0}$C->{$C->{CL}}->{r_LS1}]*/:'')
295 })
296 ((??{ $_CHARS_to_RANGE{'b'.$C->{$C->{GL}}->{chars}}
297 . qq/{$C->{G3}->{dimension},$C->{G3}->{dimension}}/ }))
298
299 ## Locking shift
300 |( \x1B[\x6E\x6F\x7C-\x7E]
301 |(??{ $C->{$C->{CL}}->{r_LS0}||'(?!)' })
302 |(??{ $C->{$C->{CL}}->{r_LS1}||'(?!)' })
303 )
304
305 ## Control sequence
306 |(??{ '(?:'.($C->{$C->{CR}}->{r_CSI}||'(?!)')
307 .($C->{$C->{ESC_Fe}}->{r_CSI_ESC} ?
308 qq/|$C->{$C->{ESC_Fe}}->{r_CSI_ESC}/: '')
309 .')'
310 })
311 ((??{ qq/[\x30-\x3F$C->{$C->{CL}}->{LS0}$C->{$C->{CL}}->{LS1}\xB0-\xBF]*/
312 .qq/[\x20-\x2F$C->{$C->{CL}}->{LS0}$C->{$C->{CL}}->{LS1}\xA0-\xAF]*/
313 }) [\x40-\x7E\xD0-\xFE])
314
315 ## Other escape sequence
316 |(\x1B[\x20-\x2F]*[\x30-\x7E])
317
318 ## Misc. sequence (SP, control, or broken data)
319 |([\x00-\xFF])
320 }{
321 my ($gl,$gr,$ss2,$ss3,$ls,$csi,$esc,$misc)
322 = ($1,$2,$3,$4,$5,$6,$7,$8,$9);
323 $C->{_irr} = undef unless defined $esc;
324 ## GL graphic character
325 if (defined $gl) {
326 my $c = 0;
327 my $m = $C->{$C->{GL}}->{chars}==94?0x21:$C->{$C->{GL}}->{chars}==96?0x20:0;
328 for (split //, $gl) {
329 $c = $c * $C->{$C->{GL}}->{chars} + unpack ('C', $_) - $m;
330 }
331 chr ($C->{$C->{GL}}->{ucs} + $c);
332 ## Control, SP, or broken data
333 ## TODO: support control sets other than ISO/IEC 6429's
334 } elsif (defined $misc) {
335 $misc;
336 ## GR graphic character
337 } elsif ($gr) {
338 my $c = 0;
339 my $m = $C->{$C->{GR}}->{chars}==94?0xA1:$C->{$C->{GR}}->{chars}==96?0xA0:0x80;
340 for (split //, $gr) {
341 $c = $c * $C->{$C->{GR}}->{chars} + unpack ('C', $_) - $m;
342 }
343 chr ($C->{$C->{GR}}->{ucs} + $c);
344 ## Graphic character with SS2
345 } elsif ($ss2) {
346 $ss2 =~ tr/\x80-\xFF/\x00-\x7F/;
347 my $c = 0; my $m = $C->{G2}->{chars}==94?0x21:$C->{G2}->{chars}==96?0x20:0;
348 for (split //, $ss2) {
349 $c = $c * $C->{G2}->{chars} + unpack ('C', $_) - $m;
350 }
351 chr ($C->{G2}->{ucs} + $c);
352 ## Graphic character with SS3
353 } elsif ($ss3) {
354 $ss3 =~ tr/\x80-\xFF/\x00-\x7F/;
355 my $c = 0; my $m = $C->{G3}->{chars}==94?0x21:$C->{G3}->{chars}==96?0x20:0;
356 for (split //, $ss3) {
357 $c = $c * $C->{G3}->{chars} + unpack ('C', $_) - $m;
358 }
359 chr ($C->{G3}->{ucs} + $c);
360 ## Escape sequence
361 } elsif ($esc) {
362 ## IRR (revision number)
363 if ($esc =~ /\x1B\x26([\x40-\x7E])/) {
364 $C->{_irr} = $1; $esc = '';
365 } else {
366 $esc =~ s{
367 \x1B([\x28-\x2B])(\x20?[\x21-\x23]?[\x30-\x7E]) ## Gx = 94^1
368 |\x1B\x24([\x28-\x2B]?)(\x20?[\x21-\x23]?[\x30-\x7E]) ## Gx = 94^n
369
370 |\x1B([\x2C-\x2F])(\x20?[\x21-\x23]?[\x30-\x7E]) ## Gx = 96^1
371 |\x1B\x24([\x2C-\x2F])(\x20?[\x21-\x23]?[\x30-\x7E]) ## Gx = 96^n
372
373 |\x1B([\x40-\x5F]) ## ESC Fe
374
375 |\x1B\x21([\x21-\x23]?[\x30-\x7E]) ## CL = C0
376 |\x1B\x22([\x21-\x23]?[\x30-\x7E]) ## CR & ESC Fe = C1
377
378 |\x1B([\x60-\x7E]) ## Single control functions
379 |\x1B\x23([\x21-\x23]?)([\x30-\x7E])
380
381 |\x1B\x20([\x40-\x7E]) ## Announcer
382 }{
383 my ($g94_g,$g94_f,$g94n_g,$g94n_f,$g96_g,$g96_f,$g96n_g,$g96n_f,$Fe,
384 $CZD, $C1D, $Fs, $sI, $sF,$ACS)
385 = ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15);
386 my $rev = $C->{_irr} || '';
387 if ($g94_g) {
388 $C->{ $_GB_to_GN{ $g94_g } } = $CHARSET{G94}->{ $g94_f.$rev }
389 || $CHARSET{G94}->{ $g94_f }
390 || $CHARSET{G94}->{ "\x7E" }; '';
391 } elsif (defined $g94n_f) {
392 $C->{ $_GB_to_GN{ $g94n_g } || 'G0' } = $CHARSET{G94n}->{ $g94n_f.$rev }
393 || $CHARSET{G94n}->{ $g94n_f }
394 || $CHARSET{G94n}->{ "\x7E" }; '';
395 } elsif ($g96_g) {
396 $C->{ $_GB_to_GN{ $g96_g } } = $CHARSET{G96}->{ $g96_f.$rev }
397 || $CHARSET{G96}->{ $g96_f }
398 || $CHARSET{G96}->{ "\x7E" }; '';
399 } elsif (defined $g96n_f) {
400 $C->{ $_GB_to_GN{ $g96n_g } } = $CHARSET{G96n}->{ $g96n_f.$rev }
401 || $CHARSET{G96n}->{ $g96n_f }
402 || $CHARSET{G96n}->{ "\x7E" }; '';
403 } elsif ($Fe) { ## ESC Fe => C1
404 chr ($C->{ $C->{ESC_Fe} }->{ucs} + (ord ($Fe) - 0x40));
405 } elsif ($CZD) {
406 $C->{C0} = $CHARSET{C0}->{ $CZD.$rev }
407 || $CHARSET{C0}->{ $CZD } || $CHARSET{C0}->{ "\x7E" }; '';
408 } elsif ($C1D) {
409 $C->{C1} = $CHARSET{C1}->{ $C1D.$rev }
410 || $CHARSET{C1}->{ $C1D } || $CHARSET{C1}->{ "\x7E" }; '';
411 } elsif ($Fs) {
412 if ($Fs eq "\x6E") { ## LS2
413 $C->{GL} = 'G2'; '';
414 } elsif ($Fs eq "\x6F") { ## LS3
415 $C->{GL} = 'G3'; '';
416 } elsif ($Fs eq "\x7E") { ## LS1R
417 $C->{GR} = 'G1'; $C->{GL} = 'G1' if $C->{bit} == 7; '';
418 } elsif ($Fs eq "\x7D") { ## LS2R
419 $C->{GR} = 'G2'; $C->{GL} = 'G2' if $C->{bit} == 7; '';
420 } elsif ($Fs eq "\x7C") { ## LS3R
421 $C->{GR} = 'G3'; $C->{GL} = 'G3' if $C->{bit} == 7; '';
422 } else {
423 chr ($CHARSET{single_control}->{Fs}->{ucs} + (ord ($Fs) - 0x60));
424 }
425 } elsif ($sI) {
426 chr ($CHARSET{single_control}->{'3F'.$sI}->{ucs} + (ord ($sF) - 0x30));
427 } elsif ($ACS) { ## Announcer
428 if ($ACS eq "\x4A") { $C->{bit} = 7 }
429 elsif ($ACS eq "\x4B") { $C->{bit} = 8 }
430 '';
431 }
432 }gex;
433 $C->{_irr} = undef;
434 }
435 $esc;
436 ## Locking shifts
437 } elsif ($ls) {
438 if ($ls eq $C->{$C->{CL}}->{LS0}) {
439 $C->{GL} = 'G0'; '';
440 } elsif ($ls eq $C->{$C->{CL}}->{LS1}) {
441 $C->{GL} = 'G1'; '';
442 } elsif ($ls =~ /\x1B([\x6E\x6F])/) {
443 $C->{GL} = {"\x6E"=>2, "\x6F"=>3}->{$1}; '';
444 } elsif ($ls =~ /\x1B([\x7C-\x7E])/) {
445 $C->{GR} = {"\x7E"=>1, "\x7D"=>2, "\x7C"=>3}->{$1}; '';
446 }
447 ## Control sequence
448 } elsif ($csi) {
449 $csi =~ tr/\xA0-\xFF/\x20-\x7F/d;
450 $csi =~ s/$C->{$C->{CL}}->{LS0}//g if $C->{$C->{CL}}->{LS0};
451 $csi =~ s/$C->{$C->{CL}}->{LS1}//g if $C->{$C->{CL}}->{LS1};
452 "\x9B".$csi;
453 }
454 }gex;
455 $s;
456 }
457
458 sub internal_to_iso2022 ($\%) {
459 my ($s, $C) = @_;
460 $C ||= &new_object;
461
462 my $r = '';
463 for my $c (split //, $s) {
464 my $cc = ord $c;
465 my $t;
466 if ($cc <= 0x1F) {
467 $t = _i2c ($c, $C, type => 'C0', charset => '@');
468 } elsif ($cc == 0x20 || $cc == 0x7F) {
469 $t = _back2ascii ($C) . $c;
470 } elsif ($cc < 0x7F) {
471 $t = _i2g ($c, $C, type => 'G94', charset => 'B');
472 } elsif ($cc <= 0x9F) {
473 $t = _i2c ($c, $C, type => 'C1', charset_id => '64291991C1',
474 charset => $C->{private_set}->{XC1}->{'64291991C1'});
475 } elsif ($cc <= 0xFF) {
476 $t = _i2g (chr($cc-0x80), $C, type => 'G96', charset => 'A');
477 } elsif ($cc <= 0x24FF) {
478 my $c = $cc - 0x100;
479 my $final = $C->{private_set}->{U96n}->[0];
480 if (length $final) {
481 $t = _i2g (chr(($c / 96)+0x20).chr(($c % 96)+0x20), $C,
482 type => 'G96n', charset => $final);
483 }
484 } elsif ($cc <= 0x33FF) {
485 my $c = $cc - 0x2500;
486 my $final = $C->{private_set}->{U96n}->[1];
487 if (length $final) {
488 $t = _i2g (chr(($c / 96)+0x20).chr(($c % 96)+0x20), $C,
489 type => 'G96n', charset => $final);
490 }
491 } elsif (0xE000 <= $cc && $cc <= 0xFFFF) {
492 my $c = $cc - 0xE000;
493 my $final = $C->{private_set}->{U96n}->[2];
494 if (length $final) {
495 $t = _i2g (chr(($c / 96)+0x20).chr(($c % 96)+0x20), $C,
496 type => 'G96n', charset => $final);
497 }
498
499 } elsif (0xE9F6C0 <= $cc && $cc <= 0xF06F80) {
500 my $c = $cc - 0xE9F6C0;
501 $t = _i2g (chr((($c % 8836) / 94)+0x21).chr(($c % 94)+0x21), $C,
502 type => 'G94n', charset => chr(($c / 8836)+0x30));
503 } elsif (0xF49D7C <= $cc && $cc <= 0xF4BFFF) {
504 my $c = $cc - 0xF49D7C;
505 $t = _i2g (chr(($c / 94)+0x21).chr(($c % 94)+0x21), $C,
506 type => 'G94n', charset => 'B', revision => '@');
507
508 } elsif (0xF0000 <= $cc && $cc <= 0x10F7FF) {
509 my $c = $cc - 0xF0000;
510 $t = _i2g (chr((($c % 9216) / 96)+0x20).chr(($c % 96)+0x20), $C,
511 type => 'G96n', charset => "\x20".chr(($c / 9216)+0x40));
512 } elsif (0xE90940 <= $cc && $cc <= 0xE92641) {
513 my $c = $cc - 0xE90940;
514 $t = _i2g (chr(($c % 94)+0x21), $C,
515 type => 'G94', charset => chr(($c / 94)+0x30));
516 } elsif (0xE92642 <= $cc && $cc <= 0xE9269F) {
517 my $c = $cc - 0xE92642;
518 $t = _i2g (chr($c+0x21),$C,type => 'G94', charset => "\x64", revision => '@');
519 } elsif (0xE926A0 <= $cc && $cc <= 0xE9443F) {
520 my $c = $cc - 0xE926A0;
521 $t = _i2g (chr(($c % 96)+0x20), $C,
522 type => 'G96', charset => chr(($c / 96)+0x30));
523 } elsif (0xE944A0 <= $cc && $cc <= 0xE961A1) {
524 my $c = $cc - 0xE944A0;
525 $t = _i2g (chr(($c % 94)+0x21), $C,
526 type => 'G94', charset => '!'.chr(($c / 94)+0x30));
527 } elsif (0xE96200 <= $cc && $cc <= 0xE97F9F) {
528 my $c = $cc - 0xE96200;
529 $t = _i2g (chr(($c % 96)+0x20), $C,
530 type => 'G96', charset => '!'.chr(($c / 96)+0x30));
531 } elsif (0xE98000 <= $cc && $cc <= 0xE99D01) {
532 my $c = $cc - 0xE98000;
533 $t = _i2g (chr(($c % 94)+0x21), $C,
534 type => 'G94', charset => '"'.chr(($c / 94)+0x30));
535 } elsif (0xE99D60 <= $cc && $cc <= 0xE9BAFF) {
536 my $c = $cc - 0xE99D60;
537 $t = _i2g (chr(($c % 96)+0x20), $C,
538 type => 'G96', charset => '"'.chr(($c / 96)+0x30));
539 } elsif (0xE9BB60 <= $cc && $cc <= 0xE9D861) {
540 my $c = $cc - 0xE9BB60;
541 $t = _i2g (chr(($c % 94)+0x21), $C,
542 type => 'G94', charset => '#'.chr(($c / 94)+0x30));
543 } elsif (0xE9D8C0 <= $cc && $cc <= 0xE9F65F) {
544 my $c = $cc - 0xE9D8C0;
545 $t = _i2g (chr(($c % 96)+0x20), $C,
546 type => 'G96', charset => '#'.chr(($c / 96)+0x30));
547 } elsif (0x70090940 <= $cc && $cc <= 0x70092641) {
548 my $c = $cc - 0x70090940;
549 $t = _i2g (chr(($c % 94)+0x21), $C,
550 type => 'G94', charset => "\x20".chr(($c / 94)+0x30));
551 } elsif (0x700926A0 <= $cc && $cc <= 0x7009443F) {
552 my $c = $cc - 0x700926A0;
553 $t = _i2g (chr(($c % 96)+0x20), $C,
554 type => 'G96', charset => "\x20".chr(($c / 96)+0x30));
555 ## TODO: DRCS with I byte: U+700944A0-U+7009F6BF
556 } elsif (0x70400000 <= $cc && $cc <= 0x7040FFED) {
557 my $c = $cc - 0x70400000;
558 $t = _i2g (chr(($c % 94)+0x21), $C, charset_id => 'P'.int ($c / 94),
559 type => 'G94', charset => $C->{private_set}->{G94}->[ $c / 94 ]);
560 } elsif (0x70410000 <= $cc && $cc <= 0x7041FFBF) {
561 my $c = $cc - 0x70410000;
562 $t = _i2g (chr(($c % 96)+0x20), $C, charset_id => 'P'.int ($c / 96),
563 type => 'G96', charset => $C->{private_set}->{G96}->[ $c / 96 ]);
564 } elsif (0x70420000 <= $cc && $cc <= 0x7046F19B) {
565 my $c = $cc % 0x10000;
566 $t = _i2g (chr((($c % 8836) / 94)+0x21).chr(($c % 94)+0x21), $C,
567 type => 'G94n',
568 charset_id => 'P'.int(($cc / 0x10000) - 0x7042).'_'.int($c / 8836),
569 charset => $C->{private_set}->{G94n}->[ ($cc / 0x10000) - 0x7042 ]
570 ->[ $c / 8836 ]);
571 }
572 if (defined $t) {
573 $r .= $t;
574 } else {
575 $r .= _i2g ($C->{option}->{undef_char}->[0], $C,
576 %{ $C->{option}->{undef_char}->[1] });
577 }
578 }
579 $r . _back2ascii ($C);
580 }
581
582 ## $O{charset} eq undef means that charset is same as the current designated one.
583 sub _i2c ($%%) {
584 my ($s, $C, %O) = @_;
585 my $r = '';
586 if ($O{type} eq 'C0') {
587 if (defined $O{charset}) {
588 if ( $C->{C0} ne $CHARSET{C0}->{$O{charset}}
589 && $C->{C0} ne $CHARSET{C0}->{$O{charset_id}}) {
590 for ($C->{option}->{designate_to}->{C0}->{$O{charset}},
591 $C->{option}->{designate_to}->{C0}->{default}) {
592 if (defined $_) { return undef if $_ == -1; last }
593 }
594 $r .= "\x1B\x21".$O{charset};
595 $C->{C0} = $CHARSET{C0}->{$O{charset}};
596 }
597 } elsif (defined $O{charset_id}) {
598 if ($C->{C0} ne $CHARSET{C0}->{$O{charset_id}}) {
599 return undef; ## Control set is not designated nor has F byte
600 }
601 }
602 $r .= _back2ascii ($C, reset_all => $C->{C0}->{reset_all}->{$s});
603 return $r . $s;
604 } elsif ($O{type} eq 'C1') {
605 if (defined $O{charset}) {
606 if ( $C->{C1} ne $CHARSET{C1}->{$O{charset}}
607 && $C->{C1} ne $CHARSET{C1}->{$O{charset_id}}) {
608 for ($C->{option}->{designate_to}->{C1}->{$O{charset}},
609 $C->{option}->{designate_to}->{C1}->{default}) {
610 if (defined $_) { return undef if $_ == -1; last }
611 }
612 $r .= "\x1B\x22".$O{charset};
613 $C->{C1} = $CHARSET{C1}->{$O{charset}};
614 }
615 } elsif (defined $O{charset_id}) {
616 if ($C->{C1} ne $CHARSET{C1}->{$O{charset_id}}) {
617 return undef; ## Control set is not designated nor has F byte
618 }
619 }
620 $r .= _back2ascii ($C, reset_all => $C->{C1}->{reset_all}->{$s});
621 unless ($C->{option}->{C1invoke_to_right}) { ## ESC Fe
622 $s =~ s/([\x80-\x9F])/"\x1B" . chr (ord ($1) - 0x40)/ge;
623 }
624 return $r . $s;
625 }
626 }
627 sub _i2g ($%%) {
628 my ($s, $C, %O) = @_;
629 my $r = '';
630 my $set = $CHARSET{$O{type}}->{$O{charset}.
631 ($O{revision}&&$C->{option}->{use_revision}?$O{revision}:'')};
632 my $set0 = $CHARSET{$O{type}}->{$O{charset_id}};
633 ## -- designate character set
634 my $G = 0;
635 if ($C->{G0} eq $set || $C->{G0} eq $set0) { $G = 0 }
636 elsif ($C->{G1} eq $set || $C->{G1} eq $set0) { $G = 1 }
637 elsif ($C->{G2} eq $set || $C->{G2} eq $set0) { $G = 2 }
638 elsif ($C->{G3} eq $set || $C->{G3} eq $set0) { $G = 3 }
639 else {
640 return undef unless $set; ## charset does not have F byte
641 $G = 1 if $O{type} eq 'G96' || $O{type} eq 'G96n';
642 for ($C->{option}->{designate_to}->{$O{type}}->{$O{charset}},
643 $C->{option}->{designate_to}->{$O{type}}->{default}) {
644 if (defined $_) {
645 $G = $_; last;
646 }
647 }
648 if ($G == -1) {
649 return undef;
650 }
651 if ($O{type} eq 'G94') {
652 $r .= ($O{revision}&&$C->{option}->{use_revision}?"\x1B\x26".$O{revision}:'')
653 ."\x1B".("\x28","\x29","\x2A","\x2B")[$G].$O{charset};
654 } elsif ($O{type} eq 'G94n') {
655 if ($G == 0 && !$C->{option}->{G94n_designate_long}
656 && ($O{charset} eq '@' || $O{charset} eq 'A' || $O{charset} eq 'B')) {
657 $r .= ($O{revision}&&$C->{option}->{use_revision}?"\x1B\x26".$O{revision}:'')
658 ."\x1B\x24".$O{charset};
659 } else {
660 $r .= ($O{revision}&&$C->{option}->{use_revision}?"\x1B\x26".$O{revision}:'')
661 ."\x1B\x24".("\x28","\x29","\x2A","\x2B")[$G].$O{charset};
662 }
663 } elsif ($O{type} eq 'G96') {
664 $r .= ($O{revision}&&$C->{option}->{use_revision}?"\x1B\x26".$O{revision}:'')
665 ."\x1B".("\x2C","\x2D","\x2E","\x2F")[$G].$O{charset};
666 } elsif ($O{type} eq 'G96n') {
667 $r .= ($O{revision}&&$C->{option}->{use_revision}?"\x1B\x26".$O{revision}:'')
668 ."\x1B\x24".("\x2C","\x2D","\x2E","\x2F")[$G].$O{charset};
669 }
670 $C->{'G'.$G} = $CHARSET{$O{type}}->{$O{charset}};
671 }
672 ## -- invoke G buffer
673 my $left = $C->{option}->{Ginvoke_to_left}->[$G];
674 if ($C->{GL} eq 'G'.$G) {
675 $left = 1;
676 } elsif ($C->{GR} eq 'G'.$G) {
677 $left = 0;
678 } else {
679 if ($C->{option}->{Ginvoke_by_single_shift}->[$G]) {
680 if ($C->{C1}->{'C_SS'.$G}) {
681 $r .= _i2c ($C->{C1}->{'C_SS'.$G}, $C, type => 'C1') || return undef;
682 } elsif ($C->{C0}->{'C_SS'.$G}) {
683 $r .= _i2c ($C->{C0}->{'C_SS'.$G}, $C, type => 'C0') || return undef;
684 } else { ## Both C0 and C1 set do not have SS2/3.
685 $left = 0 if $G == 1 && !$C->{C0}->{C_LS1};
686 $r .= __invoke ($C, $G => $left) if $C->{$left?'GL':'GR'} ne 'G'.$G;
687 }
688 } else {
689 $left = 0 if $G == 1 && !$C->{C0}->{C_LS1};
690 $r .= __invoke ($C, $G => $left) if $C->{$left?'GL':'GR'} ne 'G'.$G;
691 }
692 }
693 $s =~ tr/\x00-\x7F/\x80-\xFF/ unless $left;
694 $r . $s;
695 }
696 sub _back2ascii (%) {
697 my ($C, %O) = @_;
698 my $r = '';
699 if ($C->{option}->{reset}->{Gdesignation}) {
700 my $F = $C->{option}->{reset}->{Gdesignation}; # \x42
701 $r .= "\x1B\x28".$F unless $C->{G0} eq $CHARSET{G94}->{$F};
702 $C->{G0} = $CHARSET{G94}->{$F};
703 if ($O{reset_all}) {
704 $C->{G1} = $CHARSET{G94}->{"\x7E"};
705 $C->{G2} = $CHARSET{G94}->{"\x7E"};
706 $C->{G3} = $CHARSET{G94}->{"\x7E"};
707 }
708 }
709 if ($C->{option}->{reset}->{Ginvoke}) {
710 if ($C->{GL} ne 'G0') {
711 $r .= $C->{C0}->{C_LS0} || ($C->{C0} = $CHARSET{C0}->{'@'},"\x1B\x21\x40\x0F");
712 $C->{GL} = 'G0';
713 }
714 $C->{GR} = undef if $O{reset_all};
715 }
716 $r;
717 }
718 ## __invoke (\%C, $G, $left_or_right)
719 sub __invoke (\%$$) {
720 my ($C, $G) = @_;
721 if ($_[2]) {
722 $C->{GL} = 'G'.$G;
723 return ($C->{C0}->{C_LS0}
724 || scalar ($C->{C0}=$CHARSET{C0}->{'@'},"\x1B\x21\x40\x0F"),
725 $C->{C0}->{C_LS1}, "\x1B\x6E", "\x1B\x6F")[$G];
726 } else {
727 $C->{GR} = 'G'.$G;
728 return ("", "\x1B\x7E", "\x1B\x7D", "\x1B\x7C")[$G];
729 }
730 '';
731 }
732
733 sub make_charset (%) {
734 ## TODO: support private charset ID such as 'X0'
735 my %set = @_;
736 my $setid = qq($set{I}$set{F}$set{revision});
737 my $settype = $set{type} || 'G94';
738 delete $set{type}, $set{I}, $set{F}, $set{revision};
739 $CHARSET{ $settype }->{ $setid } = \%set;
740 }
741
742 1;
743 __END__
744
745 =head1 SEE ALSO
746
747 ISO/IEC 646:1991, "7-bit coded graphic character set for intormation interchange".
748
749 ISO/IEC 2022:1994, "Character Code Structure and Extension Techniques".
750 (IDT with ECMA 35, JIS X 0202:1998)
751
752 ISO/IEC 4873:1991, "8-Bit Coded Character Set Structure and Rules".
753 (IDT with ECMA 43)
754
755 ISO/IEC 6429:1992, "Control Functions for Coded Character Sets".
756 (IDT with ECMA 48:1991, JIS X 0211:1998)
757
758 ISO/IEC 8859, "8-Bit Single-Byte Coded Graphic Character Sets".
759
760 Encode, perlunicode
761
762 =head1 TODO
763
764 =over 4
765
766 =item NCR (coding system other than ISO/IEC 2022) support
767
768 =over 2
769
770 =item ESC 02/05 02/15 03/x of X Compound Text
771
772 =back
773
774 =item Output of control character sets, single control functions
775
776 =item Designation sequence of control character sets (input)
777
778 =item Special graphic character sets such as G3 of EUC-TW
779
780 =item SUPER SHIFT (SS) invoke function of old control character set
781
782 =item Safe transparent of control string (ISO/IEC 6429)
783
784 =item Output of unoutputable characters as alternative notation such as SGML-like entity
785
786 =item C0 set invoked to CR area like ISIRI code
787
788 Really need?
789
790 =item special treatment of 0x20, 0x7E, 0xA0, 0xFF
791
792 For example, GB mongolian sets use MSP (MONGOLIAN SPACE)
793 with these code positions.
794
795 And, no less coding systems does not use (or does ban using) DEL.
796
797 =item A lot of character sets don't have pseudo-UCS mapping.
798
799 Most of 9m^n (n >= 3) sets, 9m^n sets with I byte, 9m^n
800 DRCSes do not have pseudo-UCS mapping area. It is
801 questionable to allocate lots of code positions to these
802 rarely-(or no-)used character sets.
803
804 =item Even character sets that have pseudo-UCS mapping, some of them can't be outputed in ISO/IEC 2022.
805
806 Because output of rarely-used character sets is
807 not implemented yet.
808
809 =back
810
811 =head1 LICENSE
812
813 Copyright 2002 wakaba <w@suika.fam.cx>
814
815 This library is free software; you can redistribute it
816 and/or modify it under the same terms as Perl itself.
817
818 =cut
819
820 # $Date: 2002/09/16 02:20:18 $
821 ### ISO2022.pm ends here

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24