3 |
|
|
4 |
Encode::ISO2022 --- ISO/IEC 2022 encoder and decoder |
Encode::ISO2022 --- ISO/IEC 2022 encoder and decoder |
5 |
|
|
6 |
|
=head1 ENCODINGS |
7 |
|
|
8 |
|
=over 4 |
9 |
|
|
10 |
|
=item iso2022 |
11 |
|
|
12 |
|
ISO/IEC 2022:1994. Default status is: |
13 |
|
|
14 |
|
=over 2 |
15 |
|
|
16 |
|
=item CL = C0 = ISO/IEC 6429:1991 C0 set |
17 |
|
|
18 |
|
=item CR = C1 = ISO/IEC 6429:1991 C1 set |
19 |
|
|
20 |
|
=item GL = G0 = ISO/IEC 646:1991 IRV GL(G0) set |
21 |
|
|
22 |
|
=item GR = G1 = empty set |
23 |
|
|
24 |
|
=item G2 = empty set |
25 |
|
|
26 |
|
=item G3 = empty set |
27 |
|
|
28 |
|
=back |
29 |
|
|
30 |
|
(Alias: iso/iec2022, iso-2022, 2022, cp2022) |
31 |
|
|
32 |
|
=back |
33 |
|
|
34 |
|
Note that ISO/IEC 2022 based encodings are found in |
35 |
|
Encode::ISO2022::* modules. This module, Encode::ISO2022 |
36 |
|
only provides a general ISO/IEC 2022 encoder/decoder. |
37 |
|
|
38 |
=cut |
=cut |
39 |
|
|
40 |
require v5.7.3; |
require v5.7.3; |
41 |
package Encode::ISO2022; |
package Encode::ISO2022; |
42 |
use strict; |
use strict; |
43 |
use vars qw(%CHARSET $VERSION); |
use vars qw(%CHARSET %CODING_SYSTEM $VERSION); |
44 |
$VERSION=do{my @r=(q$Revision$=~/\d+/g);sprintf "%d."."%02d" x $#r,@r}; |
$VERSION=do{my @r=(q$Revision$=~/\d+/g);sprintf "%d."."%02d" x $#r,@r}; |
45 |
use base qw(Encode::Encoding); |
use base qw(Encode::Encoding); |
46 |
__PACKAGE__->Define (qw/iso-2022 iso2022 2022 cp2022/); |
__PACKAGE__->Define (qw!iso-2022 iso/iec2022 iso2022 2022 cp2022!); |
47 |
|
require Encode::Charset; |
48 |
|
*CHARSET = \%Encode::Charset::CHARSET; |
49 |
|
*CODING_SYSTEM = \%Encode::Charset::CODING_SYSTEM; |
50 |
|
|
51 |
### --- Intialization |
### --- Intialization |
52 |
|
|
65 |
b256 => q/[\x00-\xFF]/, |
b256 => q/[\x00-\xFF]/, |
66 |
); |
); |
67 |
|
|
|
## --- Make initial charset definitions |
|
|
&_make_initial_charsets; |
|
|
sub _make_initial_charsets () { |
|
|
for my $f (0x30..0x7E) { |
|
|
my $F = pack 'C', $f; |
|
|
for ('', '!', '"', '#') { |
|
|
$CHARSET{G94}->{ $_.$F }->{dimension} = 1; |
|
|
$CHARSET{G94}->{ $_.$F }->{chars} = 94; |
|
|
$CHARSET{G94}->{ $_.$F }->{ucs} = |
|
|
{'' => 0xE90940, '!' => 0xE944A0, '"' => 0xE98000, '#' => 0xE9BB60}->{ $_ } |
|
|
+ 94 * ($f-0x30); |
|
|
|
|
|
$CHARSET{G96}->{ $_.$F }->{dimension} = 1; |
|
|
$CHARSET{G96}->{ $_.$F }->{chars} = 96; |
|
|
$CHARSET{G96}->{ $_.$F }->{ucs} = |
|
|
{'' => 0xE926A0, '!' => 0xE96200, '"' => 0xE99D60, '#' => 0xE9D8C0}->{ $_ } |
|
|
+ 96 * ($f-0x30); |
|
|
|
|
|
$CHARSET{C0}->{ $_.$F }->{dimension} = 1; |
|
|
$CHARSET{C0}->{ $_.$F }->{chars} = 32; |
|
|
$CHARSET{C0}->{ $_.$F }->{ucs} = |
|
|
{'' => 0x70000000, '!' => 0x70001400, |
|
|
'"' => 0x70002800, '#' => 0x70003C00}->{ $_ } + 32 * ($f-0x30); |
|
|
|
|
|
$CHARSET{C1}->{ $_.$F }->{dimension} = 1; |
|
|
$CHARSET{C1}->{ $_.$F }->{chars} = 32; |
|
|
$CHARSET{C1}->{ $_.$F }->{ucs} = |
|
|
{'' => 0x70000A00, '!' => 0x70001E00, |
|
|
'"' => 0x70003200, '#' => 0x70004600}->{ $_ } + 32 * ($f-0x30); |
|
|
|
|
|
$CHARSET{G94}->{ ' '.$_.$F }->{dimension} = 1; ## DRCS |
|
|
$CHARSET{G94}->{ ' '.$_.$F }->{chars} = 94; |
|
|
$CHARSET{G94}->{ ' '.$_.$F }->{ucs} = |
|
|
{'' => 0x70090940, '!' => 0x700944A0, |
|
|
'"' => 0x70098000, '#' => 0x7009BB60}->{ $_ } + 94 * ($f-0x30); |
|
|
|
|
|
$CHARSET{G96}->{ ' '.$_.$F }->{dimension} = 1; ## DRCS |
|
|
$CHARSET{G96}->{ ' '.$_.$F }->{chars} = 96; |
|
|
$CHARSET{G96}->{ ' '.$_.$F }->{ucs} = |
|
|
{'' => 0x700926A0, '!' => 0x70096200, |
|
|
'"' => 0x70099D60, '#' => 0x7009D8C0}->{ $_ } + 96 * ($f-0x30); |
|
|
} |
|
|
} |
|
|
for my $f (0x30..0x5F, 0x7E) { |
|
|
my $F = pack 'C', $f; |
|
|
for ('', '!', '"', '#') { |
|
|
$CHARSET{G94n}->{ $_.$F }->{dimension} = 2; |
|
|
$CHARSET{G94n}->{ $_.$F }->{chars} = 94; |
|
|
$CHARSET{G94n}->{ $_.$F }->{ucs} = |
|
|
({'' => 0xE9F6C0}->{ $_ }||0) + 94*94 * ($f-0x30); |
|
|
## BUG: 94^n DRCSes with I byte have no mapping area |
|
|
|
|
|
$CHARSET{G96n}->{ $_.$F }->{dimension} = 2; |
|
|
$CHARSET{G96n}->{ $_.$F }->{chars} = 96; |
|
|
$CHARSET{G96n}->{ $_.$F }->{ucs} = |
|
|
({'' => 0xF4C000}->{ $_ }||0) + 96*96 * ($f-0x30); |
|
|
## BUG: 94^n DRCSes with I byte have no mapping area |
|
|
} |
|
|
} |
|
|
for (0x60..0x6F) { |
|
|
my $F = pack 'C', $_; |
|
|
## BUG: 9x^3 sets have no mapping area |
|
|
for ('', '!', '"', '#', ' ') { |
|
|
$CHARSET{G94n}->{ $_.$F }->{dimension} = 3; |
|
|
$CHARSET{G94n}->{ $_.$F }->{chars} = 94; |
|
|
|
|
|
$CHARSET{G96n}->{ $_.$F }->{dimension} = 3; |
|
|
$CHARSET{G96n}->{ $_.$F }->{chars} = 96; |
|
|
} |
|
|
} |
|
|
for (0x70..0x7D) { |
|
|
my $F = pack 'C', $_; |
|
|
## BUG: 9x^4 sets have no mapping area |
|
|
for ('', '!', '"', '#', ' ') { |
|
|
$CHARSET{G94n}->{ $_.$F }->{dimension} = 4; |
|
|
$CHARSET{G94n}->{ $_.$F }->{chars} = 94; |
|
|
|
|
|
$CHARSET{G96n}->{ $_.$F }->{dimension} = 4; |
|
|
$CHARSET{G96n}->{ $_.$F }->{chars} = 96; |
|
|
} |
|
|
} |
|
|
for my $f (0x40..0x4E) { |
|
|
my $F = pack 'C', $f; |
|
|
$CHARSET{G96n}->{ ' '.$F }->{dimension} = 2; |
|
|
$CHARSET{G96n}->{ ' '.$F }->{chars} = 96; |
|
|
$CHARSET{G96n}->{ ' '.$F }->{ucs} = 0xF0000 + 96*96*($f-0x40); |
|
|
## U+F0000-U+10F7FF (private) -> ESC 02/04 02/00 <I> (04/00-04/14) (DRCS) |
|
|
} |
|
|
|
|
|
$CHARSET{G94}->{B}->{ucs} = 0x21; ## ASCII |
|
|
$CHARSET{G96}->{A}->{ucs} = 0xA0; ## ISO 8859-1 |
|
|
|
|
|
$CHARSET{G94n}->{'B@'}->{dimension} = 2; ## JIS X 0208-1990 |
|
|
$CHARSET{G94n}->{'B@'}->{chars} = 94; |
|
|
$CHARSET{G94n}->{'B@'}->{ucs} = 0xE9F6C0 + 94*94*79; |
|
|
|
|
|
## -- Control character sets |
|
|
$CHARSET{C0}->{'@'}->{ucs} = 0x00; ## ISO/IEC 6429 C0 |
|
|
for ("\x40", "\x43", "\x44", "\x45", "\x46", "\x49", "\x4A", "\x4B", "\x4C") { |
|
|
$CHARSET{C0}->{$_}->{C_LS0} = "\x0F"; |
|
|
$CHARSET{C0}->{$_}->{C_LS1} = "\x0E"; |
|
|
$CHARSET{C0}->{$_}->{r_LS0} = '\x0F'; |
|
|
$CHARSET{C0}->{$_}->{r_LS1} = '\x0E'; |
|
|
} |
|
|
for ("\x40", "\x44", "\x45", "\x46", "\x48", "\x4C") { |
|
|
$CHARSET{C0}->{$_}->{reset_all} = {"\x0A" => 1, "\x0B" => 1, |
|
|
"\x0C" => 1, "\x0D" => 1}; |
|
|
} |
|
|
$CHARSET{C0}->{"\x43"}->{reset_all} = {"\x0A" => 1}; |
|
|
$CHARSET{C0}->{"\x44"}->{C_SS2} = "\x1C"; |
|
|
$CHARSET{C0}->{"\x44"}->{r_SS2} = '\x1C'; |
|
|
for ("\x45", "\x49", "\x4A", "\x4B") { |
|
|
$CHARSET{C0}->{$_}->{C_SS2} = "\x19"; |
|
|
$CHARSET{C0}->{$_}->{C_SS3} = "\x1D"; |
|
|
$CHARSET{C0}->{$_}->{r_SS2} = '\x19'; |
|
|
$CHARSET{C0}->{$_}->{r_SS3} = '\x1D'; |
|
|
} |
|
|
$CHARSET{C0}->{"\x4C"}->{C_SS2} = "\x19"; |
|
|
$CHARSET{C0}->{"\x4C"}->{r_SS2} = '\x19'; |
|
|
|
|
|
$CHARSET{C1}->{'64291991C1'}->{dimension} = 1; ## ISO/IEC 6429:1991 C1 |
|
|
$CHARSET{C1}->{'64291991C1'}->{chars} = 32; |
|
|
$CHARSET{C1}->{'64291991C1'}->{ucs} = 0x80; |
|
|
for ("\x43", "\x45", "\x47", '64291991C1') { |
|
|
$CHARSET{C1}->{$_}->{C_SS2} = "\x8E"; |
|
|
$CHARSET{C1}->{$_}->{C_SS3} = "\x8F"; |
|
|
$CHARSET{C1}->{$_}->{r_SS2} = '\x8E'; |
|
|
$CHARSET{C1}->{$_}->{r_SS3} = '\x8F'; |
|
|
$CHARSET{C1}->{$_}->{r_SS2_ESC} = '\x1B\x4E'; |
|
|
$CHARSET{C1}->{$_}->{r_SS3_ESC} = '\x1B\x4F'; |
|
|
} |
|
|
for ("\x43", '64291991C1') { |
|
|
$CHARSET{C1}->{$_}->{r_CSI} = '\x9B'; |
|
|
$CHARSET{C1}->{$_}->{r_CSI_ESC} = '\x1B\x5B'; |
|
|
$CHARSET{C1}->{$_}->{r_DCS} = '\x90'; |
|
|
$CHARSET{C1}->{$_}->{r_ST} = '\x9C'; |
|
|
$CHARSET{C1}->{$_}->{r_OSC} = '\x9D'; |
|
|
$CHARSET{C1}->{$_}->{r_PM} = '\x9E'; |
|
|
$CHARSET{C1}->{$_}->{r_APC} = '\x9F'; |
|
|
$CHARSET{C1}->{$_}->{reset_all} = {"\x85"=>1, "\x90"=>1, |
|
|
"\x9C"=>1, "\x9D"=>1, "\x9E"=>1, "\x9F"=>1}; |
|
|
} |
|
|
$CHARSET{C1}->{'64291991C1'}->{r_SCI} = '\x9A'; |
|
|
|
|
|
$CHARSET{single_control}->{Fs} ={ucs => 0x70005000, chars => 32, dimension => 1}; |
|
|
$CHARSET{single_control}->{'3F'} ={ucs => 0x70005020, chars => 80, dimension => 1}; |
|
|
$CHARSET{single_control}->{'3F!'}={ucs => 0x70005070, chars => 80, dimension => 1}; |
|
|
$CHARSET{single_control}->{'3F"'}={ucs => 0x700050C0, chars => 80, dimension => 1}; |
|
|
$CHARSET{single_control}->{'3F#'}={ucs => 0x70005110, chars => 80, dimension => 1}; |
|
|
} |
|
|
|
|
|
|
|
68 |
### --- Perl Encode module common functions |
### --- Perl Encode module common functions |
69 |
|
|
70 |
sub encode ($$;$) { |
sub encode ($$;$) { |
81 |
} |
} |
82 |
|
|
83 |
### --- Encode::ISO2022 unique functions |
### --- Encode::ISO2022 unique functions |
84 |
|
*new_object = \&Encode::Charset::new_object; |
85 |
|
|
86 |
## Make a new ISO/IEC 2022-buffers object with default status |
sub iso2022_to_internal ($;\%) { |
87 |
sub new_object { |
my ($s, $C) = @_; |
88 |
my %C; |
$C ||= &new_object; |
89 |
$C{bit} = 8; |
my $t = ''; |
90 |
$C{CL} = 'C0'; $C{CR} = 'C1'; $C{ESC_Fe} = 'C1'; |
$s =~ s{ |
91 |
$C{C0} = $CHARSET{C0}->{"\x40"}; ## ISO/IEC 6429:1991 C0 |
^((?:(?!\x1B\x25\x2F?[\x30-\x7E]).)*) |
92 |
$C{C1} = $CHARSET{C1}->{'64291991C1'}; ## ISO/IEC 6429:1991 C1 |
}{ |
93 |
$C{GL} = 'G0'; $C{GR} = 'G1'; |
my $i2 = $1; |
94 |
$C{G0} = $CHARSET{G94}->{"\x42"}; ## ISO/IEC 646:1991 IRV |
$t = _iso2022_to_internal ($i2, $C); |
95 |
#$C{G1} = $CHARSET{G96}->{"\x41"}; ## ISO/IEC 8859-1 GR |
''; |
96 |
$C{G1} = $CHARSET{G94}->{"\x7E"}; ## empty set |
}gesx; |
97 |
$C{G2} = $CHARSET{G94}->{"\x7E"}; ## empty set |
my $pad = ''; |
98 |
$C{G3} = $CHARSET{G94}->{"\x7E"}; ## empty set |
use re 'eval'; |
99 |
$C{option} = { |
$s =~ s{ |
100 |
C1invoke_to_right => 0, ## C1 invoked to: (0: ESC Fe, 1: CR) |
## ISO/IEC 2022 |
101 |
G94n_designate_long => 0, ## (1: ESC 02/04 02/08 04/00..02) |
(??{"$pad\x1B$pad\x25$pad\x40"})((?:(?!\x1B\x25\x2F?[\x30-\x7E]).)*) |
102 |
designate_to => { ## Designated G buffer (-1: not be outputed) |
## UTF-8 |
103 |
C0 => { |
|(??{"$pad\x1B$pad\x25$pad(?:\x47|\x2F$pad"."[\x47-\x49])"}) |
104 |
default => 0, |
((?:(?!\x1B\x25\x2F?[\x30-\x7E]).)*) |
105 |
}, |
## UCS-2, UTF-16 |
106 |
C1 => { |
|(??{"$pad\x1B$pad\x25$pad\x2F$pad"})([\x40\x43\x45\x4A-\x4C]) |
107 |
default => 1, |
((?:(?!\x00\x1B\x00\x25(?:\x00\x2F)?\x00[\x30-\x7E])..)*) |
108 |
}, |
## UCS-4 |
109 |
G94 => { |
|(??{"$pad\x1B$pad\x25$pad\x2F$pad"})[\x41\x44\x46] |
110 |
"\x42" => 0, |
((?:(?!\x00\x00\x00\x1B\x00\x00\x00\x25(?:\x00\x00\x00\x2F)? |
111 |
default => 0, |
\x00\x00\x00[\x30-\x7E])....)*) |
112 |
}, |
## with standard return |
113 |
G96 => { |
|(??{"$pad\x1B$pad\x25$pad"})([\x30-\x7E]) |
114 |
default => 1, |
((?:(?!\x1B\x25\x2F?[\x30-\x7E]).)*) |
115 |
}, |
## without standard return |
116 |
G94n => { |
|(??{"$pad\x1B$pad\x25$pad\x2F$pad"})([\x30-\x7E])(.*) |
117 |
default => 0, |
}{ |
118 |
}, |
my ($i2,$u8,$Fu2,$u2,$u4,$Fsr,$sr,$Fnsr,$nsr) = ($1,$2,$3,$4,$5,$6,$7,$8,$9); |
119 |
G96n => { |
my $r = ''; |
120 |
default => 1, |
if (defined $i2) { |
121 |
}, |
$r = _iso2022_to_internal ($i2, $C); $pad = ''; |
122 |
}, |
} elsif (defined $u8) { |
123 |
Ginvoke_by_single_shift => [0,0,0,0], ## Invoked by SS |
$r = Encode::decode ('utf8', $u8); $pad = ''; |
124 |
Ginvoke_to_left => [1,1,1,1], ## Which invoked to? (1: L, 0: R) |
} elsif ($Fu2) { |
125 |
private_set => { ## Private set vs Final byte |
if (ord ($Fu2) > 0x49) { |
126 |
C0 => [], |
$r = Encode::decode ('utf-16be', $u2); |
127 |
C1 => [], |
} else { |
128 |
G94 => [], |
$r = Encode::decode ('ucs-2be', $u2); |
129 |
G94n => [[],[],[],[],[]], |
} |
130 |
G96 => [], |
$pad = "\x00"; |
131 |
#G96n => [], ## (not implemented) |
} elsif (defined $u4) { |
132 |
U96n => [], ## mule-unicode sets |
$r = Encode::decode ('ucs-4be', $u2); $pad = "\x00\x00\x00"; |
133 |
XC1 => { |
} elsif (defined $Fsr && $CODING_SYSTEM{$Fsr}->{perl_name}) { |
134 |
'64291991C1' => undef, ## ISO/IEC 6429:1991 C1 |
$r = Encode::decode ($CODING_SYSTEM{$Fsr}->{perl_name}, $sr); $pad = ''; |
135 |
}, |
} elsif (defined $Fnsr && $CODING_SYSTEM{$Fnsr}->{perl_name}) { |
136 |
}, |
$r = Encode::decode ($CODING_SYSTEM{$Fnsr}->{perl_name}, $nsr); $pad = ''; |
137 |
reset => { ## Reset status at top of line |
} else { ## temporary |
138 |
Gdesignation => "\x42", ## F of designation or 0 |
$r = '?' x length ($sr.$nsr); $pad = ''; |
139 |
Ginvoke => 1, |
} |
140 |
}, |
$r; |
141 |
use_revision => 1, ## Output IRR |
}gesx; |
142 |
}; |
$t . $s; |
|
\%C; |
|
143 |
} |
} |
144 |
|
|
145 |
sub iso2022_to_internal ($;\%) { |
sub _iso2022_to_internal ($;\%) { |
146 |
my ($s, $C) = @_; |
my ($s, $C) = @_; |
147 |
my %_GB_to_GN = ( |
my %_GB_to_GN = ( |
148 |
"\x28"=>'G0',"\x29"=>'G1',"\x2A"=>'G2',"\x2B"=>'G3', |
"\x28"=>'G0',"\x29"=>'G1',"\x2A"=>'G2',"\x2B"=>'G3', |
149 |
"\x2C"=>'G0',"\x2D"=>'G1',"\x2E"=>'G2',"\x2F"=>'G3', |
"\x2C"=>'G0',"\x2D"=>'G1',"\x2E"=>'G2',"\x2F"=>'G3', |
150 |
); |
); |
|
$C ||= &new_object; |
|
151 |
|
|
152 |
use re 'eval'; |
use re 'eval'; |
153 |
$s =~ s{ |
$s =~ s{ |
154 |
((??{ $_CHARS_to_RANGE{'l'.$C->{$C->{GL}}->{chars}} |
((??{ $_CHARS_to_RANGE{'l'.$C->{$C->{GL}}->{chars}} |
155 |
. qq/{$C->{$C->{GL}}->{dimension},$C->{$C->{GL}}->{dimension}}/ })) |
. qq/{$C->{$C->{GL}}->{dimension},$C->{$C->{GL}}->{dimension}}/ })) |
156 |
|((??{ $_CHARS_to_RANGE{'r'.$C->{$C->{GL}}->{chars}} |
|((??{ $_CHARS_to_RANGE{'r'.$C->{$C->{GR}}->{chars}} |
157 |
. qq/{$C->{$C->{GR}}->{dimension},$C->{$C->{GR}}->{dimension}}/ })) |
. qq/{$C->{$C->{GR}}->{dimension},$C->{$C->{GR}}->{dimension}}/ })) |
|
|
|
158 |
| (??{ q/(?:/ . ($C->{$C->{CR}}->{r_SS2} || '(?!)') |
| (??{ q/(?:/ . ($C->{$C->{CR}}->{r_SS2} || '(?!)') |
159 |
. ($C->{$C->{ESC_Fe}}->{r_SS2_ESC} ? |
. ($C->{$C->{ESC_Fe}}->{r_SS2_ESC} ? |
160 |
qq/|$C->{$C->{ESC_Fe}}->{r_SS2_ESC}/ : '') |
qq/|$C->{$C->{ESC_Fe}}->{r_SS2_ESC}/ : '') |
163 |
||$C->{$C->{CL}}->{r_LS1}? ## ISO/IEC 6429:1992 9 |
||$C->{$C->{CL}}->{r_LS1}? ## ISO/IEC 6429:1992 9 |
164 |
qq/[$C->{$C->{CL}}->{r_LS0}$C->{$C->{CL}}->{r_LS1}]*/:'') |
qq/[$C->{$C->{CL}}->{r_LS0}$C->{$C->{CL}}->{r_LS1}]*/:'') |
165 |
}) |
}) |
166 |
((??{ $_CHARS_to_RANGE{'b'.$C->{$C->{GL}}->{chars}} |
((??{ $_CHARS_to_RANGE{'b'.$C->{G2}->{chars}} |
167 |
. qq/{$C->{G2}->{dimension},$C->{G2}->{dimension}}/ })) |
. qq/{$C->{G2}->{dimension},$C->{G2}->{dimension}}/ })) |
168 |
| (??{ q/(?:/ . ($C->{$C->{CR}}->{r_SS3} || '(?!)') |
| (??{ q/(?:/ . ($C->{$C->{CR}}->{r_SS3} || '(?!)') |
169 |
. ($C->{$C->{ESC_Fe}}->{r_SS3_ESC} ? |
. ($C->{$C->{ESC_Fe}}->{r_SS3_ESC} ? |
173 |
|| $C->{$C->{CL}}->{r_LS1}? ## ISO/IEC 6429:1992 9 |
|| $C->{$C->{CL}}->{r_LS1}? ## ISO/IEC 6429:1992 9 |
174 |
qq/[$C->{$C->{CL}}->{r_LS0}$C->{$C->{CL}}->{r_LS1}]*/:'') |
qq/[$C->{$C->{CL}}->{r_LS0}$C->{$C->{CL}}->{r_LS1}]*/:'') |
175 |
}) |
}) |
176 |
((??{ $_CHARS_to_RANGE{'b'.$C->{$C->{GL}}->{chars}} |
((??{ $_CHARS_to_RANGE{'b'.$C->{G3}->{chars}} |
177 |
. qq/{$C->{G3}->{dimension},$C->{G3}->{dimension}}/ })) |
. qq/{$C->{G3}->{dimension},$C->{G3}->{dimension}}/ })) |
178 |
|
|
179 |
|((??{ $C->{$C->{CL}}->{r_LS0}||'(?!)' })) ## GL = G0 |
## Locking shift |
180 |
|((??{ $C->{$C->{CL}}->{r_LS1}||'(?!)' })) ## GL = G1 |
|( (??{ $C->{$C->{CL}}->{r_LS0}||'(?!)' }) |
181 |
|
|(??{ $C->{$C->{CL}}->{r_LS1}||'(?!)' }) |
182 |
|
) |
183 |
|
|
184 |
## Control sequence |
## Control sequence |
185 |
|(??{ '(?:'.($C->{$C->{CR}}->{r_CSI}||'(?!)') |
|(??{ '(?:'.($C->{$C->{CR}}->{r_CSI}||'(?!)') |
197 |
## Misc. sequence (SP, control, or broken data) |
## Misc. sequence (SP, control, or broken data) |
198 |
|([\x00-\xFF]) |
|([\x00-\xFF]) |
199 |
}{ |
}{ |
200 |
my ($gl,$gr,$ss2,$ss3,$ls0,$ls1,$csi,$esc,$misc) |
my ($gl,$gr,$ss2,$ss3,$ls,$csi,$esc,$misc) |
201 |
= ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10); |
= ($1,$2,$3,$4,$5,$6,$7,$8,$9); |
202 |
$C->{_irr} = undef unless defined $esc; |
$C->{_irr} = undef unless defined $esc; |
203 |
## GL graphic character |
## GL graphic character |
204 |
if (defined $gl) { |
if (defined $gl) { |
236 |
$c = $c * $C->{G3}->{chars} + unpack ('C', $_) - $m; |
$c = $c * $C->{G3}->{chars} + unpack ('C', $_) - $m; |
237 |
} |
} |
238 |
chr ($C->{G3}->{ucs} + $c); |
chr ($C->{G3}->{ucs} + $c); |
|
## Locking shifts |
|
|
} elsif ($ls0) { |
|
|
$C->{GL} = 'G0'; ''; |
|
|
} elsif ($ls1) { |
|
|
$C->{GL} = 'G1'; ''; |
|
239 |
## Escape sequence |
## Escape sequence |
240 |
} elsif ($esc) { |
} elsif ($esc) { |
241 |
## IRR (revision number) |
## IRR (revision number) |
292 |
$C->{GL} = 'G2'; ''; |
$C->{GL} = 'G2'; ''; |
293 |
} elsif ($Fs eq "\x6F") { ## LS3 |
} elsif ($Fs eq "\x6F") { ## LS3 |
294 |
$C->{GL} = 'G3'; ''; |
$C->{GL} = 'G3'; ''; |
295 |
} elsif ($Fs eq "\x7E") { ## LS1R |
} elsif ($Fs eq "\x7E" || $Fs eq "\x6B") { ## LS1R |
296 |
$C->{GR} = 'G1'; $C->{GL} = 'G1' if $C->{bit} == 7; ''; |
$C->{GR} = 'G1'; $C->{GL} = 'G1' if $C->{bit} == 7; ''; |
297 |
} elsif ($Fs eq "\x7D") { ## LS2R |
} elsif ($Fs eq "\x7D" || $Fs eq "\x6C") { ## LS2R |
298 |
$C->{GR} = 'G2'; $C->{GL} = 'G2' if $C->{bit} == 7; ''; |
$C->{GR} = 'G2'; $C->{GL} = 'G2' if $C->{bit} == 7; ''; |
299 |
} elsif ($Fs eq "\x7C") { ## LS3R |
} elsif ($Fs eq "\x7C" || $Fs eq "\x6D") { ## LS3R |
300 |
$C->{GR} = 'G3'; $C->{GL} = 'G3' if $C->{bit} == 7; ''; |
$C->{GR} = 'G3'; $C->{GL} = 'G3' if $C->{bit} == 7; ''; |
301 |
} else { |
} else { |
302 |
chr ($CHARSET{single_control}->{Fs}->{ucs} + (ord ($Fs) - 0x60)); |
chr ($CHARSET{single_control}->{Fs}->{ucs} + (ord ($Fs) - 0x60)); |
312 |
$C->{_irr} = undef; |
$C->{_irr} = undef; |
313 |
} |
} |
314 |
$esc; |
$esc; |
315 |
|
## Locking shifts |
316 |
|
} elsif ($ls) { |
317 |
|
if ($ls eq $C->{$C->{CL}}->{LS0}) { |
318 |
|
$C->{GL} = 'G0'; ''; |
319 |
|
} elsif ($ls eq $C->{$C->{CL}}->{LS1}) { |
320 |
|
$C->{GL} = 'G1'; ''; |
321 |
|
} |
322 |
## Control sequence |
## Control sequence |
323 |
} elsif ($csi) { |
} elsif ($csi) { |
324 |
$csi =~ tr/\xA0-\xFF/\x20-\x7F/d; |
$csi =~ tr/\xA0-\xFF/\x20-\x7F/d; |
445 |
->[ $c / 8836 ]); |
->[ $c / 8836 ]); |
446 |
} |
} |
447 |
if (defined $t) { |
if (defined $t) { |
448 |
|
$t = _i2o ($t, $C, cs_F => "\x40") |
449 |
|
if $C->{coding_system} ne $CODING_SYSTEM{"\x40"}; |
450 |
|
} else { |
451 |
|
my $F; my @F = qw~G /G /H /I B /A /D /F~; |
452 |
|
push @F, qw~/J /K /L~ if $cc <= 0x10FFFF; |
453 |
|
push @F, qw~/@ /C /E~ if $cc <= 0xFFFF; |
454 |
|
for (@F) { |
455 |
|
if (defined $C->{option}->{designate_to}->{coding_system}->{$_} |
456 |
|
&& $C->{option}->{designate_to}->{coding_system}->{$_} > -1) { |
457 |
|
$F = $_; last; |
458 |
|
} elsif ($C->{option}->{designate_to}->{coding_system}->{default} > -1) { |
459 |
|
$F = $_; last; |
460 |
|
} |
461 |
|
} |
462 |
|
$t = _i2o ($c, $C, cs_F => $F) if $F; |
463 |
|
} |
464 |
|
if (defined $t) { |
465 |
$r .= $t; |
$r .= $t; |
466 |
} else { |
} else { |
467 |
$r .= _i2g ("\x3F", $C, type => 'G94', charset => 'B'); |
unless ($C->{option}->{undef_char}->[0] eq "\x20") { |
468 |
|
$t = _i2g ($C->{option}->{undef_char}->[0], $C, |
469 |
|
%{ $C->{option}->{undef_char}->[1] }); |
470 |
|
} else { ## SP |
471 |
|
$t = _back2ascii ($C) . "\x20"; |
472 |
|
} |
473 |
|
$r .= $C->{coding_system} eq $CODING_SYSTEM{"\x40"} ? |
474 |
|
$t : _i2o ($t, $C, cs_F => "\x40"); |
475 |
} |
} |
476 |
} |
} |
477 |
$r . _back2ascii ($C); |
$r . _back2ascii ($C); |
525 |
sub _i2g ($%%) { |
sub _i2g ($%%) { |
526 |
my ($s, $C, %O) = @_; |
my ($s, $C, %O) = @_; |
527 |
my $r = ''; |
my $r = ''; |
528 |
my $set = $CHARSET{$O{type}}->{$O{charset}}; |
my $set = $CHARSET{$O{type}}->{$O{charset}. |
529 |
|
($O{revision}&&$C->{option}->{use_revision}?$O{revision}:'')}; |
530 |
my $set0 = $CHARSET{$O{type}}->{$O{charset_id}}; |
my $set0 = $CHARSET{$O{type}}->{$O{charset_id}}; |
531 |
## -- designate character set |
## -- designate character set |
532 |
my $G = 0; |
my $G = 0; |
580 |
} elsif ($C->{C0}->{'C_SS'.$G}) { |
} elsif ($C->{C0}->{'C_SS'.$G}) { |
581 |
$r .= _i2c ($C->{C0}->{'C_SS'.$G}, $C, type => 'C0') || return undef; |
$r .= _i2c ($C->{C0}->{'C_SS'.$G}, $C, type => 'C0') || return undef; |
582 |
} else { ## Both C0 and C1 set do not have SS2/3. |
} else { ## Both C0 and C1 set do not have SS2/3. |
583 |
|
$left = 0 if $G == 1 && !$C->{C0}->{C_LS1}; |
584 |
$r .= __invoke ($C, $G => $left) if $C->{$left?'GL':'GR'} ne 'G'.$G; |
$r .= __invoke ($C, $G => $left) if $C->{$left?'GL':'GR'} ne 'G'.$G; |
585 |
} |
} |
586 |
} else { |
} else { |
627 |
} |
} |
628 |
''; |
''; |
629 |
} |
} |
630 |
|
sub _i2o ($\%%) { |
631 |
sub make_charset (%) { |
my ($s, $C, %O) = @_; |
632 |
## TODO: support private charset ID such as 'X0' |
my $CS = $CODING_SYSTEM{ $O{cs_F} } || $CODING_SYSTEM{ $O{cs_id} } || return undef; |
633 |
my %set = @_; |
my $r = ''; |
634 |
my $setid = qq($set{I}$set{F}$set{revision}); |
if ($CS ne $C->{coding_system}) { |
635 |
my $settype = $set{type} || 'G94'; |
my $e = ''; |
636 |
delete $set{type}, $set{I}, $set{F}, $set{revision}; |
$e .= "\x1B\x25"; |
637 |
$CHARSET{ $settype }->{ $setid } = \%set; |
$e .= $O{cs_F} || $C->{private_set}->{coding_system}->{ $O{cs_id} } |
638 |
|
|| return undef; |
639 |
|
if ($C->{coding_system} eq $CODING_SYSTEM{"\x2F\x40"} |
640 |
|
|| $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x43"} |
641 |
|
|| $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x45"} |
642 |
|
|| $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x4A"} |
643 |
|
|| $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x4B"} |
644 |
|
|| $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x4C"}) { |
645 |
|
$e =~ s/(.)/\x00$1/go; |
646 |
|
} elsif ($C->{coding_system} eq $CODING_SYSTEM{"\x2F\x41"} |
647 |
|
|| $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x44"} |
648 |
|
|| $C->{coding_system} eq $CODING_SYSTEM{"\x2F\x46"}) { |
649 |
|
$e =~ s/(.)/\x00\x00\x00$1/go; |
650 |
|
} |
651 |
|
$r .= $e; |
652 |
|
$C->{coding_system} = $CS; |
653 |
|
if ($CS->{reset_state}) { |
654 |
|
$C->{GL} = undef; $C->{GR} = undef; |
655 |
|
$C->{C0} = $CHARSET{C0}->{"\x7E"}; |
656 |
|
$C->{C1} = $CHARSET{C1}->{"\x7E"}; |
657 |
|
$C->{G0} = $CHARSET{G94}->{"\x7E"}; |
658 |
|
$C->{G1} = $CHARSET{G94}->{"\x7E"}; |
659 |
|
$C->{G2} = $CHARSET{G94}->{"\x7E"}; |
660 |
|
$C->{G3} = $CHARSET{G94}->{"\x7E"}; |
661 |
|
} |
662 |
|
} |
663 |
|
if ($CS eq $CODING_SYSTEM{"\x40"}) { |
664 |
|
# |
665 |
|
} elsif ($CS eq $CODING_SYSTEM{G} || $CS eq $CODING_SYSTEM{'/G'} |
666 |
|
|| $CS eq $CODING_SYSTEM{'/H'} || $CS eq $CODING_SYSTEM{'/I'}) { |
667 |
|
Encode::_utf8_off ($s); |
668 |
|
} elsif ($CS eq $CODING_SYSTEM{'/@'} || $CS eq $CODING_SYSTEM{'/C'} |
669 |
|
|| $CS eq $CODING_SYSTEM{'/E'}) { |
670 |
|
$s = Encode::encode ('ucs-2be', $s); |
671 |
|
} elsif ($CS eq $CODING_SYSTEM{'/A'} || $CS eq $CODING_SYSTEM{'/D'} |
672 |
|
|| $CS eq $CODING_SYSTEM{'/F'}) { |
673 |
|
$s = Encode::encode ('ucs-4be', $s); |
674 |
|
} elsif ($CS eq $CODING_SYSTEM{'/J'} || $CS eq $CODING_SYSTEM{'/K'} |
675 |
|
|| $CS eq $CODING_SYSTEM{'/L'}) { |
676 |
|
$s = Encode::encode ('UTF-16BE', $s); |
677 |
|
} elsif ($CS eq $CODING_SYSTEM{B}) { |
678 |
|
$s = Encode::encode ('utf-1', $s); |
679 |
|
} else { |
680 |
|
return undef; |
681 |
|
} |
682 |
|
$r . $s; |
683 |
} |
} |
684 |
|
|
685 |
1; |
1; |
702 |
|
|
703 |
Encode, perlunicode |
Encode, perlunicode |
704 |
|
|
705 |
|
=head1 TODO |
706 |
|
|
707 |
|
=over 4 |
708 |
|
|
709 |
|
=item NCR (coding system other than ISO/IEC 2022) support |
710 |
|
|
711 |
|
=over 2 |
712 |
|
|
713 |
|
=item ESC 02/05 02/15 03/x of X Compound Text |
714 |
|
|
715 |
|
=back |
716 |
|
|
717 |
|
=item Output of control character sets, single control functions |
718 |
|
|
719 |
|
=item Designation sequence of control character sets (input) |
720 |
|
|
721 |
|
=item Special graphic character sets such as G3 of EUC-TW |
722 |
|
|
723 |
|
=item SUPER SHIFT (SS) invoke function of old control character set |
724 |
|
|
725 |
|
=item Safe transparent of control string (ISO/IEC 6429) |
726 |
|
|
727 |
|
=item Output of unoutputable characters as alternative notation such as SGML-like entity |
728 |
|
|
729 |
|
=item C0 set invoked to CR area like ISIRI code |
730 |
|
|
731 |
|
Really need? |
732 |
|
|
733 |
|
=item special treatment of 0x20, 0x7E, 0xA0, 0xFF |
734 |
|
|
735 |
|
For example, GB mongolian sets use MSP (MONGOLIAN SPACE) |
736 |
|
with these code positions. |
737 |
|
|
738 |
|
And, no less coding systems does not use (or does ban using) DEL. |
739 |
|
|
740 |
|
=item A lot of character sets don't have pseudo-UCS mapping. |
741 |
|
|
742 |
|
Most of 9m^n (n >= 3) sets, 9m^n sets with I byte, 9m^n |
743 |
|
DRCSes do not have pseudo-UCS mapping area. It is |
744 |
|
questionable to allocate lots of code positions to these |
745 |
|
rarely-(or no-)used character sets. |
746 |
|
|
747 |
|
=item Even character sets that have pseudo-UCS mapping, some of them can't be outputed in ISO/IEC 2022. |
748 |
|
|
749 |
|
Because output of rarely-used character sets is |
750 |
|
not implemented yet. |
751 |
|
|
752 |
|
=back |
753 |
|
|
754 |
|
=head1 AUTHORS |
755 |
|
|
756 |
|
Nanashi-san |
757 |
|
|
758 |
|
Wakaba <w@suika.fam.cx> |
759 |
|
|
760 |
=head1 LICENSE |
=head1 LICENSE |
761 |
|
|
762 |
Copyright 2002 wakaba <w@suika.fam.cx> |
Copyright 2002 AUTHORS |
763 |
|
|
764 |
This library is free software; you can redistribute it |
This library is free software; you can redistribute it |
765 |
and/or modify it under the same terms as Perl itself. |
and/or modify it under the same terms as Perl itself. |