1 |
wakaba |
1.1 |
|
2 |
|
|
=head1 NAME |
3 |
|
|
|
4 |
|
|
Encode::Charset --- Coded Character Sets objects, |
5 |
|
|
used by Encode::ISO2022, Encode::SJIS, and other modules. |
6 |
|
|
|
7 |
|
|
=cut |
8 |
|
|
|
9 |
|
|
package Encode::Charset; |
10 |
|
|
use strict; |
11 |
|
|
use vars qw(%CHARSET %CODING_SYSTEM $VERSION); |
12 |
wakaba |
1.5 |
$VERSION=do{my @r=(q$Revision: 1.4 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r}; |
13 |
wakaba |
1.1 |
|
14 |
|
|
## --- Make initial charset definitions |
15 |
|
|
&_make_initial_charsets; |
16 |
|
|
sub _make_initial_charsets () { |
17 |
|
|
for my $f (0x30..0x7E) { |
18 |
|
|
my $F = pack 'C', $f; |
19 |
|
|
for ('', '!', '"', '#') { |
20 |
|
|
$CHARSET{G94}->{ $_.$F }->{dimension} = 1; |
21 |
|
|
$CHARSET{G94}->{ $_.$F }->{chars} = 94; |
22 |
|
|
$CHARSET{G94}->{ $_.$F }->{ucs} = |
23 |
|
|
{'' => 0xE90940, '!' => 0xE944A0, '"' => 0xE98000, '#' => 0xE9BB60}->{ $_ } |
24 |
|
|
+ 94 * ($f-0x30); |
25 |
|
|
|
26 |
|
|
$CHARSET{G96}->{ $_.$F }->{dimension} = 1; |
27 |
|
|
$CHARSET{G96}->{ $_.$F }->{chars} = 96; |
28 |
|
|
$CHARSET{G96}->{ $_.$F }->{ucs} = |
29 |
|
|
{'' => 0xE926A0, '!' => 0xE96200, '"' => 0xE99D60, '#' => 0xE9D8C0}->{ $_ } |
30 |
|
|
+ 96 * ($f-0x30); |
31 |
|
|
|
32 |
|
|
$CHARSET{C0}->{ $_.$F }->{dimension} = 1; |
33 |
|
|
$CHARSET{C0}->{ $_.$F }->{chars} = 32; |
34 |
|
|
$CHARSET{C0}->{ $_.$F }->{ucs} = |
35 |
|
|
{'' => 0x70000000, '!' => 0x70001400, |
36 |
|
|
'"' => 0x70002800, '#' => 0x70003C00}->{ $_ } + 32 * ($f-0x30); |
37 |
|
|
|
38 |
|
|
$CHARSET{C1}->{ $_.$F }->{dimension} = 1; |
39 |
|
|
$CHARSET{C1}->{ $_.$F }->{chars} = 32; |
40 |
|
|
$CHARSET{C1}->{ $_.$F }->{ucs} = |
41 |
|
|
{'' => 0x70000A00, '!' => 0x70001E00, |
42 |
|
|
'"' => 0x70003200, '#' => 0x70004600}->{ $_ } + 32 * ($f-0x30); |
43 |
|
|
|
44 |
|
|
$CHARSET{G94}->{ ' '.$_.$F }->{dimension} = 1; ## DRCS |
45 |
|
|
$CHARSET{G94}->{ ' '.$_.$F }->{chars} = 94; |
46 |
|
|
$CHARSET{G94}->{ ' '.$_.$F }->{ucs} = |
47 |
|
|
{'' => 0x70090940, '!' => 0x700944A0, |
48 |
|
|
'"' => 0x70098000, '#' => 0x7009BB60}->{ $_ } + 94 * ($f-0x30); |
49 |
|
|
|
50 |
|
|
$CHARSET{G96}->{ ' '.$_.$F }->{dimension} = 1; ## DRCS |
51 |
|
|
$CHARSET{G96}->{ ' '.$_.$F }->{chars} = 96; |
52 |
|
|
$CHARSET{G96}->{ ' '.$_.$F }->{ucs} = |
53 |
|
|
{'' => 0x700926A0, '!' => 0x70096200, |
54 |
|
|
'"' => 0x70099D60, '#' => 0x7009D8C0}->{ $_ } + 96 * ($f-0x30); |
55 |
|
|
} |
56 |
|
|
} |
57 |
|
|
for my $f (0x30..0x5F, 0x7E) { |
58 |
|
|
my $F = pack 'C', $f; |
59 |
|
|
for ('', '!', '"', '#', ' ') { |
60 |
|
|
$CHARSET{G94n}->{ $_.$F }->{dimension} = 2; |
61 |
|
|
$CHARSET{G94n}->{ $_.$F }->{chars} = 94; |
62 |
|
|
$CHARSET{G94n}->{ $_.$F }->{ucs} = |
63 |
|
|
({'' => 0xE9F6C0}->{ $_ }||0) + 94*94 * ($f-0x30); |
64 |
|
|
## BUG: 94^n sets with I byte have no mapping area |
65 |
|
|
|
66 |
|
|
$CHARSET{G96n}->{ $_.$F }->{dimension} = 2; |
67 |
|
|
$CHARSET{G96n}->{ $_.$F }->{chars} = 96; |
68 |
|
|
$CHARSET{G96n}->{ $_.$F }->{ucs} = |
69 |
|
|
({'' => 0xF4C000}->{ $_ }||0) + 96*96 * ($f-0x30); |
70 |
wakaba |
1.4 |
## BUG: 96^n DRCSes with I byte have no mapping area |
71 |
wakaba |
1.1 |
} |
72 |
|
|
} |
73 |
wakaba |
1.4 |
$CHARSET{G94n}->{"\x20\x40"}->{ucs} = 0x70460000; ## DRCS 94^2 04/00 |
74 |
wakaba |
1.5 |
$CHARSET{G94n}->{P4_0} = $CHARSET{G94n}->{"\x20\x40"}; |
75 |
wakaba |
1.4 |
|
76 |
wakaba |
1.1 |
for (0x60..0x6F) { |
77 |
|
|
my $F = pack 'C', $_; |
78 |
|
|
## BUG: 9x^3 sets have no mapping area |
79 |
|
|
for ('', '!', '"', '#', ' ') { |
80 |
|
|
$CHARSET{G94n}->{ $_.$F }->{dimension} = 3; |
81 |
|
|
$CHARSET{G94n}->{ $_.$F }->{chars} = 94; |
82 |
|
|
|
83 |
|
|
$CHARSET{G96n}->{ $_.$F }->{dimension} = 3; |
84 |
|
|
$CHARSET{G96n}->{ $_.$F }->{chars} = 96; |
85 |
|
|
} |
86 |
|
|
} |
87 |
|
|
for (0x70..0x7D) { |
88 |
|
|
my $F = pack 'C', $_; |
89 |
|
|
## BUG: 9x^4 sets have no mapping area |
90 |
|
|
for ('', '!', '"', '#', ' ') { |
91 |
|
|
$CHARSET{G94n}->{ $_.$F }->{dimension} = 4; |
92 |
|
|
$CHARSET{G94n}->{ $_.$F }->{chars} = 94; |
93 |
|
|
|
94 |
|
|
$CHARSET{G96n}->{ $_.$F }->{dimension} = 4; |
95 |
|
|
$CHARSET{G96n}->{ $_.$F }->{chars} = 96; |
96 |
|
|
} |
97 |
|
|
} |
98 |
|
|
for my $f (0x40..0x4E) { |
99 |
|
|
my $F = pack 'C', $f; |
100 |
|
|
$CHARSET{G96n}->{ ' '.$F }->{dimension} = 2; |
101 |
|
|
$CHARSET{G96n}->{ ' '.$F }->{chars} = 96; |
102 |
|
|
$CHARSET{G96n}->{ ' '.$F }->{ucs} = 0xF0000 + 96*96*($f-0x40); |
103 |
|
|
## U+F0000-U+10F7FF (private) -> ESC 02/04 02/00 <I> (04/00-04/14) (DRCS) |
104 |
|
|
} |
105 |
|
|
|
106 |
|
|
$CHARSET{G94}->{B}->{ucs} = 0x21; ## ASCII |
107 |
|
|
$CHARSET{G96}->{A}->{ucs} = 0xA0; ## ISO 8859-1 |
108 |
|
|
|
109 |
|
|
$CHARSET{G94n}->{'B@'}->{dimension} = 2; ## JIS X 0208-1990 |
110 |
|
|
$CHARSET{G94n}->{'B@'}->{chars} = 94; |
111 |
|
|
$CHARSET{G94n}->{'B@'}->{ucs} = 0xE9F6C0 + 94*94*79; |
112 |
wakaba |
1.3 |
|
113 |
|
|
## SJIS G3 mapping (JIS X 0213:2000 plane 2) |
114 |
|
|
$CHARSET{G94n}->{"\x50"}->{Csjis_kuE} = { # ku - 1 |
115 |
|
|
0xF0 => 7, 0xF1 => 3, 0xF2 => 11, 0xF3 => 13, 0xF4 => 77, |
116 |
|
|
0xF5 => 79, 0xF6 => 81, 0xF7 => 83, 0xF8 => 85, 0xF9 => 87, |
117 |
|
|
0xFA => 89, 0xFB => 91, 0xFC => 93, |
118 |
|
|
}; |
119 |
|
|
$CHARSET{G94n}->{"\x50"}->{Csjis_kuO} = { # ku - 1 |
120 |
|
|
0xF0 => 0, 0xF1 => 2, 0xF2 => 4, 0xF3 => 12, 0xF4 => 14, |
121 |
|
|
0xF5 => 78, 0xF6 => 80, 0xF7 => 82, 0xF8 => 84, 0xF9 => 86, |
122 |
|
|
0xFA => 88, 0xFB => 90, 0xFC => 92, |
123 |
|
|
}; |
124 |
|
|
$CHARSET{G94n}->{"\x50"}->{Csjis_first} = { reverse ( |
125 |
|
|
%{ $CHARSET{G94n}->{"\x50"}->{Csjis_kuE} }, |
126 |
|
|
%{ $CHARSET{G94n}->{"\x50"}->{Csjis_kuO} }, |
127 |
|
|
)}; |
128 |
|
|
|
129 |
wakaba |
1.1 |
## -- Control character sets |
130 |
|
|
$CHARSET{C0}->{'@'}->{ucs} = 0x00; ## ISO/IEC 6429 C0 |
131 |
|
|
for ("\x40", "\x43", "\x44", "\x45", "\x46", "\x49", "\x4A", "\x4B", "\x4C") { |
132 |
|
|
$CHARSET{C0}->{$_}->{C_LS0} = "\x0F"; |
133 |
|
|
$CHARSET{C0}->{$_}->{C_LS1} = "\x0E"; |
134 |
|
|
$CHARSET{C0}->{$_}->{r_LS0} = '\x0F'; |
135 |
|
|
$CHARSET{C0}->{$_}->{r_LS1} = '\x0E'; |
136 |
|
|
} |
137 |
|
|
for ("\x40", "\x44", "\x45", "\x46", "\x48", "\x4C") { |
138 |
|
|
$CHARSET{C0}->{$_}->{reset_all} = {"\x0A" => 1, "\x0B" => 1, |
139 |
|
|
"\x0C" => 1, "\x0D" => 1}; |
140 |
|
|
} |
141 |
|
|
$CHARSET{C0}->{"\x43"}->{reset_all} = {"\x0A" => 1}; |
142 |
|
|
$CHARSET{C0}->{"\x44"}->{C_SS2} = "\x1C"; |
143 |
|
|
$CHARSET{C0}->{"\x44"}->{r_SS2} = '\x1C'; |
144 |
|
|
for ("\x45", "\x49", "\x4A", "\x4B") { |
145 |
|
|
$CHARSET{C0}->{$_}->{C_SS2} = "\x19"; |
146 |
|
|
$CHARSET{C0}->{$_}->{C_SS3} = "\x1D"; |
147 |
|
|
$CHARSET{C0}->{$_}->{r_SS2} = '\x19'; |
148 |
|
|
$CHARSET{C0}->{$_}->{r_SS3} = '\x1D'; |
149 |
|
|
} |
150 |
|
|
$CHARSET{C0}->{"\x4C"}->{C_SS2} = "\x19"; |
151 |
|
|
$CHARSET{C0}->{"\x4C"}->{r_SS2} = '\x19'; |
152 |
|
|
|
153 |
|
|
$CHARSET{C1}->{'64291991C1'}->{dimension} = 1; ## ISO/IEC 6429:1991 C1 |
154 |
|
|
$CHARSET{C1}->{'64291991C1'}->{chars} = 32; |
155 |
|
|
$CHARSET{C1}->{'64291991C1'}->{ucs} = 0x80; |
156 |
|
|
for ("\x43", "\x45", "\x47", '64291991C1') { |
157 |
|
|
$CHARSET{C1}->{$_}->{C_SS2} = "\x8E"; |
158 |
|
|
$CHARSET{C1}->{$_}->{C_SS3} = "\x8F"; |
159 |
|
|
$CHARSET{C1}->{$_}->{r_SS2} = '\x8E'; |
160 |
|
|
$CHARSET{C1}->{$_}->{r_SS3} = '\x8F'; |
161 |
|
|
$CHARSET{C1}->{$_}->{r_SS2_ESC} = '\x1B\x4E'; |
162 |
|
|
$CHARSET{C1}->{$_}->{r_SS3_ESC} = '\x1B\x4F'; |
163 |
|
|
} |
164 |
|
|
for ("\x43", '64291991C1') { |
165 |
|
|
$CHARSET{C1}->{$_}->{r_CSI} = '\x9B'; |
166 |
|
|
$CHARSET{C1}->{$_}->{r_CSI_ESC} = '\x1B\x5B'; |
167 |
|
|
$CHARSET{C1}->{$_}->{r_DCS} = '\x90'; |
168 |
|
|
$CHARSET{C1}->{$_}->{r_ST} = '\x9C'; |
169 |
|
|
$CHARSET{C1}->{$_}->{r_OSC} = '\x9D'; |
170 |
|
|
$CHARSET{C1}->{$_}->{r_PM} = '\x9E'; |
171 |
|
|
$CHARSET{C1}->{$_}->{r_APC} = '\x9F'; |
172 |
|
|
$CHARSET{C1}->{$_}->{reset_all} = {"\x85"=>1, "\x90"=>1, |
173 |
|
|
"\x9C"=>1, "\x9D"=>1, "\x9E"=>1, "\x9F"=>1}; |
174 |
|
|
} |
175 |
|
|
$CHARSET{C1}->{'64291991C1'}->{r_SCI} = '\x9A'; |
176 |
|
|
|
177 |
|
|
$CHARSET{single_control}->{Fs} ={ucs => 0x70005000, chars => 32, dimension => 1}; |
178 |
|
|
$CHARSET{single_control}->{'3F'} ={ucs => 0x70005020, chars => 80, dimension => 1}; |
179 |
|
|
$CHARSET{single_control}->{'3F!'}={ucs => 0x70005070, chars => 80, dimension => 1}; |
180 |
|
|
$CHARSET{single_control}->{'3F"'}={ucs => 0x700050C0, chars => 80, dimension => 1}; |
181 |
|
|
$CHARSET{single_control}->{'3F#'}={ucs => 0x70005110, chars => 80, dimension => 1}; |
182 |
|
|
} |
183 |
|
|
|
184 |
|
|
&make_initial_coding_system; |
185 |
|
|
sub make_initial_coding_system { |
186 |
|
|
for (0x30..0x7E) { |
187 |
|
|
my $F = chr $_; |
188 |
|
|
$CODING_SYSTEM{$F} = {}; |
189 |
|
|
$CODING_SYSTEM{"\x2F".$F} = {reset_state => 1}; |
190 |
|
|
} |
191 |
wakaba |
1.2 |
$CODING_SYSTEM{Csjis} = {perl_name => 'shiftjis'}; |
192 |
wakaba |
1.1 |
} |
193 |
|
|
|
194 |
|
|
sub make_charset (%) { |
195 |
|
|
my %set = @_; |
196 |
|
|
my $setid = qq($set{I}$set{F}$set{revision}); |
197 |
|
|
my $settype = $set{type} || 'G94'; |
198 |
|
|
delete $set{type}, $set{I}, $set{F}, $set{revision}; |
199 |
|
|
$CHARSET{ $settype }->{ $setid } = \%set; |
200 |
|
|
} |
201 |
|
|
|
202 |
wakaba |
1.2 |
## Make a new ISO/IEC 2022-buffers object with default status |
203 |
|
|
sub new_object { |
204 |
|
|
my %C; |
205 |
|
|
$C{bit} = 8; |
206 |
wakaba |
1.3 |
$C{coding_system} = $CODING_SYSTEM{"\x40"}; ## ISO/IEC 2022 |
207 |
wakaba |
1.2 |
$C{CL} = 'C0'; $C{CR} = 'C1'; $C{ESC_Fe} = 'C1'; |
208 |
|
|
$C{C0} = $CHARSET{C0}->{"\x40"}; ## ISO/IEC 6429:1991 C0 |
209 |
|
|
$C{C1} = $CHARSET{C1}->{'64291991C1'}; ## ISO/IEC 6429:1991 C1 |
210 |
|
|
$C{GL} = 'G0'; $C{GR} = 'G1'; |
211 |
|
|
$C{G0} = $CHARSET{G94}->{"\x42"}; ## ISO/IEC 646:1991 IRV |
212 |
|
|
#$C{G1} = $CHARSET{G96}->{"\x41"}; ## ISO/IEC 8859-1 GR |
213 |
|
|
$C{G1} = $CHARSET{G94}->{"\x7E"}; ## empty set |
214 |
|
|
$C{G2} = $CHARSET{G94}->{"\x7E"}; ## empty set |
215 |
|
|
$C{G3} = $CHARSET{G94}->{"\x7E"}; ## empty set |
216 |
|
|
$C{option} = { |
217 |
|
|
C1invoke_to_right => 0, ## C1 invoked to: (0: ESC Fe, 1: CR) |
218 |
|
|
G94n_designate_long => 0, ## (1: ESC 02/04 02/08 04/00..02) |
219 |
wakaba |
1.5 |
designate_to => { ## Designated G buffer (-1: not to be outputed) |
220 |
wakaba |
1.2 |
C0 => { |
221 |
|
|
default => 0, |
222 |
|
|
}, |
223 |
|
|
C1 => { |
224 |
|
|
default => 1, |
225 |
|
|
}, |
226 |
|
|
G94 => { |
227 |
|
|
"\x42" => 0, |
228 |
|
|
default => 0, |
229 |
|
|
}, |
230 |
|
|
G96 => { |
231 |
|
|
default => 1, |
232 |
|
|
}, |
233 |
|
|
G94n => { |
234 |
|
|
default => 0, |
235 |
|
|
}, |
236 |
|
|
G96n => { |
237 |
|
|
default => 1, |
238 |
|
|
}, |
239 |
|
|
coding_system => { |
240 |
|
|
default => -1, |
241 |
|
|
}, |
242 |
|
|
}, |
243 |
wakaba |
1.5 |
final_to_set => { |
244 |
|
|
C0 => {}, C1 => {}, G94 => {}, G94n => {}, |
245 |
|
|
G96 => {}, G96n => {}, coding_system => {}, |
246 |
|
|
}, |
247 |
wakaba |
1.2 |
Ginvoke_by_single_shift => [0,0,0,0], ## Invoked by SS |
248 |
|
|
Ginvoke_to_left => [1,1,1,1], ## Which invoked to? (1: L, 0: R) |
249 |
|
|
private_set => { ## Private set vs Final byte |
250 |
|
|
C0 => [], |
251 |
|
|
C1 => [], |
252 |
|
|
G94 => [], |
253 |
wakaba |
1.4 |
G94n => [[],[],[],[],["\x20\x40"]], |
254 |
wakaba |
1.2 |
G96 => [], |
255 |
|
|
#G96n => [], ## (not implemented) |
256 |
|
|
U96n => [], ## mule-unicode sets |
257 |
|
|
XC1 => { |
258 |
|
|
'64291991C1' => undef, ## ISO/IEC 6429:1991 C1 |
259 |
|
|
}, |
260 |
|
|
}, |
261 |
|
|
reset => { ## Reset status at top of line |
262 |
|
|
Gdesignation => "\x42", ## F of designation or 0 |
263 |
|
|
Ginvoke => 1, |
264 |
|
|
}, |
265 |
|
|
undef_char => ["\x3F", {type => 'G94', charset => 'B'}], |
266 |
|
|
use_revision => 1, ## Output IRR |
267 |
|
|
}; |
268 |
|
|
\%C; |
269 |
|
|
} |
270 |
|
|
|
271 |
wakaba |
1.3 |
sub new_object_sjis { |
272 |
wakaba |
1.5 |
my $C = &new_object; |
273 |
wakaba |
1.3 |
$C->{coding_system} = $CODING_SYSTEM{Csjis}; |
274 |
|
|
$C->{CR} = undef; |
275 |
|
|
$C->{GR} = 'G2'; ## 0xA1-0xDF |
276 |
wakaba |
1.4 |
$C->{G0} = $CHARSET{G94}->{J}; ## JIS X 0201:1997 Latin |
277 |
|
|
$C->{G1} = $CHARSET{G94n}->{"\x4F"}; ## JIS X 0213:2000 |
278 |
wakaba |
1.3 |
$C->{G2} = $CHARSET{G94}->{I}; ## JIS X 0201:1997 Katakana |
279 |
|
|
$C->{G3} = $CHARSET{G94n}->{"\x50"}; ## JIS X 0213:2000 plane 2 |
280 |
wakaba |
1.4 |
## Special code area (0xFD-0xFF) |
281 |
|
|
$C->{Gsmap} = {"\xA0" => "\x{F8F0}", "\xFD" => "\x{F8F1}", "\xFE" => "\x{F8F2}", "\xFF" => "\x{F8F3}"}; |
282 |
|
|
$C->{GsmapR} = {}; ## Reversed table |
283 |
|
|
$C->{option}->{undef_char_sjis} = "\x81\xAC"; |
284 |
wakaba |
1.3 |
$C; |
285 |
|
|
} |
286 |
|
|
|
287 |
wakaba |
1.1 |
1; |
288 |
|
|
__END__ |
289 |
|
|
|
290 |
|
|
=head1 AUTHORS |
291 |
|
|
|
292 |
|
|
Nanashi-san |
293 |
|
|
|
294 |
|
|
Wakaba <w@suika.fam.cx> |
295 |
|
|
|
296 |
|
|
=head1 LICENSE |
297 |
|
|
|
298 |
|
|
Copyright 2002 AUTHORS |
299 |
|
|
|
300 |
|
|
This library is free software; you can redistribute it |
301 |
|
|
and/or modify it under the same terms as Perl itself. |
302 |
|
|
|
303 |
|
|
=cut |
304 |
|
|
|
305 |
wakaba |
1.5 |
# $Date: 2002/10/12 11:03:00 $ |
306 |
wakaba |
1.1 |
### Charset.pm ends here |