| 1 |
|
| 2 |
=head1 NAME |
| 3 |
|
| 4 |
Encode::Charset --- Coded Character Sets objects, |
| 5 |
used by Encode::ISO2022, Encode::SJIS, and other modules. |
| 6 |
|
| 7 |
=cut |
| 8 |
|
| 9 |
package Encode::Charset; |
| 10 |
use strict; |
| 11 |
use vars qw(%CHARSET %CODING_SYSTEM $VERSION); |
| 12 |
$VERSION=do{my @r=(q$Revision: 1.5 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r}; |
| 13 |
|
| 14 |
## --- Make initial charset definitions |
| 15 |
&_make_initial_charsets; |
| 16 |
sub _make_initial_charsets () { |
| 17 |
for my $f (0x30..0x7E) { |
| 18 |
my $F = pack 'C', $f; |
| 19 |
for ('', '!', '"', '#') { |
| 20 |
$CHARSET{G94}->{ $_.$F }->{dimension} = 1; |
| 21 |
$CHARSET{G94}->{ $_.$F }->{chars} = 94; |
| 22 |
$CHARSET{G94}->{ $_.$F }->{ucs} = |
| 23 |
{'' => 0xE90940, '!' => 0xE944A0, '"' => 0xE98000, '#' => 0xE9BB60}->{ $_ } |
| 24 |
+ 94 * ($f-0x30); |
| 25 |
|
| 26 |
$CHARSET{G96}->{ $_.$F }->{dimension} = 1; |
| 27 |
$CHARSET{G96}->{ $_.$F }->{chars} = 96; |
| 28 |
$CHARSET{G96}->{ $_.$F }->{ucs} = |
| 29 |
{'' => 0xE926A0, '!' => 0xE96200, '"' => 0xE99D60, '#' => 0xE9D8C0}->{ $_ } |
| 30 |
+ 96 * ($f-0x30); |
| 31 |
|
| 32 |
$CHARSET{C0}->{ $_.$F }->{dimension} = 1; |
| 33 |
$CHARSET{C0}->{ $_.$F }->{chars} = 32; |
| 34 |
$CHARSET{C0}->{ $_.$F }->{ucs} = |
| 35 |
{'' => 0x70000000, '!' => 0x70001400, |
| 36 |
'"' => 0x70002800, '#' => 0x70003C00}->{ $_ } + 32 * ($f-0x30); |
| 37 |
|
| 38 |
$CHARSET{C1}->{ $_.$F }->{dimension} = 1; |
| 39 |
$CHARSET{C1}->{ $_.$F }->{chars} = 32; |
| 40 |
$CHARSET{C1}->{ $_.$F }->{ucs} = |
| 41 |
{'' => 0x70000A00, '!' => 0x70001E00, |
| 42 |
'"' => 0x70003200, '#' => 0x70004600}->{ $_ } + 32 * ($f-0x30); |
| 43 |
|
| 44 |
$CHARSET{G94}->{ ' '.$_.$F }->{dimension} = 1; ## DRCS |
| 45 |
$CHARSET{G94}->{ ' '.$_.$F }->{chars} = 94; |
| 46 |
$CHARSET{G94}->{ ' '.$_.$F }->{ucs} = |
| 47 |
{'' => 0x70090940, '!' => 0x700944A0, |
| 48 |
'"' => 0x70098000, '#' => 0x7009BB60}->{ $_ } + 94 * ($f-0x30); |
| 49 |
|
| 50 |
$CHARSET{G96}->{ ' '.$_.$F }->{dimension} = 1; ## DRCS |
| 51 |
$CHARSET{G96}->{ ' '.$_.$F }->{chars} = 96; |
| 52 |
$CHARSET{G96}->{ ' '.$_.$F }->{ucs} = |
| 53 |
{'' => 0x700926A0, '!' => 0x70096200, |
| 54 |
'"' => 0x70099D60, '#' => 0x7009D8C0}->{ $_ } + 96 * ($f-0x30); |
| 55 |
} |
| 56 |
} |
| 57 |
for my $f (0x30..0x5F, 0x7E) { |
| 58 |
my $F = pack 'C', $f; |
| 59 |
for ('', '!', '"', '#', ' ') { |
| 60 |
$CHARSET{G94n}->{ $_.$F }->{dimension} = 2; |
| 61 |
$CHARSET{G94n}->{ $_.$F }->{chars} = 94; |
| 62 |
$CHARSET{G94n}->{ $_.$F }->{ucs} = |
| 63 |
({'' => 0xE9F6C0}->{ $_ }||0) + 94*94 * ($f-0x30); |
| 64 |
## BUG: 94^n sets with I byte have no mapping area |
| 65 |
|
| 66 |
$CHARSET{G96n}->{ $_.$F }->{dimension} = 2; |
| 67 |
$CHARSET{G96n}->{ $_.$F }->{chars} = 96; |
| 68 |
$CHARSET{G96n}->{ $_.$F }->{ucs} = |
| 69 |
({'' => 0xF4C000}->{ $_ }||0) + 96*96 * ($f-0x30); |
| 70 |
## BUG: 96^n DRCSes with I byte have no mapping area |
| 71 |
} |
| 72 |
} |
| 73 |
$CHARSET{G94n}->{"\x20\x40"}->{ucs} = 0x70460000; ## DRCS 94^2 04/00 |
| 74 |
$CHARSET{G94n}->{P4_0} = $CHARSET{G94n}->{"\x20\x40"}; |
| 75 |
|
| 76 |
for (0x60..0x6F) { |
| 77 |
my $F = pack 'C', $_; |
| 78 |
## BUG: 9x^3 sets have no mapping area |
| 79 |
for ('', '!', '"', '#', ' ') { |
| 80 |
$CHARSET{G94n}->{ $_.$F }->{dimension} = 3; |
| 81 |
$CHARSET{G94n}->{ $_.$F }->{chars} = 94; |
| 82 |
|
| 83 |
$CHARSET{G96n}->{ $_.$F }->{dimension} = 3; |
| 84 |
$CHARSET{G96n}->{ $_.$F }->{chars} = 96; |
| 85 |
} |
| 86 |
} |
| 87 |
for (0x70..0x7D) { |
| 88 |
my $F = pack 'C', $_; |
| 89 |
## BUG: 9x^4 sets have no mapping area |
| 90 |
for ('', '!', '"', '#', ' ') { |
| 91 |
$CHARSET{G94n}->{ $_.$F }->{dimension} = 4; |
| 92 |
$CHARSET{G94n}->{ $_.$F }->{chars} = 94; |
| 93 |
|
| 94 |
$CHARSET{G96n}->{ $_.$F }->{dimension} = 4; |
| 95 |
$CHARSET{G96n}->{ $_.$F }->{chars} = 96; |
| 96 |
} |
| 97 |
} |
| 98 |
for my $f (0x40..0x4E) { |
| 99 |
my $F = pack 'C', $f; |
| 100 |
$CHARSET{G96n}->{ ' '.$F }->{dimension} = 2; |
| 101 |
$CHARSET{G96n}->{ ' '.$F }->{chars} = 96; |
| 102 |
$CHARSET{G96n}->{ ' '.$F }->{ucs} = 0xF0000 + 96*96*($f-0x40); |
| 103 |
## U+F0000-U+10F7FF (private) -> ESC 02/04 02/00 <I> (04/00-04/14) (DRCS) |
| 104 |
} |
| 105 |
|
| 106 |
$CHARSET{G94}->{B}->{ucs} = 0x21; ## ASCII |
| 107 |
$CHARSET{G96}->{A}->{ucs} = 0xA0; ## ISO 8859-1 |
| 108 |
|
| 109 |
$CHARSET{G94n}->{'B@'}->{dimension} = 2; ## JIS X 0208-1990 |
| 110 |
$CHARSET{G94n}->{'B@'}->{chars} = 94; |
| 111 |
$CHARSET{G94n}->{'B@'}->{ucs} = 0xE9F6C0 + 94*94*79; |
| 112 |
|
| 113 |
## SJIS G3 mapping (JIS X 0213:2000 plane 2) |
| 114 |
$CHARSET{G94n}->{"\x50"}->{Csjis_kuE} = { # ku - 1 |
| 115 |
0xF0 => 7, 0xF1 => 3, 0xF2 => 11, 0xF3 => 13, 0xF4 => 77, |
| 116 |
0xF5 => 79, 0xF6 => 81, 0xF7 => 83, 0xF8 => 85, 0xF9 => 87, |
| 117 |
0xFA => 89, 0xFB => 91, 0xFC => 93, |
| 118 |
}; |
| 119 |
$CHARSET{G94n}->{"\x50"}->{Csjis_kuO} = { # ku - 1 |
| 120 |
0xF0 => 0, 0xF1 => 2, 0xF2 => 4, 0xF3 => 12, 0xF4 => 14, |
| 121 |
0xF5 => 78, 0xF6 => 80, 0xF7 => 82, 0xF8 => 84, 0xF9 => 86, |
| 122 |
0xFA => 88, 0xFB => 90, 0xFC => 92, |
| 123 |
}; |
| 124 |
$CHARSET{G94n}->{"\x50"}->{Csjis_first} = { reverse ( |
| 125 |
%{ $CHARSET{G94n}->{"\x50"}->{Csjis_kuE} }, |
| 126 |
%{ $CHARSET{G94n}->{"\x50"}->{Csjis_kuO} }, |
| 127 |
)}; |
| 128 |
|
| 129 |
## -- Control character sets |
| 130 |
$CHARSET{C0}->{'@'}->{ucs} = 0x00; ## ISO/IEC 6429 C0 |
| 131 |
for ("\x40", "\x43", "\x44", "\x45", "\x46", "\x49", "\x4A", "\x4B", "\x4C") { |
| 132 |
$CHARSET{C0}->{$_}->{C_LS0} = "\x0F"; |
| 133 |
$CHARSET{C0}->{$_}->{C_LS1} = "\x0E"; |
| 134 |
$CHARSET{C0}->{$_}->{r_LS0} = '\x0F'; |
| 135 |
$CHARSET{C0}->{$_}->{r_LS1} = '\x0E'; |
| 136 |
} |
| 137 |
for ("\x40", "\x44", "\x45", "\x46", "\x48", "\x4C") { |
| 138 |
$CHARSET{C0}->{$_}->{reset_all} = {"\x0A" => 1, "\x0B" => 1, |
| 139 |
"\x0C" => 1, "\x0D" => 1}; |
| 140 |
} |
| 141 |
$CHARSET{C0}->{"\x43"}->{reset_all} = {"\x0A" => 1}; |
| 142 |
$CHARSET{C0}->{"\x44"}->{C_SS2} = "\x1C"; |
| 143 |
$CHARSET{C0}->{"\x44"}->{r_SS2} = '\x1C'; |
| 144 |
for ("\x45", "\x49", "\x4A", "\x4B") { |
| 145 |
$CHARSET{C0}->{$_}->{C_SS2} = "\x19"; |
| 146 |
$CHARSET{C0}->{$_}->{C_SS3} = "\x1D"; |
| 147 |
$CHARSET{C0}->{$_}->{r_SS2} = '\x19'; |
| 148 |
$CHARSET{C0}->{$_}->{r_SS3} = '\x1D'; |
| 149 |
} |
| 150 |
$CHARSET{C0}->{"\x4C"}->{C_SS2} = "\x19"; |
| 151 |
$CHARSET{C0}->{"\x4C"}->{r_SS2} = '\x19'; |
| 152 |
|
| 153 |
$CHARSET{C1}->{'64291991C1'}->{dimension} = 1; ## ISO/IEC 6429:1991 C1 |
| 154 |
$CHARSET{C1}->{'64291991C1'}->{chars} = 32; |
| 155 |
$CHARSET{C1}->{'64291991C1'}->{ucs} = 0x80; |
| 156 |
for ("\x43", "\x45", "\x47", '64291991C1') { |
| 157 |
$CHARSET{C1}->{$_}->{C_SS2} = "\x8E"; |
| 158 |
$CHARSET{C1}->{$_}->{C_SS3} = "\x8F"; |
| 159 |
$CHARSET{C1}->{$_}->{r_SS2} = '\x8E'; |
| 160 |
$CHARSET{C1}->{$_}->{r_SS3} = '\x8F'; |
| 161 |
$CHARSET{C1}->{$_}->{r_SS2_ESC} = '\x1B\x4E'; |
| 162 |
$CHARSET{C1}->{$_}->{r_SS3_ESC} = '\x1B\x4F'; |
| 163 |
} |
| 164 |
for ("\x43", '64291991C1') { |
| 165 |
$CHARSET{C1}->{$_}->{r_CSI} = '\x9B'; |
| 166 |
$CHARSET{C1}->{$_}->{r_CSI_ESC} = '\x1B\x5B'; |
| 167 |
$CHARSET{C1}->{$_}->{r_DCS} = '\x90'; |
| 168 |
$CHARSET{C1}->{$_}->{r_ST} = '\x9C'; |
| 169 |
$CHARSET{C1}->{$_}->{r_OSC} = '\x9D'; |
| 170 |
$CHARSET{C1}->{$_}->{r_PM} = '\x9E'; |
| 171 |
$CHARSET{C1}->{$_}->{r_APC} = '\x9F'; |
| 172 |
$CHARSET{C1}->{$_}->{reset_all} = {"\x85"=>1, "\x90"=>1, |
| 173 |
"\x9C"=>1, "\x9D"=>1, "\x9E"=>1, "\x9F"=>1}; |
| 174 |
} |
| 175 |
$CHARSET{C1}->{'64291991C1'}->{r_SCI} = '\x9A'; |
| 176 |
|
| 177 |
$CHARSET{single_control}->{Fs} ={ucs => 0x70005000, chars => 32, dimension => 1}; |
| 178 |
$CHARSET{single_control}->{'3F'} ={ucs => 0x70005020, chars => 80, dimension => 1}; |
| 179 |
$CHARSET{single_control}->{'3F!'}={ucs => 0x70005070, chars => 80, dimension => 1}; |
| 180 |
$CHARSET{single_control}->{'3F"'}={ucs => 0x700050C0, chars => 80, dimension => 1}; |
| 181 |
$CHARSET{single_control}->{'3F#'}={ucs => 0x70005110, chars => 80, dimension => 1}; |
| 182 |
} |
| 183 |
|
| 184 |
&make_initial_coding_system; |
| 185 |
sub make_initial_coding_system { |
| 186 |
for (0x30..0x7E) { |
| 187 |
my $F = chr $_; |
| 188 |
$CODING_SYSTEM{$F} = {}; |
| 189 |
$CODING_SYSTEM{"\x2F".$F} = {reset_state => 1}; |
| 190 |
} |
| 191 |
$CODING_SYSTEM{Csjis} = {perl_name => 'shiftjis'}; |
| 192 |
} |
| 193 |
|
| 194 |
sub make_charset (%) { |
| 195 |
my %set = @_; |
| 196 |
my $setid = qq($set{I}$set{F}$set{revision}); |
| 197 |
my $settype = $set{type} || 'G94'; |
| 198 |
delete $set{type}, $set{I}, $set{F}, $set{revision}; |
| 199 |
$CHARSET{ $settype }->{ $setid } = \%set; |
| 200 |
} |
| 201 |
|
| 202 |
## Make a new ISO/IEC 2022-buffers object with default status |
| 203 |
sub new_object { |
| 204 |
my %C; |
| 205 |
$C{bit} = 8; |
| 206 |
$C{coding_system} = $CODING_SYSTEM{"\x40"}; ## ISO/IEC 2022 |
| 207 |
$C{CL} = 'C0'; $C{CR} = 'C1'; $C{ESC_Fe} = 'C1'; |
| 208 |
$C{C0} = $CHARSET{C0}->{"\x40"}; ## ISO/IEC 6429:1991 C0 |
| 209 |
$C{C1} = $CHARSET{C1}->{'64291991C1'}; ## ISO/IEC 6429:1991 C1 |
| 210 |
$C{GL} = 'G0'; $C{GR} = 'G1'; |
| 211 |
$C{G0} = $CHARSET{G94}->{"\x42"}; ## ISO/IEC 646:1991 IRV |
| 212 |
#$C{G1} = $CHARSET{G96}->{"\x41"}; ## ISO/IEC 8859-1 GR |
| 213 |
$C{G1} = $CHARSET{G94}->{"\x7E"}; ## empty set |
| 214 |
$C{G2} = $CHARSET{G94}->{"\x7E"}; ## empty set |
| 215 |
$C{G3} = $CHARSET{G94}->{"\x7E"}; ## empty set |
| 216 |
$C{option} = { |
| 217 |
C1invoke_to_right => 0, ## C1 invoked to: (0: ESC Fe, 1: CR) |
| 218 |
G94n_designate_long => 0, ## (1: ESC 02/04 02/08 04/00..02) |
| 219 |
designate_to => { ## Designated G buffer (-1: not to be outputed) |
| 220 |
C0 => { |
| 221 |
default => 0, |
| 222 |
}, |
| 223 |
C1 => { |
| 224 |
default => 1, |
| 225 |
}, |
| 226 |
G94 => { |
| 227 |
"\x42" => 0, |
| 228 |
default => 0, |
| 229 |
}, |
| 230 |
G96 => { |
| 231 |
default => 1, |
| 232 |
}, |
| 233 |
G94n => { |
| 234 |
default => 0, |
| 235 |
}, |
| 236 |
G96n => { |
| 237 |
default => 1, |
| 238 |
}, |
| 239 |
coding_system => { |
| 240 |
default => -1, |
| 241 |
}, |
| 242 |
}, |
| 243 |
fallback_from_ucs => 'replacement', |
| 244 |
## 'replacement' / 'perl' / 'sgml' / 'sgml-hex' / 'x-u-escaped' / 'code' |
| 245 |
## / 'quiet' / 'quiet+back' / 'quiet+warn' / 'quiet+back+warn' / 'croak' |
| 246 |
## / code |
| 247 |
final_to_set => { |
| 248 |
C0 => {}, C1 => {}, G94 => {}, G94n => {}, |
| 249 |
G96 => {}, G96n => {}, coding_system => {}, |
| 250 |
}, |
| 251 |
Ginvoke_by_single_shift => [0,0,0,0], ## Invoked by SS |
| 252 |
Ginvoke_to_left => [1,1,1,1], ## Which invoked to? (1: L, 0: R) |
| 253 |
private_set => { ## Private set vs Final byte |
| 254 |
C0 => [], |
| 255 |
C1 => [], |
| 256 |
G94 => [], |
| 257 |
G94n => [[],[],[],[],["\x20\x40"]], |
| 258 |
G96 => [], |
| 259 |
#G96n => [], ## (not implemented) |
| 260 |
U96n => [], ## mule-unicode sets |
| 261 |
XC1 => { |
| 262 |
'64291991C1' => undef, ## ISO/IEC 6429:1991 C1 |
| 263 |
}, |
| 264 |
}, |
| 265 |
reset => { ## Reset status at top of line |
| 266 |
Gdesignation => "\x42", ## F of designation or 0 |
| 267 |
Ginvoke => 1, |
| 268 |
}, |
| 269 |
undef_char => ["\x3F", {type => 'G94', charset => 'B'}], |
| 270 |
use_revision => 1, ## Output IRR |
| 271 |
}; |
| 272 |
\%C; |
| 273 |
} |
| 274 |
|
| 275 |
sub new_object_sjis { |
| 276 |
my $C = &new_object; |
| 277 |
$C->{coding_system} = $CODING_SYSTEM{Csjis}; |
| 278 |
$C->{CR} = undef; |
| 279 |
$C->{GR} = 'G2'; ## 0xA1-0xDF |
| 280 |
$C->{G0} = $CHARSET{G94}->{J}; ## JIS X 0201:1997 Latin |
| 281 |
$C->{G1} = $CHARSET{G94n}->{"\x4F"}; ## JIS X 0213:2000 |
| 282 |
$C->{G2} = $CHARSET{G94}->{I}; ## JIS X 0201:1997 Katakana |
| 283 |
$C->{G3} = $CHARSET{G94n}->{"\x50"}; ## JIS X 0213:2000 plane 2 |
| 284 |
## Special code area (0xFD-0xFF) |
| 285 |
$C->{Gsmap} = {"\xA0" => "\x{F8F0}", "\xFD" => "\x{F8F1}", "\xFE" => "\x{F8F2}", "\xFF" => "\x{F8F3}"}; |
| 286 |
$C->{GsmapR} = {}; ## Reversed table |
| 287 |
$C->{option}->{undef_char_sjis} = "\x81\xAC"; |
| 288 |
$C; |
| 289 |
} |
| 290 |
|
| 291 |
our %FallbackFromUCS = ( |
| 292 |
perl => sub { my $c = $_[1]; sprintf '\x{%04X}', ord $c }, |
| 293 |
sgml => sub { my $c = $_[1]; sprintf '&#%d;', ord $c }, |
| 294 |
'sgml-hex' => sub { my $c = $_[1]; sprintf '&#x%04X;', ord $c }, |
| 295 |
'x-u-escaped' => sub { my $c = $_[1]; my $C = ord $c; sprintf $C > 0xFFFF ? '\U%08X' : '\u%04X', $C }, |
| 296 |
); |
| 297 |
|
| 298 |
sub fallback_escape ($$;%) { |
| 299 |
my ($C, $c, %option) = @_; |
| 300 |
my $f = ref ($C->{option}->{fallback_from_ucs}) eq 'CODE' ? $C->{option}->{fallback_from_ucs} : |
| 301 |
$FallbackFromUCS{$C->{option}->{fallback_from_ucs}}; |
| 302 |
if (ref $f) { |
| 303 |
Encode::_utf8_on ($c); |
| 304 |
return &$f ($C, $c, %option); |
| 305 |
} |
| 306 |
undef; |
| 307 |
} |
| 308 |
|
| 309 |
=head1 AUTHORS |
| 310 |
|
| 311 |
Nanashi-san <nanashi-san@nanashi.invalid> |
| 312 |
|
| 313 |
Wakaba <w@suika.fam.cx> |
| 314 |
|
| 315 |
=head1 LICENSE |
| 316 |
|
| 317 |
Copyright 2002 AUTHORS, all rights reserved. |
| 318 |
|
| 319 |
This library is free software; you can redistribute it |
| 320 |
and/or modify it under the same terms as Perl itself. |
| 321 |
|
| 322 |
=cut |
| 323 |
|
| 324 |
1; # $Date: 2002/10/16 10:39:35 $ |