/[suikacvs]/perl/lib/Encode/Charset.pm
Suika

Contents of /perl/lib/Encode/Charset.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.6 - (hide annotations) (download)
Sat Dec 14 11:02:25 2002 UTC (21 years, 11 months ago) by wakaba
Branch: MAIN
Changes since 1.5: +25 -7 lines
Fallback support

1 wakaba 1.1
2     =head1 NAME
3    
4     Encode::Charset --- Coded Character Sets objects,
5     used by Encode::ISO2022, Encode::SJIS, and other modules.
6    
7     =cut
8    
9     package Encode::Charset;
10     use strict;
11     use vars qw(%CHARSET %CODING_SYSTEM $VERSION);
12 wakaba 1.6 $VERSION=do{my @r=(q$Revision: 1.5 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
13 wakaba 1.1
14     ## --- Make initial charset definitions
15     &_make_initial_charsets;
16     sub _make_initial_charsets () {
17     for my $f (0x30..0x7E) {
18     my $F = pack 'C', $f;
19     for ('', '!', '"', '#') {
20     $CHARSET{G94}->{ $_.$F }->{dimension} = 1;
21     $CHARSET{G94}->{ $_.$F }->{chars} = 94;
22     $CHARSET{G94}->{ $_.$F }->{ucs} =
23     {'' => 0xE90940, '!' => 0xE944A0, '"' => 0xE98000, '#' => 0xE9BB60}->{ $_ }
24     + 94 * ($f-0x30);
25    
26     $CHARSET{G96}->{ $_.$F }->{dimension} = 1;
27     $CHARSET{G96}->{ $_.$F }->{chars} = 96;
28     $CHARSET{G96}->{ $_.$F }->{ucs} =
29     {'' => 0xE926A0, '!' => 0xE96200, '"' => 0xE99D60, '#' => 0xE9D8C0}->{ $_ }
30     + 96 * ($f-0x30);
31    
32     $CHARSET{C0}->{ $_.$F }->{dimension} = 1;
33     $CHARSET{C0}->{ $_.$F }->{chars} = 32;
34     $CHARSET{C0}->{ $_.$F }->{ucs} =
35     {'' => 0x70000000, '!' => 0x70001400,
36     '"' => 0x70002800, '#' => 0x70003C00}->{ $_ } + 32 * ($f-0x30);
37    
38     $CHARSET{C1}->{ $_.$F }->{dimension} = 1;
39     $CHARSET{C1}->{ $_.$F }->{chars} = 32;
40     $CHARSET{C1}->{ $_.$F }->{ucs} =
41     {'' => 0x70000A00, '!' => 0x70001E00,
42     '"' => 0x70003200, '#' => 0x70004600}->{ $_ } + 32 * ($f-0x30);
43    
44     $CHARSET{G94}->{ ' '.$_.$F }->{dimension} = 1; ## DRCS
45     $CHARSET{G94}->{ ' '.$_.$F }->{chars} = 94;
46     $CHARSET{G94}->{ ' '.$_.$F }->{ucs} =
47     {'' => 0x70090940, '!' => 0x700944A0,
48     '"' => 0x70098000, '#' => 0x7009BB60}->{ $_ } + 94 * ($f-0x30);
49    
50     $CHARSET{G96}->{ ' '.$_.$F }->{dimension} = 1; ## DRCS
51     $CHARSET{G96}->{ ' '.$_.$F }->{chars} = 96;
52     $CHARSET{G96}->{ ' '.$_.$F }->{ucs} =
53     {'' => 0x700926A0, '!' => 0x70096200,
54     '"' => 0x70099D60, '#' => 0x7009D8C0}->{ $_ } + 96 * ($f-0x30);
55     }
56     }
57     for my $f (0x30..0x5F, 0x7E) {
58     my $F = pack 'C', $f;
59     for ('', '!', '"', '#', ' ') {
60     $CHARSET{G94n}->{ $_.$F }->{dimension} = 2;
61     $CHARSET{G94n}->{ $_.$F }->{chars} = 94;
62     $CHARSET{G94n}->{ $_.$F }->{ucs} =
63     ({'' => 0xE9F6C0}->{ $_ }||0) + 94*94 * ($f-0x30);
64     ## BUG: 94^n sets with I byte have no mapping area
65    
66     $CHARSET{G96n}->{ $_.$F }->{dimension} = 2;
67     $CHARSET{G96n}->{ $_.$F }->{chars} = 96;
68     $CHARSET{G96n}->{ $_.$F }->{ucs} =
69     ({'' => 0xF4C000}->{ $_ }||0) + 96*96 * ($f-0x30);
70 wakaba 1.4 ## BUG: 96^n DRCSes with I byte have no mapping area
71 wakaba 1.1 }
72     }
73 wakaba 1.4 $CHARSET{G94n}->{"\x20\x40"}->{ucs} = 0x70460000; ## DRCS 94^2 04/00
74 wakaba 1.5 $CHARSET{G94n}->{P4_0} = $CHARSET{G94n}->{"\x20\x40"};
75 wakaba 1.4
76 wakaba 1.1 for (0x60..0x6F) {
77     my $F = pack 'C', $_;
78     ## BUG: 9x^3 sets have no mapping area
79     for ('', '!', '"', '#', ' ') {
80     $CHARSET{G94n}->{ $_.$F }->{dimension} = 3;
81     $CHARSET{G94n}->{ $_.$F }->{chars} = 94;
82    
83     $CHARSET{G96n}->{ $_.$F }->{dimension} = 3;
84     $CHARSET{G96n}->{ $_.$F }->{chars} = 96;
85     }
86     }
87     for (0x70..0x7D) {
88     my $F = pack 'C', $_;
89     ## BUG: 9x^4 sets have no mapping area
90     for ('', '!', '"', '#', ' ') {
91     $CHARSET{G94n}->{ $_.$F }->{dimension} = 4;
92     $CHARSET{G94n}->{ $_.$F }->{chars} = 94;
93    
94     $CHARSET{G96n}->{ $_.$F }->{dimension} = 4;
95     $CHARSET{G96n}->{ $_.$F }->{chars} = 96;
96     }
97     }
98     for my $f (0x40..0x4E) {
99     my $F = pack 'C', $f;
100     $CHARSET{G96n}->{ ' '.$F }->{dimension} = 2;
101     $CHARSET{G96n}->{ ' '.$F }->{chars} = 96;
102     $CHARSET{G96n}->{ ' '.$F }->{ucs} = 0xF0000 + 96*96*($f-0x40);
103     ## U+F0000-U+10F7FF (private) -> ESC 02/04 02/00 <I> (04/00-04/14) (DRCS)
104     }
105    
106     $CHARSET{G94}->{B}->{ucs} = 0x21; ## ASCII
107     $CHARSET{G96}->{A}->{ucs} = 0xA0; ## ISO 8859-1
108    
109     $CHARSET{G94n}->{'B@'}->{dimension} = 2; ## JIS X 0208-1990
110     $CHARSET{G94n}->{'B@'}->{chars} = 94;
111     $CHARSET{G94n}->{'B@'}->{ucs} = 0xE9F6C0 + 94*94*79;
112 wakaba 1.3
113     ## SJIS G3 mapping (JIS X 0213:2000 plane 2)
114     $CHARSET{G94n}->{"\x50"}->{Csjis_kuE} = { # ku - 1
115     0xF0 => 7, 0xF1 => 3, 0xF2 => 11, 0xF3 => 13, 0xF4 => 77,
116     0xF5 => 79, 0xF6 => 81, 0xF7 => 83, 0xF8 => 85, 0xF9 => 87,
117     0xFA => 89, 0xFB => 91, 0xFC => 93,
118     };
119     $CHARSET{G94n}->{"\x50"}->{Csjis_kuO} = { # ku - 1
120     0xF0 => 0, 0xF1 => 2, 0xF2 => 4, 0xF3 => 12, 0xF4 => 14,
121     0xF5 => 78, 0xF6 => 80, 0xF7 => 82, 0xF8 => 84, 0xF9 => 86,
122     0xFA => 88, 0xFB => 90, 0xFC => 92,
123     };
124     $CHARSET{G94n}->{"\x50"}->{Csjis_first} = { reverse (
125     %{ $CHARSET{G94n}->{"\x50"}->{Csjis_kuE} },
126     %{ $CHARSET{G94n}->{"\x50"}->{Csjis_kuO} },
127     )};
128    
129 wakaba 1.1 ## -- Control character sets
130     $CHARSET{C0}->{'@'}->{ucs} = 0x00; ## ISO/IEC 6429 C0
131     for ("\x40", "\x43", "\x44", "\x45", "\x46", "\x49", "\x4A", "\x4B", "\x4C") {
132     $CHARSET{C0}->{$_}->{C_LS0} = "\x0F";
133     $CHARSET{C0}->{$_}->{C_LS1} = "\x0E";
134     $CHARSET{C0}->{$_}->{r_LS0} = '\x0F';
135     $CHARSET{C0}->{$_}->{r_LS1} = '\x0E';
136     }
137     for ("\x40", "\x44", "\x45", "\x46", "\x48", "\x4C") {
138     $CHARSET{C0}->{$_}->{reset_all} = {"\x0A" => 1, "\x0B" => 1,
139     "\x0C" => 1, "\x0D" => 1};
140     }
141     $CHARSET{C0}->{"\x43"}->{reset_all} = {"\x0A" => 1};
142     $CHARSET{C0}->{"\x44"}->{C_SS2} = "\x1C";
143     $CHARSET{C0}->{"\x44"}->{r_SS2} = '\x1C';
144     for ("\x45", "\x49", "\x4A", "\x4B") {
145     $CHARSET{C0}->{$_}->{C_SS2} = "\x19";
146     $CHARSET{C0}->{$_}->{C_SS3} = "\x1D";
147     $CHARSET{C0}->{$_}->{r_SS2} = '\x19';
148     $CHARSET{C0}->{$_}->{r_SS3} = '\x1D';
149     }
150     $CHARSET{C0}->{"\x4C"}->{C_SS2} = "\x19";
151     $CHARSET{C0}->{"\x4C"}->{r_SS2} = '\x19';
152    
153     $CHARSET{C1}->{'64291991C1'}->{dimension} = 1; ## ISO/IEC 6429:1991 C1
154     $CHARSET{C1}->{'64291991C1'}->{chars} = 32;
155     $CHARSET{C1}->{'64291991C1'}->{ucs} = 0x80;
156     for ("\x43", "\x45", "\x47", '64291991C1') {
157     $CHARSET{C1}->{$_}->{C_SS2} = "\x8E";
158     $CHARSET{C1}->{$_}->{C_SS3} = "\x8F";
159     $CHARSET{C1}->{$_}->{r_SS2} = '\x8E';
160     $CHARSET{C1}->{$_}->{r_SS3} = '\x8F';
161     $CHARSET{C1}->{$_}->{r_SS2_ESC} = '\x1B\x4E';
162     $CHARSET{C1}->{$_}->{r_SS3_ESC} = '\x1B\x4F';
163     }
164     for ("\x43", '64291991C1') {
165     $CHARSET{C1}->{$_}->{r_CSI} = '\x9B';
166     $CHARSET{C1}->{$_}->{r_CSI_ESC} = '\x1B\x5B';
167     $CHARSET{C1}->{$_}->{r_DCS} = '\x90';
168     $CHARSET{C1}->{$_}->{r_ST} = '\x9C';
169     $CHARSET{C1}->{$_}->{r_OSC} = '\x9D';
170     $CHARSET{C1}->{$_}->{r_PM} = '\x9E';
171     $CHARSET{C1}->{$_}->{r_APC} = '\x9F';
172     $CHARSET{C1}->{$_}->{reset_all} = {"\x85"=>1, "\x90"=>1,
173     "\x9C"=>1, "\x9D"=>1, "\x9E"=>1, "\x9F"=>1};
174     }
175     $CHARSET{C1}->{'64291991C1'}->{r_SCI} = '\x9A';
176    
177     $CHARSET{single_control}->{Fs} ={ucs => 0x70005000, chars => 32, dimension => 1};
178     $CHARSET{single_control}->{'3F'} ={ucs => 0x70005020, chars => 80, dimension => 1};
179     $CHARSET{single_control}->{'3F!'}={ucs => 0x70005070, chars => 80, dimension => 1};
180     $CHARSET{single_control}->{'3F"'}={ucs => 0x700050C0, chars => 80, dimension => 1};
181     $CHARSET{single_control}->{'3F#'}={ucs => 0x70005110, chars => 80, dimension => 1};
182     }
183    
184     &make_initial_coding_system;
185     sub make_initial_coding_system {
186     for (0x30..0x7E) {
187     my $F = chr $_;
188     $CODING_SYSTEM{$F} = {};
189     $CODING_SYSTEM{"\x2F".$F} = {reset_state => 1};
190     }
191 wakaba 1.2 $CODING_SYSTEM{Csjis} = {perl_name => 'shiftjis'};
192 wakaba 1.1 }
193    
194     sub make_charset (%) {
195     my %set = @_;
196     my $setid = qq($set{I}$set{F}$set{revision});
197     my $settype = $set{type} || 'G94';
198     delete $set{type}, $set{I}, $set{F}, $set{revision};
199     $CHARSET{ $settype }->{ $setid } = \%set;
200     }
201    
202 wakaba 1.2 ## Make a new ISO/IEC 2022-buffers object with default status
203     sub new_object {
204     my %C;
205     $C{bit} = 8;
206 wakaba 1.3 $C{coding_system} = $CODING_SYSTEM{"\x40"}; ## ISO/IEC 2022
207 wakaba 1.2 $C{CL} = 'C0'; $C{CR} = 'C1'; $C{ESC_Fe} = 'C1';
208     $C{C0} = $CHARSET{C0}->{"\x40"}; ## ISO/IEC 6429:1991 C0
209     $C{C1} = $CHARSET{C1}->{'64291991C1'}; ## ISO/IEC 6429:1991 C1
210     $C{GL} = 'G0'; $C{GR} = 'G1';
211     $C{G0} = $CHARSET{G94}->{"\x42"}; ## ISO/IEC 646:1991 IRV
212     #$C{G1} = $CHARSET{G96}->{"\x41"}; ## ISO/IEC 8859-1 GR
213     $C{G1} = $CHARSET{G94}->{"\x7E"}; ## empty set
214     $C{G2} = $CHARSET{G94}->{"\x7E"}; ## empty set
215     $C{G3} = $CHARSET{G94}->{"\x7E"}; ## empty set
216     $C{option} = {
217     C1invoke_to_right => 0, ## C1 invoked to: (0: ESC Fe, 1: CR)
218     G94n_designate_long => 0, ## (1: ESC 02/04 02/08 04/00..02)
219 wakaba 1.5 designate_to => { ## Designated G buffer (-1: not to be outputed)
220 wakaba 1.2 C0 => {
221     default => 0,
222     },
223     C1 => {
224     default => 1,
225     },
226     G94 => {
227     "\x42" => 0,
228     default => 0,
229     },
230     G96 => {
231     default => 1,
232     },
233     G94n => {
234     default => 0,
235     },
236     G96n => {
237     default => 1,
238     },
239     coding_system => {
240     default => -1,
241     },
242     },
243 wakaba 1.6 fallback_from_ucs => 'replacement',
244     ## 'replacement' / 'perl' / 'sgml' / 'sgml-hex' / 'x-u-escaped' / 'code'
245     ## / 'quiet' / 'quiet+back' / 'quiet+warn' / 'quiet+back+warn' / 'croak'
246     ## / code
247 wakaba 1.5 final_to_set => {
248     C0 => {}, C1 => {}, G94 => {}, G94n => {},
249     G96 => {}, G96n => {}, coding_system => {},
250     },
251 wakaba 1.2 Ginvoke_by_single_shift => [0,0,0,0], ## Invoked by SS
252     Ginvoke_to_left => [1,1,1,1], ## Which invoked to? (1: L, 0: R)
253     private_set => { ## Private set vs Final byte
254     C0 => [],
255     C1 => [],
256     G94 => [],
257 wakaba 1.4 G94n => [[],[],[],[],["\x20\x40"]],
258 wakaba 1.2 G96 => [],
259     #G96n => [], ## (not implemented)
260     U96n => [], ## mule-unicode sets
261     XC1 => {
262     '64291991C1' => undef, ## ISO/IEC 6429:1991 C1
263     },
264     },
265     reset => { ## Reset status at top of line
266     Gdesignation => "\x42", ## F of designation or 0
267     Ginvoke => 1,
268     },
269     undef_char => ["\x3F", {type => 'G94', charset => 'B'}],
270     use_revision => 1, ## Output IRR
271     };
272     \%C;
273     }
274    
275 wakaba 1.3 sub new_object_sjis {
276 wakaba 1.5 my $C = &new_object;
277 wakaba 1.3 $C->{coding_system} = $CODING_SYSTEM{Csjis};
278     $C->{CR} = undef;
279     $C->{GR} = 'G2'; ## 0xA1-0xDF
280 wakaba 1.4 $C->{G0} = $CHARSET{G94}->{J}; ## JIS X 0201:1997 Latin
281     $C->{G1} = $CHARSET{G94n}->{"\x4F"}; ## JIS X 0213:2000
282 wakaba 1.3 $C->{G2} = $CHARSET{G94}->{I}; ## JIS X 0201:1997 Katakana
283     $C->{G3} = $CHARSET{G94n}->{"\x50"}; ## JIS X 0213:2000 plane 2
284 wakaba 1.4 ## Special code area (0xFD-0xFF)
285     $C->{Gsmap} = {"\xA0" => "\x{F8F0}", "\xFD" => "\x{F8F1}", "\xFE" => "\x{F8F2}", "\xFF" => "\x{F8F3}"};
286     $C->{GsmapR} = {}; ## Reversed table
287     $C->{option}->{undef_char_sjis} = "\x81\xAC";
288 wakaba 1.3 $C;
289     }
290    
291 wakaba 1.6 our %FallbackFromUCS = (
292     perl => sub { my $c = $_[1]; sprintf '\x{%04X}', ord $c },
293     sgml => sub { my $c = $_[1]; sprintf '&#%d;', ord $c },
294     'sgml-hex' => sub { my $c = $_[1]; sprintf '&#x%04X;', ord $c },
295     'x-u-escaped' => sub { my $c = $_[1]; my $C = ord $c; sprintf $C > 0xFFFF ? '\U%08X' : '\u%04X', $C },
296     );
297    
298     sub fallback_escape ($$;%) {
299     my ($C, $c, %option) = @_;
300     my $f = ref ($C->{option}->{fallback_from_ucs}) eq 'CODE' ? $C->{option}->{fallback_from_ucs} :
301     $FallbackFromUCS{$C->{option}->{fallback_from_ucs}};
302     if (ref $f) {
303     Encode::_utf8_on ($c);
304     return &$f ($C, $c, %option);
305     }
306     undef;
307     }
308 wakaba 1.1
309     =head1 AUTHORS
310    
311 wakaba 1.6 Nanashi-san <nanashi-san@nanashi.invalid>
312 wakaba 1.1
313     Wakaba <w@suika.fam.cx>
314    
315     =head1 LICENSE
316    
317 wakaba 1.6 Copyright 2002 AUTHORS, all rights reserved.
318 wakaba 1.1
319     This library is free software; you can redistribute it
320     and/or modify it under the same terms as Perl itself.
321    
322     =cut
323    
324 wakaba 1.6 1; # $Date: 2002/10/16 10:39:35 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24