/[suikacvs]/markup/html/whatpm/Whatpm/Charset/DecodeHandleOld.pm
Suika

Contents of /markup/html/whatpm/Whatpm/Charset/DecodeHandleOld.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (show annotations) (download)
Sat Sep 20 04:46:53 2008 UTC (16 years, 10 months ago) by wakaba
Branch: MAIN
CVS Tags: HEAD
++ whatpm/Whatpm/Charset/ChangeLog	20 Sep 2008 04:46:38 -0000
2008-09-20  Wakaba  <wakaba@suika.fam.cx>

	* DecodeHandleOld.pm: DecodeHandle.pm revision 1.6 reintroduced
	since the latest implementation of DeocdeHandle.pm shows
	considerably bad performance when combined with
	Message::DOM::XMLParserTemp.

1 package Whatpm::Charset::DecodeHandle;
2 use strict;
3
4 my $XML_AUTO_CHARSET = q<http://suika.fam.cx/www/2006/03/xml-entity/>;
5 my $IANA_CHARSET = q<urn:x-suika-fam-cx:charset:>;
6 my $PERL_CHARSET = q<http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.>;
7 my $XML_CHARSET = q<http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.>;
8
9 ## ->create_decode_handle ($charset_uri, $byte_stream, $onerror)
10 sub create_decode_handle ($$$;$) {
11 my $csdef = $Whatpm::Charset::CharsetDef->{$_[1]};
12 my $obj = {
13 character_queue => [],
14 filehandle => $_[2],
15 charset => $_[1],
16 byte_buffer => '',
17 onerror => $_[3] || sub {},
18 };
19 if ($csdef->{uri}->{$XML_AUTO_CHARSET} or
20 $obj->{charset} eq $XML_AUTO_CHARSET) {
21 my $b = ''; # UTF-8 w/o BOM
22 $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
23 $obj->{input_encoding} = 'UTF-8';
24 if (read $obj->{filehandle}, $b, 256) {
25 no warnings "substr";
26 no warnings "uninitialized";
27 if (substr ($b, 0, 1) eq "<") {
28 if (substr ($b, 1, 1) eq "?") { # ASCII8
29 if ($b =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
30 encoding\s*=\s*["']([^"']*)/x) {
31 $obj->{input_encoding} = $1;
32 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
33 $csdef = $Whatpm::Charset::CharsetDef->{$uri};
34 if (not $csdef->{ascii8} or $csdef->{bom_required}) {
35 $obj->{onerror}->(undef, 'charset-name-mismatch-error',
36 charset_uri => $uri,
37 charset_name => $obj->{input_encoding});
38 }
39 } else {
40 $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
41 $obj->{input_encoding} = 'UTF-8';
42 }
43 if (defined $csdef->{no_bom_variant}) {
44 $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant}};
45 }
46 } elsif (substr ($b, 1, 1) eq "\x00") {
47 if (substr ($b, 2, 2) eq "?\x00") { # ASCII16LE
48 my $c = $b; $c =~ tr/\x00//d;
49 if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
50 encoding\s*=\s*["']([^"']*)/x) {
51 $obj->{input_encoding} = $1;
52 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
53 $csdef = $Whatpm::Charset::CharsetDef->{$uri};
54 if (not $csdef->{ascii16} or $csdef->{ascii16be} or
55 $csdef->{bom_required}) {
56 $obj->{onerror}->(undef, 'charset-name-mismatch-error',
57 charset_uri => $uri,
58 charset_name => $obj->{input_encoding});
59 }
60 } else {
61 $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
62 $obj->{input_encoding} = 'UTF-8';
63 }
64 if (defined $csdef->{no_bom_variant16le}) {
65 $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant16le}};
66 }
67 } elsif (substr ($b, 2, 2) eq "\x00\x00") { # ASCII32Endian4321
68 my $c = $b; $c =~ tr/\x00//d;
69 if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
70 encoding\s*=\s*["']([^"']*)/x) {
71 $obj->{input_encoding} = $1;
72 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
73 $csdef = $Whatpm::Charset::CharsetDef->{$uri};
74 if (not $csdef->{ascii32} or
75 $csdef->{ascii32endian1234} or
76 $csdef->{ascii32endian2143} or
77 $csdef->{ascii32endian3412} or
78 $csdef->{bom_required}) {
79 $obj->{onerror}->(undef, 'charset-name-mismatch-error',
80 charset_uri => $uri,
81 charset_name => $obj->{input_encoding});
82 }
83 } else {
84 $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
85 $obj->{input_encoding} = 'UTF-8';
86 }
87 if (defined $csdef->{no_bom_variant32endian4321}) {
88 $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant32endian4321}};
89 }
90 }
91 }
92 } elsif (substr ($b, 0, 3) eq "\xEF\xBB\xBF") { # UTF8
93 $obj->{has_bom} = 1;
94 substr ($b, 0, 3) = '';
95 my $c = $b;
96 if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
97 encoding\s*=\s*["']([^"']*)/x) {
98 $obj->{input_encoding} = $1;
99 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
100 $csdef = $Whatpm::Charset::CharsetDef->{$uri};
101 if (not $csdef->{utf8_encoding_scheme} or
102 not $csdef->{bom_allowed}) {
103 $obj->{onerror}->(undef, 'charset-name-mismatch-error',
104 charset_uri => $uri,
105 charset_name => $obj->{input_encoding});
106 }
107 } else {
108 $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
109 $obj->{input_encoding} = 'UTF-8';
110 }
111 if (defined $csdef->{no_bom_variant}) {
112 $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant}};
113 }
114 } elsif (substr ($b, 0, 2) eq "\x00<") {
115 if (substr ($b, 2, 2) eq "\x00?") { # ASCII16BE
116 my $c = $b; $c =~ tr/\x00//d;
117 if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
118 encoding\s*=\s*["']([^"']*)/x) {
119 $obj->{input_encoding} = $1;
120 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
121 $csdef = $Whatpm::Charset::CharsetDef->{$uri};
122 if (not $csdef->{ascii16} or $csdef->{ascii16le} or
123 $csdef->{bom_required}) {
124 $obj->{onerror}->(undef, 'charset-name-mismatch-error',
125 charset_uri => $uri,
126 charset_name => $obj->{input_encoding});
127 }
128 } else {
129 $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
130 $obj->{input_encoding} = 'UTF-8';
131 }
132 if (defined $csdef->{no_bom_variant16be}) {
133 $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant16be}};
134 }
135 } elsif (substr ($b, 2, 2) eq "\x00\x00") { # ASCII32Endian3412
136 my $c = $b; $c =~ tr/\x00//d;
137 if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
138 encoding\s*=\s*["']([^"']*)/x) {
139 $obj->{input_encoding} = $1;
140 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
141 $csdef = $Whatpm::Charset::CharsetDef->{$uri};
142 if (not $csdef->{ascii32} or
143 $csdef->{ascii32endian1234} or
144 $csdef->{ascii32endian2143} or
145 $csdef->{ascii32endian4321} or
146 $csdef->{bom_required}) {
147 $obj->{onerror}->(undef, 'charset-name-mismatch-error',
148 charset_uri => $uri,
149 charset_name => $obj->{input_encoding});
150 }
151 } else {
152 $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
153 $obj->{input_encoding} = 'UTF-8';
154 }
155 if (defined $csdef->{no_bom_variant32endian3412}) {
156 $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant32endian3412}};
157 }
158 }
159 } elsif (substr ($b, 0, 2) eq "\xFE\xFF") {
160 if (substr ($b, 2, 2) eq "\x00<") { # ASCII16BE
161 $obj->{has_bom} = 1;
162 substr ($b, 0, 2) = '';
163 my $c = $b; $c =~ tr/\x00//d;
164 if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
165 encoding\s*=\s*["']([^"']*)/x) {
166 $obj->{input_encoding} = $1;
167 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
168 $csdef = $Whatpm::Charset::CharsetDef->{$uri};
169 if (not $csdef->{ascii16} or
170 $csdef->{ascii16le} or
171 not $csdef->{bom_allowed}) {
172 $obj->{onerror}->(undef, 'charset-name-mismatch-error',
173 charset_uri => $uri,
174 charset_name => $obj->{input_encoding});
175 }
176 } else {
177 $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-16be'};
178 $obj->{input_encoding} = 'UTF-16';
179 }
180 if (defined $csdef->{no_bom_variant16be}) {
181 $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant16be}};
182 }
183 } elsif (substr ($b, 2, 2) eq "\x00\x00") { # ASCII32Endian3412
184 $obj->{has_bom} = 1;
185 substr ($b, 0, 4) = '';
186 my $c = $b; $c =~ tr/\x00//d;
187 if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
188 encoding\s*=\s*["']([^"']*)/x) {
189 $obj->{input_encoding} = $1;
190 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
191 $csdef = $Whatpm::Charset::CharsetDef->{$uri};
192 if (not $csdef->{ascii32} or
193 $csdef->{ascii32endian1234} or
194 $csdef->{ascii32endian2143} or
195 $csdef->{ascii32endian4321} or
196 not $csdef->{bom_allowed}) {
197 $obj->{onerror}->(undef, 'charset-name-mismatch-error',
198 charset_uri => $uri,
199 charset_name => $obj->{input_encoding});
200 }
201 } else {
202 $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-16be'};
203 $obj->{input_encoding} = 'UTF-16';
204 $obj->{byte_buffer} .= "\x00\x00";
205 }
206 if (defined $csdef->{no_bom_variant32endian3412}) {
207 $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant32endian3412}};
208 }
209 } else {
210 $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-16be'};
211 $obj->{input_encoding} = 'UTF-16';
212 substr ($b, 0, 2) = '';
213 $obj->{has_bom} = 1;
214 }
215 } elsif (substr ($b, 0, 2) eq "\xFF\xFE") {
216 if (substr ($b, 2, 2) eq "<\x00") { # ASCII16LE
217 $obj->{has_bom} = 1;
218 substr ($b, 0, 2) = '';
219 my $c = $b; $c =~ tr/\x00//d;
220 if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
221 encoding\s*=\s*["']([^"']*)/x) {
222 $obj->{input_encoding} = $1;
223 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
224 $csdef = $Whatpm::Charset::CharsetDef->{$uri};
225 if (not $csdef->{ascii16} or
226 $csdef->{ascii16be} or
227 not $csdef->{bom_allowed}) {
228 $obj->{onerror}->(undef, 'charset-name-mismatch-error',
229 charset_uri => $uri,
230 charset_name => $obj->{input_encoding});
231 }
232 } else {
233 $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-16le'};
234 $obj->{input_encoding} = 'UTF-16';
235 }
236 if (defined $csdef->{no_bom_variant16le}) {
237 $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant16le}};
238 }
239 } elsif (substr ($b, 2, 2) eq "\x00\x00") { # ASCII32Endian4321
240 $obj->{has_bom} = 1;
241 substr ($b, 0, 4) = '';
242 my $c = $b; $c =~ tr/\x00//d;
243 if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
244 encoding\s*=\s*["']([^"']*)/x) {
245 $obj->{input_encoding} = $1;
246 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
247 $csdef = $Whatpm::Charset::CharsetDef->{$uri};
248 if (not $csdef->{ascii32} or
249 $csdef->{ascii32endian1234} or
250 $csdef->{ascii32endian2143} or
251 $csdef->{ascii32endian3412} or
252 not $csdef->{bom_allowed}) {
253 $obj->{onerror}->(undef, 'charset-name-mismatch-error',
254 charset_uri => $uri,
255 charset_name => $obj->{input_encoding});
256 }
257 } else {
258 $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-16le'};
259 $obj->{input_encoding} = 'UTF-16';
260 $obj->{byte_buffer} .= "\x00\x00";
261 }
262 if (defined $csdef->{no_bom_variant32endian4321}) {
263 $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant32endian4321}};
264 }
265 } else {
266 $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-16le'};
267 $obj->{input_encoding} = 'UTF-16';
268 substr ($b, 0, 2) = '';
269 $obj->{has_bom} = 1;
270 }
271 } elsif (substr ($b, 0, 2) eq "\x00\x00") {
272 if (substr ($b, 2, 2) eq "\x00<") { # ASCII32Endian1234
273 my $c = $b; $c =~ tr/\x00//d;
274 if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
275 encoding\s*=\s*["']([^"']*)/x) {
276 $obj->{input_encoding} = $1;
277 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
278 $csdef = $Whatpm::Charset::CharsetDef->{$uri};
279 if (not $csdef->{ascii32} or
280 $csdef->{ascii32endian2143} or
281 $csdef->{ascii32endian3412} or
282 $csdef->{ascii32endian4321} or
283 $csdef->{bom_required}) {
284 $obj->{onerror}->(undef, 'charset-name-mismatch-error',
285 charset_uri => $uri,
286 charset_name => $obj->{input_encoding});
287 }
288 } else {
289 $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
290 $obj->{input_encoding} = 'UTF-8';
291 }
292 if (defined $csdef->{no_bom_variant32endian1234}) {
293 $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant32endian1234}};
294 }
295 } elsif (substr ($b, 2, 2) eq "<\x00") { # ASCII32Endian2143
296 my $c = $b; $c =~ tr/\x00//d;
297 if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
298 encoding\s*=\s*["']([^"']*)/x) {
299 $obj->{input_encoding} = $1;
300 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
301 $csdef = $Whatpm::Charset::CharsetDef->{$uri};
302 if (not $csdef->{ascii32} or
303 $csdef->{ascii32endian1234} or
304 $csdef->{ascii32endian3412} or
305 $csdef->{ascii32endian4321} or
306 $csdef->{bom_required}) {
307 $obj->{onerror}->(undef, 'charset-name-mismatch-error',
308 charset_uri => $uri,
309 charset_name => $obj->{input_encoding});
310 }
311 } else {
312 $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
313 $obj->{input_encoding} = 'UTF-8';
314 }
315 if (defined $csdef->{no_bom_variant32endian2143}) {
316 $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant32endian2143}};
317 }
318 } elsif (substr ($b, 2, 2) eq "\xFE\xFF") { # ASCII32Endian1234
319 $obj->{has_bom} = 1;
320 substr ($b, 0, 4) = '';
321 my $c = $b; $c =~ tr/\x00//d;
322 if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
323 encoding\s*=\s*["']([^"']*)/x) {
324 $obj->{input_encoding} = $1;
325 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
326 $csdef = $Whatpm::Charset::CharsetDef->{$uri};
327 if (not $csdef->{ascii32} or
328 $csdef->{ascii32endian2143} or
329 $csdef->{ascii32endian3412} or
330 $csdef->{ascii32endian4321} or
331 $csdef->{bom_required}) {
332 $obj->{onerror}->(undef, 'charset-name-mismatch-error',
333 charset_uri => $uri,
334 charset_name => $obj->{input_encoding});
335 }
336 } else {
337 $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
338 $obj->{input_encoding} = 'UTF-8';
339 $obj->{has_bom} = 0;
340 $obj->{byte_buffer} .= "\x00\x00\xFE\xFF";
341 }
342 if (defined $csdef->{no_bom_variant32endian1234}) {
343 $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant32endian1234}};
344 }
345 } elsif (substr ($b, 2, 2) eq "\xFF\xFE") { # ASCII32Endian2143
346 $obj->{has_bom} = 1;
347 substr ($b, 0, 4) = '';
348 my $c = $b; $c =~ tr/\x00//d;
349 if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
350 encoding\s*=\s*["']([^"']*)/x) {
351 $obj->{input_encoding} = $1;
352 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
353 $csdef = $Whatpm::Charset::CharsetDef->{$uri};
354 if (not $csdef->{ascii32} or
355 $csdef->{ascii32endian1234} or
356 $csdef->{ascii32endian3412} or
357 $csdef->{ascii32endian4321} or
358 $csdef->{bom_required}) {
359 $obj->{onerror}->(undef, 'charset-name-mismatch-error',
360 charset_uri => $uri,
361 charset_name => $obj->{input_encoding});
362 }
363 } else {
364 $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
365 $obj->{input_encoding} = 'UTF-8';
366 $obj->{has_bom} = 0;
367 $obj->{byte_buffer} .= "\x00\x00\xFF\xFE";
368 }
369 if (defined $csdef->{no_bom_variant32endian2143}) {
370 $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant32endian2143}};
371 }
372 }
373 # \x4C\x6F\xA7\x94 EBCDIC
374 } # buffer
375 $obj->{byte_buffer} .= $b;
376 } # read
377 } elsif ($csdef->{uri}->{$XML_CHARSET.'utf-8'}) {
378 ## BOM is optional.
379 my $b = '';
380 if (read $obj->{filehandle}, $b, 3) {
381 if ($b eq "\xEF\xBB\xBF") {
382 $obj->{has_bom} = 1;
383 } else {
384 $obj->{byte_buffer} .= $b;
385 }
386 }
387 $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'}; # UTF-8 w/o BOM
388 } elsif ($csdef->{uri}->{$XML_CHARSET.'utf-16'}) {
389 ## BOM is mandated.
390 my $b = '';
391 if (read $obj->{filehandle}, $b, 2) {
392 if ($b eq "\xFE\xFF") {
393 $obj->{has_bom} = 1; # UTF-16BE w/o BOM
394 $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-16be'};
395 } elsif ($b eq "\xFF\xFE") {
396 $obj->{has_bom} = 1; # UTF-16LE w/o BOM
397 $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-16le'};
398 } else {
399 $obj->{onerror}->(undef, 'no-bom-error', charset_uri => $obj->{charset});
400 $obj->{has_bom} = 0;
401 $obj->{byte_buffer} .= $b; # UTF-16BE w/o BOM
402 $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-16be'};
403 }
404 } else {
405 $obj->{onerror}->(undef, 'no-bom-error', charset_uri => $obj->{charset});
406 $obj->{has_bom} = 0; # UTF-16BE w/o BOM
407 $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-16be'};
408 }
409 }
410
411 if ($csdef->{uri}->{$XML_CHARSET.'iso-2022-jp'}) {
412 $obj->{state_2440} = 'gl-jis-1997-swapped';
413 $obj->{state_2442} = 'gl-jis-1997';
414 $obj->{state} = 'state_2842';
415 require Encode::GLJIS1997Swapped;
416 require Encode::GLJIS1997;
417 if (Encode::find_encoding ($obj->{state_2440}) and
418 Encode::find_encoding ($obj->{state_2442})) {
419 return bless $obj, 'Whatpm::Charset::DecodeHandle::ISO2022JP';
420 }
421 } elsif ($csdef->{uri}->{$IANA_CHARSET.'iso-2022-jp'}) {
422 $obj->{state_2440} = 'gl-jis-1978';
423 $obj->{state_2442} = 'gl-jis-1983';
424 $obj->{state} = 'state_2842';
425 require Encode::GLJIS1978;
426 require Encode::GLJIS1983;
427 if (Encode::find_encoding ($obj->{state_2440}) and
428 Encode::find_encoding ($obj->{state_2442})) {
429 return bless $obj, 'Whatpm::Charset::DecodeHandle::ISO2022JP';
430 }
431 } elsif (defined $csdef->{perl_name}->[0]) {
432 if ($csdef->{uri}->{$XML_CHARSET.'euc-jp'} or
433 $csdef->{uri}->{$IANA_CHARSET.'euc-jp'}) {
434 $obj->{perl_encoding_name} = $csdef->{perl_name}->[0];
435 require Encode::EUCJP1997;
436 if (Encode::find_encoding ($obj->{perl_encoding_name})) {
437 return bless $obj, 'Whatpm::Charset::DecodeHandle::EUCJP';
438 }
439 } elsif ($csdef->{uri}->{$XML_CHARSET.'shift_jis'} or
440 $csdef->{uri}->{$IANA_CHARSET.'shift_jis'}) {
441 $obj->{perl_encoding_name} = $csdef->{perl_name}->[0];
442 require Encode::ShiftJIS1997;
443 if (Encode::find_encoding ($obj->{perl_encoding_name})) {
444 return bless $obj, 'Whatpm::Charset::DecodeHandle::ShiftJIS';
445 }
446 } elsif ($csdef->{is_block_safe}) {
447 $obj->{perl_encoding_name} = $csdef->{perl_name}->[0];
448 require Encode;
449 if (Encode::find_encoding ($obj->{perl_encoding_name})) {
450 return bless $obj, 'Whatpm::Charset::DecodeHandle::Encode';
451 }
452 }
453 }
454
455 $obj->{onerror}->(undef, 'charset-not-supported-error',
456 charset_uri => $obj->{charset});
457 return undef;
458 } # create_decode_handle
459
460 sub name_to_uri ($$$) {
461 my $domain = $_[1];
462 my $name = lc $_[2];
463
464 if ($domain eq 'ietf') {
465 return $IANA_CHARSET . $name;
466 } elsif ($domain eq 'xml') {
467 if ({
468 'utf-8' => 1,
469 'utf-16' => 1,
470 'iso-10646-ucs-2' => 1,
471 'iso-10646-ucs-4' => 1,
472 'iso-8859-1' => 1,
473 'iso-8859-2' => 1,
474 'iso-8859-3' => 1,
475 'iso-8859-4' => 1,
476 'iso-8859-5' => 1,
477 'iso-8859-6' => 1,
478 'iso-8859-7' => 1,
479 'iso-8859-8' => 1,
480 'iso-8859-9' => 1,
481 'iso-8859-10' => 1,
482 'iso-8859-11' => 1,
483 'iso-8859-13' => 1,
484 'iso-8859-14' => 1,
485 'iso-8859-15' => 1,
486 'iso-8859-16' => 1,
487 'iso-2022-jp' => 1,
488 'shift_jis' => 1,
489 'euc-jp' => 1,
490 }->{$name}) {
491 return $XML_CHARSET . $name;
492 }
493
494 my $uri = $IANA_CHARSET . $name;
495 return $uri if $Whatpm::Charset::CharsetDef->{$uri};
496
497 return $XML_CHARSET . $name;
498 } else {
499 return undef;
500 }
501 } # name_to_uri
502
503 sub uri_to_name ($$$) {
504 my (undef, $domain, $uri) = @_;
505
506 if ($domain eq 'xml') {
507 my $v = $Whatpm::Charset::CharsetDef->{$uri}->{xml_name};
508 return $v if defined $v;
509
510 if (substr ($uri, 0, length $XML_CHARSET) eq $XML_CHARSET) {
511 return substr ($uri, length $XML_CHARSET);
512 }
513
514 $domain = 'ietf'; ## TODO: XML encoding name has smaller range
515 }
516
517 if ($domain eq 'ietf') {
518 my $v = $Whatpm::Charset::CharsetDef->{$uri}->{iana_name};
519 return $v->[0] if defined $v;
520
521 if (substr ($uri, 0, length $IANA_CHARSET) eq $IANA_CHARSET) {
522 return substr ($uri, length $IANA_CHARSET);
523 }
524 }
525
526 return undef;
527 } # uri_to_name
528
529 require IO::Handle;
530
531 package Whatpm::Charset::DecodeHandle::ByteBuffer;
532
533 sub new ($$) {
534 my $self = bless {
535 buffer => '',
536 }, shift;
537 $self->{filehandle} = shift;
538 return $self;
539 } # new
540
541 sub read {
542 my $self = shift;
543 my $pos = length $self->{buffer};
544 my $r = $self->{filehandle}->read ($self->{buffer}, $_[1], $pos);
545 substr ($_[0], $_[2]) = substr ($self->{buffer}, $pos);
546 return $r;
547 } # read
548
549 sub close { $_[0]->{filehandle}->close }
550
551 package Whatpm::Charset::DecodeHandle::Encode;
552
553 sub charset ($) { $_[0]->{charset} }
554
555 sub close ($) { $_[0]->{filehandle}->close }
556
557 sub getc ($) {
558 my $self = $_[0];
559 return shift @{$self->{character_queue}} if @{$self->{character_queue}};
560
561 my $error;
562 if ($self->{continue}) {
563 if ($self->{filehandle}->read ($self->{byte_buffer}, 256,
564 length $self->{byte_buffer})) {
565 #
566 } else {
567 $error = 1;
568 }
569 $self->{continue} = 0;
570 } elsif (512 > length $self->{byte_buffer}) {
571 $self->{filehandle}->read ($self->{byte_buffer}, 256,
572 length $self->{byte_buffer});
573 }
574
575 my $r;
576 unless ($error) {
577 if (not $self->{bom_checked}) {
578 if (defined $self->{bom_pattern}) {
579 if ($self->{byte_buffer} =~ s/^$self->{bom_pattern}//) {
580 $self->{has_bom} = 1;
581 }
582 }
583 $self->{bom_checked} = 1;
584 }
585
586 my $string = Encode::decode ($self->{perl_encoding_name},
587 $self->{byte_buffer},
588 Encode::FB_QUIET ());
589 if (length $string) {
590 push @{$self->{character_queue}}, split //, $string;
591 $r = shift @{$self->{character_queue}};
592 if (length $self->{byte_buffer}) {
593 $self->{continue} = 1;
594 }
595 } else {
596 if (length $self->{byte_buffer}) {
597 $error = 1;
598 } else {
599 $r = undef;
600 }
601 }
602 }
603
604 if ($error) {
605 $r = substr $self->{byte_buffer}, 0, 1, '';
606 my $fallback = $self->{fallback}->{$r};
607 if (defined $fallback) {
608 ## NOTE: This is an HTML5 parse error. Applied to Web ISO-8859-1
609 ## and Web ISO-8859-11 encodings.
610 $self->{onerror}->($self, 'fallback-char-error', octets => \$r,
611 char => \$fallback,
612 level => $self->{must_level});
613 return $fallback;
614 } else {
615 $self->{onerror}->($self, 'illegal-octets-error', octets => \$r);
616 }
617 }
618
619 return $r;
620 } # getc
621
622 sub has_bom ($) { $_[0]->{has_bom} }
623
624 sub input_encoding ($) {
625 my $v = $_[0]->{input_encoding};
626 return $v if defined $v;
627
628 my $uri = $_[0]->{charset};
629 if (defined $uri) {
630 return Whatpm::Charset::DecodeHandle->uri_to_name (xml => $uri);
631 }
632
633 return undef;
634 } # input_encoding
635
636 sub onerror ($;$) {
637 if (@_ > 1) {
638 $_[0]->{onerror} = $_[1];
639 }
640
641 return $_[0]->{onerror};
642 } # onerror
643
644 sub ungetc ($$) {
645 unshift @{$_[0]->{character_queue}}, chr int ($_[1] or 0);
646 } # ungetc
647
648 package Whatpm::Charset::DecodeHandle::EUCJP;
649 push our @ISA, 'Whatpm::Charset::DecodeHandle::Encode';
650
651 sub getc ($) {
652 my $self = $_[0];
653 return shift @{$self->{character_queue}} if @{$self->{character_queue}};
654
655 my $error;
656 if ($self->{continue}) {
657 if ($self->{filehandle}->read ($self->{byte_buffer}, 256,
658 length $self->{byte_buffer})) {
659 #
660 } else {
661 $error = 1;
662 }
663 $self->{continue} = 0;
664 } elsif (512 > length $self->{byte_buffer}) {
665 $self->{filehandle}->read ($self->{byte_buffer}, 256,
666 length $self->{byte_buffer});
667 }
668
669 my $r;
670 unless ($error) {
671 my $string = Encode::decode ($self->{perl_encoding_name},
672 $self->{byte_buffer},
673 Encode::FB_QUIET ());
674 if (length $string) {
675 push @{$self->{character_queue}}, split //, $string;
676 $r = shift @{$self->{character_queue}};
677 if (length $self->{byte_buffer}) {
678 $self->{continue} = 1;
679 }
680 } else {
681 if (length $self->{byte_buffer}) {
682 $error = 1;
683 } else {
684 $r = undef;
685 }
686 }
687 }
688
689 if ($error) {
690 $r = substr $self->{byte_buffer}, 0, 1, '';
691 my $etype = 'illegal-octets-error';
692 if ($r =~ /^[\xA1-\xFE]/) {
693 if ($self->{byte_buffer} =~ s/^([\xA1-\xFE])//) {
694 $r .= $1;
695 $etype = 'unassigned-code-point-error';
696 }
697 } elsif ($r eq "\x8F") {
698 if ($self->{byte_buffer} =~ s/^([\xA1-\xFE][\xA1-\xFE]?)//) {
699 $r .= $1;
700 $etype = 'unassigned-code-point-error' if length $1 == 2;
701 }
702 } elsif ($r eq "\x8E") {
703 if ($self->{byte_buffer} =~ s/^([\xA1-\xFE])//) {
704 $r .= $1;
705 $etype = 'unassigned-code-point-error';
706 }
707 } elsif ($r eq "\xA0" or $r eq "\xFF") {
708 $etype = 'unassigned-code-point-error';
709 }
710 $self->{onerror}->($self, $etype, octets => \$r);
711 }
712
713 return $r;
714 } # getc
715
716 package Whatpm::Charset::DecodeHandle::ISO2022JP;
717 push our @ISA, 'Whatpm::Charset::DecodeHandle::Encode';
718
719 sub getc ($) {
720 my $self = $_[0];
721 return shift @{$self->{character_queue}} if @{$self->{character_queue}};
722
723 my $r;
724 A: {
725 my $error;
726 if ($self->{continue}) {
727 if ($self->{filehandle}->read ($self->{byte_buffer}, 256,
728 length $self->{byte_buffer})) {
729 #
730 } else {
731 $error = 1;
732 }
733 $self->{continue} = 0;
734 } elsif (512 > length $self->{byte_buffer}) {
735 $self->{filehandle}->read ($self->{byte_buffer}, 256,
736 length $self->{byte_buffer});
737 }
738
739 unless ($error) {
740 if ($self->{byte_buffer} =~ s/^\x1B(\x24[\x40\x42]|\x28[\x42\x4A])//) {
741 $self->{state} = {
742 "\x24\x40" => 'state_2440',
743 "\x24\x42" => 'state_2442',
744 "\x28\x42" => 'state_2842',
745 "\x28\x4A" => 'state_284A',
746 }->{$1};
747 redo A;
748 } elsif ($self->{state} eq 'state_2842') { # IRV
749 if ($self->{byte_buffer} =~ s/^([\x00-\x0D\x10-\x1A\x1C-\x7F]+)//) {
750 push @{$self->{character_queue}}, split //, $1;
751 $r = shift @{$self->{character_queue}};
752 } else {
753 if (length $self->{byte_buffer}) {
754 $error = 1;
755 } else {
756 $r = undef;
757 }
758 }
759 } elsif ($self->{state} eq 'state_284A') { # 0201
760 if ($self->{byte_buffer} =~ s/^([\x00-\x0D\x10-\x1A\x1C-\x7F]+)//) {
761 my $v = $1;
762 $v =~ tr/\x5C\x7E/\xA5\x{203E}/;
763 push @{$self->{character_queue}}, split //, $v;
764 $r = shift @{$self->{character_queue}};
765 } else {
766 if (length $self->{byte_buffer}) {
767 $error = 1;
768 } else {
769 $r = undef;
770 $self->{onerror}->($self, 'invalid-state-error',
771 state => $self->{state});
772 }
773 }
774 } elsif ($self->{state} eq 'state_2442') { # 1983
775 my $v = Encode::decode ($self->{state_2442},
776 $self->{byte_buffer},
777 Encode::FB_QUIET ());
778 if (length $v) {
779 push @{$self->{character_queue}}, split //, $v;
780 $r = shift @{$self->{character_queue}};
781 } else {
782 if (length $self->{byte_buffer}) {
783 $error = 1;
784 } else {
785 $r = undef;
786 $self->{onerror}->($self, 'invalid-state-error',
787 state => $self->{state});
788 }
789 }
790 } elsif ($self->{state} eq 'state_2440') { # 1978
791 my $v = Encode::decode ($self->{state_2440},
792 $self->{byte_buffer},
793 Encode::FB_QUIET ());
794 if (length $v) {
795 push @{$self->{character_queue}}, split //, $v;
796 $r = shift @{$self->{character_queue}};
797 } else {
798 if (length $self->{byte_buffer}) {
799 $error = 1;
800 } else {
801 $r = undef;
802 $self->{onerror}->($self, 'invalid-state-error',
803 state => $self->{state});
804 }
805 }
806 } else {
807 $error = 1;
808 }
809 }
810
811 if ($error) {
812 $r = substr $self->{byte_buffer}, 0, 1, '';
813 my $etype = 'illegal-octets-error';
814 if (($self->{state} eq 'state_2442' or
815 $self->{state} eq 'state_2440') and
816 $r =~ /^[\x21-\x7E]/ and
817 $self->{byte_buffer} =~ s/^([\x21-\x7E])//) {
818 $r .= $1;
819 $etype = 'unassigned-code-point-error';
820 } elsif ($r eq "\x1B" and
821 $self->{byte_buffer} =~ s/^\(H//) { # Old 0201
822 $r .= "(H";
823 $self->{state} = 'state_284A';
824 }
825 $self->{onerror}->($self, $etype, octets => \$r);
826 }
827 } # A
828
829 return $r;
830 } # getc
831
832 package Whatpm::Charset::DecodeHandle::ShiftJIS;
833 push our @ISA, 'Whatpm::Charset::DecodeHandle::Encode';
834
835 sub getc ($) {
836 my $self = $_[0];
837 return shift @{$self->{character_queue}} if @{$self->{character_queue}};
838
839 my $error;
840 if ($self->{continue}) {
841 if ($self->{filehandle}->read ($self->{byte_buffer}, 256,
842 length $self->{byte_buffer})) {
843 #
844 } else {
845 $error = 1;
846 }
847 $self->{continue} = 0;
848 } elsif (512 > length $self->{byte_buffer}) {
849 $self->{filehandle}->read ($self->{byte_buffer}, 256,
850 length $self->{byte_buffer});
851 }
852
853 my $r;
854 unless ($error) {
855 my $string = Encode::decode ($self->{perl_encoding_name},
856 $self->{byte_buffer},
857 Encode::FB_QUIET ());
858 if (length $string) {
859 push @{$self->{character_queue}}, split //, $string;
860 $r = shift @{$self->{character_queue}};
861 if (length $self->{byte_buffer}) {
862 $self->{continue} = 1;
863 }
864 } else {
865 if (length $self->{byte_buffer}) {
866 $error = 1;
867 } else {
868 $r = undef;
869 }
870 }
871 }
872
873 if ($error) {
874 $r = substr $self->{byte_buffer}, 0, 1, '';
875 my $etype = 'illegal-octets-error';
876 if ($r =~ /^[\x81-\x9F\xE0-\xFC]/) {
877 if ($self->{byte_buffer} =~ s/(.)//s) {
878 $r .= $1; # not limited to \x40-\xFC - \x7F
879 $etype = 'unassigned-code-point-error';
880 }
881 ## NOTE: Range [\xF0-\xFC] is unassigned and may be used as a single-byte
882 ## character or as the first-byte of a double-byte character according
883 ## to JIS X 0208:1997 Appendix 1. However, the current practice is
884 ## use the range as the first-byte of double-byte characters.
885 } elsif ($r =~ /^[\x80\xA0\xFD-\xFF]/) {
886 $etype = 'unassigned-code-point-error';
887 }
888 $self->{onerror}->($self, $etype, octets => \$r);
889 }
890
891 return $r;
892 } # getc
893
894 $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:us-ascii'} =
895 $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:us'} =
896 $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:iso646-us'} =
897 $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:cp367'} =
898 $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:ibm367'} =
899 $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:ansi_x3.4-1986'} =
900 $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:ansi_x3.4-1968'} =
901 $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:iso-ir-6'} =
902 $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:csascii'} =
903 $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:iso_646.irv:1991'} =
904 $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:ascii'} = {ascii8 =>
905 '1',
906 is_block_safe =>
907 '1',
908 ietf_name =>
909 ['ansi_x3.4-1968',
910 'ansi_x3.4-1986',
911 'ascii',
912 'cp367',
913 'csascii',
914 'ibm367',
915 'iso-ir-6',
916 'iso646-us',
917 'iso_646.irv:1991',
918 'us',
919 'us-ascii',
920 'us-ascii'],
921 mime_name =>
922 'us-ascii',
923 perl_name =>
924 ['ascii',
925 'iso-646-us',
926 'us-ascii'],
927 utf8_encoding_scheme =>
928 '1',
929 'uri',
930 {'urn:x-suika-fam-cx:charset:ansi_x3.4-1968',
931 '1',
932 'urn:x-suika-fam-cx:charset:ansi_x3.4-1986',
933 '1',
934 'urn:x-suika-fam-cx:charset:ascii',
935 '1',
936 'urn:x-suika-fam-cx:charset:cp367',
937 '1',
938 'urn:x-suika-fam-cx:charset:csascii',
939 '1',
940 'urn:x-suika-fam-cx:charset:ibm367',
941 '1',
942 'urn:x-suika-fam-cx:charset:iso-ir-6',
943 '1',
944 'urn:x-suika-fam-cx:charset:iso646-us',
945 '1',
946 'urn:x-suika-fam-cx:charset:iso_646.irv:1991',
947 '1',
948 'urn:x-suika-fam-cx:charset:us',
949 '1',
950 'urn:x-suika-fam-cx:charset:us-ascii',
951 '1'},
952 };
953
954 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.ascii-ctrl'} = {perl_name =>
955 ['ascii-ctrl'],
956 'uri',
957 {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.ascii-ctrl',
958 '1'}};
959 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.null'} = {perl_name =>
960 ['null'],
961 'uri',
962 {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.null',
963 '1'}};
964 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.utf-8'} = {ascii8 =>
965 '1',
966 bom_allowed =>
967 '1',
968 no_bom_variant =>
969 'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf8',
970 utf8_encoding_scheme =>
971 '1',
972 'uri',
973 {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.utf-8',
974 '1'},
975 xml_name => 'UTF-8',
976 };
977
978 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/UTF-8.RFC2279'} = {ascii8 =>
979 '1',
980 bom_allowed =>
981 '1',
982 no_bom_variant =>
983 'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf8',
984 utf8_encoding_scheme =>
985 '1',
986 'uri',
987 {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/UTF-8.RFC2279',
988 '1'}};
989 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-8'} = {
990 ascii8 => 1,
991 is_block_safe =>
992 '1',
993 perl_name =>
994 ['utf-8'],
995 utf8_encoding_scheme =>
996 '1',
997 'uri',
998 {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-8',
999 '1'}};
1000 $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:utf-8'} = {
1001 ascii8 => 1,
1002 bom_allowed =>
1003 '1',
1004 no_bom_variant =>
1005 'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-8',
1006 ietf_name =>
1007 ['utf-8'],
1008 mime_name =>
1009 'utf-8',
1010 utf8_encoding_scheme =>
1011 '1',
1012 'uri',
1013 {'urn:x-suika-fam-cx:charset:utf-8',
1014 '1'},
1015 };
1016
1017 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf8'} = {ascii8 =>
1018 '1',
1019 is_block_safe =>
1020 '1',
1021 perl_name =>
1022 ['utf8'],
1023 utf8_encoding_scheme =>
1024 '1',
1025 'uri',
1026 {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf8',
1027 '1'}};
1028 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.utf-16'} = {
1029 ascii16 => 1,
1030 bom_allowed =>
1031 '1',
1032 bom_required =>
1033 '1',
1034 no_bom_variant =>
1035 'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16le',
1036 no_bom_variant16be =>
1037 'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16be',
1038 no_bom_variant16le =>
1039 'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16le',
1040 perl_name =>
1041 ['utf-16'],
1042 'uri',
1043 {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.utf-16',
1044 '1'},
1045 xml_name => 'UTF-16',
1046 };
1047
1048 $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:utf-16'} = {
1049 ascii16 => 1,
1050 bom_allowed =>
1051 '1',
1052 no_bom_variant =>
1053 'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16le',
1054 no_bom_variant16be =>
1055 'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16be',
1056 no_bom_variant16le =>
1057 'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16le',
1058 ietf_name =>
1059 ['utf-16'],
1060 mime_name =>
1061 'utf-16',
1062 'uri',
1063 {'urn:x-suika-fam-cx:charset:utf-16',
1064 '1'},
1065 };
1066
1067 $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:utf-16be'} = {
1068 ascii16 => 1,
1069 ascii16be => 1,
1070 bom_allowed =>
1071 '1',
1072 no_bom_variant =>
1073 'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16be',
1074 no_bom_variant16be =>
1075 'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16be',
1076 ietf_name =>
1077 ['utf-16be'],
1078 mime_name =>
1079 'utf-16be',
1080 'uri',
1081 {'urn:x-suika-fam-cx:charset:utf-16be',
1082 '1'},
1083 };
1084
1085 $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:utf-16le'} = {
1086 ascii16 => 1,
1087 ascii16le => 1,
1088 bom_allowed =>
1089 '1',
1090 no_bom_variant =>
1091 'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16le',
1092 no_bom_variant16le =>
1093 'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16le',
1094 ietf_name =>
1095 ['utf-16le'],
1096 mime_name =>
1097 'utf-16le',
1098 'uri',
1099 {'urn:x-suika-fam-cx:charset:utf-16le',
1100 '1'},
1101 };
1102
1103 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16be'} = {
1104 ascii16 => 1,
1105 ascii16be => 1,
1106 is_block_safe =>
1107 '1',
1108 perl_name =>
1109 ['utf-16be'],
1110 'uri',
1111 {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16be',
1112 '1'}};
1113 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16le'} = {
1114 ascii16 => 1,
1115 ascii16le => 1,
1116 is_block_safe =>
1117 '1',
1118 perl_name =>
1119 ['utf-16le'],
1120 'uri',
1121 {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16le',
1122 '1'}};
1123 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-10646-ucs-2'} = $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:iso-10646-ucs-2'} = {
1124 ascii16 => 1,
1125 bom_allowed =>
1126 '1',
1127 no_bom_variant =>
1128 'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.ucs-2le',
1129 no_bom_variant16be =>
1130 'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.ucs-2be',
1131 no_bom_variant16le =>
1132 'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.ucs-2le',
1133 ietf_name =>
1134 ['csunicode',
1135 'iso-10646-ucs-2'],
1136 mime_name =>
1137 'iso-10646-ucs-2',
1138 'uri',
1139 {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-10646-ucs-2',
1140 '1',
1141 'urn:x-suika-fam-cx:charset:iso-10646-ucs-2',
1142 '1'},
1143 xml_name => 'ISO-10646-UCS-2',
1144 };
1145
1146 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.ucs-2be'} = {
1147 ascii16 => 1,
1148 ascii16be => 1,
1149 is_block_safe =>
1150 '1',
1151 perl_name =>
1152 ['ucs-2be'],
1153 'uri',
1154 {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.ucs-2be',
1155 '1'}};
1156 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.ucs-2le'} = {
1157 ascii16 => 1,
1158 ascii16le => 1,
1159 is_block_safe =>
1160 '1',
1161 perl_name =>
1162 ['ucs-2le'],
1163 'uri',
1164 {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.ucs-2le',
1165 '1'}};
1166 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-10646-ucs-4'} = $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:iso-10646-ucs-4'} = {
1167 ascii32 => 1,
1168 bom_allowed =>
1169 '1',
1170 no_bom_variant =>
1171 'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-32le',
1172 no_bom_variant32endian1234 =>
1173 'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-32be',
1174 no_bom_variant32endian4321 =>
1175 'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-32le',
1176 ietf_name =>
1177 ['csucs4',
1178 'iso-10646-ucs-4'],
1179 mime_name =>
1180 'iso-10646-ucs-4',
1181 'uri',
1182 {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-10646-ucs-4',
1183 '1',
1184 'urn:x-suika-fam-cx:charset:iso-10646-ucs-4',
1185 '1'},
1186 xml_name => 'ISO-10646-UCS-4',
1187 };
1188
1189 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-32be'} = {
1190 ascii32 => 1,
1191 ascii32endian1234 => 1,
1192 is_block_safe =>
1193 '1',
1194 perl_name =>
1195 ['ucs-4be',
1196 'utf-32be'],
1197 'uri',
1198 {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-32be',
1199 '1'}};
1200 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-32le'} = {
1201 ascii32 => 1,
1202 ascii32endian4321 => 1,
1203 is_block_safe =>
1204 '1',
1205 perl_name =>
1206 ['ucs-4le',
1207 'utf-32le'],
1208 'uri',
1209 {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-32le',
1210 '1'}};
1211 $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:iso_8859-1:1987'} = $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-1'} = {ascii8 =>
1212 '1',
1213 is_block_safe =>
1214 '1',
1215 ietf_name =>
1216 ['cp819',
1217 'csisolatin1',
1218 'ibm819',
1219 'iso-8859-1',
1220 'iso-8859-1',
1221 'iso-ir-100',
1222 'iso_8859-1',
1223 'iso_8859-1:1987',
1224 'l1',
1225 'latin1'],
1226 mime_name =>
1227 'iso-8859-1',
1228 perl_name =>
1229 ['iso-8859-1',
1230 'latin1'],
1231 'uri',
1232 {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-1',
1233 '1',
1234 'urn:x-suika-fam-cx:charset:iso_8859-1:1987',
1235 '1'},
1236 xml_name => 'ISO-8859-1',
1237 };
1238
1239 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-2'} = {ascii8 =>
1240 '1',
1241 is_block_safe =>
1242 '1',
1243 'uri',
1244 {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-2',
1245 '1'},
1246 xml_name => 'ISO-8859-2',
1247 };
1248
1249 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-3'} = {ascii8 =>
1250 '1',
1251 is_block_safe =>
1252 '1',
1253 'uri',
1254 {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-3',
1255 '1'},
1256 xml_name => 'ISO-8859-3',
1257 };
1258
1259 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-4'} = {ascii8 =>
1260 '1',
1261 is_block_safe =>
1262 '1',
1263 'uri',
1264 {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-4',
1265 '1'},
1266 xml_name => 'ISO-8859-4',
1267 };
1268
1269 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-5'} = {ascii8 =>
1270 '1',
1271 is_block_safe =>
1272 '1',
1273 'uri',
1274 {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-5',
1275 '1'},
1276 xml_name => 'ISO-8859-5',
1277 };
1278
1279 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-6'} = {ascii8 =>
1280 '1',
1281 is_block_safe =>
1282 '1',
1283 'uri',
1284 {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-6',
1285 '1'},
1286 xml_name => 'ISO-8859-6',
1287 };
1288
1289 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-7'} = {ascii8 =>
1290 '1',
1291 is_block_safe =>
1292 '1',
1293 'uri',
1294 {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-7',
1295 '1'},
1296 xml_name => 'ISO-8859-7',
1297 };
1298
1299 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-8'} = {ascii8 =>
1300 '1',
1301 is_block_safe =>
1302 '1',
1303 'uri',
1304 {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-8',
1305 '1'},
1306 xml_name => 'ISO-8859-8',
1307 };
1308
1309 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-9'} = {ascii8 =>
1310 '1',
1311 is_block_safe =>
1312 '1',
1313 'uri',
1314 {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-9',
1315 '1'},
1316 xml_name => 'ISO-8859-9',
1317 };
1318
1319 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-10'} = {ascii8 =>
1320 '1',
1321 is_block_safe =>
1322 '1',
1323 'uri',
1324 {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-10',
1325 '1'},
1326 xml_name => 'ISO-8859-10',
1327 };
1328
1329 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-11'} = {ascii8 =>
1330 '1',
1331 is_block_safe =>
1332 '1',
1333 'uri',
1334 {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-11',
1335 '1'},
1336 xml_name => 'ISO-8859-11',
1337 };
1338
1339 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-13'} = {ascii8 =>
1340 '1',
1341 is_block_safe =>
1342 '1',
1343 'uri',
1344 {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-13',
1345 '1'},
1346 xml_name => 'ISO-8859-13',
1347 };
1348
1349 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-14'} = {ascii8 =>
1350 '1',
1351 is_block_safe =>
1352 '1',
1353 'uri',
1354 {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-14',
1355 '1'},
1356 xml_name => 'ISO-8859-14',
1357 };
1358
1359 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-15'} = {ascii8 =>
1360 '1',
1361 is_block_safe =>
1362 '1',
1363 'uri',
1364 {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-15',
1365 '1'},
1366 xml_name => 'ISO-8859-15',
1367 };
1368
1369 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-16'} = {ascii8 =>
1370 '1',
1371 is_block_safe =>
1372 '1',
1373 'uri',
1374 {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-16',
1375 '1'},
1376 xml_name => 'ISO-8859-16',
1377 };
1378
1379 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-2022-jp'} = {ascii8 =>
1380 '1',
1381 'uri',
1382 {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-2022-jp',
1383 '1'},
1384 xml_name => 'ISO-2022-JP',
1385 };
1386
1387 $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:iso-2022-jp'} = {ascii8 =>
1388 '1',
1389 ietf_name =>
1390 ['csiso2022jp',
1391 'iso-2022-jp',
1392 'iso-2022-jp'],
1393 mime_name =>
1394 'iso-2022-jp',
1395 'uri',
1396 {'urn:x-suika-fam-cx:charset:iso-2022-jp',
1397 '1'},
1398 };
1399
1400 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.iso-2022-jp'} = {ascii8 =>
1401 '1',
1402 perl_name =>
1403 ['iso-2022-jp'],
1404 'uri',
1405 {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.iso-2022-jp',
1406 '1'}};
1407 $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:shift_jis'} = $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.shift_jis'} = {ascii8 =>
1408 '1',
1409 is_block_safe =>
1410 '1',
1411 ietf_name =>
1412 ['csshiftjis',
1413 'ms_kanji',
1414 'shift_jis',
1415 'shift_jis'],
1416 mime_name =>
1417 'shift_jis',
1418 perl_name =>
1419 ['shift-jis-1997'],
1420 'uri',
1421 {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.shift_jis',
1422 '1',
1423 'urn:x-suika-fam-cx:charset:shift_jis',
1424 '1'},
1425 xml_name => 'Shift_JIS',
1426 };
1427
1428 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.shiftjis'} = {ascii8 =>
1429 '1',
1430 is_block_safe =>
1431 '1',
1432 perl_name =>
1433 ['shiftjis',
1434 'sjis'],
1435 'uri',
1436 {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.shiftjis',
1437 '1'}};
1438 $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:euc-jp'} = $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.euc-jp'} = $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:extended_unix_code_packed_format_for_japanese'} = {ascii8 =>
1439 '1',
1440 is_block_safe =>
1441 '1',
1442 ietf_name =>
1443 ['cseucpkdfmtjapanese',
1444 'euc-jp',
1445 'euc-jp',
1446 'extended_unix_code_packed_format_for_japanese'],
1447 mime_name =>
1448 'euc-jp',
1449 perl_name =>
1450 ['euc-jp-1997'],
1451 'uri',
1452 {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.euc-jp',
1453 '1',
1454 'urn:x-suika-fam-cx:charset:euc-jp',
1455 '1',
1456 'urn:x-suika-fam-cx:charset:extended_unix_code_packed_format_for_japanese',
1457 '1'},
1458 xml_name => 'EUC-JP',
1459 };
1460
1461 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.euc-jp'} = {ascii8 =>
1462 '1',
1463 is_block_safe =>
1464 '1',
1465 perl_name =>
1466 ['euc-jp',
1467 'ujis'],
1468 'uri',
1469 {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.euc-jp',
1470 '1'}};
1471
1472 1;
1473 ## $Date: 2008/05/18 06:07:22 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24