/[suikacvs]/markup/html/whatpm/Whatpm/Charset/DecodeHandle.pm
Suika

Contents of /markup/html/whatpm/Whatpm/Charset/DecodeHandle.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.6 - (hide annotations) (download)
Sun May 18 06:07:22 2008 UTC (16 years, 5 months ago) by wakaba
Branch: MAIN
Changes since 1.5: +12 -2 lines
++ whatpm/Whatpm/Charset/ChangeLog	18 May 2008 06:07:15 -0000
	* WebThai.pm, WebLatin1.pm: New modules.

	* DecodeHandle.pm: Support for fallback character mappings,
	used for WebLatin1 and WebThai encodings.

2008-05-18  Wakaba  <wakaba@suika.fam.cx>

1 wakaba 1.1 package Whatpm::Charset::DecodeHandle;
2     use strict;
3    
4     my $XML_AUTO_CHARSET = q<http://suika.fam.cx/www/2006/03/xml-entity/>;
5     my $IANA_CHARSET = q<urn:x-suika-fam-cx:charset:>;
6     my $PERL_CHARSET = q<http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.>;
7     my $XML_CHARSET = q<http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.>;
8    
9     ## ->create_decode_handle ($charset_uri, $byte_stream, $onerror)
10     sub create_decode_handle ($$$;$) {
11     my $csdef = $Whatpm::Charset::CharsetDef->{$_[1]};
12     my $obj = {
13     character_queue => [],
14     filehandle => $_[2],
15     charset => $_[1],
16     byte_buffer => '',
17     onerror => $_[3] || sub {},
18     };
19     if ($csdef->{uri}->{$XML_AUTO_CHARSET} or
20     $obj->{charset} eq $XML_AUTO_CHARSET) {
21     my $b = ''; # UTF-8 w/o BOM
22     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
23 wakaba 1.2 $obj->{input_encoding} = 'UTF-8';
24 wakaba 1.1 if (read $obj->{filehandle}, $b, 256) {
25     no warnings "substr";
26     no warnings "uninitialized";
27     if (substr ($b, 0, 1) eq "<") {
28     if (substr ($b, 1, 1) eq "?") { # ASCII8
29     if ($b =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
30     encoding\s*=\s*["']([^"']*)/x) {
31 wakaba 1.2 $obj->{input_encoding} = $1;
32 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
33     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
34     if (not $csdef->{ascii8} or $csdef->{bom_required}) {
35     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
36     charset_uri => $uri,
37     charset_name => $obj->{input_encoding});
38     }
39     } else {
40     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
41 wakaba 1.2 $obj->{input_encoding} = 'UTF-8';
42 wakaba 1.1 }
43     if (defined $csdef->{no_bom_variant}) {
44     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant}};
45     }
46     } elsif (substr ($b, 1, 1) eq "\x00") {
47     if (substr ($b, 2, 2) eq "?\x00") { # ASCII16LE
48     my $c = $b; $c =~ tr/\x00//d;
49     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
50     encoding\s*=\s*["']([^"']*)/x) {
51 wakaba 1.2 $obj->{input_encoding} = $1;
52 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
53     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
54     if (not $csdef->{ascii16} or $csdef->{ascii16be} or
55     $csdef->{bom_required}) {
56     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
57     charset_uri => $uri,
58     charset_name => $obj->{input_encoding});
59     }
60     } else {
61     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
62 wakaba 1.2 $obj->{input_encoding} = 'UTF-8';
63 wakaba 1.1 }
64     if (defined $csdef->{no_bom_variant16le}) {
65     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant16le}};
66     }
67     } elsif (substr ($b, 2, 2) eq "\x00\x00") { # ASCII32Endian4321
68     my $c = $b; $c =~ tr/\x00//d;
69     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
70     encoding\s*=\s*["']([^"']*)/x) {
71 wakaba 1.2 $obj->{input_encoding} = $1;
72 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
73     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
74     if (not $csdef->{ascii32} or
75     $csdef->{ascii32endian1234} or
76     $csdef->{ascii32endian2143} or
77     $csdef->{ascii32endian3412} or
78     $csdef->{bom_required}) {
79     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
80     charset_uri => $uri,
81     charset_name => $obj->{input_encoding});
82     }
83     } else {
84     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
85 wakaba 1.2 $obj->{input_encoding} = 'UTF-8';
86 wakaba 1.1 }
87     if (defined $csdef->{no_bom_variant32endian4321}) {
88     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant32endian4321}};
89     }
90     }
91     }
92     } elsif (substr ($b, 0, 3) eq "\xEF\xBB\xBF") { # UTF8
93     $obj->{has_bom} = 1;
94     substr ($b, 0, 3) = '';
95     my $c = $b;
96     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
97     encoding\s*=\s*["']([^"']*)/x) {
98 wakaba 1.2 $obj->{input_encoding} = $1;
99 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
100     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
101     if (not $csdef->{utf8_encoding_scheme} or
102     not $csdef->{bom_allowed}) {
103     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
104     charset_uri => $uri,
105     charset_name => $obj->{input_encoding});
106     }
107     } else {
108     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
109 wakaba 1.2 $obj->{input_encoding} = 'UTF-8';
110 wakaba 1.1 }
111     if (defined $csdef->{no_bom_variant}) {
112     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant}};
113     }
114     } elsif (substr ($b, 0, 2) eq "\x00<") {
115     if (substr ($b, 2, 2) eq "\x00?") { # ASCII16BE
116     my $c = $b; $c =~ tr/\x00//d;
117     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
118     encoding\s*=\s*["']([^"']*)/x) {
119 wakaba 1.2 $obj->{input_encoding} = $1;
120 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
121     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
122     if (not $csdef->{ascii16} or $csdef->{ascii16le} or
123     $csdef->{bom_required}) {
124     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
125     charset_uri => $uri,
126     charset_name => $obj->{input_encoding});
127     }
128     } else {
129     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
130 wakaba 1.2 $obj->{input_encoding} = 'UTF-8';
131 wakaba 1.1 }
132     if (defined $csdef->{no_bom_variant16be}) {
133     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant16be}};
134     }
135     } elsif (substr ($b, 2, 2) eq "\x00\x00") { # ASCII32Endian3412
136     my $c = $b; $c =~ tr/\x00//d;
137     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
138     encoding\s*=\s*["']([^"']*)/x) {
139 wakaba 1.2 $obj->{input_encoding} = $1;
140 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
141     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
142     if (not $csdef->{ascii32} or
143     $csdef->{ascii32endian1234} or
144     $csdef->{ascii32endian2143} or
145     $csdef->{ascii32endian4321} or
146     $csdef->{bom_required}) {
147     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
148     charset_uri => $uri,
149     charset_name => $obj->{input_encoding});
150     }
151     } else {
152     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
153 wakaba 1.2 $obj->{input_encoding} = 'UTF-8';
154 wakaba 1.1 }
155     if (defined $csdef->{no_bom_variant32endian3412}) {
156     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant32endian3412}};
157     }
158     }
159     } elsif (substr ($b, 0, 2) eq "\xFE\xFF") {
160     if (substr ($b, 2, 2) eq "\x00<") { # ASCII16BE
161     $obj->{has_bom} = 1;
162     substr ($b, 0, 2) = '';
163     my $c = $b; $c =~ tr/\x00//d;
164     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
165     encoding\s*=\s*["']([^"']*)/x) {
166 wakaba 1.2 $obj->{input_encoding} = $1;
167 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
168     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
169     if (not $csdef->{ascii16} or
170     $csdef->{ascii16le} or
171     not $csdef->{bom_allowed}) {
172     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
173     charset_uri => $uri,
174     charset_name => $obj->{input_encoding});
175     }
176     } else {
177     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-16be'};
178 wakaba 1.2 $obj->{input_encoding} = 'UTF-16';
179 wakaba 1.1 }
180     if (defined $csdef->{no_bom_variant16be}) {
181     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant16be}};
182     }
183     } elsif (substr ($b, 2, 2) eq "\x00\x00") { # ASCII32Endian3412
184     $obj->{has_bom} = 1;
185     substr ($b, 0, 4) = '';
186     my $c = $b; $c =~ tr/\x00//d;
187     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
188     encoding\s*=\s*["']([^"']*)/x) {
189 wakaba 1.2 $obj->{input_encoding} = $1;
190 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
191     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
192     if (not $csdef->{ascii32} or
193     $csdef->{ascii32endian1234} or
194     $csdef->{ascii32endian2143} or
195     $csdef->{ascii32endian4321} or
196     not $csdef->{bom_allowed}) {
197     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
198     charset_uri => $uri,
199     charset_name => $obj->{input_encoding});
200     }
201     } else {
202     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-16be'};
203 wakaba 1.2 $obj->{input_encoding} = 'UTF-16';
204 wakaba 1.1 $obj->{byte_buffer} .= "\x00\x00";
205     }
206     if (defined $csdef->{no_bom_variant32endian3412}) {
207     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant32endian3412}};
208     }
209     } else {
210     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-16be'};
211 wakaba 1.2 $obj->{input_encoding} = 'UTF-16';
212 wakaba 1.1 substr ($b, 0, 2) = '';
213     $obj->{has_bom} = 1;
214     }
215     } elsif (substr ($b, 0, 2) eq "\xFF\xFE") {
216     if (substr ($b, 2, 2) eq "<\x00") { # ASCII16LE
217     $obj->{has_bom} = 1;
218     substr ($b, 0, 2) = '';
219     my $c = $b; $c =~ tr/\x00//d;
220     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
221     encoding\s*=\s*["']([^"']*)/x) {
222 wakaba 1.2 $obj->{input_encoding} = $1;
223 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
224     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
225     if (not $csdef->{ascii16} or
226     $csdef->{ascii16be} or
227     not $csdef->{bom_allowed}) {
228     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
229     charset_uri => $uri,
230     charset_name => $obj->{input_encoding});
231     }
232     } else {
233     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-16le'};
234 wakaba 1.2 $obj->{input_encoding} = 'UTF-16';
235 wakaba 1.1 }
236     if (defined $csdef->{no_bom_variant16le}) {
237     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant16le}};
238     }
239     } elsif (substr ($b, 2, 2) eq "\x00\x00") { # ASCII32Endian4321
240     $obj->{has_bom} = 1;
241     substr ($b, 0, 4) = '';
242     my $c = $b; $c =~ tr/\x00//d;
243     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
244     encoding\s*=\s*["']([^"']*)/x) {
245 wakaba 1.2 $obj->{input_encoding} = $1;
246 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
247     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
248     if (not $csdef->{ascii32} or
249     $csdef->{ascii32endian1234} or
250     $csdef->{ascii32endian2143} or
251     $csdef->{ascii32endian3412} or
252     not $csdef->{bom_allowed}) {
253     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
254     charset_uri => $uri,
255     charset_name => $obj->{input_encoding});
256     }
257     } else {
258     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-16le'};
259 wakaba 1.2 $obj->{input_encoding} = 'UTF-16';
260 wakaba 1.1 $obj->{byte_buffer} .= "\x00\x00";
261     }
262     if (defined $csdef->{no_bom_variant32endian4321}) {
263     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant32endian4321}};
264     }
265     } else {
266     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-16le'};
267 wakaba 1.2 $obj->{input_encoding} = 'UTF-16';
268 wakaba 1.1 substr ($b, 0, 2) = '';
269     $obj->{has_bom} = 1;
270     }
271     } elsif (substr ($b, 0, 2) eq "\x00\x00") {
272     if (substr ($b, 2, 2) eq "\x00<") { # ASCII32Endian1234
273     my $c = $b; $c =~ tr/\x00//d;
274     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
275     encoding\s*=\s*["']([^"']*)/x) {
276 wakaba 1.2 $obj->{input_encoding} = $1;
277 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
278     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
279     if (not $csdef->{ascii32} or
280     $csdef->{ascii32endian2143} or
281     $csdef->{ascii32endian3412} or
282     $csdef->{ascii32endian4321} or
283     $csdef->{bom_required}) {
284     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
285     charset_uri => $uri,
286     charset_name => $obj->{input_encoding});
287     }
288     } else {
289     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
290 wakaba 1.2 $obj->{input_encoding} = 'UTF-8';
291 wakaba 1.1 }
292     if (defined $csdef->{no_bom_variant32endian1234}) {
293     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant32endian1234}};
294     }
295     } elsif (substr ($b, 2, 2) eq "<\x00") { # ASCII32Endian2143
296     my $c = $b; $c =~ tr/\x00//d;
297     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
298     encoding\s*=\s*["']([^"']*)/x) {
299 wakaba 1.2 $obj->{input_encoding} = $1;
300 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
301     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
302     if (not $csdef->{ascii32} or
303     $csdef->{ascii32endian1234} or
304     $csdef->{ascii32endian3412} or
305     $csdef->{ascii32endian4321} or
306     $csdef->{bom_required}) {
307     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
308     charset_uri => $uri,
309     charset_name => $obj->{input_encoding});
310     }
311     } else {
312     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
313 wakaba 1.2 $obj->{input_encoding} = 'UTF-8';
314 wakaba 1.1 }
315     if (defined $csdef->{no_bom_variant32endian2143}) {
316     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant32endian2143}};
317     }
318     } elsif (substr ($b, 2, 2) eq "\xFE\xFF") { # ASCII32Endian1234
319     $obj->{has_bom} = 1;
320     substr ($b, 0, 4) = '';
321     my $c = $b; $c =~ tr/\x00//d;
322     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
323     encoding\s*=\s*["']([^"']*)/x) {
324 wakaba 1.2 $obj->{input_encoding} = $1;
325 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
326     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
327     if (not $csdef->{ascii32} or
328     $csdef->{ascii32endian2143} or
329     $csdef->{ascii32endian3412} or
330     $csdef->{ascii32endian4321} or
331     $csdef->{bom_required}) {
332     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
333     charset_uri => $uri,
334     charset_name => $obj->{input_encoding});
335     }
336     } else {
337     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
338 wakaba 1.2 $obj->{input_encoding} = 'UTF-8';
339 wakaba 1.1 $obj->{has_bom} = 0;
340     $obj->{byte_buffer} .= "\x00\x00\xFE\xFF";
341     }
342     if (defined $csdef->{no_bom_variant32endian1234}) {
343     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant32endian1234}};
344     }
345     } elsif (substr ($b, 2, 2) eq "\xFF\xFE") { # ASCII32Endian2143
346     $obj->{has_bom} = 1;
347     substr ($b, 0, 4) = '';
348     my $c = $b; $c =~ tr/\x00//d;
349     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
350     encoding\s*=\s*["']([^"']*)/x) {
351 wakaba 1.2 $obj->{input_encoding} = $1;
352 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
353     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
354     if (not $csdef->{ascii32} or
355     $csdef->{ascii32endian1234} or
356     $csdef->{ascii32endian3412} or
357     $csdef->{ascii32endian4321} or
358     $csdef->{bom_required}) {
359     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
360     charset_uri => $uri,
361     charset_name => $obj->{input_encoding});
362     }
363     } else {
364     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
365 wakaba 1.2 $obj->{input_encoding} = 'UTF-8';
366 wakaba 1.1 $obj->{has_bom} = 0;
367     $obj->{byte_buffer} .= "\x00\x00\xFF\xFE";
368     }
369     if (defined $csdef->{no_bom_variant32endian2143}) {
370     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant32endian2143}};
371     }
372     }
373     # \x4C\x6F\xA7\x94 EBCDIC
374     } # buffer
375     $obj->{byte_buffer} .= $b;
376     } # read
377     } elsif ($csdef->{uri}->{$XML_CHARSET.'utf-8'}) {
378     ## BOM is optional.
379     my $b = '';
380     if (read $obj->{filehandle}, $b, 3) {
381     if ($b eq "\xEF\xBB\xBF") {
382     $obj->{has_bom} = 1;
383     } else {
384     $obj->{byte_buffer} .= $b;
385     }
386     }
387     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'}; # UTF-8 w/o BOM
388     } elsif ($csdef->{uri}->{$XML_CHARSET.'utf-16'}) {
389     ## BOM is mandated.
390     my $b = '';
391     if (read $obj->{filehandle}, $b, 2) {
392     if ($b eq "\xFE\xFF") {
393     $obj->{has_bom} = 1; # UTF-16BE w/o BOM
394     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-16be'};
395     } elsif ($b eq "\xFF\xFE") {
396     $obj->{has_bom} = 1; # UTF-16LE w/o BOM
397     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-16le'};
398     } else {
399     $obj->{onerror}->(undef, 'no-bom-error', charset_uri => $obj->{charset});
400     $obj->{has_bom} = 0;
401     $obj->{byte_buffer} .= $b; # UTF-16BE w/o BOM
402     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-16be'};
403     }
404     } else {
405     $obj->{onerror}->(undef, 'no-bom-error', charset_uri => $obj->{charset});
406     $obj->{has_bom} = 0; # UTF-16BE w/o BOM
407     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-16be'};
408     }
409     }
410    
411     if ($csdef->{uri}->{$XML_CHARSET.'iso-2022-jp'}) {
412     $obj->{state_2440} = 'gl-jis-1997-swapped';
413     $obj->{state_2442} = 'gl-jis-1997';
414     $obj->{state} = 'state_2842';
415     require Encode::GLJIS1997Swapped;
416     require Encode::GLJIS1997;
417     if (Encode::find_encoding ($obj->{state_2440}) and
418     Encode::find_encoding ($obj->{state_2442})) {
419     return bless $obj, 'Whatpm::Charset::DecodeHandle::ISO2022JP';
420     }
421     } elsif ($csdef->{uri}->{$IANA_CHARSET.'iso-2022-jp'}) {
422     $obj->{state_2440} = 'gl-jis-1978';
423     $obj->{state_2442} = 'gl-jis-1983';
424     $obj->{state} = 'state_2842';
425     require Encode::GLJIS1978;
426     require Encode::GLJIS1983;
427     if (Encode::find_encoding ($obj->{state_2440}) and
428     Encode::find_encoding ($obj->{state_2442})) {
429     return bless $obj, 'Whatpm::Charset::DecodeHandle::ISO2022JP';
430     }
431     } elsif (defined $csdef->{perl_name}->[0]) {
432     if ($csdef->{uri}->{$XML_CHARSET.'euc-jp'} or
433     $csdef->{uri}->{$IANA_CHARSET.'euc-jp'}) {
434     $obj->{perl_encoding_name} = $csdef->{perl_name}->[0];
435     require Encode::EUCJP1997;
436     if (Encode::find_encoding ($obj->{perl_encoding_name})) {
437     return bless $obj, 'Whatpm::Charset::DecodeHandle::EUCJP';
438     }
439     } elsif ($csdef->{uri}->{$XML_CHARSET.'shift_jis'} or
440     $csdef->{uri}->{$IANA_CHARSET.'shift_jis'}) {
441     $obj->{perl_encoding_name} = $csdef->{perl_name}->[0];
442     require Encode::ShiftJIS1997;
443     if (Encode::find_encoding ($obj->{perl_encoding_name})) {
444     return bless $obj, 'Whatpm::Charset::DecodeHandle::ShiftJIS';
445     }
446     } elsif ($csdef->{is_block_safe}) {
447     $obj->{perl_encoding_name} = $csdef->{perl_name}->[0];
448     require Encode;
449     if (Encode::find_encoding ($obj->{perl_encoding_name})) {
450     return bless $obj, 'Whatpm::Charset::DecodeHandle::Encode';
451     }
452     }
453     }
454    
455     $obj->{onerror}->(undef, 'charset-not-supported-error',
456     charset_uri => $obj->{charset});
457     return undef;
458     } # create_decode_handle
459    
460     sub name_to_uri ($$$) {
461     my $domain = $_[1];
462     my $name = lc $_[2];
463    
464     if ($domain eq 'ietf') {
465     return $IANA_CHARSET . $name;
466     } elsif ($domain eq 'xml') {
467     if ({
468     'utf-8' => 1,
469     'utf-16' => 1,
470     'iso-10646-ucs-2' => 1,
471     'iso-10646-ucs-4' => 1,
472     'iso-8859-1' => 1,
473     'iso-8859-2' => 1,
474     'iso-8859-3' => 1,
475     'iso-8859-4' => 1,
476     'iso-8859-5' => 1,
477     'iso-8859-6' => 1,
478     'iso-8859-7' => 1,
479     'iso-8859-8' => 1,
480     'iso-8859-9' => 1,
481     'iso-8859-10' => 1,
482     'iso-8859-11' => 1,
483     'iso-8859-13' => 1,
484     'iso-8859-14' => 1,
485     'iso-8859-15' => 1,
486     'iso-8859-16' => 1,
487     'iso-2022-jp' => 1,
488     'shift_jis' => 1,
489     'euc-jp' => 1,
490     }->{$name}) {
491     return $XML_CHARSET . $name;
492     }
493    
494     my $uri = $IANA_CHARSET . $name;
495     return $uri if $Whatpm::Charset::CharsetDef->{$uri};
496    
497     return $XML_CHARSET . $name;
498     } else {
499     return undef;
500     }
501     } # name_to_uri
502    
503     sub uri_to_name ($$$) {
504     my (undef, $domain, $uri) = @_;
505 wakaba 1.2
506     if ($domain eq 'xml') {
507     my $v = $Whatpm::Charset::CharsetDef->{$uri}->{xml_name};
508     return $v if defined $v;
509    
510     if (substr ($uri, 0, length $XML_CHARSET) eq $XML_CHARSET) {
511     return substr ($uri, length $XML_CHARSET);
512     }
513    
514     $domain = 'ietf'; ## TODO: XML encoding name has smaller range
515     }
516    
517     if ($domain eq 'ietf') {
518     my $v = $Whatpm::Charset::CharsetDef->{$uri}->{iana_name};
519     return $v->[0] if defined $v;
520    
521     if (substr ($uri, 0, length $IANA_CHARSET) eq $IANA_CHARSET) {
522 wakaba 1.1 return substr ($uri, length $IANA_CHARSET);
523     }
524     }
525    
526     return undef;
527     } # uri_to_name
528    
529 wakaba 1.3 require IO::Handle;
530    
531     package Whatpm::Charset::DecodeHandle::ByteBuffer;
532    
533     sub new ($$) {
534     my $self = bless {
535     buffer => '',
536     }, shift;
537     $self->{filehandle} = shift;
538     return $self;
539     } # new
540    
541     sub read {
542     my $self = shift;
543     my $pos = length $self->{buffer};
544     my $r = $self->{filehandle}->read ($self->{buffer}, $_[1], $pos);
545     substr ($_[0], $_[2]) = substr ($self->{buffer}, $pos);
546     return $r;
547     } # read
548    
549     sub close { $_[0]->{filehandle}->close }
550    
551 wakaba 1.1 package Whatpm::Charset::DecodeHandle::Encode;
552    
553     sub charset ($) { $_[0]->{charset} }
554    
555 wakaba 1.3 sub close ($) { $_[0]->{filehandle}->close }
556 wakaba 1.1
557     sub getc ($) {
558     my $self = $_[0];
559     return shift @{$self->{character_queue}} if @{$self->{character_queue}};
560    
561     my $error;
562     if ($self->{continue}) {
563 wakaba 1.3 if ($self->{filehandle}->read ($self->{byte_buffer}, 256,
564     length $self->{byte_buffer})) {
565 wakaba 1.1 #
566     } else {
567     $error = 1;
568     }
569     $self->{continue} = 0;
570     } elsif (512 > length $self->{byte_buffer}) {
571 wakaba 1.3 $self->{filehandle}->read ($self->{byte_buffer}, 256,
572     length $self->{byte_buffer});
573 wakaba 1.1 }
574    
575     my $r;
576     unless ($error) {
577 wakaba 1.4 if (not $self->{bom_checked}) {
578     if (defined $self->{bom_pattern}) {
579     if ($self->{byte_buffer} =~ s/^$self->{bom_pattern}//) {
580     $self->{has_bom} = 1;
581     }
582     }
583     $self->{bom_checked} = 1;
584     }
585    
586 wakaba 1.1 my $string = Encode::decode ($self->{perl_encoding_name},
587     $self->{byte_buffer},
588     Encode::FB_QUIET ());
589     if (length $string) {
590     push @{$self->{character_queue}}, split //, $string;
591     $r = shift @{$self->{character_queue}};
592     if (length $self->{byte_buffer}) {
593     $self->{continue} = 1;
594     }
595     } else {
596     if (length $self->{byte_buffer}) {
597     $error = 1;
598     } else {
599     $r = undef;
600     }
601     }
602     }
603    
604     if ($error) {
605     $r = substr $self->{byte_buffer}, 0, 1, '';
606 wakaba 1.6 my $fallback = $self->{fallback}->{$r};
607     if (defined $fallback) {
608     ## NOTE: This is an HTML5 parse error. Applied to Web ISO-8859-1
609     ## and Web ISO-8859-11 encodings.
610     $self->{onerror}->($self, 'fallback-char-error', octets => \$r,
611     char => \$fallback,
612     level => $self->{must_level});
613     return $fallback;
614     } else {
615     $self->{onerror}->($self, 'illegal-octets-error', octets => \$r);
616     }
617 wakaba 1.1 }
618    
619     return $r;
620     } # getc
621    
622     sub has_bom ($) { $_[0]->{has_bom} }
623    
624 wakaba 1.2 sub input_encoding ($) {
625     my $v = $_[0]->{input_encoding};
626     return $v if defined $v;
627    
628     my $uri = $_[0]->{charset};
629     if (defined $uri) {
630     return Whatpm::Charset::DecodeHandle->uri_to_name (xml => $uri);
631     }
632    
633     return undef;
634     } # input_encoding
635 wakaba 1.1
636     sub onerror ($;$) {
637     if (@_ > 1) {
638     $_[0]->{onerror} = $_[1];
639     }
640    
641     return $_[0]->{onerror};
642     } # onerror
643    
644     sub ungetc ($$) {
645     unshift @{$_[0]->{character_queue}}, chr int ($_[1] or 0);
646     } # ungetc
647    
648     package Whatpm::Charset::DecodeHandle::EUCJP;
649     push our @ISA, 'Whatpm::Charset::DecodeHandle::Encode';
650    
651     sub getc ($) {
652     my $self = $_[0];
653     return shift @{$self->{character_queue}} if @{$self->{character_queue}};
654    
655     my $error;
656     if ($self->{continue}) {
657 wakaba 1.3 if ($self->{filehandle}->read ($self->{byte_buffer}, 256,
658     length $self->{byte_buffer})) {
659 wakaba 1.1 #
660     } else {
661     $error = 1;
662     }
663     $self->{continue} = 0;
664     } elsif (512 > length $self->{byte_buffer}) {
665 wakaba 1.3 $self->{filehandle}->read ($self->{byte_buffer}, 256,
666     length $self->{byte_buffer});
667 wakaba 1.1 }
668    
669     my $r;
670     unless ($error) {
671     my $string = Encode::decode ($self->{perl_encoding_name},
672     $self->{byte_buffer},
673     Encode::FB_QUIET ());
674     if (length $string) {
675     push @{$self->{character_queue}}, split //, $string;
676     $r = shift @{$self->{character_queue}};
677     if (length $self->{byte_buffer}) {
678     $self->{continue} = 1;
679     }
680     } else {
681     if (length $self->{byte_buffer}) {
682     $error = 1;
683     } else {
684     $r = undef;
685     }
686     }
687     }
688    
689     if ($error) {
690     $r = substr $self->{byte_buffer}, 0, 1, '';
691     my $etype = 'illegal-octets-error';
692     if ($r =~ /^[\xA1-\xFE]/) {
693     if ($self->{byte_buffer} =~ s/^([\xA1-\xFE])//) {
694     $r .= $1;
695     $etype = 'unassigned-code-point-error';
696     }
697     } elsif ($r eq "\x8F") {
698     if ($self->{byte_buffer} =~ s/^([\xA1-\xFE][\xA1-\xFE]?)//) {
699     $r .= $1;
700     $etype = 'unassigned-code-point-error' if length $1 == 2;
701     }
702     } elsif ($r eq "\x8E") {
703     if ($self->{byte_buffer} =~ s/^([\xA1-\xFE])//) {
704     $r .= $1;
705     $etype = 'unassigned-code-point-error';
706     }
707     } elsif ($r eq "\xA0" or $r eq "\xFF") {
708     $etype = 'unassigned-code-point-error';
709     }
710     $self->{onerror}->($self, $etype, octets => \$r);
711     }
712    
713     return $r;
714     } # getc
715    
716     package Whatpm::Charset::DecodeHandle::ISO2022JP;
717     push our @ISA, 'Whatpm::Charset::DecodeHandle::Encode';
718    
719     sub getc ($) {
720     my $self = $_[0];
721     return shift @{$self->{character_queue}} if @{$self->{character_queue}};
722    
723     my $r;
724     A: {
725     my $error;
726     if ($self->{continue}) {
727 wakaba 1.3 if ($self->{filehandle}->read ($self->{byte_buffer}, 256,
728     length $self->{byte_buffer})) {
729 wakaba 1.1 #
730     } else {
731     $error = 1;
732     }
733     $self->{continue} = 0;
734     } elsif (512 > length $self->{byte_buffer}) {
735 wakaba 1.3 $self->{filehandle}->read ($self->{byte_buffer}, 256,
736     length $self->{byte_buffer});
737 wakaba 1.1 }
738    
739     unless ($error) {
740     if ($self->{byte_buffer} =~ s/^\x1B(\x24[\x40\x42]|\x28[\x42\x4A])//) {
741     $self->{state} = {
742     "\x24\x40" => 'state_2440',
743     "\x24\x42" => 'state_2442',
744     "\x28\x42" => 'state_2842',
745     "\x28\x4A" => 'state_284A',
746     }->{$1};
747     redo A;
748     } elsif ($self->{state} eq 'state_2842') { # IRV
749     if ($self->{byte_buffer} =~ s/^([\x00-\x0D\x10-\x1A\x1C-\x7F]+)//) {
750     push @{$self->{character_queue}}, split //, $1;
751     $r = shift @{$self->{character_queue}};
752     } else {
753     if (length $self->{byte_buffer}) {
754     $error = 1;
755     } else {
756     $r = undef;
757     }
758     }
759     } elsif ($self->{state} eq 'state_284A') { # 0201
760     if ($self->{byte_buffer} =~ s/^([\x00-\x0D\x10-\x1A\x1C-\x7F]+)//) {
761     my $v = $1;
762     $v =~ tr/\x5C\x7E/\xA5\x{203E}/;
763     push @{$self->{character_queue}}, split //, $v;
764     $r = shift @{$self->{character_queue}};
765     } else {
766     if (length $self->{byte_buffer}) {
767     $error = 1;
768     } else {
769     $r = undef;
770     $self->{onerror}->($self, 'invalid-state-error',
771     state => $self->{state});
772     }
773     }
774     } elsif ($self->{state} eq 'state_2442') { # 1983
775     my $v = Encode::decode ($self->{state_2442},
776     $self->{byte_buffer},
777     Encode::FB_QUIET ());
778     if (length $v) {
779     push @{$self->{character_queue}}, split //, $v;
780     $r = shift @{$self->{character_queue}};
781     } else {
782     if (length $self->{byte_buffer}) {
783     $error = 1;
784     } else {
785     $r = undef;
786     $self->{onerror}->($self, 'invalid-state-error',
787     state => $self->{state});
788     }
789     }
790     } elsif ($self->{state} eq 'state_2440') { # 1978
791     my $v = Encode::decode ($self->{state_2440},
792     $self->{byte_buffer},
793     Encode::FB_QUIET ());
794     if (length $v) {
795     push @{$self->{character_queue}}, split //, $v;
796     $r = shift @{$self->{character_queue}};
797     } else {
798     if (length $self->{byte_buffer}) {
799     $error = 1;
800     } else {
801     $r = undef;
802     $self->{onerror}->($self, 'invalid-state-error',
803     state => $self->{state});
804     }
805     }
806     } else {
807     $error = 1;
808     }
809     }
810    
811     if ($error) {
812     $r = substr $self->{byte_buffer}, 0, 1, '';
813     my $etype = 'illegal-octets-error';
814     if (($self->{state} eq 'state_2442' or
815     $self->{state} eq 'state_2440') and
816     $r =~ /^[\x21-\x7E]/ and
817     $self->{byte_buffer} =~ s/^([\x21-\x7E])//) {
818     $r .= $1;
819     $etype = 'unassigned-code-point-error';
820     } elsif ($r eq "\x1B" and
821     $self->{byte_buffer} =~ s/^\(H//) { # Old 0201
822     $r .= "(H";
823     $self->{state} = 'state_284A';
824     }
825     $self->{onerror}->($self, $etype, octets => \$r);
826     }
827     } # A
828    
829     return $r;
830     } # getc
831    
832     package Whatpm::Charset::DecodeHandle::ShiftJIS;
833     push our @ISA, 'Whatpm::Charset::DecodeHandle::Encode';
834    
835     sub getc ($) {
836     my $self = $_[0];
837     return shift @{$self->{character_queue}} if @{$self->{character_queue}};
838    
839     my $error;
840     if ($self->{continue}) {
841 wakaba 1.3 if ($self->{filehandle}->read ($self->{byte_buffer}, 256,
842     length $self->{byte_buffer})) {
843 wakaba 1.1 #
844     } else {
845     $error = 1;
846     }
847     $self->{continue} = 0;
848     } elsif (512 > length $self->{byte_buffer}) {
849 wakaba 1.3 $self->{filehandle}->read ($self->{byte_buffer}, 256,
850     length $self->{byte_buffer});
851 wakaba 1.1 }
852    
853     my $r;
854     unless ($error) {
855     my $string = Encode::decode ($self->{perl_encoding_name},
856     $self->{byte_buffer},
857     Encode::FB_QUIET ());
858     if (length $string) {
859     push @{$self->{character_queue}}, split //, $string;
860     $r = shift @{$self->{character_queue}};
861     if (length $self->{byte_buffer}) {
862     $self->{continue} = 1;
863     }
864     } else {
865     if (length $self->{byte_buffer}) {
866     $error = 1;
867     } else {
868     $r = undef;
869     }
870     }
871     }
872    
873     if ($error) {
874     $r = substr $self->{byte_buffer}, 0, 1, '';
875     my $etype = 'illegal-octets-error';
876 wakaba 1.5 if ($r =~ /^[\x81-\x9F\xE0-\xFC]/) {
877 wakaba 1.1 if ($self->{byte_buffer} =~ s/(.)//s) {
878     $r .= $1; # not limited to \x40-\xFC - \x7F
879     $etype = 'unassigned-code-point-error';
880     }
881 wakaba 1.5 ## NOTE: Range [\xF0-\xFC] is unassigned and may be used as a single-byte
882     ## character or as the first-byte of a double-byte character according
883     ## to JIS X 0208:1997 Appendix 1. However, the current practice is
884     ## use the range as the first-byte of double-byte characters.
885     } elsif ($r =~ /^[\x80\xA0\xFD-\xFF]/) {
886 wakaba 1.1 $etype = 'unassigned-code-point-error';
887     }
888     $self->{onerror}->($self, $etype, octets => \$r);
889     }
890    
891     return $r;
892     } # getc
893    
894     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:us-ascii'} =
895     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:us'} =
896     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:iso646-us'} =
897     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:cp367'} =
898     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:ibm367'} =
899     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:ansi_x3.4-1986'} =
900     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:ansi_x3.4-1968'} =
901     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:iso-ir-6'} =
902     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:csascii'} =
903     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:iso_646.irv:1991'} =
904     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:ascii'} = {ascii8 =>
905     '1',
906     is_block_safe =>
907     '1',
908     ietf_name =>
909     ['ansi_x3.4-1968',
910     'ansi_x3.4-1986',
911     'ascii',
912     'cp367',
913     'csascii',
914     'ibm367',
915     'iso-ir-6',
916     'iso646-us',
917     'iso_646.irv:1991',
918     'us',
919     'us-ascii',
920     'us-ascii'],
921     mime_name =>
922     'us-ascii',
923     perl_name =>
924     ['ascii',
925     'iso-646-us',
926     'us-ascii'],
927     utf8_encoding_scheme =>
928     '1',
929     'uri',
930     {'urn:x-suika-fam-cx:charset:ansi_x3.4-1968',
931     '1',
932     'urn:x-suika-fam-cx:charset:ansi_x3.4-1986',
933     '1',
934     'urn:x-suika-fam-cx:charset:ascii',
935     '1',
936     'urn:x-suika-fam-cx:charset:cp367',
937     '1',
938     'urn:x-suika-fam-cx:charset:csascii',
939     '1',
940     'urn:x-suika-fam-cx:charset:ibm367',
941     '1',
942     'urn:x-suika-fam-cx:charset:iso-ir-6',
943     '1',
944     'urn:x-suika-fam-cx:charset:iso646-us',
945     '1',
946     'urn:x-suika-fam-cx:charset:iso_646.irv:1991',
947     '1',
948     'urn:x-suika-fam-cx:charset:us',
949     '1',
950     'urn:x-suika-fam-cx:charset:us-ascii',
951     '1'},
952 wakaba 1.2 };
953    
954 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.ascii-ctrl'} = {perl_name =>
955     ['ascii-ctrl'],
956     'uri',
957     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.ascii-ctrl',
958     '1'}};
959     $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.null'} = {perl_name =>
960     ['null'],
961     'uri',
962     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.null',
963     '1'}};
964     $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.utf-8'} = {ascii8 =>
965     '1',
966     bom_allowed =>
967     '1',
968     no_bom_variant =>
969     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf8',
970     utf8_encoding_scheme =>
971     '1',
972     'uri',
973     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.utf-8',
974     '1'},
975 wakaba 1.2 xml_name => 'UTF-8',
976     };
977    
978 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/UTF-8.RFC2279'} = {ascii8 =>
979     '1',
980     bom_allowed =>
981     '1',
982     no_bom_variant =>
983     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf8',
984     utf8_encoding_scheme =>
985     '1',
986     'uri',
987     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/UTF-8.RFC2279',
988     '1'}};
989     $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-8'} = {
990     ascii8 => 1,
991     is_block_safe =>
992     '1',
993     perl_name =>
994     ['utf-8'],
995     utf8_encoding_scheme =>
996     '1',
997     'uri',
998     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-8',
999     '1'}};
1000     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:utf-8'} = {
1001     ascii8 => 1,
1002     bom_allowed =>
1003     '1',
1004     no_bom_variant =>
1005     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-8',
1006     ietf_name =>
1007     ['utf-8'],
1008     mime_name =>
1009     'utf-8',
1010     utf8_encoding_scheme =>
1011     '1',
1012     'uri',
1013     {'urn:x-suika-fam-cx:charset:utf-8',
1014     '1'},
1015 wakaba 1.2 };
1016    
1017 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf8'} = {ascii8 =>
1018     '1',
1019     is_block_safe =>
1020     '1',
1021     perl_name =>
1022     ['utf8'],
1023     utf8_encoding_scheme =>
1024     '1',
1025     'uri',
1026     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf8',
1027     '1'}};
1028     $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.utf-16'} = {
1029     ascii16 => 1,
1030     bom_allowed =>
1031     '1',
1032     bom_required =>
1033     '1',
1034     no_bom_variant =>
1035     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16le',
1036     no_bom_variant16be =>
1037     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16be',
1038     no_bom_variant16le =>
1039     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16le',
1040     perl_name =>
1041     ['utf-16'],
1042     'uri',
1043     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.utf-16',
1044     '1'},
1045 wakaba 1.2 xml_name => 'UTF-16',
1046     };
1047    
1048 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:utf-16'} = {
1049     ascii16 => 1,
1050     bom_allowed =>
1051     '1',
1052     no_bom_variant =>
1053     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16le',
1054     no_bom_variant16be =>
1055     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16be',
1056     no_bom_variant16le =>
1057     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16le',
1058     ietf_name =>
1059     ['utf-16'],
1060     mime_name =>
1061     'utf-16',
1062     'uri',
1063     {'urn:x-suika-fam-cx:charset:utf-16',
1064     '1'},
1065 wakaba 1.2 };
1066    
1067 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:utf-16be'} = {
1068     ascii16 => 1,
1069     ascii16be => 1,
1070     bom_allowed =>
1071     '1',
1072     no_bom_variant =>
1073     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16be',
1074     no_bom_variant16be =>
1075     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16be',
1076     ietf_name =>
1077     ['utf-16be'],
1078     mime_name =>
1079     'utf-16be',
1080     'uri',
1081     {'urn:x-suika-fam-cx:charset:utf-16be',
1082     '1'},
1083 wakaba 1.2 };
1084    
1085 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:utf-16le'} = {
1086     ascii16 => 1,
1087     ascii16le => 1,
1088     bom_allowed =>
1089     '1',
1090     no_bom_variant =>
1091     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16le',
1092     no_bom_variant16le =>
1093     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16le',
1094     ietf_name =>
1095     ['utf-16le'],
1096     mime_name =>
1097     'utf-16le',
1098     'uri',
1099     {'urn:x-suika-fam-cx:charset:utf-16le',
1100     '1'},
1101 wakaba 1.2 };
1102    
1103 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16be'} = {
1104     ascii16 => 1,
1105     ascii16be => 1,
1106     is_block_safe =>
1107     '1',
1108     perl_name =>
1109     ['utf-16be'],
1110     'uri',
1111     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16be',
1112     '1'}};
1113     $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16le'} = {
1114     ascii16 => 1,
1115     ascii16le => 1,
1116     is_block_safe =>
1117     '1',
1118     perl_name =>
1119     ['utf-16le'],
1120     'uri',
1121     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16le',
1122     '1'}};
1123     $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-10646-ucs-2'} = $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:iso-10646-ucs-2'} = {
1124     ascii16 => 1,
1125     bom_allowed =>
1126     '1',
1127     no_bom_variant =>
1128     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.ucs-2le',
1129     no_bom_variant16be =>
1130     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.ucs-2be',
1131     no_bom_variant16le =>
1132     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.ucs-2le',
1133     ietf_name =>
1134     ['csunicode',
1135     'iso-10646-ucs-2'],
1136     mime_name =>
1137     'iso-10646-ucs-2',
1138     'uri',
1139     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-10646-ucs-2',
1140     '1',
1141     'urn:x-suika-fam-cx:charset:iso-10646-ucs-2',
1142     '1'},
1143 wakaba 1.2 xml_name => 'ISO-10646-UCS-2',
1144     };
1145    
1146 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.ucs-2be'} = {
1147     ascii16 => 1,
1148     ascii16be => 1,
1149     is_block_safe =>
1150     '1',
1151     perl_name =>
1152     ['ucs-2be'],
1153     'uri',
1154     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.ucs-2be',
1155     '1'}};
1156     $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.ucs-2le'} = {
1157     ascii16 => 1,
1158     ascii16le => 1,
1159     is_block_safe =>
1160     '1',
1161     perl_name =>
1162     ['ucs-2le'],
1163     'uri',
1164     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.ucs-2le',
1165     '1'}};
1166     $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-10646-ucs-4'} = $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:iso-10646-ucs-4'} = {
1167     ascii32 => 1,
1168     bom_allowed =>
1169     '1',
1170     no_bom_variant =>
1171     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-32le',
1172     no_bom_variant32endian1234 =>
1173     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-32be',
1174     no_bom_variant32endian4321 =>
1175     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-32le',
1176     ietf_name =>
1177     ['csucs4',
1178     'iso-10646-ucs-4'],
1179     mime_name =>
1180     'iso-10646-ucs-4',
1181     'uri',
1182     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-10646-ucs-4',
1183     '1',
1184     'urn:x-suika-fam-cx:charset:iso-10646-ucs-4',
1185     '1'},
1186 wakaba 1.2 xml_name => 'ISO-10646-UCS-4',
1187     };
1188    
1189 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-32be'} = {
1190     ascii32 => 1,
1191     ascii32endian1234 => 1,
1192     is_block_safe =>
1193     '1',
1194     perl_name =>
1195     ['ucs-4be',
1196     'utf-32be'],
1197     'uri',
1198     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-32be',
1199     '1'}};
1200     $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-32le'} = {
1201     ascii32 => 1,
1202     ascii32endian4321 => 1,
1203     is_block_safe =>
1204     '1',
1205     perl_name =>
1206     ['ucs-4le',
1207     'utf-32le'],
1208     'uri',
1209     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-32le',
1210     '1'}};
1211     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:iso_8859-1:1987'} = $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-1'} = {ascii8 =>
1212     '1',
1213     is_block_safe =>
1214     '1',
1215     ietf_name =>
1216     ['cp819',
1217     'csisolatin1',
1218     'ibm819',
1219     'iso-8859-1',
1220     'iso-8859-1',
1221     'iso-ir-100',
1222     'iso_8859-1',
1223     'iso_8859-1:1987',
1224     'l1',
1225     'latin1'],
1226     mime_name =>
1227     'iso-8859-1',
1228     perl_name =>
1229     ['iso-8859-1',
1230     'latin1'],
1231     'uri',
1232     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-1',
1233     '1',
1234     'urn:x-suika-fam-cx:charset:iso_8859-1:1987',
1235     '1'},
1236 wakaba 1.2 xml_name => 'ISO-8859-1',
1237     };
1238    
1239 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-2'} = {ascii8 =>
1240     '1',
1241     is_block_safe =>
1242     '1',
1243     'uri',
1244     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-2',
1245     '1'},
1246 wakaba 1.2 xml_name => 'ISO-8859-2',
1247     };
1248    
1249 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-3'} = {ascii8 =>
1250     '1',
1251     is_block_safe =>
1252     '1',
1253     'uri',
1254     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-3',
1255     '1'},
1256 wakaba 1.2 xml_name => 'ISO-8859-3',
1257     };
1258    
1259 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-4'} = {ascii8 =>
1260     '1',
1261     is_block_safe =>
1262     '1',
1263     'uri',
1264     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-4',
1265     '1'},
1266 wakaba 1.2 xml_name => 'ISO-8859-4',
1267     };
1268    
1269 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-5'} = {ascii8 =>
1270     '1',
1271     is_block_safe =>
1272     '1',
1273     'uri',
1274     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-5',
1275     '1'},
1276 wakaba 1.2 xml_name => 'ISO-8859-5',
1277     };
1278    
1279 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-6'} = {ascii8 =>
1280     '1',
1281     is_block_safe =>
1282     '1',
1283     'uri',
1284     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-6',
1285     '1'},
1286 wakaba 1.2 xml_name => 'ISO-8859-6',
1287     };
1288    
1289 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-7'} = {ascii8 =>
1290     '1',
1291     is_block_safe =>
1292     '1',
1293     'uri',
1294     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-7',
1295     '1'},
1296 wakaba 1.2 xml_name => 'ISO-8859-7',
1297     };
1298    
1299 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-8'} = {ascii8 =>
1300     '1',
1301     is_block_safe =>
1302     '1',
1303     'uri',
1304     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-8',
1305     '1'},
1306 wakaba 1.2 xml_name => 'ISO-8859-8',
1307     };
1308    
1309 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-9'} = {ascii8 =>
1310     '1',
1311     is_block_safe =>
1312     '1',
1313     'uri',
1314     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-9',
1315     '1'},
1316 wakaba 1.2 xml_name => 'ISO-8859-9',
1317     };
1318    
1319 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-10'} = {ascii8 =>
1320     '1',
1321     is_block_safe =>
1322     '1',
1323     'uri',
1324     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-10',
1325     '1'},
1326 wakaba 1.2 xml_name => 'ISO-8859-10',
1327     };
1328    
1329 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-11'} = {ascii8 =>
1330     '1',
1331     is_block_safe =>
1332     '1',
1333     'uri',
1334     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-11',
1335     '1'},
1336 wakaba 1.2 xml_name => 'ISO-8859-11',
1337     };
1338    
1339 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-13'} = {ascii8 =>
1340     '1',
1341     is_block_safe =>
1342     '1',
1343     'uri',
1344     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-13',
1345     '1'},
1346 wakaba 1.2 xml_name => 'ISO-8859-13',
1347     };
1348    
1349 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-14'} = {ascii8 =>
1350     '1',
1351     is_block_safe =>
1352     '1',
1353     'uri',
1354     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-14',
1355     '1'},
1356 wakaba 1.2 xml_name => 'ISO-8859-14',
1357     };
1358    
1359 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-15'} = {ascii8 =>
1360     '1',
1361     is_block_safe =>
1362     '1',
1363     'uri',
1364     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-15',
1365     '1'},
1366 wakaba 1.2 xml_name => 'ISO-8859-15',
1367     };
1368    
1369 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-16'} = {ascii8 =>
1370     '1',
1371     is_block_safe =>
1372     '1',
1373     'uri',
1374     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-16',
1375     '1'},
1376 wakaba 1.2 xml_name => 'ISO-8859-16',
1377     };
1378    
1379 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-2022-jp'} = {ascii8 =>
1380     '1',
1381     'uri',
1382     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-2022-jp',
1383     '1'},
1384 wakaba 1.2 xml_name => 'ISO-2022-JP',
1385     };
1386    
1387 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:iso-2022-jp'} = {ascii8 =>
1388     '1',
1389     ietf_name =>
1390     ['csiso2022jp',
1391     'iso-2022-jp',
1392     'iso-2022-jp'],
1393     mime_name =>
1394     'iso-2022-jp',
1395     'uri',
1396     {'urn:x-suika-fam-cx:charset:iso-2022-jp',
1397     '1'},
1398 wakaba 1.2 };
1399    
1400 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.iso-2022-jp'} = {ascii8 =>
1401     '1',
1402     perl_name =>
1403     ['iso-2022-jp'],
1404     'uri',
1405     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.iso-2022-jp',
1406     '1'}};
1407     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:shift_jis'} = $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.shift_jis'} = {ascii8 =>
1408     '1',
1409     is_block_safe =>
1410     '1',
1411     ietf_name =>
1412     ['csshiftjis',
1413     'ms_kanji',
1414     'shift_jis',
1415     'shift_jis'],
1416     mime_name =>
1417     'shift_jis',
1418     perl_name =>
1419     ['shift-jis-1997'],
1420     'uri',
1421     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.shift_jis',
1422     '1',
1423     'urn:x-suika-fam-cx:charset:shift_jis',
1424     '1'},
1425 wakaba 1.2 xml_name => 'Shift_JIS',
1426     };
1427    
1428 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.shiftjis'} = {ascii8 =>
1429     '1',
1430     is_block_safe =>
1431     '1',
1432     perl_name =>
1433     ['shiftjis',
1434     'sjis'],
1435     'uri',
1436     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.shiftjis',
1437     '1'}};
1438     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:euc-jp'} = $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.euc-jp'} = $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:extended_unix_code_packed_format_for_japanese'} = {ascii8 =>
1439     '1',
1440     is_block_safe =>
1441     '1',
1442     ietf_name =>
1443     ['cseucpkdfmtjapanese',
1444     'euc-jp',
1445     'euc-jp',
1446     'extended_unix_code_packed_format_for_japanese'],
1447     mime_name =>
1448     'euc-jp',
1449     perl_name =>
1450     ['euc-jp-1997'],
1451     'uri',
1452     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.euc-jp',
1453     '1',
1454     'urn:x-suika-fam-cx:charset:euc-jp',
1455     '1',
1456     'urn:x-suika-fam-cx:charset:extended_unix_code_packed_format_for_japanese',
1457     '1'},
1458 wakaba 1.2 xml_name => 'EUC-JP',
1459     };
1460    
1461 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.euc-jp'} = {ascii8 =>
1462     '1',
1463     is_block_safe =>
1464     '1',
1465     perl_name =>
1466     ['euc-jp',
1467     'ujis'],
1468     'uri',
1469     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.euc-jp',
1470     '1'}};
1471    
1472     1;
1473 wakaba 1.6 ## $Date: 2008/05/18 04:15:52 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24