/[suikacvs]/markup/html/whatpm/Whatpm/Charset/DecodeHandle.pm
Suika

Contents of /markup/html/whatpm/Whatpm/Charset/DecodeHandle.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (hide annotations) (download)
Sun Jul 15 16:51:14 2007 UTC (17 years, 3 months ago) by wakaba
Branch: MAIN
Changes since 1.1: +138 -139 lines
++ whatpm/t/ChangeLog	15 Jul 2007 16:51:05 -0000
	* Charset-DecodeHandler.t: Cases for charset names has
	been changed to match with the module's implementation.

2007-07-15  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/Charset/ChangeLog	15 Jul 2007 16:49:53 -0000
	* DecodeHandle.pm (create_decode_handle): Set canonical
	or specified name to |input_encoding| attribute.
	(uri_to_name): Reimplemented.
	(input_encoding): Return charset name returned
	by |uri_to_name| if available.
	($CharsetDef): Property |xml_name| is now contain
	only name defined in XML specifications.

2007-07-15  Wakaba  <wakaba@suika.fam.cx>

1 wakaba 1.1 package Whatpm::Charset::DecodeHandle;
2     use strict;
3    
4     my $XML_AUTO_CHARSET = q<http://suika.fam.cx/www/2006/03/xml-entity/>;
5     my $IANA_CHARSET = q<urn:x-suika-fam-cx:charset:>;
6     my $PERL_CHARSET = q<http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.>;
7     my $XML_CHARSET = q<http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.>;
8    
9     ## ->create_decode_handle ($charset_uri, $byte_stream, $onerror)
10     sub create_decode_handle ($$$;$) {
11     my $csdef = $Whatpm::Charset::CharsetDef->{$_[1]};
12     my $obj = {
13     character_queue => [],
14     filehandle => $_[2],
15     charset => $_[1],
16     byte_buffer => '',
17     onerror => $_[3] || sub {},
18     };
19     if ($csdef->{uri}->{$XML_AUTO_CHARSET} or
20     $obj->{charset} eq $XML_AUTO_CHARSET) {
21     my $b = ''; # UTF-8 w/o BOM
22     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
23 wakaba 1.2 $obj->{input_encoding} = 'UTF-8';
24 wakaba 1.1 if (read $obj->{filehandle}, $b, 256) {
25     no warnings "substr";
26     no warnings "uninitialized";
27     if (substr ($b, 0, 1) eq "<") {
28     if (substr ($b, 1, 1) eq "?") { # ASCII8
29     if ($b =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
30     encoding\s*=\s*["']([^"']*)/x) {
31 wakaba 1.2 $obj->{input_encoding} = $1;
32 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
33     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
34     if (not $csdef->{ascii8} or $csdef->{bom_required}) {
35     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
36     charset_uri => $uri,
37     charset_name => $obj->{input_encoding});
38     }
39     } else {
40     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
41 wakaba 1.2 $obj->{input_encoding} = 'UTF-8';
42 wakaba 1.1 }
43     if (defined $csdef->{no_bom_variant}) {
44     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant}};
45     }
46     } elsif (substr ($b, 1, 1) eq "\x00") {
47     if (substr ($b, 2, 2) eq "?\x00") { # ASCII16LE
48     my $c = $b; $c =~ tr/\x00//d;
49     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
50     encoding\s*=\s*["']([^"']*)/x) {
51 wakaba 1.2 $obj->{input_encoding} = $1;
52 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
53     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
54     if (not $csdef->{ascii16} or $csdef->{ascii16be} or
55     $csdef->{bom_required}) {
56     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
57     charset_uri => $uri,
58     charset_name => $obj->{input_encoding});
59     }
60     } else {
61     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
62 wakaba 1.2 $obj->{input_encoding} = 'UTF-8';
63 wakaba 1.1 }
64     if (defined $csdef->{no_bom_variant16le}) {
65     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant16le}};
66     }
67     } elsif (substr ($b, 2, 2) eq "\x00\x00") { # ASCII32Endian4321
68     my $c = $b; $c =~ tr/\x00//d;
69     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
70     encoding\s*=\s*["']([^"']*)/x) {
71 wakaba 1.2 $obj->{input_encoding} = $1;
72 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
73     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
74     if (not $csdef->{ascii32} or
75     $csdef->{ascii32endian1234} or
76     $csdef->{ascii32endian2143} or
77     $csdef->{ascii32endian3412} or
78     $csdef->{bom_required}) {
79     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
80     charset_uri => $uri,
81     charset_name => $obj->{input_encoding});
82     }
83     } else {
84     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
85 wakaba 1.2 $obj->{input_encoding} = 'UTF-8';
86 wakaba 1.1 }
87     if (defined $csdef->{no_bom_variant32endian4321}) {
88     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant32endian4321}};
89     }
90     }
91     }
92     } elsif (substr ($b, 0, 3) eq "\xEF\xBB\xBF") { # UTF8
93     $obj->{has_bom} = 1;
94     substr ($b, 0, 3) = '';
95     my $c = $b;
96     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
97     encoding\s*=\s*["']([^"']*)/x) {
98 wakaba 1.2 $obj->{input_encoding} = $1;
99 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
100     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
101     if (not $csdef->{utf8_encoding_scheme} or
102     not $csdef->{bom_allowed}) {
103     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
104     charset_uri => $uri,
105     charset_name => $obj->{input_encoding});
106     }
107     } else {
108     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
109 wakaba 1.2 $obj->{input_encoding} = 'UTF-8';
110 wakaba 1.1 }
111     if (defined $csdef->{no_bom_variant}) {
112     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant}};
113     }
114     } elsif (substr ($b, 0, 2) eq "\x00<") {
115     if (substr ($b, 2, 2) eq "\x00?") { # ASCII16BE
116     my $c = $b; $c =~ tr/\x00//d;
117     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
118     encoding\s*=\s*["']([^"']*)/x) {
119 wakaba 1.2 $obj->{input_encoding} = $1;
120 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
121     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
122     if (not $csdef->{ascii16} or $csdef->{ascii16le} or
123     $csdef->{bom_required}) {
124     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
125     charset_uri => $uri,
126     charset_name => $obj->{input_encoding});
127     }
128     } else {
129     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
130 wakaba 1.2 $obj->{input_encoding} = 'UTF-8';
131 wakaba 1.1 }
132     if (defined $csdef->{no_bom_variant16be}) {
133     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant16be}};
134     }
135     } elsif (substr ($b, 2, 2) eq "\x00\x00") { # ASCII32Endian3412
136     my $c = $b; $c =~ tr/\x00//d;
137     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
138     encoding\s*=\s*["']([^"']*)/x) {
139 wakaba 1.2 $obj->{input_encoding} = $1;
140 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
141     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
142     if (not $csdef->{ascii32} or
143     $csdef->{ascii32endian1234} or
144     $csdef->{ascii32endian2143} or
145     $csdef->{ascii32endian4321} or
146     $csdef->{bom_required}) {
147     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
148     charset_uri => $uri,
149     charset_name => $obj->{input_encoding});
150     }
151     } else {
152     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
153 wakaba 1.2 $obj->{input_encoding} = 'UTF-8';
154 wakaba 1.1 }
155     if (defined $csdef->{no_bom_variant32endian3412}) {
156     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant32endian3412}};
157     }
158     }
159     } elsif (substr ($b, 0, 2) eq "\xFE\xFF") {
160     if (substr ($b, 2, 2) eq "\x00<") { # ASCII16BE
161     $obj->{has_bom} = 1;
162     substr ($b, 0, 2) = '';
163     my $c = $b; $c =~ tr/\x00//d;
164     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
165     encoding\s*=\s*["']([^"']*)/x) {
166 wakaba 1.2 $obj->{input_encoding} = $1;
167 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
168     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
169     if (not $csdef->{ascii16} or
170     $csdef->{ascii16le} or
171     not $csdef->{bom_allowed}) {
172     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
173     charset_uri => $uri,
174     charset_name => $obj->{input_encoding});
175     }
176     } else {
177     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-16be'};
178 wakaba 1.2 $obj->{input_encoding} = 'UTF-16';
179 wakaba 1.1 }
180     if (defined $csdef->{no_bom_variant16be}) {
181     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant16be}};
182     }
183     } elsif (substr ($b, 2, 2) eq "\x00\x00") { # ASCII32Endian3412
184     $obj->{has_bom} = 1;
185     substr ($b, 0, 4) = '';
186     my $c = $b; $c =~ tr/\x00//d;
187     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
188     encoding\s*=\s*["']([^"']*)/x) {
189 wakaba 1.2 $obj->{input_encoding} = $1;
190 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
191     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
192     if (not $csdef->{ascii32} or
193     $csdef->{ascii32endian1234} or
194     $csdef->{ascii32endian2143} or
195     $csdef->{ascii32endian4321} or
196     not $csdef->{bom_allowed}) {
197     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
198     charset_uri => $uri,
199     charset_name => $obj->{input_encoding});
200     }
201     } else {
202     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-16be'};
203 wakaba 1.2 $obj->{input_encoding} = 'UTF-16';
204 wakaba 1.1 $obj->{byte_buffer} .= "\x00\x00";
205     }
206     if (defined $csdef->{no_bom_variant32endian3412}) {
207     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant32endian3412}};
208     }
209     } else {
210     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-16be'};
211 wakaba 1.2 $obj->{input_encoding} = 'UTF-16';
212 wakaba 1.1 substr ($b, 0, 2) = '';
213     $obj->{has_bom} = 1;
214     }
215     } elsif (substr ($b, 0, 2) eq "\xFF\xFE") {
216     if (substr ($b, 2, 2) eq "<\x00") { # ASCII16LE
217     $obj->{has_bom} = 1;
218     substr ($b, 0, 2) = '';
219     my $c = $b; $c =~ tr/\x00//d;
220     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
221     encoding\s*=\s*["']([^"']*)/x) {
222 wakaba 1.2 $obj->{input_encoding} = $1;
223 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
224     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
225     if (not $csdef->{ascii16} or
226     $csdef->{ascii16be} or
227     not $csdef->{bom_allowed}) {
228     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
229     charset_uri => $uri,
230     charset_name => $obj->{input_encoding});
231     }
232     } else {
233     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-16le'};
234 wakaba 1.2 $obj->{input_encoding} = 'UTF-16';
235 wakaba 1.1 }
236     if (defined $csdef->{no_bom_variant16le}) {
237     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant16le}};
238     }
239     } elsif (substr ($b, 2, 2) eq "\x00\x00") { # ASCII32Endian4321
240     $obj->{has_bom} = 1;
241     substr ($b, 0, 4) = '';
242     my $c = $b; $c =~ tr/\x00//d;
243     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
244     encoding\s*=\s*["']([^"']*)/x) {
245 wakaba 1.2 $obj->{input_encoding} = $1;
246 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
247     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
248     if (not $csdef->{ascii32} or
249     $csdef->{ascii32endian1234} or
250     $csdef->{ascii32endian2143} or
251     $csdef->{ascii32endian3412} or
252     not $csdef->{bom_allowed}) {
253     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
254     charset_uri => $uri,
255     charset_name => $obj->{input_encoding});
256     }
257     } else {
258     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-16le'};
259 wakaba 1.2 $obj->{input_encoding} = 'UTF-16';
260 wakaba 1.1 $obj->{byte_buffer} .= "\x00\x00";
261     }
262     if (defined $csdef->{no_bom_variant32endian4321}) {
263     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant32endian4321}};
264     }
265     } else {
266     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-16le'};
267 wakaba 1.2 $obj->{input_encoding} = 'UTF-16';
268 wakaba 1.1 substr ($b, 0, 2) = '';
269     $obj->{has_bom} = 1;
270     }
271     } elsif (substr ($b, 0, 2) eq "\x00\x00") {
272     if (substr ($b, 2, 2) eq "\x00<") { # ASCII32Endian1234
273     my $c = $b; $c =~ tr/\x00//d;
274     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
275     encoding\s*=\s*["']([^"']*)/x) {
276 wakaba 1.2 $obj->{input_encoding} = $1;
277 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
278     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
279     if (not $csdef->{ascii32} or
280     $csdef->{ascii32endian2143} or
281     $csdef->{ascii32endian3412} or
282     $csdef->{ascii32endian4321} or
283     $csdef->{bom_required}) {
284     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
285     charset_uri => $uri,
286     charset_name => $obj->{input_encoding});
287     }
288     } else {
289     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
290 wakaba 1.2 $obj->{input_encoding} = 'UTF-8';
291 wakaba 1.1 }
292     if (defined $csdef->{no_bom_variant32endian1234}) {
293     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant32endian1234}};
294     }
295     } elsif (substr ($b, 2, 2) eq "<\x00") { # ASCII32Endian2143
296     my $c = $b; $c =~ tr/\x00//d;
297     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
298     encoding\s*=\s*["']([^"']*)/x) {
299 wakaba 1.2 $obj->{input_encoding} = $1;
300 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
301     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
302     if (not $csdef->{ascii32} or
303     $csdef->{ascii32endian1234} or
304     $csdef->{ascii32endian3412} or
305     $csdef->{ascii32endian4321} or
306     $csdef->{bom_required}) {
307     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
308     charset_uri => $uri,
309     charset_name => $obj->{input_encoding});
310     }
311     } else {
312     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
313 wakaba 1.2 $obj->{input_encoding} = 'UTF-8';
314 wakaba 1.1 }
315     if (defined $csdef->{no_bom_variant32endian2143}) {
316     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant32endian2143}};
317     }
318     } elsif (substr ($b, 2, 2) eq "\xFE\xFF") { # ASCII32Endian1234
319     $obj->{has_bom} = 1;
320     substr ($b, 0, 4) = '';
321     my $c = $b; $c =~ tr/\x00//d;
322     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
323     encoding\s*=\s*["']([^"']*)/x) {
324 wakaba 1.2 $obj->{input_encoding} = $1;
325 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
326     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
327     if (not $csdef->{ascii32} or
328     $csdef->{ascii32endian2143} or
329     $csdef->{ascii32endian3412} or
330     $csdef->{ascii32endian4321} or
331     $csdef->{bom_required}) {
332     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
333     charset_uri => $uri,
334     charset_name => $obj->{input_encoding});
335     }
336     } else {
337     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
338 wakaba 1.2 $obj->{input_encoding} = 'UTF-8';
339 wakaba 1.1 $obj->{has_bom} = 0;
340     $obj->{byte_buffer} .= "\x00\x00\xFE\xFF";
341     }
342     if (defined $csdef->{no_bom_variant32endian1234}) {
343     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant32endian1234}};
344     }
345     } elsif (substr ($b, 2, 2) eq "\xFF\xFE") { # ASCII32Endian2143
346     $obj->{has_bom} = 1;
347     substr ($b, 0, 4) = '';
348     my $c = $b; $c =~ tr/\x00//d;
349     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
350     encoding\s*=\s*["']([^"']*)/x) {
351 wakaba 1.2 $obj->{input_encoding} = $1;
352 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
353     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
354     if (not $csdef->{ascii32} or
355     $csdef->{ascii32endian1234} or
356     $csdef->{ascii32endian3412} or
357     $csdef->{ascii32endian4321} or
358     $csdef->{bom_required}) {
359     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
360     charset_uri => $uri,
361     charset_name => $obj->{input_encoding});
362     }
363     } else {
364     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
365 wakaba 1.2 $obj->{input_encoding} = 'UTF-8';
366 wakaba 1.1 $obj->{has_bom} = 0;
367     $obj->{byte_buffer} .= "\x00\x00\xFF\xFE";
368     }
369     if (defined $csdef->{no_bom_variant32endian2143}) {
370     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant32endian2143}};
371     }
372     }
373     # \x4C\x6F\xA7\x94 EBCDIC
374     } # buffer
375     $obj->{byte_buffer} .= $b;
376     } # read
377     } elsif ($csdef->{uri}->{$XML_CHARSET.'utf-8'}) {
378     ## BOM is optional.
379     my $b = '';
380     if (read $obj->{filehandle}, $b, 3) {
381     if ($b eq "\xEF\xBB\xBF") {
382     $obj->{has_bom} = 1;
383     } else {
384     $obj->{byte_buffer} .= $b;
385     }
386     }
387     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'}; # UTF-8 w/o BOM
388     } elsif ($csdef->{uri}->{$XML_CHARSET.'utf-16'}) {
389     ## BOM is mandated.
390     my $b = '';
391     if (read $obj->{filehandle}, $b, 2) {
392     if ($b eq "\xFE\xFF") {
393     $obj->{has_bom} = 1; # UTF-16BE w/o BOM
394     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-16be'};
395     } elsif ($b eq "\xFF\xFE") {
396     $obj->{has_bom} = 1; # UTF-16LE w/o BOM
397     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-16le'};
398     } else {
399     $obj->{onerror}->(undef, 'no-bom-error', charset_uri => $obj->{charset});
400     $obj->{has_bom} = 0;
401     $obj->{byte_buffer} .= $b; # UTF-16BE w/o BOM
402     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-16be'};
403     }
404     } else {
405     $obj->{onerror}->(undef, 'no-bom-error', charset_uri => $obj->{charset});
406     $obj->{has_bom} = 0; # UTF-16BE w/o BOM
407     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-16be'};
408     }
409     }
410    
411     if ($csdef->{uri}->{$XML_CHARSET.'iso-2022-jp'}) {
412     $obj->{state_2440} = 'gl-jis-1997-swapped';
413     $obj->{state_2442} = 'gl-jis-1997';
414     $obj->{state} = 'state_2842';
415     require Encode::GLJIS1997Swapped;
416     require Encode::GLJIS1997;
417     if (Encode::find_encoding ($obj->{state_2440}) and
418     Encode::find_encoding ($obj->{state_2442})) {
419     return bless $obj, 'Whatpm::Charset::DecodeHandle::ISO2022JP';
420     }
421     } elsif ($csdef->{uri}->{$IANA_CHARSET.'iso-2022-jp'}) {
422     $obj->{state_2440} = 'gl-jis-1978';
423     $obj->{state_2442} = 'gl-jis-1983';
424     $obj->{state} = 'state_2842';
425     require Encode::GLJIS1978;
426     require Encode::GLJIS1983;
427     if (Encode::find_encoding ($obj->{state_2440}) and
428     Encode::find_encoding ($obj->{state_2442})) {
429     return bless $obj, 'Whatpm::Charset::DecodeHandle::ISO2022JP';
430     }
431     } elsif (defined $csdef->{perl_name}->[0]) {
432     if ($csdef->{uri}->{$XML_CHARSET.'euc-jp'} or
433     $csdef->{uri}->{$IANA_CHARSET.'euc-jp'}) {
434     $obj->{perl_encoding_name} = $csdef->{perl_name}->[0];
435     require Encode::EUCJP1997;
436     if (Encode::find_encoding ($obj->{perl_encoding_name})) {
437     return bless $obj, 'Whatpm::Charset::DecodeHandle::EUCJP';
438     }
439     } elsif ($csdef->{uri}->{$XML_CHARSET.'shift_jis'} or
440     $csdef->{uri}->{$IANA_CHARSET.'shift_jis'}) {
441     $obj->{perl_encoding_name} = $csdef->{perl_name}->[0];
442     require Encode::ShiftJIS1997;
443     if (Encode::find_encoding ($obj->{perl_encoding_name})) {
444     return bless $obj, 'Whatpm::Charset::DecodeHandle::ShiftJIS';
445     }
446     } elsif ($csdef->{is_block_safe}) {
447     $obj->{perl_encoding_name} = $csdef->{perl_name}->[0];
448     require Encode;
449     if (Encode::find_encoding ($obj->{perl_encoding_name})) {
450     return bless $obj, 'Whatpm::Charset::DecodeHandle::Encode';
451     }
452     }
453     }
454    
455     $obj->{onerror}->(undef, 'charset-not-supported-error',
456     charset_uri => $obj->{charset});
457     return undef;
458     } # create_decode_handle
459    
460     sub name_to_uri ($$$) {
461     my $domain = $_[1];
462     my $name = lc $_[2];
463    
464     if ($domain eq 'ietf') {
465     return $IANA_CHARSET . $name;
466     } elsif ($domain eq 'xml') {
467     if ({
468     'utf-8' => 1,
469     'utf-16' => 1,
470     'iso-10646-ucs-2' => 1,
471     'iso-10646-ucs-4' => 1,
472     'iso-8859-1' => 1,
473     'iso-8859-2' => 1,
474     'iso-8859-3' => 1,
475     'iso-8859-4' => 1,
476     'iso-8859-5' => 1,
477     'iso-8859-6' => 1,
478     'iso-8859-7' => 1,
479     'iso-8859-8' => 1,
480     'iso-8859-9' => 1,
481     'iso-8859-10' => 1,
482     'iso-8859-11' => 1,
483     'iso-8859-13' => 1,
484     'iso-8859-14' => 1,
485     'iso-8859-15' => 1,
486     'iso-8859-16' => 1,
487     'iso-2022-jp' => 1,
488     'shift_jis' => 1,
489     'euc-jp' => 1,
490     }->{$name}) {
491     return $XML_CHARSET . $name;
492     }
493    
494     my $uri = $IANA_CHARSET . $name;
495     return $uri if $Whatpm::Charset::CharsetDef->{$uri};
496    
497     return $XML_CHARSET . $name;
498     } else {
499     return undef;
500     }
501     } # name_to_uri
502    
503     sub uri_to_name ($$$) {
504     my (undef, $domain, $uri) = @_;
505 wakaba 1.2
506     if ($domain eq 'xml') {
507     my $v = $Whatpm::Charset::CharsetDef->{$uri}->{xml_name};
508     return $v if defined $v;
509    
510     if (substr ($uri, 0, length $XML_CHARSET) eq $XML_CHARSET) {
511     return substr ($uri, length $XML_CHARSET);
512     }
513    
514     $domain = 'ietf'; ## TODO: XML encoding name has smaller range
515     }
516    
517     if ($domain eq 'ietf') {
518     my $v = $Whatpm::Charset::CharsetDef->{$uri}->{iana_name};
519     return $v->[0] if defined $v;
520    
521     if (substr ($uri, 0, length $IANA_CHARSET) eq $IANA_CHARSET) {
522 wakaba 1.1 return substr ($uri, length $IANA_CHARSET);
523     }
524     }
525    
526     return undef;
527     } # uri_to_name
528    
529     package Whatpm::Charset::DecodeHandle::Encode;
530    
531     sub charset ($) { $_[0]->{charset} }
532    
533     sub close ($) { CORE::close $_[0]->{filehandle} }
534    
535     sub getc ($) {
536     my $self = $_[0];
537     return shift @{$self->{character_queue}} if @{$self->{character_queue}};
538    
539     my $error;
540     if ($self->{continue}) {
541     if (read $self->{filehandle}, $self->{byte_buffer}, 256,
542     length $self->{byte_buffer}) {
543     #
544     } else {
545     $error = 1;
546     }
547     $self->{continue} = 0;
548     } elsif (512 > length $self->{byte_buffer}) {
549     read $self->{filehandle}, $self->{byte_buffer}, 256,
550     length $self->{byte_buffer};
551     }
552    
553     my $r;
554     unless ($error) {
555     my $string = Encode::decode ($self->{perl_encoding_name},
556     $self->{byte_buffer},
557     Encode::FB_QUIET ());
558     if (length $string) {
559     push @{$self->{character_queue}}, split //, $string;
560     $r = shift @{$self->{character_queue}};
561     if (length $self->{byte_buffer}) {
562     $self->{continue} = 1;
563     }
564     } else {
565     if (length $self->{byte_buffer}) {
566     $error = 1;
567     } else {
568     $r = undef;
569     }
570     }
571     }
572    
573     if ($error) {
574     $r = substr $self->{byte_buffer}, 0, 1, '';
575     $self->{onerror}->($self, 'illegal-octets-error', octets => \$r);
576     }
577    
578     return $r;
579     } # getc
580    
581     sub has_bom ($) { $_[0]->{has_bom} }
582    
583 wakaba 1.2 sub input_encoding ($) {
584     my $v = $_[0]->{input_encoding};
585     return $v if defined $v;
586    
587     my $uri = $_[0]->{charset};
588     if (defined $uri) {
589     return Whatpm::Charset::DecodeHandle->uri_to_name (xml => $uri);
590     }
591    
592     return undef;
593     } # input_encoding
594 wakaba 1.1
595     sub onerror ($;$) {
596     if (@_ > 1) {
597     $_[0]->{onerror} = $_[1];
598     }
599    
600     return $_[0]->{onerror};
601     } # onerror
602    
603     sub ungetc ($$) {
604     unshift @{$_[0]->{character_queue}}, chr int ($_[1] or 0);
605     } # ungetc
606    
607     package Whatpm::Charset::DecodeHandle::EUCJP;
608     push our @ISA, 'Whatpm::Charset::DecodeHandle::Encode';
609    
610     sub getc ($) {
611     my $self = $_[0];
612     return shift @{$self->{character_queue}} if @{$self->{character_queue}};
613    
614     my $error;
615     if ($self->{continue}) {
616     if (read $self->{filehandle}, $self->{byte_buffer}, 256,
617     length $self->{byte_buffer}) {
618     #
619     } else {
620     $error = 1;
621     }
622     $self->{continue} = 0;
623     } elsif (512 > length $self->{byte_buffer}) {
624     read $self->{filehandle}, $self->{byte_buffer}, 256,
625     length $self->{byte_buffer};
626     }
627    
628     my $r;
629     unless ($error) {
630     my $string = Encode::decode ($self->{perl_encoding_name},
631     $self->{byte_buffer},
632     Encode::FB_QUIET ());
633     if (length $string) {
634     push @{$self->{character_queue}}, split //, $string;
635     $r = shift @{$self->{character_queue}};
636     if (length $self->{byte_buffer}) {
637     $self->{continue} = 1;
638     }
639     } else {
640     if (length $self->{byte_buffer}) {
641     $error = 1;
642     } else {
643     $r = undef;
644     }
645     }
646     }
647    
648     if ($error) {
649     $r = substr $self->{byte_buffer}, 0, 1, '';
650     my $etype = 'illegal-octets-error';
651     if ($r =~ /^[\xA1-\xFE]/) {
652     if ($self->{byte_buffer} =~ s/^([\xA1-\xFE])//) {
653     $r .= $1;
654     $etype = 'unassigned-code-point-error';
655     }
656     } elsif ($r eq "\x8F") {
657     if ($self->{byte_buffer} =~ s/^([\xA1-\xFE][\xA1-\xFE]?)//) {
658     $r .= $1;
659     $etype = 'unassigned-code-point-error' if length $1 == 2;
660     }
661     } elsif ($r eq "\x8E") {
662     if ($self->{byte_buffer} =~ s/^([\xA1-\xFE])//) {
663     $r .= $1;
664     $etype = 'unassigned-code-point-error';
665     }
666     } elsif ($r eq "\xA0" or $r eq "\xFF") {
667     $etype = 'unassigned-code-point-error';
668     }
669     $self->{onerror}->($self, $etype, octets => \$r);
670     }
671    
672     return $r;
673     } # getc
674    
675     package Whatpm::Charset::DecodeHandle::ISO2022JP;
676     push our @ISA, 'Whatpm::Charset::DecodeHandle::Encode';
677    
678     sub getc ($) {
679     my $self = $_[0];
680     return shift @{$self->{character_queue}} if @{$self->{character_queue}};
681    
682     my $r;
683     A: {
684     my $error;
685     if ($self->{continue}) {
686     if (read $self->{filehandle}, $self->{byte_buffer}, 256,
687     length $self->{byte_buffer}) {
688     #
689     } else {
690     $error = 1;
691     }
692     $self->{continue} = 0;
693     } elsif (512 > length $self->{byte_buffer}) {
694     read $self->{filehandle}, $self->{byte_buffer}, 256,
695     length $self->{byte_buffer};
696     }
697    
698     unless ($error) {
699     if ($self->{byte_buffer} =~ s/^\x1B(\x24[\x40\x42]|\x28[\x42\x4A])//) {
700     $self->{state} = {
701     "\x24\x40" => 'state_2440',
702     "\x24\x42" => 'state_2442',
703     "\x28\x42" => 'state_2842',
704     "\x28\x4A" => 'state_284A',
705     }->{$1};
706     redo A;
707     } elsif ($self->{state} eq 'state_2842') { # IRV
708     if ($self->{byte_buffer} =~ s/^([\x00-\x0D\x10-\x1A\x1C-\x7F]+)//) {
709     push @{$self->{character_queue}}, split //, $1;
710     $r = shift @{$self->{character_queue}};
711     } else {
712     if (length $self->{byte_buffer}) {
713     $error = 1;
714     } else {
715     $r = undef;
716     }
717     }
718     } elsif ($self->{state} eq 'state_284A') { # 0201
719     if ($self->{byte_buffer} =~ s/^([\x00-\x0D\x10-\x1A\x1C-\x7F]+)//) {
720     my $v = $1;
721     $v =~ tr/\x5C\x7E/\xA5\x{203E}/;
722     push @{$self->{character_queue}}, split //, $v;
723     $r = shift @{$self->{character_queue}};
724     } else {
725     if (length $self->{byte_buffer}) {
726     $error = 1;
727     } else {
728     $r = undef;
729     $self->{onerror}->($self, 'invalid-state-error',
730     state => $self->{state});
731     }
732     }
733     } elsif ($self->{state} eq 'state_2442') { # 1983
734     my $v = Encode::decode ($self->{state_2442},
735     $self->{byte_buffer},
736     Encode::FB_QUIET ());
737     if (length $v) {
738     push @{$self->{character_queue}}, split //, $v;
739     $r = shift @{$self->{character_queue}};
740     } else {
741     if (length $self->{byte_buffer}) {
742     $error = 1;
743     } else {
744     $r = undef;
745     $self->{onerror}->($self, 'invalid-state-error',
746     state => $self->{state});
747     }
748     }
749     } elsif ($self->{state} eq 'state_2440') { # 1978
750     my $v = Encode::decode ($self->{state_2440},
751     $self->{byte_buffer},
752     Encode::FB_QUIET ());
753     if (length $v) {
754     push @{$self->{character_queue}}, split //, $v;
755     $r = shift @{$self->{character_queue}};
756     } else {
757     if (length $self->{byte_buffer}) {
758     $error = 1;
759     } else {
760     $r = undef;
761     $self->{onerror}->($self, 'invalid-state-error',
762     state => $self->{state});
763     }
764     }
765     } else {
766     $error = 1;
767     }
768     }
769    
770     if ($error) {
771     $r = substr $self->{byte_buffer}, 0, 1, '';
772     my $etype = 'illegal-octets-error';
773     if (($self->{state} eq 'state_2442' or
774     $self->{state} eq 'state_2440') and
775     $r =~ /^[\x21-\x7E]/ and
776     $self->{byte_buffer} =~ s/^([\x21-\x7E])//) {
777     $r .= $1;
778     $etype = 'unassigned-code-point-error';
779     } elsif ($r eq "\x1B" and
780     $self->{byte_buffer} =~ s/^\(H//) { # Old 0201
781     $r .= "(H";
782     $self->{state} = 'state_284A';
783     }
784     $self->{onerror}->($self, $etype, octets => \$r);
785     }
786     } # A
787    
788     return $r;
789     } # getc
790    
791     package Whatpm::Charset::DecodeHandle::ShiftJIS;
792     push our @ISA, 'Whatpm::Charset::DecodeHandle::Encode';
793    
794     sub getc ($) {
795     my $self = $_[0];
796     return shift @{$self->{character_queue}} if @{$self->{character_queue}};
797    
798     my $error;
799     if ($self->{continue}) {
800     if (read $self->{filehandle}, $self->{byte_buffer}, 256,
801     length $self->{byte_buffer}) {
802     #
803     } else {
804     $error = 1;
805     }
806     $self->{continue} = 0;
807     } elsif (512 > length $self->{byte_buffer}) {
808     read $self->{filehandle}, $self->{byte_buffer}, 256,
809     length $self->{byte_buffer};
810     }
811    
812     my $r;
813     unless ($error) {
814     my $string = Encode::decode ($self->{perl_encoding_name},
815     $self->{byte_buffer},
816     Encode::FB_QUIET ());
817     if (length $string) {
818     push @{$self->{character_queue}}, split //, $string;
819     $r = shift @{$self->{character_queue}};
820     if (length $self->{byte_buffer}) {
821     $self->{continue} = 1;
822     }
823     } else {
824     if (length $self->{byte_buffer}) {
825     $error = 1;
826     } else {
827     $r = undef;
828     }
829     }
830     }
831    
832     if ($error) {
833     $r = substr $self->{byte_buffer}, 0, 1, '';
834     my $etype = 'illegal-octets-error';
835     if ($r =~ /^[\x81-\x9F\xE0-\xEF]/) {
836     if ($self->{byte_buffer} =~ s/(.)//s) {
837     $r .= $1; # not limited to \x40-\xFC - \x7F
838     $etype = 'unassigned-code-point-error';
839     }
840     } elsif ($r =~ /^[\x80\xA0\xF0-\xFF]/) {
841     $etype = 'unassigned-code-point-error';
842     }
843     $self->{onerror}->($self, $etype, octets => \$r);
844     }
845    
846     return $r;
847     } # getc
848    
849     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:us-ascii'} =
850     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:us'} =
851     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:iso646-us'} =
852     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:cp367'} =
853     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:ibm367'} =
854     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:ansi_x3.4-1986'} =
855     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:ansi_x3.4-1968'} =
856     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:iso-ir-6'} =
857     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:csascii'} =
858     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:iso_646.irv:1991'} =
859     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:ascii'} = {ascii8 =>
860     '1',
861     is_block_safe =>
862     '1',
863     ietf_name =>
864     ['ansi_x3.4-1968',
865     'ansi_x3.4-1986',
866     'ascii',
867     'cp367',
868     'csascii',
869     'ibm367',
870     'iso-ir-6',
871     'iso646-us',
872     'iso_646.irv:1991',
873     'us',
874     'us-ascii',
875     'us-ascii'],
876     mime_name =>
877     'us-ascii',
878     perl_name =>
879     ['ascii',
880     'iso-646-us',
881     'us-ascii'],
882     utf8_encoding_scheme =>
883     '1',
884     'uri',
885     {'urn:x-suika-fam-cx:charset:ansi_x3.4-1968',
886     '1',
887     'urn:x-suika-fam-cx:charset:ansi_x3.4-1986',
888     '1',
889     'urn:x-suika-fam-cx:charset:ascii',
890     '1',
891     'urn:x-suika-fam-cx:charset:cp367',
892     '1',
893     'urn:x-suika-fam-cx:charset:csascii',
894     '1',
895     'urn:x-suika-fam-cx:charset:ibm367',
896     '1',
897     'urn:x-suika-fam-cx:charset:iso-ir-6',
898     '1',
899     'urn:x-suika-fam-cx:charset:iso646-us',
900     '1',
901     'urn:x-suika-fam-cx:charset:iso_646.irv:1991',
902     '1',
903     'urn:x-suika-fam-cx:charset:us',
904     '1',
905     'urn:x-suika-fam-cx:charset:us-ascii',
906     '1'},
907 wakaba 1.2 };
908    
909 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.ascii-ctrl'} = {perl_name =>
910     ['ascii-ctrl'],
911     'uri',
912     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.ascii-ctrl',
913     '1'}};
914     $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.null'} = {perl_name =>
915     ['null'],
916     'uri',
917     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.null',
918     '1'}};
919     $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.utf-8'} = {ascii8 =>
920     '1',
921     bom_allowed =>
922     '1',
923     no_bom_variant =>
924     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf8',
925     utf8_encoding_scheme =>
926     '1',
927     'uri',
928     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.utf-8',
929     '1'},
930 wakaba 1.2 xml_name => 'UTF-8',
931     };
932    
933 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/UTF-8.RFC2279'} = {ascii8 =>
934     '1',
935     bom_allowed =>
936     '1',
937     no_bom_variant =>
938     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf8',
939     utf8_encoding_scheme =>
940     '1',
941     'uri',
942     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/UTF-8.RFC2279',
943     '1'}};
944     $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-8'} = {
945     ascii8 => 1,
946     is_block_safe =>
947     '1',
948     perl_name =>
949     ['utf-8'],
950     utf8_encoding_scheme =>
951     '1',
952     'uri',
953     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-8',
954     '1'}};
955     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:utf-8'} = {
956     ascii8 => 1,
957     bom_allowed =>
958     '1',
959     no_bom_variant =>
960     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-8',
961     ietf_name =>
962     ['utf-8'],
963     mime_name =>
964     'utf-8',
965     utf8_encoding_scheme =>
966     '1',
967     'uri',
968     {'urn:x-suika-fam-cx:charset:utf-8',
969     '1'},
970 wakaba 1.2 };
971    
972 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf8'} = {ascii8 =>
973     '1',
974     is_block_safe =>
975     '1',
976     perl_name =>
977     ['utf8'],
978     utf8_encoding_scheme =>
979     '1',
980     'uri',
981     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf8',
982     '1'}};
983     $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.utf-16'} = {
984     ascii16 => 1,
985     bom_allowed =>
986     '1',
987     bom_required =>
988     '1',
989     no_bom_variant =>
990     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16le',
991     no_bom_variant16be =>
992     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16be',
993     no_bom_variant16le =>
994     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16le',
995     perl_name =>
996     ['utf-16'],
997     'uri',
998     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.utf-16',
999     '1'},
1000 wakaba 1.2 xml_name => 'UTF-16',
1001     };
1002    
1003 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:utf-16'} = {
1004     ascii16 => 1,
1005     bom_allowed =>
1006     '1',
1007     no_bom_variant =>
1008     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16le',
1009     no_bom_variant16be =>
1010     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16be',
1011     no_bom_variant16le =>
1012     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16le',
1013     ietf_name =>
1014     ['utf-16'],
1015     mime_name =>
1016     'utf-16',
1017     'uri',
1018     {'urn:x-suika-fam-cx:charset:utf-16',
1019     '1'},
1020 wakaba 1.2 };
1021    
1022 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:utf-16be'} = {
1023     ascii16 => 1,
1024     ascii16be => 1,
1025     bom_allowed =>
1026     '1',
1027     no_bom_variant =>
1028     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16be',
1029     no_bom_variant16be =>
1030     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16be',
1031     ietf_name =>
1032     ['utf-16be'],
1033     mime_name =>
1034     'utf-16be',
1035     'uri',
1036     {'urn:x-suika-fam-cx:charset:utf-16be',
1037     '1'},
1038 wakaba 1.2 };
1039    
1040 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:utf-16le'} = {
1041     ascii16 => 1,
1042     ascii16le => 1,
1043     bom_allowed =>
1044     '1',
1045     no_bom_variant =>
1046     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16le',
1047     no_bom_variant16le =>
1048     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16le',
1049     ietf_name =>
1050     ['utf-16le'],
1051     mime_name =>
1052     'utf-16le',
1053     'uri',
1054     {'urn:x-suika-fam-cx:charset:utf-16le',
1055     '1'},
1056 wakaba 1.2 };
1057    
1058 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16be'} = {
1059     ascii16 => 1,
1060     ascii16be => 1,
1061     is_block_safe =>
1062     '1',
1063     perl_name =>
1064     ['utf-16be'],
1065     'uri',
1066     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16be',
1067     '1'}};
1068     $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16le'} = {
1069     ascii16 => 1,
1070     ascii16le => 1,
1071     is_block_safe =>
1072     '1',
1073     perl_name =>
1074     ['utf-16le'],
1075     'uri',
1076     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16le',
1077     '1'}};
1078     $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-10646-ucs-2'} = $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:iso-10646-ucs-2'} = {
1079     ascii16 => 1,
1080     bom_allowed =>
1081     '1',
1082     no_bom_variant =>
1083     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.ucs-2le',
1084     no_bom_variant16be =>
1085     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.ucs-2be',
1086     no_bom_variant16le =>
1087     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.ucs-2le',
1088     ietf_name =>
1089     ['csunicode',
1090     'iso-10646-ucs-2'],
1091     mime_name =>
1092     'iso-10646-ucs-2',
1093     'uri',
1094     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-10646-ucs-2',
1095     '1',
1096     'urn:x-suika-fam-cx:charset:iso-10646-ucs-2',
1097     '1'},
1098 wakaba 1.2 xml_name => 'ISO-10646-UCS-2',
1099     };
1100    
1101 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.ucs-2be'} = {
1102     ascii16 => 1,
1103     ascii16be => 1,
1104     is_block_safe =>
1105     '1',
1106     perl_name =>
1107     ['ucs-2be'],
1108     'uri',
1109     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.ucs-2be',
1110     '1'}};
1111     $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.ucs-2le'} = {
1112     ascii16 => 1,
1113     ascii16le => 1,
1114     is_block_safe =>
1115     '1',
1116     perl_name =>
1117     ['ucs-2le'],
1118     'uri',
1119     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.ucs-2le',
1120     '1'}};
1121     $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-10646-ucs-4'} = $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:iso-10646-ucs-4'} = {
1122     ascii32 => 1,
1123     bom_allowed =>
1124     '1',
1125     no_bom_variant =>
1126     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-32le',
1127     no_bom_variant32endian1234 =>
1128     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-32be',
1129     no_bom_variant32endian4321 =>
1130     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-32le',
1131     ietf_name =>
1132     ['csucs4',
1133     'iso-10646-ucs-4'],
1134     mime_name =>
1135     'iso-10646-ucs-4',
1136     'uri',
1137     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-10646-ucs-4',
1138     '1',
1139     'urn:x-suika-fam-cx:charset:iso-10646-ucs-4',
1140     '1'},
1141 wakaba 1.2 xml_name => 'ISO-10646-UCS-4',
1142     };
1143    
1144 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-32be'} = {
1145     ascii32 => 1,
1146     ascii32endian1234 => 1,
1147     is_block_safe =>
1148     '1',
1149     perl_name =>
1150     ['ucs-4be',
1151     'utf-32be'],
1152     'uri',
1153     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-32be',
1154     '1'}};
1155     $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-32le'} = {
1156     ascii32 => 1,
1157     ascii32endian4321 => 1,
1158     is_block_safe =>
1159     '1',
1160     perl_name =>
1161     ['ucs-4le',
1162     'utf-32le'],
1163     'uri',
1164     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-32le',
1165     '1'}};
1166     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:iso_8859-1:1987'} = $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-1'} = {ascii8 =>
1167     '1',
1168     is_block_safe =>
1169     '1',
1170     ietf_name =>
1171     ['cp819',
1172     'csisolatin1',
1173     'ibm819',
1174     'iso-8859-1',
1175     'iso-8859-1',
1176     'iso-ir-100',
1177     'iso_8859-1',
1178     'iso_8859-1:1987',
1179     'l1',
1180     'latin1'],
1181     mime_name =>
1182     'iso-8859-1',
1183     perl_name =>
1184     ['iso-8859-1',
1185     'latin1'],
1186     'uri',
1187     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-1',
1188     '1',
1189     'urn:x-suika-fam-cx:charset:iso_8859-1:1987',
1190     '1'},
1191 wakaba 1.2 xml_name => 'ISO-8859-1',
1192     };
1193    
1194 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-2'} = {ascii8 =>
1195     '1',
1196     is_block_safe =>
1197     '1',
1198     'uri',
1199     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-2',
1200     '1'},
1201 wakaba 1.2 xml_name => 'ISO-8859-2',
1202     };
1203    
1204 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-3'} = {ascii8 =>
1205     '1',
1206     is_block_safe =>
1207     '1',
1208     'uri',
1209     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-3',
1210     '1'},
1211 wakaba 1.2 xml_name => 'ISO-8859-3',
1212     };
1213    
1214 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-4'} = {ascii8 =>
1215     '1',
1216     is_block_safe =>
1217     '1',
1218     'uri',
1219     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-4',
1220     '1'},
1221 wakaba 1.2 xml_name => 'ISO-8859-4',
1222     };
1223    
1224 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-5'} = {ascii8 =>
1225     '1',
1226     is_block_safe =>
1227     '1',
1228     'uri',
1229     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-5',
1230     '1'},
1231 wakaba 1.2 xml_name => 'ISO-8859-5',
1232     };
1233    
1234 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-6'} = {ascii8 =>
1235     '1',
1236     is_block_safe =>
1237     '1',
1238     'uri',
1239     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-6',
1240     '1'},
1241 wakaba 1.2 xml_name => 'ISO-8859-6',
1242     };
1243    
1244 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-7'} = {ascii8 =>
1245     '1',
1246     is_block_safe =>
1247     '1',
1248     'uri',
1249     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-7',
1250     '1'},
1251 wakaba 1.2 xml_name => 'ISO-8859-7',
1252     };
1253    
1254 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-8'} = {ascii8 =>
1255     '1',
1256     is_block_safe =>
1257     '1',
1258     'uri',
1259     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-8',
1260     '1'},
1261 wakaba 1.2 xml_name => 'ISO-8859-8',
1262     };
1263    
1264 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-9'} = {ascii8 =>
1265     '1',
1266     is_block_safe =>
1267     '1',
1268     'uri',
1269     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-9',
1270     '1'},
1271 wakaba 1.2 xml_name => 'ISO-8859-9',
1272     };
1273    
1274 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-10'} = {ascii8 =>
1275     '1',
1276     is_block_safe =>
1277     '1',
1278     'uri',
1279     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-10',
1280     '1'},
1281 wakaba 1.2 xml_name => 'ISO-8859-10',
1282     };
1283    
1284 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-11'} = {ascii8 =>
1285     '1',
1286     is_block_safe =>
1287     '1',
1288     'uri',
1289     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-11',
1290     '1'},
1291 wakaba 1.2 xml_name => 'ISO-8859-11',
1292     };
1293    
1294 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-13'} = {ascii8 =>
1295     '1',
1296     is_block_safe =>
1297     '1',
1298     'uri',
1299     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-13',
1300     '1'},
1301 wakaba 1.2 xml_name => 'ISO-8859-13',
1302     };
1303    
1304 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-14'} = {ascii8 =>
1305     '1',
1306     is_block_safe =>
1307     '1',
1308     'uri',
1309     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-14',
1310     '1'},
1311 wakaba 1.2 xml_name => 'ISO-8859-14',
1312     };
1313    
1314 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-15'} = {ascii8 =>
1315     '1',
1316     is_block_safe =>
1317     '1',
1318     'uri',
1319     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-15',
1320     '1'},
1321 wakaba 1.2 xml_name => 'ISO-8859-15',
1322     };
1323    
1324 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-16'} = {ascii8 =>
1325     '1',
1326     is_block_safe =>
1327     '1',
1328     'uri',
1329     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-16',
1330     '1'},
1331 wakaba 1.2 xml_name => 'ISO-8859-16',
1332     };
1333    
1334 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-2022-jp'} = {ascii8 =>
1335     '1',
1336     'uri',
1337     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-2022-jp',
1338     '1'},
1339 wakaba 1.2 xml_name => 'ISO-2022-JP',
1340     };
1341    
1342 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:iso-2022-jp'} = {ascii8 =>
1343     '1',
1344     ietf_name =>
1345     ['csiso2022jp',
1346     'iso-2022-jp',
1347     'iso-2022-jp'],
1348     mime_name =>
1349     'iso-2022-jp',
1350     'uri',
1351     {'urn:x-suika-fam-cx:charset:iso-2022-jp',
1352     '1'},
1353 wakaba 1.2 };
1354    
1355 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.iso-2022-jp'} = {ascii8 =>
1356     '1',
1357     perl_name =>
1358     ['iso-2022-jp'],
1359     'uri',
1360     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.iso-2022-jp',
1361     '1'}};
1362     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:shift_jis'} = $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.shift_jis'} = {ascii8 =>
1363     '1',
1364     is_block_safe =>
1365     '1',
1366     ietf_name =>
1367     ['csshiftjis',
1368     'ms_kanji',
1369     'shift_jis',
1370     'shift_jis'],
1371     mime_name =>
1372     'shift_jis',
1373     perl_name =>
1374     ['shift-jis-1997'],
1375     'uri',
1376     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.shift_jis',
1377     '1',
1378     'urn:x-suika-fam-cx:charset:shift_jis',
1379     '1'},
1380 wakaba 1.2 xml_name => 'Shift_JIS',
1381     };
1382    
1383 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.shiftjis'} = {ascii8 =>
1384     '1',
1385     is_block_safe =>
1386     '1',
1387     perl_name =>
1388     ['shiftjis',
1389     'sjis'],
1390     'uri',
1391     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.shiftjis',
1392     '1'}};
1393     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:euc-jp'} = $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.euc-jp'} = $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:extended_unix_code_packed_format_for_japanese'} = {ascii8 =>
1394     '1',
1395     is_block_safe =>
1396     '1',
1397     ietf_name =>
1398     ['cseucpkdfmtjapanese',
1399     'euc-jp',
1400     'euc-jp',
1401     'extended_unix_code_packed_format_for_japanese'],
1402     mime_name =>
1403     'euc-jp',
1404     perl_name =>
1405     ['euc-jp-1997'],
1406     'uri',
1407     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.euc-jp',
1408     '1',
1409     'urn:x-suika-fam-cx:charset:euc-jp',
1410     '1',
1411     'urn:x-suika-fam-cx:charset:extended_unix_code_packed_format_for_japanese',
1412     '1'},
1413 wakaba 1.2 xml_name => 'EUC-JP',
1414     };
1415    
1416 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.euc-jp'} = {ascii8 =>
1417     '1',
1418     is_block_safe =>
1419     '1',
1420     perl_name =>
1421     ['euc-jp',
1422     'ujis'],
1423     'uri',
1424     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.euc-jp',
1425     '1'}};
1426    
1427     1;
1428 wakaba 1.2 ## $Date: 2007/07/15 12:41:51 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24