/[suikacvs]/markup/html/whatpm/Whatpm/Charset/DecodeHandle.pm
Suika

Contents of /markup/html/whatpm/Whatpm/Charset/DecodeHandle.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.8 - (hide annotations) (download)
Thu Sep 11 09:55:56 2008 UTC (16 years, 10 months ago) by wakaba
Branch: MAIN
Changes since 1.7: +21 -1 lines
++ whatpm/Whatpm/Charset/ChangeLog	11 Sep 2008 09:55:54 -0000
	* UnicodeChecker.pm, DecodeHandle.pm: Tentative support
	for |read| method.

2008-09-11  Wakaba  <wakaba@suika.fam.cx>

1 wakaba 1.1 package Whatpm::Charset::DecodeHandle;
2     use strict;
3    
4     my $XML_AUTO_CHARSET = q<http://suika.fam.cx/www/2006/03/xml-entity/>;
5     my $IANA_CHARSET = q<urn:x-suika-fam-cx:charset:>;
6     my $PERL_CHARSET = q<http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.>;
7     my $XML_CHARSET = q<http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.>;
8    
9     ## ->create_decode_handle ($charset_uri, $byte_stream, $onerror)
10     sub create_decode_handle ($$$;$) {
11     my $csdef = $Whatpm::Charset::CharsetDef->{$_[1]};
12     my $obj = {
13     character_queue => [],
14     filehandle => $_[2],
15     charset => $_[1],
16     byte_buffer => '',
17     onerror => $_[3] || sub {},
18     };
19     if ($csdef->{uri}->{$XML_AUTO_CHARSET} or
20     $obj->{charset} eq $XML_AUTO_CHARSET) {
21     my $b = ''; # UTF-8 w/o BOM
22     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
23 wakaba 1.2 $obj->{input_encoding} = 'UTF-8';
24 wakaba 1.1 if (read $obj->{filehandle}, $b, 256) {
25     no warnings "substr";
26     no warnings "uninitialized";
27     if (substr ($b, 0, 1) eq "<") {
28     if (substr ($b, 1, 1) eq "?") { # ASCII8
29     if ($b =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
30     encoding\s*=\s*["']([^"']*)/x) {
31 wakaba 1.2 $obj->{input_encoding} = $1;
32 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
33     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
34     if (not $csdef->{ascii8} or $csdef->{bom_required}) {
35     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
36     charset_uri => $uri,
37     charset_name => $obj->{input_encoding});
38     }
39     } else {
40     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
41 wakaba 1.2 $obj->{input_encoding} = 'UTF-8';
42 wakaba 1.1 }
43     if (defined $csdef->{no_bom_variant}) {
44     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant}};
45     }
46     } elsif (substr ($b, 1, 1) eq "\x00") {
47     if (substr ($b, 2, 2) eq "?\x00") { # ASCII16LE
48     my $c = $b; $c =~ tr/\x00//d;
49     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
50     encoding\s*=\s*["']([^"']*)/x) {
51 wakaba 1.2 $obj->{input_encoding} = $1;
52 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
53     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
54     if (not $csdef->{ascii16} or $csdef->{ascii16be} or
55     $csdef->{bom_required}) {
56     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
57     charset_uri => $uri,
58     charset_name => $obj->{input_encoding});
59     }
60     } else {
61     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
62 wakaba 1.2 $obj->{input_encoding} = 'UTF-8';
63 wakaba 1.1 }
64     if (defined $csdef->{no_bom_variant16le}) {
65     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant16le}};
66     }
67     } elsif (substr ($b, 2, 2) eq "\x00\x00") { # ASCII32Endian4321
68     my $c = $b; $c =~ tr/\x00//d;
69     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
70     encoding\s*=\s*["']([^"']*)/x) {
71 wakaba 1.2 $obj->{input_encoding} = $1;
72 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
73     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
74     if (not $csdef->{ascii32} or
75     $csdef->{ascii32endian1234} or
76     $csdef->{ascii32endian2143} or
77     $csdef->{ascii32endian3412} or
78     $csdef->{bom_required}) {
79     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
80     charset_uri => $uri,
81     charset_name => $obj->{input_encoding});
82     }
83     } else {
84     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
85 wakaba 1.2 $obj->{input_encoding} = 'UTF-8';
86 wakaba 1.1 }
87     if (defined $csdef->{no_bom_variant32endian4321}) {
88     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant32endian4321}};
89     }
90     }
91     }
92     } elsif (substr ($b, 0, 3) eq "\xEF\xBB\xBF") { # UTF8
93     $obj->{has_bom} = 1;
94     substr ($b, 0, 3) = '';
95     my $c = $b;
96     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
97     encoding\s*=\s*["']([^"']*)/x) {
98 wakaba 1.2 $obj->{input_encoding} = $1;
99 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
100     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
101     if (not $csdef->{utf8_encoding_scheme} or
102     not $csdef->{bom_allowed}) {
103     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
104     charset_uri => $uri,
105     charset_name => $obj->{input_encoding});
106     }
107     } else {
108     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
109 wakaba 1.2 $obj->{input_encoding} = 'UTF-8';
110 wakaba 1.1 }
111     if (defined $csdef->{no_bom_variant}) {
112     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant}};
113     }
114     } elsif (substr ($b, 0, 2) eq "\x00<") {
115     if (substr ($b, 2, 2) eq "\x00?") { # ASCII16BE
116     my $c = $b; $c =~ tr/\x00//d;
117     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
118     encoding\s*=\s*["']([^"']*)/x) {
119 wakaba 1.2 $obj->{input_encoding} = $1;
120 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
121     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
122     if (not $csdef->{ascii16} or $csdef->{ascii16le} or
123     $csdef->{bom_required}) {
124     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
125     charset_uri => $uri,
126     charset_name => $obj->{input_encoding});
127     }
128     } else {
129     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
130 wakaba 1.2 $obj->{input_encoding} = 'UTF-8';
131 wakaba 1.1 }
132     if (defined $csdef->{no_bom_variant16be}) {
133     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant16be}};
134     }
135     } elsif (substr ($b, 2, 2) eq "\x00\x00") { # ASCII32Endian3412
136     my $c = $b; $c =~ tr/\x00//d;
137     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
138     encoding\s*=\s*["']([^"']*)/x) {
139 wakaba 1.2 $obj->{input_encoding} = $1;
140 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
141     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
142     if (not $csdef->{ascii32} or
143     $csdef->{ascii32endian1234} or
144     $csdef->{ascii32endian2143} or
145     $csdef->{ascii32endian4321} or
146     $csdef->{bom_required}) {
147     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
148     charset_uri => $uri,
149     charset_name => $obj->{input_encoding});
150     }
151     } else {
152     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
153 wakaba 1.2 $obj->{input_encoding} = 'UTF-8';
154 wakaba 1.1 }
155     if (defined $csdef->{no_bom_variant32endian3412}) {
156     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant32endian3412}};
157     }
158     }
159     } elsif (substr ($b, 0, 2) eq "\xFE\xFF") {
160     if (substr ($b, 2, 2) eq "\x00<") { # ASCII16BE
161     $obj->{has_bom} = 1;
162     substr ($b, 0, 2) = '';
163     my $c = $b; $c =~ tr/\x00//d;
164     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
165     encoding\s*=\s*["']([^"']*)/x) {
166 wakaba 1.2 $obj->{input_encoding} = $1;
167 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
168     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
169     if (not $csdef->{ascii16} or
170     $csdef->{ascii16le} or
171     not $csdef->{bom_allowed}) {
172     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
173     charset_uri => $uri,
174     charset_name => $obj->{input_encoding});
175     }
176     } else {
177     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-16be'};
178 wakaba 1.2 $obj->{input_encoding} = 'UTF-16';
179 wakaba 1.1 }
180     if (defined $csdef->{no_bom_variant16be}) {
181     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant16be}};
182     }
183     } elsif (substr ($b, 2, 2) eq "\x00\x00") { # ASCII32Endian3412
184     $obj->{has_bom} = 1;
185     substr ($b, 0, 4) = '';
186     my $c = $b; $c =~ tr/\x00//d;
187     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
188     encoding\s*=\s*["']([^"']*)/x) {
189 wakaba 1.2 $obj->{input_encoding} = $1;
190 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
191     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
192     if (not $csdef->{ascii32} or
193     $csdef->{ascii32endian1234} or
194     $csdef->{ascii32endian2143} or
195     $csdef->{ascii32endian4321} or
196     not $csdef->{bom_allowed}) {
197     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
198     charset_uri => $uri,
199     charset_name => $obj->{input_encoding});
200     }
201     } else {
202     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-16be'};
203 wakaba 1.2 $obj->{input_encoding} = 'UTF-16';
204 wakaba 1.1 $obj->{byte_buffer} .= "\x00\x00";
205     }
206     if (defined $csdef->{no_bom_variant32endian3412}) {
207     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant32endian3412}};
208     }
209     } else {
210     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-16be'};
211 wakaba 1.2 $obj->{input_encoding} = 'UTF-16';
212 wakaba 1.1 substr ($b, 0, 2) = '';
213     $obj->{has_bom} = 1;
214     }
215     } elsif (substr ($b, 0, 2) eq "\xFF\xFE") {
216     if (substr ($b, 2, 2) eq "<\x00") { # ASCII16LE
217     $obj->{has_bom} = 1;
218     substr ($b, 0, 2) = '';
219     my $c = $b; $c =~ tr/\x00//d;
220     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
221     encoding\s*=\s*["']([^"']*)/x) {
222 wakaba 1.2 $obj->{input_encoding} = $1;
223 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
224     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
225     if (not $csdef->{ascii16} or
226     $csdef->{ascii16be} or
227     not $csdef->{bom_allowed}) {
228     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
229     charset_uri => $uri,
230     charset_name => $obj->{input_encoding});
231     }
232     } else {
233     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-16le'};
234 wakaba 1.2 $obj->{input_encoding} = 'UTF-16';
235 wakaba 1.1 }
236     if (defined $csdef->{no_bom_variant16le}) {
237     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant16le}};
238     }
239     } elsif (substr ($b, 2, 2) eq "\x00\x00") { # ASCII32Endian4321
240     $obj->{has_bom} = 1;
241     substr ($b, 0, 4) = '';
242     my $c = $b; $c =~ tr/\x00//d;
243     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
244     encoding\s*=\s*["']([^"']*)/x) {
245 wakaba 1.2 $obj->{input_encoding} = $1;
246 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
247     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
248     if (not $csdef->{ascii32} or
249     $csdef->{ascii32endian1234} or
250     $csdef->{ascii32endian2143} or
251     $csdef->{ascii32endian3412} or
252     not $csdef->{bom_allowed}) {
253     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
254     charset_uri => $uri,
255     charset_name => $obj->{input_encoding});
256     }
257     } else {
258     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-16le'};
259 wakaba 1.2 $obj->{input_encoding} = 'UTF-16';
260 wakaba 1.1 $obj->{byte_buffer} .= "\x00\x00";
261     }
262     if (defined $csdef->{no_bom_variant32endian4321}) {
263     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant32endian4321}};
264     }
265     } else {
266     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-16le'};
267 wakaba 1.2 $obj->{input_encoding} = 'UTF-16';
268 wakaba 1.1 substr ($b, 0, 2) = '';
269     $obj->{has_bom} = 1;
270     }
271     } elsif (substr ($b, 0, 2) eq "\x00\x00") {
272     if (substr ($b, 2, 2) eq "\x00<") { # ASCII32Endian1234
273     my $c = $b; $c =~ tr/\x00//d;
274     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
275     encoding\s*=\s*["']([^"']*)/x) {
276 wakaba 1.2 $obj->{input_encoding} = $1;
277 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
278     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
279     if (not $csdef->{ascii32} or
280     $csdef->{ascii32endian2143} or
281     $csdef->{ascii32endian3412} or
282     $csdef->{ascii32endian4321} or
283     $csdef->{bom_required}) {
284     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
285     charset_uri => $uri,
286     charset_name => $obj->{input_encoding});
287     }
288     } else {
289     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
290 wakaba 1.2 $obj->{input_encoding} = 'UTF-8';
291 wakaba 1.1 }
292     if (defined $csdef->{no_bom_variant32endian1234}) {
293     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant32endian1234}};
294     }
295     } elsif (substr ($b, 2, 2) eq "<\x00") { # ASCII32Endian2143
296     my $c = $b; $c =~ tr/\x00//d;
297     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
298     encoding\s*=\s*["']([^"']*)/x) {
299 wakaba 1.2 $obj->{input_encoding} = $1;
300 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
301     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
302     if (not $csdef->{ascii32} or
303     $csdef->{ascii32endian1234} or
304     $csdef->{ascii32endian3412} or
305     $csdef->{ascii32endian4321} or
306     $csdef->{bom_required}) {
307     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
308     charset_uri => $uri,
309     charset_name => $obj->{input_encoding});
310     }
311     } else {
312     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
313 wakaba 1.2 $obj->{input_encoding} = 'UTF-8';
314 wakaba 1.1 }
315     if (defined $csdef->{no_bom_variant32endian2143}) {
316     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant32endian2143}};
317     }
318     } elsif (substr ($b, 2, 2) eq "\xFE\xFF") { # ASCII32Endian1234
319     $obj->{has_bom} = 1;
320     substr ($b, 0, 4) = '';
321     my $c = $b; $c =~ tr/\x00//d;
322     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
323     encoding\s*=\s*["']([^"']*)/x) {
324 wakaba 1.2 $obj->{input_encoding} = $1;
325 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
326     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
327     if (not $csdef->{ascii32} or
328     $csdef->{ascii32endian2143} or
329     $csdef->{ascii32endian3412} or
330     $csdef->{ascii32endian4321} or
331     $csdef->{bom_required}) {
332     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
333     charset_uri => $uri,
334     charset_name => $obj->{input_encoding});
335     }
336     } else {
337     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
338 wakaba 1.2 $obj->{input_encoding} = 'UTF-8';
339 wakaba 1.1 $obj->{has_bom} = 0;
340     $obj->{byte_buffer} .= "\x00\x00\xFE\xFF";
341     }
342     if (defined $csdef->{no_bom_variant32endian1234}) {
343     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant32endian1234}};
344     }
345     } elsif (substr ($b, 2, 2) eq "\xFF\xFE") { # ASCII32Endian2143
346     $obj->{has_bom} = 1;
347     substr ($b, 0, 4) = '';
348     my $c = $b; $c =~ tr/\x00//d;
349     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
350     encoding\s*=\s*["']([^"']*)/x) {
351 wakaba 1.2 $obj->{input_encoding} = $1;
352 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
353     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
354     if (not $csdef->{ascii32} or
355     $csdef->{ascii32endian1234} or
356     $csdef->{ascii32endian3412} or
357     $csdef->{ascii32endian4321} or
358     $csdef->{bom_required}) {
359     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
360     charset_uri => $uri,
361     charset_name => $obj->{input_encoding});
362     }
363     } else {
364     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
365 wakaba 1.2 $obj->{input_encoding} = 'UTF-8';
366 wakaba 1.1 $obj->{has_bom} = 0;
367     $obj->{byte_buffer} .= "\x00\x00\xFF\xFE";
368     }
369     if (defined $csdef->{no_bom_variant32endian2143}) {
370     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant32endian2143}};
371     }
372     }
373     # \x4C\x6F\xA7\x94 EBCDIC
374     } # buffer
375     $obj->{byte_buffer} .= $b;
376     } # read
377     } elsif ($csdef->{uri}->{$XML_CHARSET.'utf-8'}) {
378     ## BOM is optional.
379     my $b = '';
380     if (read $obj->{filehandle}, $b, 3) {
381     if ($b eq "\xEF\xBB\xBF") {
382     $obj->{has_bom} = 1;
383     } else {
384     $obj->{byte_buffer} .= $b;
385     }
386     }
387     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'}; # UTF-8 w/o BOM
388     } elsif ($csdef->{uri}->{$XML_CHARSET.'utf-16'}) {
389     ## BOM is mandated.
390     my $b = '';
391     if (read $obj->{filehandle}, $b, 2) {
392     if ($b eq "\xFE\xFF") {
393     $obj->{has_bom} = 1; # UTF-16BE w/o BOM
394     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-16be'};
395     } elsif ($b eq "\xFF\xFE") {
396     $obj->{has_bom} = 1; # UTF-16LE w/o BOM
397     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-16le'};
398     } else {
399     $obj->{onerror}->(undef, 'no-bom-error', charset_uri => $obj->{charset});
400     $obj->{has_bom} = 0;
401     $obj->{byte_buffer} .= $b; # UTF-16BE w/o BOM
402     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-16be'};
403     }
404     } else {
405     $obj->{onerror}->(undef, 'no-bom-error', charset_uri => $obj->{charset});
406     $obj->{has_bom} = 0; # UTF-16BE w/o BOM
407     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-16be'};
408     }
409     }
410    
411     if ($csdef->{uri}->{$XML_CHARSET.'iso-2022-jp'}) {
412     $obj->{state_2440} = 'gl-jis-1997-swapped';
413     $obj->{state_2442} = 'gl-jis-1997';
414     $obj->{state} = 'state_2842';
415     require Encode::GLJIS1997Swapped;
416     require Encode::GLJIS1997;
417     if (Encode::find_encoding ($obj->{state_2440}) and
418     Encode::find_encoding ($obj->{state_2442})) {
419     return bless $obj, 'Whatpm::Charset::DecodeHandle::ISO2022JP';
420     }
421     } elsif ($csdef->{uri}->{$IANA_CHARSET.'iso-2022-jp'}) {
422     $obj->{state_2440} = 'gl-jis-1978';
423     $obj->{state_2442} = 'gl-jis-1983';
424     $obj->{state} = 'state_2842';
425     require Encode::GLJIS1978;
426     require Encode::GLJIS1983;
427     if (Encode::find_encoding ($obj->{state_2440}) and
428     Encode::find_encoding ($obj->{state_2442})) {
429     return bless $obj, 'Whatpm::Charset::DecodeHandle::ISO2022JP';
430     }
431     } elsif (defined $csdef->{perl_name}->[0]) {
432     if ($csdef->{uri}->{$XML_CHARSET.'euc-jp'} or
433     $csdef->{uri}->{$IANA_CHARSET.'euc-jp'}) {
434     $obj->{perl_encoding_name} = $csdef->{perl_name}->[0];
435     require Encode::EUCJP1997;
436     if (Encode::find_encoding ($obj->{perl_encoding_name})) {
437     return bless $obj, 'Whatpm::Charset::DecodeHandle::EUCJP';
438     }
439     } elsif ($csdef->{uri}->{$XML_CHARSET.'shift_jis'} or
440     $csdef->{uri}->{$IANA_CHARSET.'shift_jis'}) {
441     $obj->{perl_encoding_name} = $csdef->{perl_name}->[0];
442     require Encode::ShiftJIS1997;
443     if (Encode::find_encoding ($obj->{perl_encoding_name})) {
444     return bless $obj, 'Whatpm::Charset::DecodeHandle::ShiftJIS';
445     }
446     } elsif ($csdef->{is_block_safe}) {
447     $obj->{perl_encoding_name} = $csdef->{perl_name}->[0];
448     require Encode;
449     if (Encode::find_encoding ($obj->{perl_encoding_name})) {
450     return bless $obj, 'Whatpm::Charset::DecodeHandle::Encode';
451     }
452     }
453     }
454    
455     $obj->{onerror}->(undef, 'charset-not-supported-error',
456     charset_uri => $obj->{charset});
457     return undef;
458     } # create_decode_handle
459    
460     sub name_to_uri ($$$) {
461     my $domain = $_[1];
462     my $name = lc $_[2];
463    
464     if ($domain eq 'ietf') {
465     return $IANA_CHARSET . $name;
466     } elsif ($domain eq 'xml') {
467     if ({
468     'utf-8' => 1,
469     'utf-16' => 1,
470     'iso-10646-ucs-2' => 1,
471     'iso-10646-ucs-4' => 1,
472     'iso-8859-1' => 1,
473     'iso-8859-2' => 1,
474     'iso-8859-3' => 1,
475     'iso-8859-4' => 1,
476     'iso-8859-5' => 1,
477     'iso-8859-6' => 1,
478     'iso-8859-7' => 1,
479     'iso-8859-8' => 1,
480     'iso-8859-9' => 1,
481     'iso-8859-10' => 1,
482     'iso-8859-11' => 1,
483     'iso-8859-13' => 1,
484     'iso-8859-14' => 1,
485     'iso-8859-15' => 1,
486     'iso-8859-16' => 1,
487     'iso-2022-jp' => 1,
488     'shift_jis' => 1,
489     'euc-jp' => 1,
490     }->{$name}) {
491     return $XML_CHARSET . $name;
492     }
493    
494     my $uri = $IANA_CHARSET . $name;
495     return $uri if $Whatpm::Charset::CharsetDef->{$uri};
496    
497     return $XML_CHARSET . $name;
498     } else {
499     return undef;
500     }
501     } # name_to_uri
502    
503     sub uri_to_name ($$$) {
504     my (undef, $domain, $uri) = @_;
505 wakaba 1.2
506     if ($domain eq 'xml') {
507     my $v = $Whatpm::Charset::CharsetDef->{$uri}->{xml_name};
508     return $v if defined $v;
509    
510     if (substr ($uri, 0, length $XML_CHARSET) eq $XML_CHARSET) {
511     return substr ($uri, length $XML_CHARSET);
512     }
513    
514     $domain = 'ietf'; ## TODO: XML encoding name has smaller range
515     }
516    
517     if ($domain eq 'ietf') {
518     my $v = $Whatpm::Charset::CharsetDef->{$uri}->{iana_name};
519     return $v->[0] if defined $v;
520    
521     if (substr ($uri, 0, length $IANA_CHARSET) eq $IANA_CHARSET) {
522 wakaba 1.1 return substr ($uri, length $IANA_CHARSET);
523     }
524     }
525    
526     return undef;
527     } # uri_to_name
528    
529 wakaba 1.3 require IO::Handle;
530    
531     package Whatpm::Charset::DecodeHandle::ByteBuffer;
532    
533 wakaba 1.7 ## NOTE: Provides a byte buffer wrapper object.
534    
535 wakaba 1.3 sub new ($$) {
536     my $self = bless {
537     buffer => '',
538     }, shift;
539     $self->{filehandle} = shift;
540     return $self;
541     } # new
542    
543     sub read {
544     my $self = shift;
545     my $pos = length $self->{buffer};
546     my $r = $self->{filehandle}->read ($self->{buffer}, $_[1], $pos);
547     substr ($_[0], $_[2]) = substr ($self->{buffer}, $pos);
548 wakaba 1.8 ## NOTE: This would do different behavior from Perl's standard
549     ## |read| when $pos points beyond the end of the string.
550 wakaba 1.3 return $r;
551     } # read
552    
553     sub close { $_[0]->{filehandle}->close }
554    
555 wakaba 1.1 package Whatpm::Charset::DecodeHandle::Encode;
556    
557 wakaba 1.7 ## NOTE: Provides a Perl |Encode| module wrapper object.
558    
559 wakaba 1.1 sub charset ($) { $_[0]->{charset} }
560    
561 wakaba 1.3 sub close ($) { $_[0]->{filehandle}->close }
562 wakaba 1.1
563     sub getc ($) {
564     my $self = $_[0];
565     return shift @{$self->{character_queue}} if @{$self->{character_queue}};
566    
567     my $error;
568     if ($self->{continue}) {
569 wakaba 1.3 if ($self->{filehandle}->read ($self->{byte_buffer}, 256,
570     length $self->{byte_buffer})) {
571 wakaba 1.1 #
572     } else {
573     $error = 1;
574     }
575     $self->{continue} = 0;
576     } elsif (512 > length $self->{byte_buffer}) {
577 wakaba 1.3 $self->{filehandle}->read ($self->{byte_buffer}, 256,
578     length $self->{byte_buffer});
579 wakaba 1.1 }
580    
581     my $r;
582     unless ($error) {
583 wakaba 1.4 if (not $self->{bom_checked}) {
584     if (defined $self->{bom_pattern}) {
585     if ($self->{byte_buffer} =~ s/^$self->{bom_pattern}//) {
586     $self->{has_bom} = 1;
587     }
588     }
589     $self->{bom_checked} = 1;
590     }
591    
592 wakaba 1.1 my $string = Encode::decode ($self->{perl_encoding_name},
593     $self->{byte_buffer},
594     Encode::FB_QUIET ());
595     if (length $string) {
596     push @{$self->{character_queue}}, split //, $string;
597     $r = shift @{$self->{character_queue}};
598     if (length $self->{byte_buffer}) {
599     $self->{continue} = 1;
600     }
601     } else {
602     if (length $self->{byte_buffer}) {
603     $error = 1;
604     } else {
605     $r = undef;
606     }
607     }
608     }
609    
610     if ($error) {
611     $r = substr $self->{byte_buffer}, 0, 1, '';
612 wakaba 1.6 my $fallback = $self->{fallback}->{$r};
613     if (defined $fallback) {
614 wakaba 1.7 ## NOTE: This is an HTML5 parse error.
615 wakaba 1.6 $self->{onerror}->($self, 'fallback-char-error', octets => \$r,
616     char => \$fallback,
617 wakaba 1.7 level => $self->{level}->{$self->{error_level}->{'fallback-char-error'}});
618 wakaba 1.6 return $fallback;
619 wakaba 1.7 } elsif (exists $self->{fallback}->{$r}) {
620     ## NOTE: This is an HTML5 parse error. In addition, the octet
621     ## is not assigned with a character.
622     $self->{onerror}->($self, 'fallback-unassigned-error', octets => \$r,
623     level => $self->{level}->{$self->{error_level}->{'fallback-unassigned-error'}});
624 wakaba 1.6 } else {
625 wakaba 1.7 $self->{onerror}->($self, 'illegal-octets-error', octets => \$r,
626     level => $self->{level}->{$self->{error_level}->{'illegal-octets-error'}});
627 wakaba 1.6 }
628 wakaba 1.1 }
629    
630     return $r;
631     } # getc
632    
633     sub has_bom ($) { $_[0]->{has_bom} }
634    
635 wakaba 1.2 sub input_encoding ($) {
636     my $v = $_[0]->{input_encoding};
637     return $v if defined $v;
638    
639     my $uri = $_[0]->{charset};
640     if (defined $uri) {
641     return Whatpm::Charset::DecodeHandle->uri_to_name (xml => $uri);
642     }
643    
644     return undef;
645     } # input_encoding
646 wakaba 1.1
647     sub onerror ($;$) {
648     if (@_ > 1) {
649     $_[0]->{onerror} = $_[1];
650     }
651    
652     return $_[0]->{onerror};
653     } # onerror
654    
655     sub ungetc ($$) {
656     unshift @{$_[0]->{character_queue}}, chr int ($_[1] or 0);
657     } # ungetc
658    
659 wakaba 1.8 ## TODO: This is not good for performance. Should be replaced
660     ## by read-centric implementation.
661     sub read ($$$;$) {
662     #my ($self, $scalar, $length, $offset) = @_;
663     my $length = $_[2];
664     my $r = '';
665     while ($length > 0) {
666     my $c = $_[0]->getc;
667     last unless defined $c;
668     $r .= $c;
669     $length--;
670     }
671     substr ($_[1], $_[3]) = $r;
672     ## NOTE: This would do different thing from what Perl's |read| do
673     ## if $offset points beyond the end of the $scalar.
674     return length $r;
675     } # read
676    
677 wakaba 1.1 package Whatpm::Charset::DecodeHandle::EUCJP;
678     push our @ISA, 'Whatpm::Charset::DecodeHandle::Encode';
679    
680     sub getc ($) {
681     my $self = $_[0];
682     return shift @{$self->{character_queue}} if @{$self->{character_queue}};
683    
684     my $error;
685     if ($self->{continue}) {
686 wakaba 1.3 if ($self->{filehandle}->read ($self->{byte_buffer}, 256,
687     length $self->{byte_buffer})) {
688 wakaba 1.1 #
689     } else {
690     $error = 1;
691     }
692     $self->{continue} = 0;
693     } elsif (512 > length $self->{byte_buffer}) {
694 wakaba 1.3 $self->{filehandle}->read ($self->{byte_buffer}, 256,
695     length $self->{byte_buffer});
696 wakaba 1.1 }
697    
698     my $r;
699     unless ($error) {
700     my $string = Encode::decode ($self->{perl_encoding_name},
701     $self->{byte_buffer},
702     Encode::FB_QUIET ());
703     if (length $string) {
704     push @{$self->{character_queue}}, split //, $string;
705     $r = shift @{$self->{character_queue}};
706     if (length $self->{byte_buffer}) {
707     $self->{continue} = 1;
708     }
709     } else {
710     if (length $self->{byte_buffer}) {
711     $error = 1;
712     } else {
713     $r = undef;
714     }
715     }
716     }
717    
718     if ($error) {
719     $r = substr $self->{byte_buffer}, 0, 1, '';
720     my $etype = 'illegal-octets-error';
721     if ($r =~ /^[\xA1-\xFE]/) {
722     if ($self->{byte_buffer} =~ s/^([\xA1-\xFE])//) {
723     $r .= $1;
724     $etype = 'unassigned-code-point-error';
725     }
726     } elsif ($r eq "\x8F") {
727     if ($self->{byte_buffer} =~ s/^([\xA1-\xFE][\xA1-\xFE]?)//) {
728     $r .= $1;
729     $etype = 'unassigned-code-point-error' if length $1 == 2;
730     }
731     } elsif ($r eq "\x8E") {
732     if ($self->{byte_buffer} =~ s/^([\xA1-\xFE])//) {
733     $r .= $1;
734     $etype = 'unassigned-code-point-error';
735     }
736     } elsif ($r eq "\xA0" or $r eq "\xFF") {
737     $etype = 'unassigned-code-point-error';
738     }
739 wakaba 1.7 $self->{onerror}->($self, $etype, octets => \$r,
740     level => $self->{level}->{$self->{error_level}->{$etype}});
741 wakaba 1.1 }
742    
743     return $r;
744     } # getc
745    
746     package Whatpm::Charset::DecodeHandle::ISO2022JP;
747     push our @ISA, 'Whatpm::Charset::DecodeHandle::Encode';
748    
749     sub getc ($) {
750     my $self = $_[0];
751     return shift @{$self->{character_queue}} if @{$self->{character_queue}};
752    
753     my $r;
754     A: {
755     my $error;
756     if ($self->{continue}) {
757 wakaba 1.3 if ($self->{filehandle}->read ($self->{byte_buffer}, 256,
758     length $self->{byte_buffer})) {
759 wakaba 1.1 #
760     } else {
761     $error = 1;
762     }
763     $self->{continue} = 0;
764     } elsif (512 > length $self->{byte_buffer}) {
765 wakaba 1.3 $self->{filehandle}->read ($self->{byte_buffer}, 256,
766     length $self->{byte_buffer});
767 wakaba 1.1 }
768    
769     unless ($error) {
770     if ($self->{byte_buffer} =~ s/^\x1B(\x24[\x40\x42]|\x28[\x42\x4A])//) {
771     $self->{state} = {
772     "\x24\x40" => 'state_2440',
773     "\x24\x42" => 'state_2442',
774     "\x28\x42" => 'state_2842',
775     "\x28\x4A" => 'state_284A',
776     }->{$1};
777     redo A;
778     } elsif ($self->{state} eq 'state_2842') { # IRV
779     if ($self->{byte_buffer} =~ s/^([\x00-\x0D\x10-\x1A\x1C-\x7F]+)//) {
780     push @{$self->{character_queue}}, split //, $1;
781     $r = shift @{$self->{character_queue}};
782     } else {
783     if (length $self->{byte_buffer}) {
784     $error = 1;
785     } else {
786     $r = undef;
787     }
788     }
789     } elsif ($self->{state} eq 'state_284A') { # 0201
790     if ($self->{byte_buffer} =~ s/^([\x00-\x0D\x10-\x1A\x1C-\x7F]+)//) {
791     my $v = $1;
792     $v =~ tr/\x5C\x7E/\xA5\x{203E}/;
793     push @{$self->{character_queue}}, split //, $v;
794     $r = shift @{$self->{character_queue}};
795     } else {
796     if (length $self->{byte_buffer}) {
797     $error = 1;
798     } else {
799     $r = undef;
800     $self->{onerror}->($self, 'invalid-state-error',
801 wakaba 1.7 state => $self->{state},
802     level => $self->{level}->{$self->{error_level}->{'invalid-state-error'}});
803 wakaba 1.1 }
804     }
805     } elsif ($self->{state} eq 'state_2442') { # 1983
806     my $v = Encode::decode ($self->{state_2442},
807     $self->{byte_buffer},
808     Encode::FB_QUIET ());
809     if (length $v) {
810     push @{$self->{character_queue}}, split //, $v;
811     $r = shift @{$self->{character_queue}};
812     } else {
813     if (length $self->{byte_buffer}) {
814     $error = 1;
815     } else {
816     $r = undef;
817     $self->{onerror}->($self, 'invalid-state-error',
818 wakaba 1.7 state => $self->{state},
819     level => $self->{level}->{$self->{error_level}->{'invalid-state-error'}});
820 wakaba 1.1 }
821     }
822     } elsif ($self->{state} eq 'state_2440') { # 1978
823     my $v = Encode::decode ($self->{state_2440},
824     $self->{byte_buffer},
825     Encode::FB_QUIET ());
826     if (length $v) {
827     push @{$self->{character_queue}}, split //, $v;
828     $r = shift @{$self->{character_queue}};
829     } else {
830     if (length $self->{byte_buffer}) {
831     $error = 1;
832     } else {
833     $r = undef;
834     $self->{onerror}->($self, 'invalid-state-error',
835 wakaba 1.7 state => $self->{state},
836     level => $self->{level}->{$self->{error_level}->{'invalid-state-error'}});
837 wakaba 1.1 }
838     }
839     } else {
840     $error = 1;
841     }
842     }
843    
844     if ($error) {
845     $r = substr $self->{byte_buffer}, 0, 1, '';
846     my $etype = 'illegal-octets-error';
847     if (($self->{state} eq 'state_2442' or
848     $self->{state} eq 'state_2440') and
849     $r =~ /^[\x21-\x7E]/ and
850     $self->{byte_buffer} =~ s/^([\x21-\x7E])//) {
851     $r .= $1;
852     $etype = 'unassigned-code-point-error';
853     } elsif ($r eq "\x1B" and
854     $self->{byte_buffer} =~ s/^\(H//) { # Old 0201
855     $r .= "(H";
856     $self->{state} = 'state_284A';
857     }
858 wakaba 1.7 $self->{onerror}->($self, $etype, octets => \$r,
859     level => $self->{level}->{$self->{error_level}->{$etype}});
860 wakaba 1.1 }
861     } # A
862    
863     return $r;
864     } # getc
865    
866     package Whatpm::Charset::DecodeHandle::ShiftJIS;
867     push our @ISA, 'Whatpm::Charset::DecodeHandle::Encode';
868    
869     sub getc ($) {
870     my $self = $_[0];
871     return shift @{$self->{character_queue}} if @{$self->{character_queue}};
872    
873     my $error;
874     if ($self->{continue}) {
875 wakaba 1.3 if ($self->{filehandle}->read ($self->{byte_buffer}, 256,
876     length $self->{byte_buffer})) {
877 wakaba 1.1 #
878     } else {
879     $error = 1;
880     }
881     $self->{continue} = 0;
882     } elsif (512 > length $self->{byte_buffer}) {
883 wakaba 1.3 $self->{filehandle}->read ($self->{byte_buffer}, 256,
884     length $self->{byte_buffer});
885 wakaba 1.1 }
886    
887     my $r;
888     unless ($error) {
889     my $string = Encode::decode ($self->{perl_encoding_name},
890     $self->{byte_buffer},
891     Encode::FB_QUIET ());
892     if (length $string) {
893     push @{$self->{character_queue}}, split //, $string;
894     $r = shift @{$self->{character_queue}};
895     if (length $self->{byte_buffer}) {
896     $self->{continue} = 1;
897     }
898     } else {
899     if (length $self->{byte_buffer}) {
900     $error = 1;
901     } else {
902     $r = undef;
903     }
904     }
905     }
906    
907     if ($error) {
908     $r = substr $self->{byte_buffer}, 0, 1, '';
909     my $etype = 'illegal-octets-error';
910 wakaba 1.5 if ($r =~ /^[\x81-\x9F\xE0-\xFC]/) {
911 wakaba 1.1 if ($self->{byte_buffer} =~ s/(.)//s) {
912     $r .= $1; # not limited to \x40-\xFC - \x7F
913     $etype = 'unassigned-code-point-error';
914     }
915 wakaba 1.5 ## NOTE: Range [\xF0-\xFC] is unassigned and may be used as a single-byte
916     ## character or as the first-byte of a double-byte character according
917     ## to JIS X 0208:1997 Appendix 1. However, the current practice is
918     ## use the range as the first-byte of double-byte characters.
919     } elsif ($r =~ /^[\x80\xA0\xFD-\xFF]/) {
920 wakaba 1.1 $etype = 'unassigned-code-point-error';
921     }
922 wakaba 1.7 $self->{onerror}->($self, $etype, octets => \$r,
923     level => $self->{level}->{$self->{error_level}->{$etype}});
924 wakaba 1.1 }
925    
926     return $r;
927     } # getc
928    
929     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:us-ascii'} =
930     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:us'} =
931     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:iso646-us'} =
932     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:cp367'} =
933     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:ibm367'} =
934     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:ansi_x3.4-1986'} =
935     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:ansi_x3.4-1968'} =
936     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:iso-ir-6'} =
937     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:csascii'} =
938     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:iso_646.irv:1991'} =
939     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:ascii'} = {ascii8 =>
940     '1',
941     is_block_safe =>
942     '1',
943     ietf_name =>
944     ['ansi_x3.4-1968',
945     'ansi_x3.4-1986',
946     'ascii',
947     'cp367',
948     'csascii',
949     'ibm367',
950     'iso-ir-6',
951     'iso646-us',
952     'iso_646.irv:1991',
953     'us',
954     'us-ascii',
955     'us-ascii'],
956     mime_name =>
957     'us-ascii',
958     perl_name =>
959     ['ascii',
960     'iso-646-us',
961     'us-ascii'],
962     utf8_encoding_scheme =>
963     '1',
964     'uri',
965     {'urn:x-suika-fam-cx:charset:ansi_x3.4-1968',
966     '1',
967     'urn:x-suika-fam-cx:charset:ansi_x3.4-1986',
968     '1',
969     'urn:x-suika-fam-cx:charset:ascii',
970     '1',
971     'urn:x-suika-fam-cx:charset:cp367',
972     '1',
973     'urn:x-suika-fam-cx:charset:csascii',
974     '1',
975     'urn:x-suika-fam-cx:charset:ibm367',
976     '1',
977     'urn:x-suika-fam-cx:charset:iso-ir-6',
978     '1',
979     'urn:x-suika-fam-cx:charset:iso646-us',
980     '1',
981     'urn:x-suika-fam-cx:charset:iso_646.irv:1991',
982     '1',
983     'urn:x-suika-fam-cx:charset:us',
984     '1',
985     'urn:x-suika-fam-cx:charset:us-ascii',
986     '1'},
987 wakaba 1.2 };
988    
989 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.ascii-ctrl'} = {perl_name =>
990     ['ascii-ctrl'],
991     'uri',
992     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.ascii-ctrl',
993     '1'}};
994     $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.null'} = {perl_name =>
995     ['null'],
996     'uri',
997     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.null',
998     '1'}};
999     $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.utf-8'} = {ascii8 =>
1000     '1',
1001     bom_allowed =>
1002     '1',
1003     no_bom_variant =>
1004     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf8',
1005     utf8_encoding_scheme =>
1006     '1',
1007     'uri',
1008     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.utf-8',
1009     '1'},
1010 wakaba 1.2 xml_name => 'UTF-8',
1011     };
1012    
1013 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/UTF-8.RFC2279'} = {ascii8 =>
1014     '1',
1015     bom_allowed =>
1016     '1',
1017     no_bom_variant =>
1018     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf8',
1019     utf8_encoding_scheme =>
1020     '1',
1021     'uri',
1022     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/UTF-8.RFC2279',
1023     '1'}};
1024     $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-8'} = {
1025     ascii8 => 1,
1026     is_block_safe =>
1027     '1',
1028     perl_name =>
1029     ['utf-8'],
1030     utf8_encoding_scheme =>
1031     '1',
1032     'uri',
1033     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-8',
1034     '1'}};
1035     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:utf-8'} = {
1036     ascii8 => 1,
1037     bom_allowed =>
1038     '1',
1039     no_bom_variant =>
1040     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-8',
1041     ietf_name =>
1042     ['utf-8'],
1043     mime_name =>
1044     'utf-8',
1045     utf8_encoding_scheme =>
1046     '1',
1047     'uri',
1048     {'urn:x-suika-fam-cx:charset:utf-8',
1049     '1'},
1050 wakaba 1.2 };
1051    
1052 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf8'} = {ascii8 =>
1053     '1',
1054     is_block_safe =>
1055     '1',
1056     perl_name =>
1057     ['utf8'],
1058     utf8_encoding_scheme =>
1059     '1',
1060     'uri',
1061     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf8',
1062     '1'}};
1063     $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.utf-16'} = {
1064     ascii16 => 1,
1065     bom_allowed =>
1066     '1',
1067     bom_required =>
1068     '1',
1069     no_bom_variant =>
1070     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16le',
1071     no_bom_variant16be =>
1072     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16be',
1073     no_bom_variant16le =>
1074     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16le',
1075     perl_name =>
1076     ['utf-16'],
1077     'uri',
1078     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.utf-16',
1079     '1'},
1080 wakaba 1.2 xml_name => 'UTF-16',
1081     };
1082    
1083 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:utf-16'} = {
1084     ascii16 => 1,
1085     bom_allowed =>
1086     '1',
1087     no_bom_variant =>
1088     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16le',
1089     no_bom_variant16be =>
1090     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16be',
1091     no_bom_variant16le =>
1092     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16le',
1093     ietf_name =>
1094     ['utf-16'],
1095     mime_name =>
1096     'utf-16',
1097     'uri',
1098     {'urn:x-suika-fam-cx:charset:utf-16',
1099     '1'},
1100 wakaba 1.2 };
1101    
1102 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:utf-16be'} = {
1103     ascii16 => 1,
1104     ascii16be => 1,
1105     bom_allowed =>
1106     '1',
1107     no_bom_variant =>
1108     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16be',
1109     no_bom_variant16be =>
1110     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16be',
1111     ietf_name =>
1112     ['utf-16be'],
1113     mime_name =>
1114     'utf-16be',
1115     'uri',
1116     {'urn:x-suika-fam-cx:charset:utf-16be',
1117     '1'},
1118 wakaba 1.2 };
1119    
1120 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:utf-16le'} = {
1121     ascii16 => 1,
1122     ascii16le => 1,
1123     bom_allowed =>
1124     '1',
1125     no_bom_variant =>
1126     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16le',
1127     no_bom_variant16le =>
1128     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16le',
1129     ietf_name =>
1130     ['utf-16le'],
1131     mime_name =>
1132     'utf-16le',
1133     'uri',
1134     {'urn:x-suika-fam-cx:charset:utf-16le',
1135     '1'},
1136 wakaba 1.2 };
1137    
1138 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16be'} = {
1139     ascii16 => 1,
1140     ascii16be => 1,
1141     is_block_safe =>
1142     '1',
1143     perl_name =>
1144     ['utf-16be'],
1145     'uri',
1146     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16be',
1147     '1'}};
1148     $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16le'} = {
1149     ascii16 => 1,
1150     ascii16le => 1,
1151     is_block_safe =>
1152     '1',
1153     perl_name =>
1154     ['utf-16le'],
1155     'uri',
1156     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16le',
1157     '1'}};
1158     $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-10646-ucs-2'} = $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:iso-10646-ucs-2'} = {
1159     ascii16 => 1,
1160     bom_allowed =>
1161     '1',
1162     no_bom_variant =>
1163     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.ucs-2le',
1164     no_bom_variant16be =>
1165     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.ucs-2be',
1166     no_bom_variant16le =>
1167     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.ucs-2le',
1168     ietf_name =>
1169     ['csunicode',
1170     'iso-10646-ucs-2'],
1171     mime_name =>
1172     'iso-10646-ucs-2',
1173     'uri',
1174     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-10646-ucs-2',
1175     '1',
1176     'urn:x-suika-fam-cx:charset:iso-10646-ucs-2',
1177     '1'},
1178 wakaba 1.2 xml_name => 'ISO-10646-UCS-2',
1179     };
1180    
1181 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.ucs-2be'} = {
1182     ascii16 => 1,
1183     ascii16be => 1,
1184     is_block_safe =>
1185     '1',
1186     perl_name =>
1187     ['ucs-2be'],
1188     'uri',
1189     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.ucs-2be',
1190     '1'}};
1191     $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.ucs-2le'} = {
1192     ascii16 => 1,
1193     ascii16le => 1,
1194     is_block_safe =>
1195     '1',
1196     perl_name =>
1197     ['ucs-2le'],
1198     'uri',
1199     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.ucs-2le',
1200     '1'}};
1201     $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-10646-ucs-4'} = $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:iso-10646-ucs-4'} = {
1202     ascii32 => 1,
1203     bom_allowed =>
1204     '1',
1205     no_bom_variant =>
1206     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-32le',
1207     no_bom_variant32endian1234 =>
1208     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-32be',
1209     no_bom_variant32endian4321 =>
1210     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-32le',
1211     ietf_name =>
1212     ['csucs4',
1213     'iso-10646-ucs-4'],
1214     mime_name =>
1215     'iso-10646-ucs-4',
1216     'uri',
1217     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-10646-ucs-4',
1218     '1',
1219     'urn:x-suika-fam-cx:charset:iso-10646-ucs-4',
1220     '1'},
1221 wakaba 1.2 xml_name => 'ISO-10646-UCS-4',
1222     };
1223    
1224 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-32be'} = {
1225     ascii32 => 1,
1226     ascii32endian1234 => 1,
1227     is_block_safe =>
1228     '1',
1229     perl_name =>
1230     ['ucs-4be',
1231     'utf-32be'],
1232     'uri',
1233     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-32be',
1234     '1'}};
1235     $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-32le'} = {
1236     ascii32 => 1,
1237     ascii32endian4321 => 1,
1238     is_block_safe =>
1239     '1',
1240     perl_name =>
1241     ['ucs-4le',
1242     'utf-32le'],
1243     'uri',
1244     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-32le',
1245     '1'}};
1246     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:iso_8859-1:1987'} = $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-1'} = {ascii8 =>
1247     '1',
1248     is_block_safe =>
1249     '1',
1250     ietf_name =>
1251     ['cp819',
1252     'csisolatin1',
1253     'ibm819',
1254     'iso-8859-1',
1255     'iso-8859-1',
1256     'iso-ir-100',
1257     'iso_8859-1',
1258     'iso_8859-1:1987',
1259     'l1',
1260     'latin1'],
1261     mime_name =>
1262     'iso-8859-1',
1263     perl_name =>
1264     ['iso-8859-1',
1265     'latin1'],
1266     'uri',
1267     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-1',
1268     '1',
1269     'urn:x-suika-fam-cx:charset:iso_8859-1:1987',
1270     '1'},
1271 wakaba 1.2 xml_name => 'ISO-8859-1',
1272     };
1273    
1274 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-2'} = {ascii8 =>
1275     '1',
1276     is_block_safe =>
1277     '1',
1278     'uri',
1279     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-2',
1280     '1'},
1281 wakaba 1.2 xml_name => 'ISO-8859-2',
1282     };
1283    
1284 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-3'} = {ascii8 =>
1285     '1',
1286     is_block_safe =>
1287     '1',
1288     'uri',
1289     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-3',
1290     '1'},
1291 wakaba 1.2 xml_name => 'ISO-8859-3',
1292     };
1293    
1294 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-4'} = {ascii8 =>
1295     '1',
1296     is_block_safe =>
1297     '1',
1298     'uri',
1299     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-4',
1300     '1'},
1301 wakaba 1.2 xml_name => 'ISO-8859-4',
1302     };
1303    
1304 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-5'} = {ascii8 =>
1305     '1',
1306     is_block_safe =>
1307     '1',
1308     'uri',
1309     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-5',
1310     '1'},
1311 wakaba 1.2 xml_name => 'ISO-8859-5',
1312     };
1313    
1314 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-6'} = {ascii8 =>
1315     '1',
1316     is_block_safe =>
1317     '1',
1318     'uri',
1319     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-6',
1320     '1'},
1321 wakaba 1.2 xml_name => 'ISO-8859-6',
1322     };
1323    
1324 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-7'} = {ascii8 =>
1325     '1',
1326     is_block_safe =>
1327     '1',
1328     'uri',
1329     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-7',
1330     '1'},
1331 wakaba 1.2 xml_name => 'ISO-8859-7',
1332     };
1333    
1334 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-8'} = {ascii8 =>
1335     '1',
1336     is_block_safe =>
1337     '1',
1338     'uri',
1339     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-8',
1340     '1'},
1341 wakaba 1.2 xml_name => 'ISO-8859-8',
1342     };
1343    
1344 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-9'} = {ascii8 =>
1345     '1',
1346     is_block_safe =>
1347     '1',
1348     'uri',
1349     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-9',
1350     '1'},
1351 wakaba 1.2 xml_name => 'ISO-8859-9',
1352     };
1353    
1354 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-10'} = {ascii8 =>
1355     '1',
1356     is_block_safe =>
1357     '1',
1358     'uri',
1359     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-10',
1360     '1'},
1361 wakaba 1.2 xml_name => 'ISO-8859-10',
1362     };
1363    
1364 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-11'} = {ascii8 =>
1365     '1',
1366     is_block_safe =>
1367     '1',
1368     'uri',
1369     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-11',
1370     '1'},
1371 wakaba 1.2 xml_name => 'ISO-8859-11',
1372     };
1373    
1374 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-13'} = {ascii8 =>
1375     '1',
1376     is_block_safe =>
1377     '1',
1378     'uri',
1379     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-13',
1380     '1'},
1381 wakaba 1.2 xml_name => 'ISO-8859-13',
1382     };
1383    
1384 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-14'} = {ascii8 =>
1385     '1',
1386     is_block_safe =>
1387     '1',
1388     'uri',
1389     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-14',
1390     '1'},
1391 wakaba 1.2 xml_name => 'ISO-8859-14',
1392     };
1393    
1394 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-15'} = {ascii8 =>
1395     '1',
1396     is_block_safe =>
1397     '1',
1398     'uri',
1399     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-15',
1400     '1'},
1401 wakaba 1.2 xml_name => 'ISO-8859-15',
1402     };
1403    
1404 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-16'} = {ascii8 =>
1405     '1',
1406     is_block_safe =>
1407     '1',
1408     'uri',
1409     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-16',
1410     '1'},
1411 wakaba 1.2 xml_name => 'ISO-8859-16',
1412     };
1413    
1414 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-2022-jp'} = {ascii8 =>
1415     '1',
1416     'uri',
1417     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-2022-jp',
1418     '1'},
1419 wakaba 1.2 xml_name => 'ISO-2022-JP',
1420     };
1421    
1422 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:iso-2022-jp'} = {ascii8 =>
1423     '1',
1424     ietf_name =>
1425     ['csiso2022jp',
1426     'iso-2022-jp',
1427     'iso-2022-jp'],
1428     mime_name =>
1429     'iso-2022-jp',
1430     'uri',
1431     {'urn:x-suika-fam-cx:charset:iso-2022-jp',
1432     '1'},
1433 wakaba 1.2 };
1434    
1435 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.iso-2022-jp'} = {ascii8 =>
1436     '1',
1437     perl_name =>
1438     ['iso-2022-jp'],
1439     'uri',
1440     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.iso-2022-jp',
1441     '1'}};
1442     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:shift_jis'} = $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.shift_jis'} = {ascii8 =>
1443     '1',
1444     is_block_safe =>
1445     '1',
1446     ietf_name =>
1447     ['csshiftjis',
1448     'ms_kanji',
1449     'shift_jis',
1450     'shift_jis'],
1451     mime_name =>
1452     'shift_jis',
1453     perl_name =>
1454     ['shift-jis-1997'],
1455     'uri',
1456     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.shift_jis',
1457     '1',
1458     'urn:x-suika-fam-cx:charset:shift_jis',
1459     '1'},
1460 wakaba 1.2 xml_name => 'Shift_JIS',
1461     };
1462    
1463 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.shiftjis'} = {ascii8 =>
1464     '1',
1465     is_block_safe =>
1466     '1',
1467     perl_name =>
1468     ['shiftjis',
1469     'sjis'],
1470     'uri',
1471     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.shiftjis',
1472     '1'}};
1473     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:euc-jp'} = $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.euc-jp'} = $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:extended_unix_code_packed_format_for_japanese'} = {ascii8 =>
1474     '1',
1475     is_block_safe =>
1476     '1',
1477     ietf_name =>
1478     ['cseucpkdfmtjapanese',
1479     'euc-jp',
1480     'euc-jp',
1481     'extended_unix_code_packed_format_for_japanese'],
1482     mime_name =>
1483     'euc-jp',
1484     perl_name =>
1485     ['euc-jp-1997'],
1486     'uri',
1487     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.euc-jp',
1488     '1',
1489     'urn:x-suika-fam-cx:charset:euc-jp',
1490     '1',
1491     'urn:x-suika-fam-cx:charset:extended_unix_code_packed_format_for_japanese',
1492     '1'},
1493 wakaba 1.2 xml_name => 'EUC-JP',
1494     };
1495    
1496 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.euc-jp'} = {ascii8 =>
1497     '1',
1498     is_block_safe =>
1499     '1',
1500     perl_name =>
1501     ['euc-jp',
1502     'ujis'],
1503     'uri',
1504     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.euc-jp',
1505     '1'}};
1506    
1507     1;
1508 wakaba 1.8 ## $Date: 2008/09/10 10:27:09 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24