/[suikacvs]/markup/html/whatpm/Whatpm/Charset/DecodeHandle.pm
Suika

Contents of /markup/html/whatpm/Whatpm/Charset/DecodeHandle.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.5 - (hide annotations) (download)
Sun May 18 04:15:52 2008 UTC (17 years, 2 months ago) by wakaba
Branch: MAIN
Changes since 1.4: +7 -3 lines
++ whatpm/Whatpm/ChangeLog	18 May 2008 04:15:00 -0000
	* HTML.pm.src (parse_byte_string): Redefined to invoke
	|parse_byte_stream|.
	(parse_byte_stream): New method.

2008-05-18  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/Charset/ChangeLog	18 May 2008 04:15:42 -0000
	* DecodeHandle.pm (ShiftJIS): \xF0-\xFC should be considered
	as part of double-byte characters for more user-friendly error
	recovery.

2008-05-18  Wakaba  <wakaba@suika.fam.cx>

1 wakaba 1.1 package Whatpm::Charset::DecodeHandle;
2     use strict;
3    
4     my $XML_AUTO_CHARSET = q<http://suika.fam.cx/www/2006/03/xml-entity/>;
5     my $IANA_CHARSET = q<urn:x-suika-fam-cx:charset:>;
6     my $PERL_CHARSET = q<http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.>;
7     my $XML_CHARSET = q<http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.>;
8    
9     ## ->create_decode_handle ($charset_uri, $byte_stream, $onerror)
10     sub create_decode_handle ($$$;$) {
11     my $csdef = $Whatpm::Charset::CharsetDef->{$_[1]};
12     my $obj = {
13     character_queue => [],
14     filehandle => $_[2],
15     charset => $_[1],
16     byte_buffer => '',
17     onerror => $_[3] || sub {},
18     };
19     if ($csdef->{uri}->{$XML_AUTO_CHARSET} or
20     $obj->{charset} eq $XML_AUTO_CHARSET) {
21     my $b = ''; # UTF-8 w/o BOM
22     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
23 wakaba 1.2 $obj->{input_encoding} = 'UTF-8';
24 wakaba 1.1 if (read $obj->{filehandle}, $b, 256) {
25     no warnings "substr";
26     no warnings "uninitialized";
27     if (substr ($b, 0, 1) eq "<") {
28     if (substr ($b, 1, 1) eq "?") { # ASCII8
29     if ($b =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
30     encoding\s*=\s*["']([^"']*)/x) {
31 wakaba 1.2 $obj->{input_encoding} = $1;
32 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
33     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
34     if (not $csdef->{ascii8} or $csdef->{bom_required}) {
35     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
36     charset_uri => $uri,
37     charset_name => $obj->{input_encoding});
38     }
39     } else {
40     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
41 wakaba 1.2 $obj->{input_encoding} = 'UTF-8';
42 wakaba 1.1 }
43     if (defined $csdef->{no_bom_variant}) {
44     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant}};
45     }
46     } elsif (substr ($b, 1, 1) eq "\x00") {
47     if (substr ($b, 2, 2) eq "?\x00") { # ASCII16LE
48     my $c = $b; $c =~ tr/\x00//d;
49     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
50     encoding\s*=\s*["']([^"']*)/x) {
51 wakaba 1.2 $obj->{input_encoding} = $1;
52 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
53     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
54     if (not $csdef->{ascii16} or $csdef->{ascii16be} or
55     $csdef->{bom_required}) {
56     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
57     charset_uri => $uri,
58     charset_name => $obj->{input_encoding});
59     }
60     } else {
61     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
62 wakaba 1.2 $obj->{input_encoding} = 'UTF-8';
63 wakaba 1.1 }
64     if (defined $csdef->{no_bom_variant16le}) {
65     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant16le}};
66     }
67     } elsif (substr ($b, 2, 2) eq "\x00\x00") { # ASCII32Endian4321
68     my $c = $b; $c =~ tr/\x00//d;
69     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
70     encoding\s*=\s*["']([^"']*)/x) {
71 wakaba 1.2 $obj->{input_encoding} = $1;
72 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
73     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
74     if (not $csdef->{ascii32} or
75     $csdef->{ascii32endian1234} or
76     $csdef->{ascii32endian2143} or
77     $csdef->{ascii32endian3412} or
78     $csdef->{bom_required}) {
79     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
80     charset_uri => $uri,
81     charset_name => $obj->{input_encoding});
82     }
83     } else {
84     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
85 wakaba 1.2 $obj->{input_encoding} = 'UTF-8';
86 wakaba 1.1 }
87     if (defined $csdef->{no_bom_variant32endian4321}) {
88     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant32endian4321}};
89     }
90     }
91     }
92     } elsif (substr ($b, 0, 3) eq "\xEF\xBB\xBF") { # UTF8
93     $obj->{has_bom} = 1;
94     substr ($b, 0, 3) = '';
95     my $c = $b;
96     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
97     encoding\s*=\s*["']([^"']*)/x) {
98 wakaba 1.2 $obj->{input_encoding} = $1;
99 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
100     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
101     if (not $csdef->{utf8_encoding_scheme} or
102     not $csdef->{bom_allowed}) {
103     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
104     charset_uri => $uri,
105     charset_name => $obj->{input_encoding});
106     }
107     } else {
108     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
109 wakaba 1.2 $obj->{input_encoding} = 'UTF-8';
110 wakaba 1.1 }
111     if (defined $csdef->{no_bom_variant}) {
112     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant}};
113     }
114     } elsif (substr ($b, 0, 2) eq "\x00<") {
115     if (substr ($b, 2, 2) eq "\x00?") { # ASCII16BE
116     my $c = $b; $c =~ tr/\x00//d;
117     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
118     encoding\s*=\s*["']([^"']*)/x) {
119 wakaba 1.2 $obj->{input_encoding} = $1;
120 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
121     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
122     if (not $csdef->{ascii16} or $csdef->{ascii16le} or
123     $csdef->{bom_required}) {
124     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
125     charset_uri => $uri,
126     charset_name => $obj->{input_encoding});
127     }
128     } else {
129     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
130 wakaba 1.2 $obj->{input_encoding} = 'UTF-8';
131 wakaba 1.1 }
132     if (defined $csdef->{no_bom_variant16be}) {
133     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant16be}};
134     }
135     } elsif (substr ($b, 2, 2) eq "\x00\x00") { # ASCII32Endian3412
136     my $c = $b; $c =~ tr/\x00//d;
137     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
138     encoding\s*=\s*["']([^"']*)/x) {
139 wakaba 1.2 $obj->{input_encoding} = $1;
140 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
141     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
142     if (not $csdef->{ascii32} or
143     $csdef->{ascii32endian1234} or
144     $csdef->{ascii32endian2143} or
145     $csdef->{ascii32endian4321} or
146     $csdef->{bom_required}) {
147     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
148     charset_uri => $uri,
149     charset_name => $obj->{input_encoding});
150     }
151     } else {
152     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
153 wakaba 1.2 $obj->{input_encoding} = 'UTF-8';
154 wakaba 1.1 }
155     if (defined $csdef->{no_bom_variant32endian3412}) {
156     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant32endian3412}};
157     }
158     }
159     } elsif (substr ($b, 0, 2) eq "\xFE\xFF") {
160     if (substr ($b, 2, 2) eq "\x00<") { # ASCII16BE
161     $obj->{has_bom} = 1;
162     substr ($b, 0, 2) = '';
163     my $c = $b; $c =~ tr/\x00//d;
164     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
165     encoding\s*=\s*["']([^"']*)/x) {
166 wakaba 1.2 $obj->{input_encoding} = $1;
167 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
168     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
169     if (not $csdef->{ascii16} or
170     $csdef->{ascii16le} or
171     not $csdef->{bom_allowed}) {
172     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
173     charset_uri => $uri,
174     charset_name => $obj->{input_encoding});
175     }
176     } else {
177     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-16be'};
178 wakaba 1.2 $obj->{input_encoding} = 'UTF-16';
179 wakaba 1.1 }
180     if (defined $csdef->{no_bom_variant16be}) {
181     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant16be}};
182     }
183     } elsif (substr ($b, 2, 2) eq "\x00\x00") { # ASCII32Endian3412
184     $obj->{has_bom} = 1;
185     substr ($b, 0, 4) = '';
186     my $c = $b; $c =~ tr/\x00//d;
187     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
188     encoding\s*=\s*["']([^"']*)/x) {
189 wakaba 1.2 $obj->{input_encoding} = $1;
190 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
191     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
192     if (not $csdef->{ascii32} or
193     $csdef->{ascii32endian1234} or
194     $csdef->{ascii32endian2143} or
195     $csdef->{ascii32endian4321} or
196     not $csdef->{bom_allowed}) {
197     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
198     charset_uri => $uri,
199     charset_name => $obj->{input_encoding});
200     }
201     } else {
202     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-16be'};
203 wakaba 1.2 $obj->{input_encoding} = 'UTF-16';
204 wakaba 1.1 $obj->{byte_buffer} .= "\x00\x00";
205     }
206     if (defined $csdef->{no_bom_variant32endian3412}) {
207     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant32endian3412}};
208     }
209     } else {
210     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-16be'};
211 wakaba 1.2 $obj->{input_encoding} = 'UTF-16';
212 wakaba 1.1 substr ($b, 0, 2) = '';
213     $obj->{has_bom} = 1;
214     }
215     } elsif (substr ($b, 0, 2) eq "\xFF\xFE") {
216     if (substr ($b, 2, 2) eq "<\x00") { # ASCII16LE
217     $obj->{has_bom} = 1;
218     substr ($b, 0, 2) = '';
219     my $c = $b; $c =~ tr/\x00//d;
220     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
221     encoding\s*=\s*["']([^"']*)/x) {
222 wakaba 1.2 $obj->{input_encoding} = $1;
223 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
224     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
225     if (not $csdef->{ascii16} or
226     $csdef->{ascii16be} or
227     not $csdef->{bom_allowed}) {
228     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
229     charset_uri => $uri,
230     charset_name => $obj->{input_encoding});
231     }
232     } else {
233     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-16le'};
234 wakaba 1.2 $obj->{input_encoding} = 'UTF-16';
235 wakaba 1.1 }
236     if (defined $csdef->{no_bom_variant16le}) {
237     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant16le}};
238     }
239     } elsif (substr ($b, 2, 2) eq "\x00\x00") { # ASCII32Endian4321
240     $obj->{has_bom} = 1;
241     substr ($b, 0, 4) = '';
242     my $c = $b; $c =~ tr/\x00//d;
243     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
244     encoding\s*=\s*["']([^"']*)/x) {
245 wakaba 1.2 $obj->{input_encoding} = $1;
246 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
247     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
248     if (not $csdef->{ascii32} or
249     $csdef->{ascii32endian1234} or
250     $csdef->{ascii32endian2143} or
251     $csdef->{ascii32endian3412} or
252     not $csdef->{bom_allowed}) {
253     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
254     charset_uri => $uri,
255     charset_name => $obj->{input_encoding});
256     }
257     } else {
258     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-16le'};
259 wakaba 1.2 $obj->{input_encoding} = 'UTF-16';
260 wakaba 1.1 $obj->{byte_buffer} .= "\x00\x00";
261     }
262     if (defined $csdef->{no_bom_variant32endian4321}) {
263     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant32endian4321}};
264     }
265     } else {
266     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-16le'};
267 wakaba 1.2 $obj->{input_encoding} = 'UTF-16';
268 wakaba 1.1 substr ($b, 0, 2) = '';
269     $obj->{has_bom} = 1;
270     }
271     } elsif (substr ($b, 0, 2) eq "\x00\x00") {
272     if (substr ($b, 2, 2) eq "\x00<") { # ASCII32Endian1234
273     my $c = $b; $c =~ tr/\x00//d;
274     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
275     encoding\s*=\s*["']([^"']*)/x) {
276 wakaba 1.2 $obj->{input_encoding} = $1;
277 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
278     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
279     if (not $csdef->{ascii32} or
280     $csdef->{ascii32endian2143} or
281     $csdef->{ascii32endian3412} or
282     $csdef->{ascii32endian4321} or
283     $csdef->{bom_required}) {
284     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
285     charset_uri => $uri,
286     charset_name => $obj->{input_encoding});
287     }
288     } else {
289     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
290 wakaba 1.2 $obj->{input_encoding} = 'UTF-8';
291 wakaba 1.1 }
292     if (defined $csdef->{no_bom_variant32endian1234}) {
293     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant32endian1234}};
294     }
295     } elsif (substr ($b, 2, 2) eq "<\x00") { # ASCII32Endian2143
296     my $c = $b; $c =~ tr/\x00//d;
297     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
298     encoding\s*=\s*["']([^"']*)/x) {
299 wakaba 1.2 $obj->{input_encoding} = $1;
300 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
301     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
302     if (not $csdef->{ascii32} or
303     $csdef->{ascii32endian1234} or
304     $csdef->{ascii32endian3412} or
305     $csdef->{ascii32endian4321} or
306     $csdef->{bom_required}) {
307     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
308     charset_uri => $uri,
309     charset_name => $obj->{input_encoding});
310     }
311     } else {
312     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
313 wakaba 1.2 $obj->{input_encoding} = 'UTF-8';
314 wakaba 1.1 }
315     if (defined $csdef->{no_bom_variant32endian2143}) {
316     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant32endian2143}};
317     }
318     } elsif (substr ($b, 2, 2) eq "\xFE\xFF") { # ASCII32Endian1234
319     $obj->{has_bom} = 1;
320     substr ($b, 0, 4) = '';
321     my $c = $b; $c =~ tr/\x00//d;
322     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
323     encoding\s*=\s*["']([^"']*)/x) {
324 wakaba 1.2 $obj->{input_encoding} = $1;
325 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
326     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
327     if (not $csdef->{ascii32} or
328     $csdef->{ascii32endian2143} or
329     $csdef->{ascii32endian3412} or
330     $csdef->{ascii32endian4321} or
331     $csdef->{bom_required}) {
332     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
333     charset_uri => $uri,
334     charset_name => $obj->{input_encoding});
335     }
336     } else {
337     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
338 wakaba 1.2 $obj->{input_encoding} = 'UTF-8';
339 wakaba 1.1 $obj->{has_bom} = 0;
340     $obj->{byte_buffer} .= "\x00\x00\xFE\xFF";
341     }
342     if (defined $csdef->{no_bom_variant32endian1234}) {
343     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant32endian1234}};
344     }
345     } elsif (substr ($b, 2, 2) eq "\xFF\xFE") { # ASCII32Endian2143
346     $obj->{has_bom} = 1;
347     substr ($b, 0, 4) = '';
348     my $c = $b; $c =~ tr/\x00//d;
349     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
350     encoding\s*=\s*["']([^"']*)/x) {
351 wakaba 1.2 $obj->{input_encoding} = $1;
352 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
353     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
354     if (not $csdef->{ascii32} or
355     $csdef->{ascii32endian1234} or
356     $csdef->{ascii32endian3412} or
357     $csdef->{ascii32endian4321} or
358     $csdef->{bom_required}) {
359     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
360     charset_uri => $uri,
361     charset_name => $obj->{input_encoding});
362     }
363     } else {
364     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
365 wakaba 1.2 $obj->{input_encoding} = 'UTF-8';
366 wakaba 1.1 $obj->{has_bom} = 0;
367     $obj->{byte_buffer} .= "\x00\x00\xFF\xFE";
368     }
369     if (defined $csdef->{no_bom_variant32endian2143}) {
370     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant32endian2143}};
371     }
372     }
373     # \x4C\x6F\xA7\x94 EBCDIC
374     } # buffer
375     $obj->{byte_buffer} .= $b;
376     } # read
377     } elsif ($csdef->{uri}->{$XML_CHARSET.'utf-8'}) {
378     ## BOM is optional.
379     my $b = '';
380     if (read $obj->{filehandle}, $b, 3) {
381     if ($b eq "\xEF\xBB\xBF") {
382     $obj->{has_bom} = 1;
383     } else {
384     $obj->{byte_buffer} .= $b;
385     }
386     }
387     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'}; # UTF-8 w/o BOM
388     } elsif ($csdef->{uri}->{$XML_CHARSET.'utf-16'}) {
389     ## BOM is mandated.
390     my $b = '';
391     if (read $obj->{filehandle}, $b, 2) {
392     if ($b eq "\xFE\xFF") {
393     $obj->{has_bom} = 1; # UTF-16BE w/o BOM
394     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-16be'};
395     } elsif ($b eq "\xFF\xFE") {
396     $obj->{has_bom} = 1; # UTF-16LE w/o BOM
397     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-16le'};
398     } else {
399     $obj->{onerror}->(undef, 'no-bom-error', charset_uri => $obj->{charset});
400     $obj->{has_bom} = 0;
401     $obj->{byte_buffer} .= $b; # UTF-16BE w/o BOM
402     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-16be'};
403     }
404     } else {
405     $obj->{onerror}->(undef, 'no-bom-error', charset_uri => $obj->{charset});
406     $obj->{has_bom} = 0; # UTF-16BE w/o BOM
407     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-16be'};
408     }
409     }
410    
411     if ($csdef->{uri}->{$XML_CHARSET.'iso-2022-jp'}) {
412     $obj->{state_2440} = 'gl-jis-1997-swapped';
413     $obj->{state_2442} = 'gl-jis-1997';
414     $obj->{state} = 'state_2842';
415     require Encode::GLJIS1997Swapped;
416     require Encode::GLJIS1997;
417     if (Encode::find_encoding ($obj->{state_2440}) and
418     Encode::find_encoding ($obj->{state_2442})) {
419     return bless $obj, 'Whatpm::Charset::DecodeHandle::ISO2022JP';
420     }
421     } elsif ($csdef->{uri}->{$IANA_CHARSET.'iso-2022-jp'}) {
422     $obj->{state_2440} = 'gl-jis-1978';
423     $obj->{state_2442} = 'gl-jis-1983';
424     $obj->{state} = 'state_2842';
425     require Encode::GLJIS1978;
426     require Encode::GLJIS1983;
427     if (Encode::find_encoding ($obj->{state_2440}) and
428     Encode::find_encoding ($obj->{state_2442})) {
429     return bless $obj, 'Whatpm::Charset::DecodeHandle::ISO2022JP';
430     }
431     } elsif (defined $csdef->{perl_name}->[0]) {
432     if ($csdef->{uri}->{$XML_CHARSET.'euc-jp'} or
433     $csdef->{uri}->{$IANA_CHARSET.'euc-jp'}) {
434     $obj->{perl_encoding_name} = $csdef->{perl_name}->[0];
435     require Encode::EUCJP1997;
436     if (Encode::find_encoding ($obj->{perl_encoding_name})) {
437     return bless $obj, 'Whatpm::Charset::DecodeHandle::EUCJP';
438     }
439     } elsif ($csdef->{uri}->{$XML_CHARSET.'shift_jis'} or
440     $csdef->{uri}->{$IANA_CHARSET.'shift_jis'}) {
441     $obj->{perl_encoding_name} = $csdef->{perl_name}->[0];
442     require Encode::ShiftJIS1997;
443     if (Encode::find_encoding ($obj->{perl_encoding_name})) {
444     return bless $obj, 'Whatpm::Charset::DecodeHandle::ShiftJIS';
445     }
446     } elsif ($csdef->{is_block_safe}) {
447     $obj->{perl_encoding_name} = $csdef->{perl_name}->[0];
448     require Encode;
449     if (Encode::find_encoding ($obj->{perl_encoding_name})) {
450     return bless $obj, 'Whatpm::Charset::DecodeHandle::Encode';
451     }
452     }
453     }
454    
455     $obj->{onerror}->(undef, 'charset-not-supported-error',
456     charset_uri => $obj->{charset});
457     return undef;
458     } # create_decode_handle
459    
460     sub name_to_uri ($$$) {
461     my $domain = $_[1];
462     my $name = lc $_[2];
463    
464     if ($domain eq 'ietf') {
465     return $IANA_CHARSET . $name;
466     } elsif ($domain eq 'xml') {
467     if ({
468     'utf-8' => 1,
469     'utf-16' => 1,
470     'iso-10646-ucs-2' => 1,
471     'iso-10646-ucs-4' => 1,
472     'iso-8859-1' => 1,
473     'iso-8859-2' => 1,
474     'iso-8859-3' => 1,
475     'iso-8859-4' => 1,
476     'iso-8859-5' => 1,
477     'iso-8859-6' => 1,
478     'iso-8859-7' => 1,
479     'iso-8859-8' => 1,
480     'iso-8859-9' => 1,
481     'iso-8859-10' => 1,
482     'iso-8859-11' => 1,
483     'iso-8859-13' => 1,
484     'iso-8859-14' => 1,
485     'iso-8859-15' => 1,
486     'iso-8859-16' => 1,
487     'iso-2022-jp' => 1,
488     'shift_jis' => 1,
489     'euc-jp' => 1,
490     }->{$name}) {
491     return $XML_CHARSET . $name;
492     }
493    
494     my $uri = $IANA_CHARSET . $name;
495     return $uri if $Whatpm::Charset::CharsetDef->{$uri};
496    
497     return $XML_CHARSET . $name;
498     } else {
499     return undef;
500     }
501     } # name_to_uri
502    
503     sub uri_to_name ($$$) {
504     my (undef, $domain, $uri) = @_;
505 wakaba 1.2
506     if ($domain eq 'xml') {
507     my $v = $Whatpm::Charset::CharsetDef->{$uri}->{xml_name};
508     return $v if defined $v;
509    
510     if (substr ($uri, 0, length $XML_CHARSET) eq $XML_CHARSET) {
511     return substr ($uri, length $XML_CHARSET);
512     }
513    
514     $domain = 'ietf'; ## TODO: XML encoding name has smaller range
515     }
516    
517     if ($domain eq 'ietf') {
518     my $v = $Whatpm::Charset::CharsetDef->{$uri}->{iana_name};
519     return $v->[0] if defined $v;
520    
521     if (substr ($uri, 0, length $IANA_CHARSET) eq $IANA_CHARSET) {
522 wakaba 1.1 return substr ($uri, length $IANA_CHARSET);
523     }
524     }
525    
526     return undef;
527     } # uri_to_name
528    
529 wakaba 1.3 require IO::Handle;
530    
531     package Whatpm::Charset::DecodeHandle::ByteBuffer;
532    
533     sub new ($$) {
534     my $self = bless {
535     buffer => '',
536     }, shift;
537     $self->{filehandle} = shift;
538     return $self;
539     } # new
540    
541     sub read {
542     my $self = shift;
543     my $pos = length $self->{buffer};
544     my $r = $self->{filehandle}->read ($self->{buffer}, $_[1], $pos);
545     substr ($_[0], $_[2]) = substr ($self->{buffer}, $pos);
546     return $r;
547     } # read
548    
549     sub close { $_[0]->{filehandle}->close }
550    
551 wakaba 1.1 package Whatpm::Charset::DecodeHandle::Encode;
552    
553     sub charset ($) { $_[0]->{charset} }
554    
555 wakaba 1.3 sub close ($) { $_[0]->{filehandle}->close }
556 wakaba 1.1
557     sub getc ($) {
558     my $self = $_[0];
559     return shift @{$self->{character_queue}} if @{$self->{character_queue}};
560    
561     my $error;
562     if ($self->{continue}) {
563 wakaba 1.3 if ($self->{filehandle}->read ($self->{byte_buffer}, 256,
564     length $self->{byte_buffer})) {
565 wakaba 1.1 #
566     } else {
567     $error = 1;
568     }
569     $self->{continue} = 0;
570     } elsif (512 > length $self->{byte_buffer}) {
571 wakaba 1.3 $self->{filehandle}->read ($self->{byte_buffer}, 256,
572     length $self->{byte_buffer});
573 wakaba 1.1 }
574    
575     my $r;
576     unless ($error) {
577 wakaba 1.4 if (not $self->{bom_checked}) {
578     if (defined $self->{bom_pattern}) {
579     if ($self->{byte_buffer} =~ s/^$self->{bom_pattern}//) {
580     $self->{has_bom} = 1;
581     }
582     }
583     $self->{bom_checked} = 1;
584     }
585    
586 wakaba 1.1 my $string = Encode::decode ($self->{perl_encoding_name},
587     $self->{byte_buffer},
588     Encode::FB_QUIET ());
589     if (length $string) {
590     push @{$self->{character_queue}}, split //, $string;
591     $r = shift @{$self->{character_queue}};
592     if (length $self->{byte_buffer}) {
593     $self->{continue} = 1;
594     }
595     } else {
596     if (length $self->{byte_buffer}) {
597     $error = 1;
598     } else {
599     $r = undef;
600     }
601     }
602     }
603    
604     if ($error) {
605     $r = substr $self->{byte_buffer}, 0, 1, '';
606     $self->{onerror}->($self, 'illegal-octets-error', octets => \$r);
607     }
608    
609     return $r;
610     } # getc
611    
612     sub has_bom ($) { $_[0]->{has_bom} }
613    
614 wakaba 1.2 sub input_encoding ($) {
615     my $v = $_[0]->{input_encoding};
616     return $v if defined $v;
617    
618     my $uri = $_[0]->{charset};
619     if (defined $uri) {
620     return Whatpm::Charset::DecodeHandle->uri_to_name (xml => $uri);
621     }
622    
623     return undef;
624     } # input_encoding
625 wakaba 1.1
626     sub onerror ($;$) {
627     if (@_ > 1) {
628     $_[0]->{onerror} = $_[1];
629     }
630    
631     return $_[0]->{onerror};
632     } # onerror
633    
634     sub ungetc ($$) {
635     unshift @{$_[0]->{character_queue}}, chr int ($_[1] or 0);
636     } # ungetc
637    
638     package Whatpm::Charset::DecodeHandle::EUCJP;
639     push our @ISA, 'Whatpm::Charset::DecodeHandle::Encode';
640    
641     sub getc ($) {
642     my $self = $_[0];
643     return shift @{$self->{character_queue}} if @{$self->{character_queue}};
644    
645     my $error;
646     if ($self->{continue}) {
647 wakaba 1.3 if ($self->{filehandle}->read ($self->{byte_buffer}, 256,
648     length $self->{byte_buffer})) {
649 wakaba 1.1 #
650     } else {
651     $error = 1;
652     }
653     $self->{continue} = 0;
654     } elsif (512 > length $self->{byte_buffer}) {
655 wakaba 1.3 $self->{filehandle}->read ($self->{byte_buffer}, 256,
656     length $self->{byte_buffer});
657 wakaba 1.1 }
658    
659     my $r;
660     unless ($error) {
661     my $string = Encode::decode ($self->{perl_encoding_name},
662     $self->{byte_buffer},
663     Encode::FB_QUIET ());
664     if (length $string) {
665     push @{$self->{character_queue}}, split //, $string;
666     $r = shift @{$self->{character_queue}};
667     if (length $self->{byte_buffer}) {
668     $self->{continue} = 1;
669     }
670     } else {
671     if (length $self->{byte_buffer}) {
672     $error = 1;
673     } else {
674     $r = undef;
675     }
676     }
677     }
678    
679     if ($error) {
680     $r = substr $self->{byte_buffer}, 0, 1, '';
681     my $etype = 'illegal-octets-error';
682     if ($r =~ /^[\xA1-\xFE]/) {
683     if ($self->{byte_buffer} =~ s/^([\xA1-\xFE])//) {
684     $r .= $1;
685     $etype = 'unassigned-code-point-error';
686     }
687     } elsif ($r eq "\x8F") {
688     if ($self->{byte_buffer} =~ s/^([\xA1-\xFE][\xA1-\xFE]?)//) {
689     $r .= $1;
690     $etype = 'unassigned-code-point-error' if length $1 == 2;
691     }
692     } elsif ($r eq "\x8E") {
693     if ($self->{byte_buffer} =~ s/^([\xA1-\xFE])//) {
694     $r .= $1;
695     $etype = 'unassigned-code-point-error';
696     }
697     } elsif ($r eq "\xA0" or $r eq "\xFF") {
698     $etype = 'unassigned-code-point-error';
699     }
700     $self->{onerror}->($self, $etype, octets => \$r);
701     }
702    
703     return $r;
704     } # getc
705    
706     package Whatpm::Charset::DecodeHandle::ISO2022JP;
707     push our @ISA, 'Whatpm::Charset::DecodeHandle::Encode';
708    
709     sub getc ($) {
710     my $self = $_[0];
711     return shift @{$self->{character_queue}} if @{$self->{character_queue}};
712    
713     my $r;
714     A: {
715     my $error;
716     if ($self->{continue}) {
717 wakaba 1.3 if ($self->{filehandle}->read ($self->{byte_buffer}, 256,
718     length $self->{byte_buffer})) {
719 wakaba 1.1 #
720     } else {
721     $error = 1;
722     }
723     $self->{continue} = 0;
724     } elsif (512 > length $self->{byte_buffer}) {
725 wakaba 1.3 $self->{filehandle}->read ($self->{byte_buffer}, 256,
726     length $self->{byte_buffer});
727 wakaba 1.1 }
728    
729     unless ($error) {
730     if ($self->{byte_buffer} =~ s/^\x1B(\x24[\x40\x42]|\x28[\x42\x4A])//) {
731     $self->{state} = {
732     "\x24\x40" => 'state_2440',
733     "\x24\x42" => 'state_2442',
734     "\x28\x42" => 'state_2842',
735     "\x28\x4A" => 'state_284A',
736     }->{$1};
737     redo A;
738     } elsif ($self->{state} eq 'state_2842') { # IRV
739     if ($self->{byte_buffer} =~ s/^([\x00-\x0D\x10-\x1A\x1C-\x7F]+)//) {
740     push @{$self->{character_queue}}, split //, $1;
741     $r = shift @{$self->{character_queue}};
742     } else {
743     if (length $self->{byte_buffer}) {
744     $error = 1;
745     } else {
746     $r = undef;
747     }
748     }
749     } elsif ($self->{state} eq 'state_284A') { # 0201
750     if ($self->{byte_buffer} =~ s/^([\x00-\x0D\x10-\x1A\x1C-\x7F]+)//) {
751     my $v = $1;
752     $v =~ tr/\x5C\x7E/\xA5\x{203E}/;
753     push @{$self->{character_queue}}, split //, $v;
754     $r = shift @{$self->{character_queue}};
755     } else {
756     if (length $self->{byte_buffer}) {
757     $error = 1;
758     } else {
759     $r = undef;
760     $self->{onerror}->($self, 'invalid-state-error',
761     state => $self->{state});
762     }
763     }
764     } elsif ($self->{state} eq 'state_2442') { # 1983
765     my $v = Encode::decode ($self->{state_2442},
766     $self->{byte_buffer},
767     Encode::FB_QUIET ());
768     if (length $v) {
769     push @{$self->{character_queue}}, split //, $v;
770     $r = shift @{$self->{character_queue}};
771     } else {
772     if (length $self->{byte_buffer}) {
773     $error = 1;
774     } else {
775     $r = undef;
776     $self->{onerror}->($self, 'invalid-state-error',
777     state => $self->{state});
778     }
779     }
780     } elsif ($self->{state} eq 'state_2440') { # 1978
781     my $v = Encode::decode ($self->{state_2440},
782     $self->{byte_buffer},
783     Encode::FB_QUIET ());
784     if (length $v) {
785     push @{$self->{character_queue}}, split //, $v;
786     $r = shift @{$self->{character_queue}};
787     } else {
788     if (length $self->{byte_buffer}) {
789     $error = 1;
790     } else {
791     $r = undef;
792     $self->{onerror}->($self, 'invalid-state-error',
793     state => $self->{state});
794     }
795     }
796     } else {
797     $error = 1;
798     }
799     }
800    
801     if ($error) {
802     $r = substr $self->{byte_buffer}, 0, 1, '';
803     my $etype = 'illegal-octets-error';
804     if (($self->{state} eq 'state_2442' or
805     $self->{state} eq 'state_2440') and
806     $r =~ /^[\x21-\x7E]/ and
807     $self->{byte_buffer} =~ s/^([\x21-\x7E])//) {
808     $r .= $1;
809     $etype = 'unassigned-code-point-error';
810     } elsif ($r eq "\x1B" and
811     $self->{byte_buffer} =~ s/^\(H//) { # Old 0201
812     $r .= "(H";
813     $self->{state} = 'state_284A';
814     }
815     $self->{onerror}->($self, $etype, octets => \$r);
816     }
817     } # A
818    
819     return $r;
820     } # getc
821    
822     package Whatpm::Charset::DecodeHandle::ShiftJIS;
823     push our @ISA, 'Whatpm::Charset::DecodeHandle::Encode';
824    
825     sub getc ($) {
826     my $self = $_[0];
827     return shift @{$self->{character_queue}} if @{$self->{character_queue}};
828    
829     my $error;
830     if ($self->{continue}) {
831 wakaba 1.3 if ($self->{filehandle}->read ($self->{byte_buffer}, 256,
832     length $self->{byte_buffer})) {
833 wakaba 1.1 #
834     } else {
835     $error = 1;
836     }
837     $self->{continue} = 0;
838     } elsif (512 > length $self->{byte_buffer}) {
839 wakaba 1.3 $self->{filehandle}->read ($self->{byte_buffer}, 256,
840     length $self->{byte_buffer});
841 wakaba 1.1 }
842    
843     my $r;
844     unless ($error) {
845     my $string = Encode::decode ($self->{perl_encoding_name},
846     $self->{byte_buffer},
847     Encode::FB_QUIET ());
848     if (length $string) {
849     push @{$self->{character_queue}}, split //, $string;
850     $r = shift @{$self->{character_queue}};
851     if (length $self->{byte_buffer}) {
852     $self->{continue} = 1;
853     }
854     } else {
855     if (length $self->{byte_buffer}) {
856     $error = 1;
857     } else {
858     $r = undef;
859     }
860     }
861     }
862    
863     if ($error) {
864     $r = substr $self->{byte_buffer}, 0, 1, '';
865     my $etype = 'illegal-octets-error';
866 wakaba 1.5 if ($r =~ /^[\x81-\x9F\xE0-\xFC]/) {
867 wakaba 1.1 if ($self->{byte_buffer} =~ s/(.)//s) {
868     $r .= $1; # not limited to \x40-\xFC - \x7F
869     $etype = 'unassigned-code-point-error';
870     }
871 wakaba 1.5 ## NOTE: Range [\xF0-\xFC] is unassigned and may be used as a single-byte
872     ## character or as the first-byte of a double-byte character according
873     ## to JIS X 0208:1997 Appendix 1. However, the current practice is
874     ## use the range as the first-byte of double-byte characters.
875     } elsif ($r =~ /^[\x80\xA0\xFD-\xFF]/) {
876 wakaba 1.1 $etype = 'unassigned-code-point-error';
877     }
878     $self->{onerror}->($self, $etype, octets => \$r);
879     }
880    
881     return $r;
882     } # getc
883    
884     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:us-ascii'} =
885     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:us'} =
886     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:iso646-us'} =
887     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:cp367'} =
888     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:ibm367'} =
889     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:ansi_x3.4-1986'} =
890     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:ansi_x3.4-1968'} =
891     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:iso-ir-6'} =
892     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:csascii'} =
893     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:iso_646.irv:1991'} =
894     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:ascii'} = {ascii8 =>
895     '1',
896     is_block_safe =>
897     '1',
898     ietf_name =>
899     ['ansi_x3.4-1968',
900     'ansi_x3.4-1986',
901     'ascii',
902     'cp367',
903     'csascii',
904     'ibm367',
905     'iso-ir-6',
906     'iso646-us',
907     'iso_646.irv:1991',
908     'us',
909     'us-ascii',
910     'us-ascii'],
911     mime_name =>
912     'us-ascii',
913     perl_name =>
914     ['ascii',
915     'iso-646-us',
916     'us-ascii'],
917     utf8_encoding_scheme =>
918     '1',
919     'uri',
920     {'urn:x-suika-fam-cx:charset:ansi_x3.4-1968',
921     '1',
922     'urn:x-suika-fam-cx:charset:ansi_x3.4-1986',
923     '1',
924     'urn:x-suika-fam-cx:charset:ascii',
925     '1',
926     'urn:x-suika-fam-cx:charset:cp367',
927     '1',
928     'urn:x-suika-fam-cx:charset:csascii',
929     '1',
930     'urn:x-suika-fam-cx:charset:ibm367',
931     '1',
932     'urn:x-suika-fam-cx:charset:iso-ir-6',
933     '1',
934     'urn:x-suika-fam-cx:charset:iso646-us',
935     '1',
936     'urn:x-suika-fam-cx:charset:iso_646.irv:1991',
937     '1',
938     'urn:x-suika-fam-cx:charset:us',
939     '1',
940     'urn:x-suika-fam-cx:charset:us-ascii',
941     '1'},
942 wakaba 1.2 };
943    
944 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.ascii-ctrl'} = {perl_name =>
945     ['ascii-ctrl'],
946     'uri',
947     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.ascii-ctrl',
948     '1'}};
949     $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.null'} = {perl_name =>
950     ['null'],
951     'uri',
952     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.null',
953     '1'}};
954     $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.utf-8'} = {ascii8 =>
955     '1',
956     bom_allowed =>
957     '1',
958     no_bom_variant =>
959     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf8',
960     utf8_encoding_scheme =>
961     '1',
962     'uri',
963     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.utf-8',
964     '1'},
965 wakaba 1.2 xml_name => 'UTF-8',
966     };
967    
968 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/UTF-8.RFC2279'} = {ascii8 =>
969     '1',
970     bom_allowed =>
971     '1',
972     no_bom_variant =>
973     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf8',
974     utf8_encoding_scheme =>
975     '1',
976     'uri',
977     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/UTF-8.RFC2279',
978     '1'}};
979     $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-8'} = {
980     ascii8 => 1,
981     is_block_safe =>
982     '1',
983     perl_name =>
984     ['utf-8'],
985     utf8_encoding_scheme =>
986     '1',
987     'uri',
988     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-8',
989     '1'}};
990     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:utf-8'} = {
991     ascii8 => 1,
992     bom_allowed =>
993     '1',
994     no_bom_variant =>
995     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-8',
996     ietf_name =>
997     ['utf-8'],
998     mime_name =>
999     'utf-8',
1000     utf8_encoding_scheme =>
1001     '1',
1002     'uri',
1003     {'urn:x-suika-fam-cx:charset:utf-8',
1004     '1'},
1005 wakaba 1.2 };
1006    
1007 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf8'} = {ascii8 =>
1008     '1',
1009     is_block_safe =>
1010     '1',
1011     perl_name =>
1012     ['utf8'],
1013     utf8_encoding_scheme =>
1014     '1',
1015     'uri',
1016     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf8',
1017     '1'}};
1018     $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.utf-16'} = {
1019     ascii16 => 1,
1020     bom_allowed =>
1021     '1',
1022     bom_required =>
1023     '1',
1024     no_bom_variant =>
1025     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16le',
1026     no_bom_variant16be =>
1027     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16be',
1028     no_bom_variant16le =>
1029     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16le',
1030     perl_name =>
1031     ['utf-16'],
1032     'uri',
1033     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.utf-16',
1034     '1'},
1035 wakaba 1.2 xml_name => 'UTF-16',
1036     };
1037    
1038 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:utf-16'} = {
1039     ascii16 => 1,
1040     bom_allowed =>
1041     '1',
1042     no_bom_variant =>
1043     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16le',
1044     no_bom_variant16be =>
1045     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16be',
1046     no_bom_variant16le =>
1047     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16le',
1048     ietf_name =>
1049     ['utf-16'],
1050     mime_name =>
1051     'utf-16',
1052     'uri',
1053     {'urn:x-suika-fam-cx:charset:utf-16',
1054     '1'},
1055 wakaba 1.2 };
1056    
1057 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:utf-16be'} = {
1058     ascii16 => 1,
1059     ascii16be => 1,
1060     bom_allowed =>
1061     '1',
1062     no_bom_variant =>
1063     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16be',
1064     no_bom_variant16be =>
1065     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16be',
1066     ietf_name =>
1067     ['utf-16be'],
1068     mime_name =>
1069     'utf-16be',
1070     'uri',
1071     {'urn:x-suika-fam-cx:charset:utf-16be',
1072     '1'},
1073 wakaba 1.2 };
1074    
1075 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:utf-16le'} = {
1076     ascii16 => 1,
1077     ascii16le => 1,
1078     bom_allowed =>
1079     '1',
1080     no_bom_variant =>
1081     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16le',
1082     no_bom_variant16le =>
1083     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16le',
1084     ietf_name =>
1085     ['utf-16le'],
1086     mime_name =>
1087     'utf-16le',
1088     'uri',
1089     {'urn:x-suika-fam-cx:charset:utf-16le',
1090     '1'},
1091 wakaba 1.2 };
1092    
1093 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16be'} = {
1094     ascii16 => 1,
1095     ascii16be => 1,
1096     is_block_safe =>
1097     '1',
1098     perl_name =>
1099     ['utf-16be'],
1100     'uri',
1101     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16be',
1102     '1'}};
1103     $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16le'} = {
1104     ascii16 => 1,
1105     ascii16le => 1,
1106     is_block_safe =>
1107     '1',
1108     perl_name =>
1109     ['utf-16le'],
1110     'uri',
1111     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16le',
1112     '1'}};
1113     $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-10646-ucs-2'} = $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:iso-10646-ucs-2'} = {
1114     ascii16 => 1,
1115     bom_allowed =>
1116     '1',
1117     no_bom_variant =>
1118     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.ucs-2le',
1119     no_bom_variant16be =>
1120     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.ucs-2be',
1121     no_bom_variant16le =>
1122     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.ucs-2le',
1123     ietf_name =>
1124     ['csunicode',
1125     'iso-10646-ucs-2'],
1126     mime_name =>
1127     'iso-10646-ucs-2',
1128     'uri',
1129     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-10646-ucs-2',
1130     '1',
1131     'urn:x-suika-fam-cx:charset:iso-10646-ucs-2',
1132     '1'},
1133 wakaba 1.2 xml_name => 'ISO-10646-UCS-2',
1134     };
1135    
1136 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.ucs-2be'} = {
1137     ascii16 => 1,
1138     ascii16be => 1,
1139     is_block_safe =>
1140     '1',
1141     perl_name =>
1142     ['ucs-2be'],
1143     'uri',
1144     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.ucs-2be',
1145     '1'}};
1146     $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.ucs-2le'} = {
1147     ascii16 => 1,
1148     ascii16le => 1,
1149     is_block_safe =>
1150     '1',
1151     perl_name =>
1152     ['ucs-2le'],
1153     'uri',
1154     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.ucs-2le',
1155     '1'}};
1156     $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-10646-ucs-4'} = $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:iso-10646-ucs-4'} = {
1157     ascii32 => 1,
1158     bom_allowed =>
1159     '1',
1160     no_bom_variant =>
1161     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-32le',
1162     no_bom_variant32endian1234 =>
1163     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-32be',
1164     no_bom_variant32endian4321 =>
1165     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-32le',
1166     ietf_name =>
1167     ['csucs4',
1168     'iso-10646-ucs-4'],
1169     mime_name =>
1170     'iso-10646-ucs-4',
1171     'uri',
1172     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-10646-ucs-4',
1173     '1',
1174     'urn:x-suika-fam-cx:charset:iso-10646-ucs-4',
1175     '1'},
1176 wakaba 1.2 xml_name => 'ISO-10646-UCS-4',
1177     };
1178    
1179 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-32be'} = {
1180     ascii32 => 1,
1181     ascii32endian1234 => 1,
1182     is_block_safe =>
1183     '1',
1184     perl_name =>
1185     ['ucs-4be',
1186     'utf-32be'],
1187     'uri',
1188     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-32be',
1189     '1'}};
1190     $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-32le'} = {
1191     ascii32 => 1,
1192     ascii32endian4321 => 1,
1193     is_block_safe =>
1194     '1',
1195     perl_name =>
1196     ['ucs-4le',
1197     'utf-32le'],
1198     'uri',
1199     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-32le',
1200     '1'}};
1201     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:iso_8859-1:1987'} = $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-1'} = {ascii8 =>
1202     '1',
1203     is_block_safe =>
1204     '1',
1205     ietf_name =>
1206     ['cp819',
1207     'csisolatin1',
1208     'ibm819',
1209     'iso-8859-1',
1210     'iso-8859-1',
1211     'iso-ir-100',
1212     'iso_8859-1',
1213     'iso_8859-1:1987',
1214     'l1',
1215     'latin1'],
1216     mime_name =>
1217     'iso-8859-1',
1218     perl_name =>
1219     ['iso-8859-1',
1220     'latin1'],
1221     'uri',
1222     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-1',
1223     '1',
1224     'urn:x-suika-fam-cx:charset:iso_8859-1:1987',
1225     '1'},
1226 wakaba 1.2 xml_name => 'ISO-8859-1',
1227     };
1228    
1229 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-2'} = {ascii8 =>
1230     '1',
1231     is_block_safe =>
1232     '1',
1233     'uri',
1234     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-2',
1235     '1'},
1236 wakaba 1.2 xml_name => 'ISO-8859-2',
1237     };
1238    
1239 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-3'} = {ascii8 =>
1240     '1',
1241     is_block_safe =>
1242     '1',
1243     'uri',
1244     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-3',
1245     '1'},
1246 wakaba 1.2 xml_name => 'ISO-8859-3',
1247     };
1248    
1249 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-4'} = {ascii8 =>
1250     '1',
1251     is_block_safe =>
1252     '1',
1253     'uri',
1254     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-4',
1255     '1'},
1256 wakaba 1.2 xml_name => 'ISO-8859-4',
1257     };
1258    
1259 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-5'} = {ascii8 =>
1260     '1',
1261     is_block_safe =>
1262     '1',
1263     'uri',
1264     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-5',
1265     '1'},
1266 wakaba 1.2 xml_name => 'ISO-8859-5',
1267     };
1268    
1269 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-6'} = {ascii8 =>
1270     '1',
1271     is_block_safe =>
1272     '1',
1273     'uri',
1274     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-6',
1275     '1'},
1276 wakaba 1.2 xml_name => 'ISO-8859-6',
1277     };
1278    
1279 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-7'} = {ascii8 =>
1280     '1',
1281     is_block_safe =>
1282     '1',
1283     'uri',
1284     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-7',
1285     '1'},
1286 wakaba 1.2 xml_name => 'ISO-8859-7',
1287     };
1288    
1289 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-8'} = {ascii8 =>
1290     '1',
1291     is_block_safe =>
1292     '1',
1293     'uri',
1294     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-8',
1295     '1'},
1296 wakaba 1.2 xml_name => 'ISO-8859-8',
1297     };
1298    
1299 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-9'} = {ascii8 =>
1300     '1',
1301     is_block_safe =>
1302     '1',
1303     'uri',
1304     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-9',
1305     '1'},
1306 wakaba 1.2 xml_name => 'ISO-8859-9',
1307     };
1308    
1309 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-10'} = {ascii8 =>
1310     '1',
1311     is_block_safe =>
1312     '1',
1313     'uri',
1314     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-10',
1315     '1'},
1316 wakaba 1.2 xml_name => 'ISO-8859-10',
1317     };
1318    
1319 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-11'} = {ascii8 =>
1320     '1',
1321     is_block_safe =>
1322     '1',
1323     'uri',
1324     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-11',
1325     '1'},
1326 wakaba 1.2 xml_name => 'ISO-8859-11',
1327     };
1328    
1329 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-13'} = {ascii8 =>
1330     '1',
1331     is_block_safe =>
1332     '1',
1333     'uri',
1334     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-13',
1335     '1'},
1336 wakaba 1.2 xml_name => 'ISO-8859-13',
1337     };
1338    
1339 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-14'} = {ascii8 =>
1340     '1',
1341     is_block_safe =>
1342     '1',
1343     'uri',
1344     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-14',
1345     '1'},
1346 wakaba 1.2 xml_name => 'ISO-8859-14',
1347     };
1348    
1349 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-15'} = {ascii8 =>
1350     '1',
1351     is_block_safe =>
1352     '1',
1353     'uri',
1354     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-15',
1355     '1'},
1356 wakaba 1.2 xml_name => 'ISO-8859-15',
1357     };
1358    
1359 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-16'} = {ascii8 =>
1360     '1',
1361     is_block_safe =>
1362     '1',
1363     'uri',
1364     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-16',
1365     '1'},
1366 wakaba 1.2 xml_name => 'ISO-8859-16',
1367     };
1368    
1369 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-2022-jp'} = {ascii8 =>
1370     '1',
1371     'uri',
1372     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-2022-jp',
1373     '1'},
1374 wakaba 1.2 xml_name => 'ISO-2022-JP',
1375     };
1376    
1377 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:iso-2022-jp'} = {ascii8 =>
1378     '1',
1379     ietf_name =>
1380     ['csiso2022jp',
1381     'iso-2022-jp',
1382     'iso-2022-jp'],
1383     mime_name =>
1384     'iso-2022-jp',
1385     'uri',
1386     {'urn:x-suika-fam-cx:charset:iso-2022-jp',
1387     '1'},
1388 wakaba 1.2 };
1389    
1390 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.iso-2022-jp'} = {ascii8 =>
1391     '1',
1392     perl_name =>
1393     ['iso-2022-jp'],
1394     'uri',
1395     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.iso-2022-jp',
1396     '1'}};
1397     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:shift_jis'} = $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.shift_jis'} = {ascii8 =>
1398     '1',
1399     is_block_safe =>
1400     '1',
1401     ietf_name =>
1402     ['csshiftjis',
1403     'ms_kanji',
1404     'shift_jis',
1405     'shift_jis'],
1406     mime_name =>
1407     'shift_jis',
1408     perl_name =>
1409     ['shift-jis-1997'],
1410     'uri',
1411     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.shift_jis',
1412     '1',
1413     'urn:x-suika-fam-cx:charset:shift_jis',
1414     '1'},
1415 wakaba 1.2 xml_name => 'Shift_JIS',
1416     };
1417    
1418 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.shiftjis'} = {ascii8 =>
1419     '1',
1420     is_block_safe =>
1421     '1',
1422     perl_name =>
1423     ['shiftjis',
1424     'sjis'],
1425     'uri',
1426     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.shiftjis',
1427     '1'}};
1428     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:euc-jp'} = $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.euc-jp'} = $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:extended_unix_code_packed_format_for_japanese'} = {ascii8 =>
1429     '1',
1430     is_block_safe =>
1431     '1',
1432     ietf_name =>
1433     ['cseucpkdfmtjapanese',
1434     'euc-jp',
1435     'euc-jp',
1436     'extended_unix_code_packed_format_for_japanese'],
1437     mime_name =>
1438     'euc-jp',
1439     perl_name =>
1440     ['euc-jp-1997'],
1441     'uri',
1442     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.euc-jp',
1443     '1',
1444     'urn:x-suika-fam-cx:charset:euc-jp',
1445     '1',
1446     'urn:x-suika-fam-cx:charset:extended_unix_code_packed_format_for_japanese',
1447     '1'},
1448 wakaba 1.2 xml_name => 'EUC-JP',
1449     };
1450    
1451 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.euc-jp'} = {ascii8 =>
1452     '1',
1453     is_block_safe =>
1454     '1',
1455     perl_name =>
1456     ['euc-jp',
1457     'ujis'],
1458     'uri',
1459     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.euc-jp',
1460     '1'}};
1461    
1462     1;
1463 wakaba 1.5 ## $Date: 2008/05/18 03:46:30 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24