/[suikacvs]/markup/html/whatpm/Whatpm/Charset/DecodeHandle.pm
Suika

Contents of /markup/html/whatpm/Whatpm/Charset/DecodeHandle.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.14 - (hide annotations) (download)
Sun Sep 14 06:32:49 2008 UTC (16 years, 10 months ago) by wakaba
Branch: MAIN
Changes since 1.13: +139 -84 lines
++ whatpm/Whatpm/ChangeLog	14 Sep 2008 06:32:02 -0000
	* HTML.pm.src ($char_onerror): Have character decoder's |line|
	and |column| a higher priority than the one set by the
	tokenizer's input handler.
	($self->{read_until}): Exclude U+FFFD (but this might
	not be necessary, since now we do line/column fixup in
	the character decode handle).

2008-09-14  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/Charset/ChangeLog	14 Sep 2008 06:32:40 -0000
	* DecodeHandle.pm: EUCJP class reimplemented using |read|-centric
	model.

2008-09-14  Wakaba  <wakaba@suika.fam.cx>

1 wakaba 1.1 package Whatpm::Charset::DecodeHandle;
2     use strict;
3    
4 wakaba 1.9 ## NOTE: |Message::Charset::Info| uses this module without calling
5     ## the constructor.
6    
7 wakaba 1.1 my $XML_AUTO_CHARSET = q<http://suika.fam.cx/www/2006/03/xml-entity/>;
8     my $IANA_CHARSET = q<urn:x-suika-fam-cx:charset:>;
9     my $PERL_CHARSET = q<http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.>;
10     my $XML_CHARSET = q<http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.>;
11    
12     ## ->create_decode_handle ($charset_uri, $byte_stream, $onerror)
13     sub create_decode_handle ($$$;$) {
14     my $csdef = $Whatpm::Charset::CharsetDef->{$_[1]};
15     my $obj = {
16 wakaba 1.9 char_buffer => \(my $s = ''),
17 wakaba 1.11 char_buffer_pos => 0,
18 wakaba 1.1 character_queue => [],
19     filehandle => $_[2],
20     charset => $_[1],
21     byte_buffer => '',
22     onerror => $_[3] || sub {},
23     };
24     if ($csdef->{uri}->{$XML_AUTO_CHARSET} or
25     $obj->{charset} eq $XML_AUTO_CHARSET) {
26     my $b = ''; # UTF-8 w/o BOM
27     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
28 wakaba 1.2 $obj->{input_encoding} = 'UTF-8';
29 wakaba 1.1 if (read $obj->{filehandle}, $b, 256) {
30     no warnings "substr";
31     no warnings "uninitialized";
32     if (substr ($b, 0, 1) eq "<") {
33     if (substr ($b, 1, 1) eq "?") { # ASCII8
34     if ($b =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
35     encoding\s*=\s*["']([^"']*)/x) {
36 wakaba 1.2 $obj->{input_encoding} = $1;
37 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
38     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
39     if (not $csdef->{ascii8} or $csdef->{bom_required}) {
40     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
41     charset_uri => $uri,
42     charset_name => $obj->{input_encoding});
43     }
44     } else {
45     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
46 wakaba 1.2 $obj->{input_encoding} = 'UTF-8';
47 wakaba 1.1 }
48     if (defined $csdef->{no_bom_variant}) {
49     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant}};
50     }
51     } elsif (substr ($b, 1, 1) eq "\x00") {
52     if (substr ($b, 2, 2) eq "?\x00") { # ASCII16LE
53     my $c = $b; $c =~ tr/\x00//d;
54     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
55     encoding\s*=\s*["']([^"']*)/x) {
56 wakaba 1.2 $obj->{input_encoding} = $1;
57 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
58     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
59     if (not $csdef->{ascii16} or $csdef->{ascii16be} or
60     $csdef->{bom_required}) {
61     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
62     charset_uri => $uri,
63     charset_name => $obj->{input_encoding});
64     }
65     } else {
66     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
67 wakaba 1.2 $obj->{input_encoding} = 'UTF-8';
68 wakaba 1.1 }
69     if (defined $csdef->{no_bom_variant16le}) {
70     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant16le}};
71     }
72     } elsif (substr ($b, 2, 2) eq "\x00\x00") { # ASCII32Endian4321
73     my $c = $b; $c =~ tr/\x00//d;
74     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
75     encoding\s*=\s*["']([^"']*)/x) {
76 wakaba 1.2 $obj->{input_encoding} = $1;
77 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
78     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
79     if (not $csdef->{ascii32} or
80     $csdef->{ascii32endian1234} or
81     $csdef->{ascii32endian2143} or
82     $csdef->{ascii32endian3412} or
83     $csdef->{bom_required}) {
84     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
85     charset_uri => $uri,
86     charset_name => $obj->{input_encoding});
87     }
88     } else {
89     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
90 wakaba 1.2 $obj->{input_encoding} = 'UTF-8';
91 wakaba 1.1 }
92     if (defined $csdef->{no_bom_variant32endian4321}) {
93     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant32endian4321}};
94     }
95     }
96     }
97     } elsif (substr ($b, 0, 3) eq "\xEF\xBB\xBF") { # UTF8
98     $obj->{has_bom} = 1;
99     substr ($b, 0, 3) = '';
100     my $c = $b;
101     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
102     encoding\s*=\s*["']([^"']*)/x) {
103 wakaba 1.2 $obj->{input_encoding} = $1;
104 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
105     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
106     if (not $csdef->{utf8_encoding_scheme} or
107     not $csdef->{bom_allowed}) {
108     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
109     charset_uri => $uri,
110     charset_name => $obj->{input_encoding});
111     }
112     } else {
113     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
114 wakaba 1.2 $obj->{input_encoding} = 'UTF-8';
115 wakaba 1.1 }
116     if (defined $csdef->{no_bom_variant}) {
117     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant}};
118     }
119     } elsif (substr ($b, 0, 2) eq "\x00<") {
120     if (substr ($b, 2, 2) eq "\x00?") { # ASCII16BE
121     my $c = $b; $c =~ tr/\x00//d;
122     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
123     encoding\s*=\s*["']([^"']*)/x) {
124 wakaba 1.2 $obj->{input_encoding} = $1;
125 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
126     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
127     if (not $csdef->{ascii16} or $csdef->{ascii16le} or
128     $csdef->{bom_required}) {
129     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
130     charset_uri => $uri,
131     charset_name => $obj->{input_encoding});
132     }
133     } else {
134     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
135 wakaba 1.2 $obj->{input_encoding} = 'UTF-8';
136 wakaba 1.1 }
137     if (defined $csdef->{no_bom_variant16be}) {
138     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant16be}};
139     }
140     } elsif (substr ($b, 2, 2) eq "\x00\x00") { # ASCII32Endian3412
141     my $c = $b; $c =~ tr/\x00//d;
142     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
143     encoding\s*=\s*["']([^"']*)/x) {
144 wakaba 1.2 $obj->{input_encoding} = $1;
145 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
146     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
147     if (not $csdef->{ascii32} or
148     $csdef->{ascii32endian1234} or
149     $csdef->{ascii32endian2143} or
150     $csdef->{ascii32endian4321} or
151     $csdef->{bom_required}) {
152     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
153     charset_uri => $uri,
154     charset_name => $obj->{input_encoding});
155     }
156     } else {
157     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
158 wakaba 1.2 $obj->{input_encoding} = 'UTF-8';
159 wakaba 1.1 }
160     if (defined $csdef->{no_bom_variant32endian3412}) {
161     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant32endian3412}};
162     }
163     }
164     } elsif (substr ($b, 0, 2) eq "\xFE\xFF") {
165     if (substr ($b, 2, 2) eq "\x00<") { # ASCII16BE
166     $obj->{has_bom} = 1;
167     substr ($b, 0, 2) = '';
168     my $c = $b; $c =~ tr/\x00//d;
169     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
170     encoding\s*=\s*["']([^"']*)/x) {
171 wakaba 1.2 $obj->{input_encoding} = $1;
172 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
173     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
174     if (not $csdef->{ascii16} or
175     $csdef->{ascii16le} or
176     not $csdef->{bom_allowed}) {
177     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
178     charset_uri => $uri,
179     charset_name => $obj->{input_encoding});
180     }
181     } else {
182     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-16be'};
183 wakaba 1.2 $obj->{input_encoding} = 'UTF-16';
184 wakaba 1.1 }
185     if (defined $csdef->{no_bom_variant16be}) {
186     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant16be}};
187     }
188     } elsif (substr ($b, 2, 2) eq "\x00\x00") { # ASCII32Endian3412
189     $obj->{has_bom} = 1;
190     substr ($b, 0, 4) = '';
191     my $c = $b; $c =~ tr/\x00//d;
192     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
193     encoding\s*=\s*["']([^"']*)/x) {
194 wakaba 1.2 $obj->{input_encoding} = $1;
195 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
196     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
197     if (not $csdef->{ascii32} or
198     $csdef->{ascii32endian1234} or
199     $csdef->{ascii32endian2143} or
200     $csdef->{ascii32endian4321} or
201     not $csdef->{bom_allowed}) {
202     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
203     charset_uri => $uri,
204     charset_name => $obj->{input_encoding});
205     }
206     } else {
207     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-16be'};
208 wakaba 1.2 $obj->{input_encoding} = 'UTF-16';
209 wakaba 1.1 $obj->{byte_buffer} .= "\x00\x00";
210     }
211     if (defined $csdef->{no_bom_variant32endian3412}) {
212     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant32endian3412}};
213     }
214     } else {
215     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-16be'};
216 wakaba 1.2 $obj->{input_encoding} = 'UTF-16';
217 wakaba 1.1 substr ($b, 0, 2) = '';
218     $obj->{has_bom} = 1;
219     }
220     } elsif (substr ($b, 0, 2) eq "\xFF\xFE") {
221     if (substr ($b, 2, 2) eq "<\x00") { # ASCII16LE
222     $obj->{has_bom} = 1;
223     substr ($b, 0, 2) = '';
224     my $c = $b; $c =~ tr/\x00//d;
225     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
226     encoding\s*=\s*["']([^"']*)/x) {
227 wakaba 1.2 $obj->{input_encoding} = $1;
228 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
229     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
230     if (not $csdef->{ascii16} or
231     $csdef->{ascii16be} or
232     not $csdef->{bom_allowed}) {
233     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
234     charset_uri => $uri,
235     charset_name => $obj->{input_encoding});
236     }
237     } else {
238     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-16le'};
239 wakaba 1.2 $obj->{input_encoding} = 'UTF-16';
240 wakaba 1.1 }
241     if (defined $csdef->{no_bom_variant16le}) {
242     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant16le}};
243     }
244     } elsif (substr ($b, 2, 2) eq "\x00\x00") { # ASCII32Endian4321
245     $obj->{has_bom} = 1;
246     substr ($b, 0, 4) = '';
247     my $c = $b; $c =~ tr/\x00//d;
248     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
249     encoding\s*=\s*["']([^"']*)/x) {
250 wakaba 1.2 $obj->{input_encoding} = $1;
251 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
252     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
253     if (not $csdef->{ascii32} or
254     $csdef->{ascii32endian1234} or
255     $csdef->{ascii32endian2143} or
256     $csdef->{ascii32endian3412} or
257     not $csdef->{bom_allowed}) {
258     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
259     charset_uri => $uri,
260     charset_name => $obj->{input_encoding});
261     }
262     } else {
263     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-16le'};
264 wakaba 1.2 $obj->{input_encoding} = 'UTF-16';
265 wakaba 1.1 $obj->{byte_buffer} .= "\x00\x00";
266     }
267     if (defined $csdef->{no_bom_variant32endian4321}) {
268     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant32endian4321}};
269     }
270     } else {
271     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-16le'};
272 wakaba 1.2 $obj->{input_encoding} = 'UTF-16';
273 wakaba 1.1 substr ($b, 0, 2) = '';
274     $obj->{has_bom} = 1;
275     }
276     } elsif (substr ($b, 0, 2) eq "\x00\x00") {
277     if (substr ($b, 2, 2) eq "\x00<") { # ASCII32Endian1234
278     my $c = $b; $c =~ tr/\x00//d;
279     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
280     encoding\s*=\s*["']([^"']*)/x) {
281 wakaba 1.2 $obj->{input_encoding} = $1;
282 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
283     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
284     if (not $csdef->{ascii32} or
285     $csdef->{ascii32endian2143} or
286     $csdef->{ascii32endian3412} or
287     $csdef->{ascii32endian4321} or
288     $csdef->{bom_required}) {
289     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
290     charset_uri => $uri,
291     charset_name => $obj->{input_encoding});
292     }
293     } else {
294     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
295 wakaba 1.2 $obj->{input_encoding} = 'UTF-8';
296 wakaba 1.1 }
297     if (defined $csdef->{no_bom_variant32endian1234}) {
298     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant32endian1234}};
299     }
300     } elsif (substr ($b, 2, 2) eq "<\x00") { # ASCII32Endian2143
301     my $c = $b; $c =~ tr/\x00//d;
302     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
303     encoding\s*=\s*["']([^"']*)/x) {
304 wakaba 1.2 $obj->{input_encoding} = $1;
305 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
306     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
307     if (not $csdef->{ascii32} or
308     $csdef->{ascii32endian1234} or
309     $csdef->{ascii32endian3412} or
310     $csdef->{ascii32endian4321} or
311     $csdef->{bom_required}) {
312     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
313     charset_uri => $uri,
314     charset_name => $obj->{input_encoding});
315     }
316     } else {
317     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
318 wakaba 1.2 $obj->{input_encoding} = 'UTF-8';
319 wakaba 1.1 }
320     if (defined $csdef->{no_bom_variant32endian2143}) {
321     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant32endian2143}};
322     }
323     } elsif (substr ($b, 2, 2) eq "\xFE\xFF") { # ASCII32Endian1234
324     $obj->{has_bom} = 1;
325     substr ($b, 0, 4) = '';
326     my $c = $b; $c =~ tr/\x00//d;
327     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
328     encoding\s*=\s*["']([^"']*)/x) {
329 wakaba 1.2 $obj->{input_encoding} = $1;
330 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
331     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
332     if (not $csdef->{ascii32} or
333     $csdef->{ascii32endian2143} or
334     $csdef->{ascii32endian3412} or
335     $csdef->{ascii32endian4321} or
336     $csdef->{bom_required}) {
337     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
338     charset_uri => $uri,
339     charset_name => $obj->{input_encoding});
340     }
341     } else {
342     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
343 wakaba 1.2 $obj->{input_encoding} = 'UTF-8';
344 wakaba 1.1 $obj->{has_bom} = 0;
345     $obj->{byte_buffer} .= "\x00\x00\xFE\xFF";
346     }
347     if (defined $csdef->{no_bom_variant32endian1234}) {
348     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant32endian1234}};
349     }
350     } elsif (substr ($b, 2, 2) eq "\xFF\xFE") { # ASCII32Endian2143
351     $obj->{has_bom} = 1;
352     substr ($b, 0, 4) = '';
353     my $c = $b; $c =~ tr/\x00//d;
354     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
355     encoding\s*=\s*["']([^"']*)/x) {
356 wakaba 1.2 $obj->{input_encoding} = $1;
357 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
358     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
359     if (not $csdef->{ascii32} or
360     $csdef->{ascii32endian1234} or
361     $csdef->{ascii32endian3412} or
362     $csdef->{ascii32endian4321} or
363     $csdef->{bom_required}) {
364     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
365     charset_uri => $uri,
366     charset_name => $obj->{input_encoding});
367     }
368     } else {
369     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
370 wakaba 1.2 $obj->{input_encoding} = 'UTF-8';
371 wakaba 1.1 $obj->{has_bom} = 0;
372     $obj->{byte_buffer} .= "\x00\x00\xFF\xFE";
373     }
374     if (defined $csdef->{no_bom_variant32endian2143}) {
375     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant32endian2143}};
376     }
377     }
378     # \x4C\x6F\xA7\x94 EBCDIC
379     } # buffer
380     $obj->{byte_buffer} .= $b;
381     } # read
382     } elsif ($csdef->{uri}->{$XML_CHARSET.'utf-8'}) {
383     ## BOM is optional.
384     my $b = '';
385     if (read $obj->{filehandle}, $b, 3) {
386     if ($b eq "\xEF\xBB\xBF") {
387     $obj->{has_bom} = 1;
388     } else {
389     $obj->{byte_buffer} .= $b;
390     }
391     }
392     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'}; # UTF-8 w/o BOM
393     } elsif ($csdef->{uri}->{$XML_CHARSET.'utf-16'}) {
394     ## BOM is mandated.
395     my $b = '';
396     if (read $obj->{filehandle}, $b, 2) {
397     if ($b eq "\xFE\xFF") {
398     $obj->{has_bom} = 1; # UTF-16BE w/o BOM
399     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-16be'};
400     } elsif ($b eq "\xFF\xFE") {
401     $obj->{has_bom} = 1; # UTF-16LE w/o BOM
402     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-16le'};
403     } else {
404     $obj->{onerror}->(undef, 'no-bom-error', charset_uri => $obj->{charset});
405     $obj->{has_bom} = 0;
406     $obj->{byte_buffer} .= $b; # UTF-16BE w/o BOM
407     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-16be'};
408     }
409     } else {
410     $obj->{onerror}->(undef, 'no-bom-error', charset_uri => $obj->{charset});
411     $obj->{has_bom} = 0; # UTF-16BE w/o BOM
412     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-16be'};
413     }
414     }
415    
416     if ($csdef->{uri}->{$XML_CHARSET.'iso-2022-jp'}) {
417     $obj->{state_2440} = 'gl-jis-1997-swapped';
418     $obj->{state_2442} = 'gl-jis-1997';
419     $obj->{state} = 'state_2842';
420     require Encode::GLJIS1997Swapped;
421     require Encode::GLJIS1997;
422     if (Encode::find_encoding ($obj->{state_2440}) and
423     Encode::find_encoding ($obj->{state_2442})) {
424     return bless $obj, 'Whatpm::Charset::DecodeHandle::ISO2022JP';
425     }
426     } elsif ($csdef->{uri}->{$IANA_CHARSET.'iso-2022-jp'}) {
427     $obj->{state_2440} = 'gl-jis-1978';
428     $obj->{state_2442} = 'gl-jis-1983';
429     $obj->{state} = 'state_2842';
430     require Encode::GLJIS1978;
431     require Encode::GLJIS1983;
432     if (Encode::find_encoding ($obj->{state_2440}) and
433     Encode::find_encoding ($obj->{state_2442})) {
434     return bless $obj, 'Whatpm::Charset::DecodeHandle::ISO2022JP';
435     }
436     } elsif (defined $csdef->{perl_name}->[0]) {
437     if ($csdef->{uri}->{$XML_CHARSET.'euc-jp'} or
438     $csdef->{uri}->{$IANA_CHARSET.'euc-jp'}) {
439     $obj->{perl_encoding_name} = $csdef->{perl_name}->[0];
440     require Encode::EUCJP1997;
441     if (Encode::find_encoding ($obj->{perl_encoding_name})) {
442     return bless $obj, 'Whatpm::Charset::DecodeHandle::EUCJP';
443     }
444     } elsif ($csdef->{uri}->{$XML_CHARSET.'shift_jis'} or
445     $csdef->{uri}->{$IANA_CHARSET.'shift_jis'}) {
446     $obj->{perl_encoding_name} = $csdef->{perl_name}->[0];
447     require Encode::ShiftJIS1997;
448     if (Encode::find_encoding ($obj->{perl_encoding_name})) {
449     return bless $obj, 'Whatpm::Charset::DecodeHandle::ShiftJIS';
450     }
451     } elsif ($csdef->{is_block_safe}) {
452     $obj->{perl_encoding_name} = $csdef->{perl_name}->[0];
453     require Encode;
454     if (Encode::find_encoding ($obj->{perl_encoding_name})) {
455     return bless $obj, 'Whatpm::Charset::DecodeHandle::Encode';
456     }
457     }
458     }
459    
460     $obj->{onerror}->(undef, 'charset-not-supported-error',
461     charset_uri => $obj->{charset});
462     return undef;
463     } # create_decode_handle
464    
465     sub name_to_uri ($$$) {
466     my $domain = $_[1];
467     my $name = lc $_[2];
468    
469     if ($domain eq 'ietf') {
470     return $IANA_CHARSET . $name;
471     } elsif ($domain eq 'xml') {
472     if ({
473     'utf-8' => 1,
474     'utf-16' => 1,
475     'iso-10646-ucs-2' => 1,
476     'iso-10646-ucs-4' => 1,
477     'iso-8859-1' => 1,
478     'iso-8859-2' => 1,
479     'iso-8859-3' => 1,
480     'iso-8859-4' => 1,
481     'iso-8859-5' => 1,
482     'iso-8859-6' => 1,
483     'iso-8859-7' => 1,
484     'iso-8859-8' => 1,
485     'iso-8859-9' => 1,
486     'iso-8859-10' => 1,
487     'iso-8859-11' => 1,
488     'iso-8859-13' => 1,
489     'iso-8859-14' => 1,
490     'iso-8859-15' => 1,
491     'iso-8859-16' => 1,
492     'iso-2022-jp' => 1,
493     'shift_jis' => 1,
494     'euc-jp' => 1,
495     }->{$name}) {
496     return $XML_CHARSET . $name;
497     }
498    
499     my $uri = $IANA_CHARSET . $name;
500     return $uri if $Whatpm::Charset::CharsetDef->{$uri};
501    
502     return $XML_CHARSET . $name;
503     } else {
504     return undef;
505     }
506     } # name_to_uri
507    
508     sub uri_to_name ($$$) {
509     my (undef, $domain, $uri) = @_;
510 wakaba 1.2
511     if ($domain eq 'xml') {
512     my $v = $Whatpm::Charset::CharsetDef->{$uri}->{xml_name};
513     return $v if defined $v;
514    
515     if (substr ($uri, 0, length $XML_CHARSET) eq $XML_CHARSET) {
516     return substr ($uri, length $XML_CHARSET);
517     }
518    
519     $domain = 'ietf'; ## TODO: XML encoding name has smaller range
520     }
521    
522     if ($domain eq 'ietf') {
523     my $v = $Whatpm::Charset::CharsetDef->{$uri}->{iana_name};
524     return $v->[0] if defined $v;
525    
526     if (substr ($uri, 0, length $IANA_CHARSET) eq $IANA_CHARSET) {
527 wakaba 1.1 return substr ($uri, length $IANA_CHARSET);
528     }
529     }
530    
531     return undef;
532     } # uri_to_name
533    
534 wakaba 1.3 require IO::Handle;
535    
536     package Whatpm::Charset::DecodeHandle::ByteBuffer;
537    
538 wakaba 1.7 ## NOTE: Provides a byte buffer wrapper object.
539    
540 wakaba 1.3 sub new ($$) {
541     my $self = bless {
542     buffer => '',
543     }, shift;
544     $self->{filehandle} = shift;
545     return $self;
546     } # new
547    
548     sub read {
549     my $self = shift;
550     my $pos = length $self->{buffer};
551     my $r = $self->{filehandle}->read ($self->{buffer}, $_[1], $pos);
552     substr ($_[0], $_[2]) = substr ($self->{buffer}, $pos);
553 wakaba 1.8 ## NOTE: This would do different behavior from Perl's standard
554     ## |read| when $pos points beyond the end of the string.
555 wakaba 1.3 return $r;
556     } # read
557    
558     sub close { $_[0]->{filehandle}->close }
559    
560 wakaba 1.11 package Whatpm::Charset::DecodeHandle::CharString;
561    
562     ## NOTE: Same as Perl's standard |open $handle, '<', \$char_string|,
563     ## but supports |ungetc| and other extensions.
564    
565     sub new ($$) {
566     my $self = bless {pos => 0}, shift;
567     $self->{string} = shift; # must be a scalar ref
568     return $self;
569     } # new
570    
571     sub getc ($) {
572     my $self = shift;
573     if ($self->{pos} < length ${$self->{string}}) {
574     return substr ${$self->{string}}, $self->{pos}++, 1;
575     } else {
576     return undef;
577     }
578     } # getc
579    
580     sub read ($$$$) {
581     #my ($self, $scalar, $length, $offset) = @_;
582     my $self = $_[0];
583     my $length = $_[2] || 0;
584     my $offset = $_[3];
585     ## NOTE: We don't support standard Perl semantics if $offset is
586     ## greater than the length of $scalar.
587     substr ($_[1], $offset) = substr (${$self->{string}}, $self->{pos}, $length);
588     my $count = (length $_[1]) - $offset;
589     $self->{pos} += $count;
590     return $count;
591     } # read
592    
593 wakaba 1.12 sub manakai_read_until ($$$;$) {
594     #my ($self, $scalar, $pattern, $offset) = @_;
595     my $self = $_[0];
596 wakaba 1.11 pos (${$self->{string}}) = $self->{pos};
597 wakaba 1.12 if (${$self->{string}} =~ /\G(?>$_[2])+/) {
598     substr ($_[1], $_[3]) = substr (${$self->{string}}, $-[0], $+[0] - $-[0]);
599 wakaba 1.11 $self->{pos} += $+[0] - $-[0];
600 wakaba 1.12 return $+[0] - $-[0];
601 wakaba 1.11 } else {
602 wakaba 1.12 return 0;
603 wakaba 1.11 }
604 wakaba 1.12 } # manakai_read_until
605 wakaba 1.11
606     sub ungetc ($$) {
607     my $self = shift;
608     ## Ignore second parameter.
609     $self->{pos}-- if $self->{pos} > 0;
610     } # ungetc
611    
612     sub close ($) { }
613    
614 wakaba 1.1 package Whatpm::Charset::DecodeHandle::Encode;
615    
616 wakaba 1.7 ## NOTE: Provides a Perl |Encode| module wrapper object.
617    
618 wakaba 1.1 sub charset ($) { $_[0]->{charset} }
619    
620 wakaba 1.3 sub close ($) { $_[0]->{filehandle}->close }
621 wakaba 1.1
622     sub getc ($) {
623 wakaba 1.9 my $c = '';
624     my $l = $_[0]->read ($c, 1);
625     if ($l) {
626     return $c;
627     } else {
628     return undef;
629     }
630     } # getc
631    
632     sub read ($$$;$) {
633 wakaba 1.1 my $self = $_[0];
634 wakaba 1.9 # $scalar = $_[1];
635     my $length = $_[2];
636     my $offset = $_[3] || 0;
637     my $count = 0;
638     my $eof;
639 wakaba 1.10 ## NOTE: It is incompatible with the standard Perl semantics
640     ## if $offset is greater than the length of $scalar.
641    
642 wakaba 1.9 A: {
643     return $count if $length < 1;
644    
645 wakaba 1.11 if (my $l = (length ${$self->{char_buffer}}) - $self->{char_buffer_pos}) {
646 wakaba 1.9 if ($l >= $length) {
647 wakaba 1.11 substr ($_[1], $offset)
648     = substr (${$self->{char_buffer}}, $self->{char_buffer_pos},
649     $length);
650 wakaba 1.9 $count += $length;
651 wakaba 1.11 $self->{char_buffer_pos} += $length;
652 wakaba 1.9 $length = 0;
653     return $count;
654     } else {
655 wakaba 1.11 substr ($_[1], $offset)
656     = substr (${$self->{char_buffer}}, $self->{char_buffer_pos});
657 wakaba 1.9 $count += $l;
658     $length -= $l;
659     ${$self->{char_buffer}} = '';
660 wakaba 1.11 $self->{char_buffer_pos} = 0;
661 wakaba 1.9 }
662     $offset = length $_[1];
663     }
664    
665     if ($eof) {
666     return $count;
667     }
668 wakaba 1.1
669     my $error;
670     if ($self->{continue}) {
671 wakaba 1.3 if ($self->{filehandle}->read ($self->{byte_buffer}, 256,
672     length $self->{byte_buffer})) {
673 wakaba 1.1 #
674     } else {
675     $error = 1;
676     }
677     $self->{continue} = 0;
678     } elsif (512 > length $self->{byte_buffer}) {
679 wakaba 1.9 if ($self->{filehandle}->read ($self->{byte_buffer}, 256,
680     length $self->{byte_buffer})) {
681     #
682     } else {
683     $eof = 1;
684     }
685 wakaba 1.1 }
686    
687     unless ($error) {
688 wakaba 1.4 if (not $self->{bom_checked}) {
689     if (defined $self->{bom_pattern}) {
690     if ($self->{byte_buffer} =~ s/^$self->{bom_pattern}//) {
691     $self->{has_bom} = 1;
692     }
693     }
694     $self->{bom_checked} = 1;
695     }
696    
697 wakaba 1.1 my $string = Encode::decode ($self->{perl_encoding_name},
698     $self->{byte_buffer},
699     Encode::FB_QUIET ());
700     if (length $string) {
701 wakaba 1.9 $self->{char_buffer} = \$string;
702 wakaba 1.11 $self->{char_buffer_pos} = 0;
703 wakaba 1.1 if (length $self->{byte_buffer}) {
704     $self->{continue} = 1;
705     }
706     } else {
707     if (length $self->{byte_buffer}) {
708     $error = 1;
709     } else {
710 wakaba 1.9 ## NOTE: No further input
711     redo A;
712 wakaba 1.1 }
713     }
714     }
715    
716     if ($error) {
717 wakaba 1.9 my $r = substr $self->{byte_buffer}, 0, 1, '';
718 wakaba 1.6 my $fallback = $self->{fallback}->{$r};
719     if (defined $fallback) {
720 wakaba 1.7 ## NOTE: This is an HTML5 parse error.
721 wakaba 1.6 $self->{onerror}->($self, 'fallback-char-error', octets => \$r,
722     char => \$fallback,
723 wakaba 1.7 level => $self->{level}->{$self->{error_level}->{'fallback-char-error'}});
724 wakaba 1.9 ${$self->{char_buffer}} .= $fallback;
725 wakaba 1.7 } elsif (exists $self->{fallback}->{$r}) {
726     ## NOTE: This is an HTML5 parse error. In addition, the octet
727     ## is not assigned with a character.
728     $self->{onerror}->($self, 'fallback-unassigned-error', octets => \$r,
729     level => $self->{level}->{$self->{error_level}->{'fallback-unassigned-error'}});
730 wakaba 1.9 ${$self->{char_buffer}} .= $r;
731 wakaba 1.6 } else {
732 wakaba 1.7 $self->{onerror}->($self, 'illegal-octets-error', octets => \$r,
733     level => $self->{level}->{$self->{error_level}->{'illegal-octets-error'}});
734 wakaba 1.9 ${$self->{char_buffer}} .= $r;
735 wakaba 1.6 }
736 wakaba 1.1 }
737    
738 wakaba 1.9 redo A;
739     } # A
740 wakaba 1.11 } # read
741    
742 wakaba 1.12 sub manakai_read_until ($$$;$) {
743     #my ($self, $scalar, $pattern, $offset) = @_;
744     my $self = $_[0];
745 wakaba 1.11 my $s = '';
746     $self->read ($s, 255);
747 wakaba 1.12 if ($s =~ /^(?>$_[2])+/) {
748 wakaba 1.11 my $rem_length = (length $s) - $+[0];
749     if ($rem_length) {
750     if ($self->{char_buffer_pos} > $rem_length) {
751     $self->{char_buffer_pos} -= $rem_length;
752     } else {
753     substr (${$self->{char_buffer}}, 0, $self->{char_buffer_pos})
754     = substr ($s, $+[0]);
755     $self->{char_buffer_pos} = 0;
756     }
757     }
758 wakaba 1.13 substr ($_[1], $_[3]) = substr ($s, $-[0], $+[0] - $-[0]);
759 wakaba 1.12 return $+[0];
760 wakaba 1.11 } elsif (length $s) {
761     if ($self->{char_buffer_pos} > length $s) {
762     $self->{char_buffer_pos} -= length $s;
763     } else {
764     substr (${$self->{char_buffer}}, 0, $self->{char_buffer_pos}) = $s;
765     $self->{char_buffer_pos} = 0;
766     }
767     }
768 wakaba 1.12 return 0;
769     } # manakai_read_until
770 wakaba 1.1
771     sub has_bom ($) { $_[0]->{has_bom} }
772    
773 wakaba 1.2 sub input_encoding ($) {
774     my $v = $_[0]->{input_encoding};
775     return $v if defined $v;
776    
777     my $uri = $_[0]->{charset};
778     if (defined $uri) {
779     return Whatpm::Charset::DecodeHandle->uri_to_name (xml => $uri);
780     }
781    
782     return undef;
783     } # input_encoding
784 wakaba 1.1
785     sub onerror ($;$) {
786     if (@_ > 1) {
787     $_[0]->{onerror} = $_[1];
788     }
789    
790     return $_[0]->{onerror};
791     } # onerror
792    
793     sub ungetc ($$) {
794     unshift @{$_[0]->{character_queue}}, chr int ($_[1] or 0);
795     } # ungetc
796    
797     package Whatpm::Charset::DecodeHandle::EUCJP;
798     push our @ISA, 'Whatpm::Charset::DecodeHandle::Encode';
799    
800 wakaba 1.14 sub read ($$$;$) {
801 wakaba 1.1 my $self = $_[0];
802 wakaba 1.14 #my $scalar = $_[1];
803     my $length = $_[2];
804     my $offset = $_[3] || 0;
805     my $count = 0;
806     my $eof;
807     ## NOTE: It is incompatible with the standard Perl semantics
808     ## if $offset is greater than the length of $scalar.
809    
810     A: {
811     return $count if $length < 1;
812    
813     if (my $l = (length ${$self->{char_buffer}}) - $self->{char_buffer_pos}) {
814     if ($l >= $length) {
815     substr ($_[1], $offset)
816     = substr (${$self->{char_buffer}}, $self->{char_buffer_pos},
817     $length);
818     $count += $length;
819     $self->{char_buffer_pos} += $length;
820     $length = 0;
821     return $count;
822     } else {
823     substr ($_[1], $offset)
824     = substr (${$self->{char_buffer}}, $self->{char_buffer_pos});
825     $count += $l;
826     $length -= $l;
827     ${$self->{char_buffer}} = '';
828     $self->{char_buffer_pos} = 0;
829     }
830     $offset = length $_[1];
831     }
832 wakaba 1.1
833 wakaba 1.14 if ($eof) {
834     return $count;
835 wakaba 1.1 }
836 wakaba 1.14
837     my $error;
838     if ($self->{continue}) {
839     if ($self->{filehandle}->read ($self->{byte_buffer}, 256,
840     length $self->{byte_buffer})) {
841     #
842     } else {
843     $error = 1;
844     }
845     $self->{continue} = 0;
846     } elsif (512 > length $self->{byte_buffer}) {
847     if ($self->{filehandle}->read ($self->{byte_buffer}, 256,
848     length $self->{byte_buffer})) {
849     #
850     } else {
851     $eof = 1;
852 wakaba 1.1 }
853 wakaba 1.14 }
854    
855     unless ($error) {
856     my $string = Encode::decode ($self->{perl_encoding_name},
857     $self->{byte_buffer},
858     Encode::FB_QUIET ());
859     if (length $string) {
860     $self->{char_buffer} = \$string;
861     $self->{char_buffer_pos} = 0;
862     if (length $self->{byte_buffer}) {
863     $self->{continue} = 1;
864     }
865 wakaba 1.1 } else {
866 wakaba 1.14 if (length $self->{byte_buffer}) {
867     $error = 1;
868     } else {
869     ## NOTE: No further input.
870     redo A;
871     }
872 wakaba 1.1 }
873     }
874    
875 wakaba 1.14 if ($error) {
876     my $r = substr $self->{byte_buffer}, 0, 1, '';
877     my $etype = 'illegal-octets-error';
878     if ($r =~ /^[\xA1-\xFE]/) {
879     if ($self->{byte_buffer} =~ s/^([\xA1-\xFE])//) {
880     $r .= $1;
881     $etype = 'unassigned-code-point-error';
882     }
883     } elsif ($r eq "\x8F") {
884     if ($self->{byte_buffer} =~ s/^([\xA1-\xFE][\xA1-\xFE]?)//) {
885     $r .= $1;
886     $etype = 'unassigned-code-point-error' if length $1 == 2;
887     }
888     } elsif ($r eq "\x8E") {
889     if ($self->{byte_buffer} =~ s/^([\xA1-\xFE])//) {
890     $r .= $1;
891     $etype = 'unassigned-code-point-error';
892     }
893     } elsif ($r eq "\xA0" or $r eq "\xFF") {
894 wakaba 1.1 $etype = 'unassigned-code-point-error';
895     }
896 wakaba 1.14 ## NOTE: Fixup line/column number by counting the number of
897     ## lines/columns in the string that is to be retuend by this
898     ## method call.
899     my $line_diff = 0;
900     my $col_diff = 0;
901     my $set_col;
902     for (my $i = 0; $i < $count; $i++) {
903     my $s = substr $_[1], $i - $count, 1;
904     if ($s eq "\x0D") {
905     $line_diff++;
906     $col_diff = 0;
907     $set_col = 1;
908     $i++ if substr ($_[1], $i - $count + 1, 1) eq "\x0A";
909     } elsif ($s eq "\x0A") {
910     $line_diff++;
911     $col_diff = 0;
912     $set_col = 1;
913     } else {
914     $col_diff++;
915     }
916     }
917     my $i = $self->{char_buffer_pos};
918     if ($count and substr (${$self->{char_buffer}}, -1, 1) eq "\x0D") {
919     if (substr (${$self->{char_buffer}}, $i, 1) eq "\x0A") {
920     $i++;
921     }
922 wakaba 1.1 }
923 wakaba 1.14 my $cb_length = length ${$self->{char_buffer}};
924     for (; $i < $cb_length; $i++) {
925     my $s = substr $_[1], $i, 1;
926     if ($s eq "\x0D") {
927     $line_diff++;
928     $col_diff = 0;
929     $set_col = 1;
930     $i++ if substr ($_[1], $i + 1, 1) eq "\x0A";
931     } elsif ($s eq "\x0A") {
932     $line_diff++;
933     $col_diff = 0;
934     $set_col = 1;
935     } else {
936     $col_diff++;
937     }
938 wakaba 1.1 }
939 wakaba 1.14 $self->{onerror}->($self, $etype, octets => \$r,
940     level => $self->{level}->{$self->{error_level}->{$etype}},
941     line_diff => $line_diff,
942     ($set_col ? (column => 1) : ()),
943     column_diff => $col_diff);
944     ## NOTE: Error handler may modify |octets| parameter, which
945     ## would be returned as part of the output. Note that what
946     ## is returned would affect what |manakai_read_until| returns.
947     ${$self->{char_buffer}} .= $r;
948 wakaba 1.1 }
949    
950 wakaba 1.14 redo A;
951     } # A
952 wakaba 1.9 } # read
953    
954 wakaba 1.1 package Whatpm::Charset::DecodeHandle::ISO2022JP;
955     push our @ISA, 'Whatpm::Charset::DecodeHandle::Encode';
956    
957     sub getc ($) {
958     my $self = $_[0];
959     return shift @{$self->{character_queue}} if @{$self->{character_queue}};
960    
961     my $r;
962     A: {
963     my $error;
964     if ($self->{continue}) {
965 wakaba 1.3 if ($self->{filehandle}->read ($self->{byte_buffer}, 256,
966     length $self->{byte_buffer})) {
967 wakaba 1.1 #
968     } else {
969     $error = 1;
970     }
971     $self->{continue} = 0;
972     } elsif (512 > length $self->{byte_buffer}) {
973 wakaba 1.3 $self->{filehandle}->read ($self->{byte_buffer}, 256,
974     length $self->{byte_buffer});
975 wakaba 1.1 }
976    
977     unless ($error) {
978     if ($self->{byte_buffer} =~ s/^\x1B(\x24[\x40\x42]|\x28[\x42\x4A])//) {
979     $self->{state} = {
980     "\x24\x40" => 'state_2440',
981     "\x24\x42" => 'state_2442',
982     "\x28\x42" => 'state_2842',
983     "\x28\x4A" => 'state_284A',
984     }->{$1};
985     redo A;
986     } elsif ($self->{state} eq 'state_2842') { # IRV
987     if ($self->{byte_buffer} =~ s/^([\x00-\x0D\x10-\x1A\x1C-\x7F]+)//) {
988     push @{$self->{character_queue}}, split //, $1;
989     $r = shift @{$self->{character_queue}};
990     } else {
991     if (length $self->{byte_buffer}) {
992     $error = 1;
993     } else {
994     $r = undef;
995     }
996     }
997     } elsif ($self->{state} eq 'state_284A') { # 0201
998     if ($self->{byte_buffer} =~ s/^([\x00-\x0D\x10-\x1A\x1C-\x7F]+)//) {
999     my $v = $1;
1000     $v =~ tr/\x5C\x7E/\xA5\x{203E}/;
1001     push @{$self->{character_queue}}, split //, $v;
1002     $r = shift @{$self->{character_queue}};
1003     } else {
1004     if (length $self->{byte_buffer}) {
1005     $error = 1;
1006     } else {
1007     $r = undef;
1008     $self->{onerror}->($self, 'invalid-state-error',
1009 wakaba 1.7 state => $self->{state},
1010     level => $self->{level}->{$self->{error_level}->{'invalid-state-error'}});
1011 wakaba 1.1 }
1012     }
1013     } elsif ($self->{state} eq 'state_2442') { # 1983
1014     my $v = Encode::decode ($self->{state_2442},
1015     $self->{byte_buffer},
1016     Encode::FB_QUIET ());
1017     if (length $v) {
1018     push @{$self->{character_queue}}, split //, $v;
1019     $r = shift @{$self->{character_queue}};
1020     } else {
1021     if (length $self->{byte_buffer}) {
1022     $error = 1;
1023     } else {
1024     $r = undef;
1025     $self->{onerror}->($self, 'invalid-state-error',
1026 wakaba 1.7 state => $self->{state},
1027     level => $self->{level}->{$self->{error_level}->{'invalid-state-error'}});
1028 wakaba 1.1 }
1029     }
1030     } elsif ($self->{state} eq 'state_2440') { # 1978
1031     my $v = Encode::decode ($self->{state_2440},
1032     $self->{byte_buffer},
1033     Encode::FB_QUIET ());
1034     if (length $v) {
1035     push @{$self->{character_queue}}, split //, $v;
1036     $r = shift @{$self->{character_queue}};
1037     } else {
1038     if (length $self->{byte_buffer}) {
1039     $error = 1;
1040     } else {
1041     $r = undef;
1042     $self->{onerror}->($self, 'invalid-state-error',
1043 wakaba 1.7 state => $self->{state},
1044     level => $self->{level}->{$self->{error_level}->{'invalid-state-error'}});
1045 wakaba 1.1 }
1046     }
1047     } else {
1048     $error = 1;
1049     }
1050     }
1051    
1052     if ($error) {
1053     $r = substr $self->{byte_buffer}, 0, 1, '';
1054     my $etype = 'illegal-octets-error';
1055     if (($self->{state} eq 'state_2442' or
1056     $self->{state} eq 'state_2440') and
1057     $r =~ /^[\x21-\x7E]/ and
1058     $self->{byte_buffer} =~ s/^([\x21-\x7E])//) {
1059     $r .= $1;
1060     $etype = 'unassigned-code-point-error';
1061     } elsif ($r eq "\x1B" and
1062     $self->{byte_buffer} =~ s/^\(H//) { # Old 0201
1063     $r .= "(H";
1064     $self->{state} = 'state_284A';
1065     }
1066 wakaba 1.7 $self->{onerror}->($self, $etype, octets => \$r,
1067     level => $self->{level}->{$self->{error_level}->{$etype}});
1068 wakaba 1.1 }
1069     } # A
1070    
1071     return $r;
1072     } # getc
1073    
1074 wakaba 1.9 ## TODO: This is not good for performance. Should be replaced
1075     ## by read-centric implementation.
1076     sub read ($$$;$) {
1077     #my ($self, $scalar, $length, $offset) = @_;
1078     my $length = $_[2];
1079     my $r = '';
1080     while ($length > 0) {
1081     my $c = $_[0]->getc;
1082     last unless defined $c;
1083     $r .= $c;
1084     $length--;
1085     }
1086     substr ($_[1], $_[3]) = $r;
1087     ## NOTE: This would do different thing from what Perl's |read| do
1088     ## if $offset points beyond the end of the $scalar.
1089     return length $r;
1090     } # read
1091    
1092 wakaba 1.12 sub manakai_read_until ($$$;$) {
1093     #my ($self, $scalar, $pattern, $offset) = @_;
1094     my $self = $_[0];
1095 wakaba 1.11 my $c = $self->getc;
1096 wakaba 1.12 if ($c =~ /^$_[2]/) {
1097     substr ($_[1], $_[3]) = $c;
1098     return 1;
1099 wakaba 1.11 } elsif (defined $c) {
1100     $self->ungetc (ord $c);
1101 wakaba 1.12 return 0;
1102 wakaba 1.11 } else {
1103 wakaba 1.12 return 0;
1104 wakaba 1.11 }
1105 wakaba 1.12 } # manakai_read_until
1106 wakaba 1.11
1107 wakaba 1.1 package Whatpm::Charset::DecodeHandle::ShiftJIS;
1108     push our @ISA, 'Whatpm::Charset::DecodeHandle::Encode';
1109    
1110     sub getc ($) {
1111     my $self = $_[0];
1112     return shift @{$self->{character_queue}} if @{$self->{character_queue}};
1113    
1114     my $error;
1115     if ($self->{continue}) {
1116 wakaba 1.3 if ($self->{filehandle}->read ($self->{byte_buffer}, 256,
1117     length $self->{byte_buffer})) {
1118 wakaba 1.1 #
1119     } else {
1120     $error = 1;
1121     }
1122     $self->{continue} = 0;
1123     } elsif (512 > length $self->{byte_buffer}) {
1124 wakaba 1.3 $self->{filehandle}->read ($self->{byte_buffer}, 256,
1125     length $self->{byte_buffer});
1126 wakaba 1.1 }
1127    
1128     my $r;
1129     unless ($error) {
1130     my $string = Encode::decode ($self->{perl_encoding_name},
1131     $self->{byte_buffer},
1132     Encode::FB_QUIET ());
1133     if (length $string) {
1134     push @{$self->{character_queue}}, split //, $string;
1135     $r = shift @{$self->{character_queue}};
1136     if (length $self->{byte_buffer}) {
1137     $self->{continue} = 1;
1138     }
1139     } else {
1140     if (length $self->{byte_buffer}) {
1141     $error = 1;
1142     } else {
1143     $r = undef;
1144     }
1145     }
1146     }
1147    
1148     if ($error) {
1149     $r = substr $self->{byte_buffer}, 0, 1, '';
1150     my $etype = 'illegal-octets-error';
1151 wakaba 1.5 if ($r =~ /^[\x81-\x9F\xE0-\xFC]/) {
1152 wakaba 1.1 if ($self->{byte_buffer} =~ s/(.)//s) {
1153     $r .= $1; # not limited to \x40-\xFC - \x7F
1154     $etype = 'unassigned-code-point-error';
1155     }
1156 wakaba 1.5 ## NOTE: Range [\xF0-\xFC] is unassigned and may be used as a single-byte
1157     ## character or as the first-byte of a double-byte character according
1158     ## to JIS X 0208:1997 Appendix 1. However, the current practice is
1159     ## use the range as the first-byte of double-byte characters.
1160     } elsif ($r =~ /^[\x80\xA0\xFD-\xFF]/) {
1161 wakaba 1.1 $etype = 'unassigned-code-point-error';
1162     }
1163 wakaba 1.7 $self->{onerror}->($self, $etype, octets => \$r,
1164     level => $self->{level}->{$self->{error_level}->{$etype}});
1165 wakaba 1.1 }
1166    
1167     return $r;
1168     } # getc
1169    
1170 wakaba 1.9 ## TODO: This is not good for performance. Should be replaced
1171     ## by read-centric implementation.
1172     sub read ($$$;$) {
1173     #my ($self, $scalar, $length, $offset) = @_;
1174     my $length = $_[2];
1175     my $r = '';
1176     while ($length > 0) {
1177     my $c = $_[0]->getc;
1178     last unless defined $c;
1179     $r .= $c;
1180     $length--;
1181     }
1182     substr ($_[1], $_[3]) = $r;
1183     ## NOTE: This would do different thing from what Perl's |read| do
1184     ## if $offset points beyond the end of the $scalar.
1185 wakaba 1.11
1186 wakaba 1.9 return length $r;
1187     } # read
1188    
1189 wakaba 1.12 sub manakai_read_until ($$$;$) {
1190     #my ($self, $scalar, $pattern, $offset) = @_;
1191     my $self = $_[0];
1192 wakaba 1.11 my $c = $self->getc;
1193 wakaba 1.12 if ($c =~ /^$_[2]/) {
1194     substr ($_[1], $_[3]) = $c;
1195     return 1;
1196 wakaba 1.11 } elsif (defined $c) {
1197     $self->ungetc (ord $c);
1198 wakaba 1.12 return 0;
1199 wakaba 1.11 } else {
1200 wakaba 1.12 return 0;
1201 wakaba 1.11 }
1202 wakaba 1.12 } # manakai_read_until
1203 wakaba 1.11
1204 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:us-ascii'} =
1205     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:us'} =
1206     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:iso646-us'} =
1207     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:cp367'} =
1208     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:ibm367'} =
1209     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:ansi_x3.4-1986'} =
1210     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:ansi_x3.4-1968'} =
1211     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:iso-ir-6'} =
1212     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:csascii'} =
1213     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:iso_646.irv:1991'} =
1214     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:ascii'} = {ascii8 =>
1215     '1',
1216     is_block_safe =>
1217     '1',
1218     ietf_name =>
1219     ['ansi_x3.4-1968',
1220     'ansi_x3.4-1986',
1221     'ascii',
1222     'cp367',
1223     'csascii',
1224     'ibm367',
1225     'iso-ir-6',
1226     'iso646-us',
1227     'iso_646.irv:1991',
1228     'us',
1229     'us-ascii',
1230     'us-ascii'],
1231     mime_name =>
1232     'us-ascii',
1233     perl_name =>
1234     ['ascii',
1235     'iso-646-us',
1236     'us-ascii'],
1237     utf8_encoding_scheme =>
1238     '1',
1239     'uri',
1240     {'urn:x-suika-fam-cx:charset:ansi_x3.4-1968',
1241     '1',
1242     'urn:x-suika-fam-cx:charset:ansi_x3.4-1986',
1243     '1',
1244     'urn:x-suika-fam-cx:charset:ascii',
1245     '1',
1246     'urn:x-suika-fam-cx:charset:cp367',
1247     '1',
1248     'urn:x-suika-fam-cx:charset:csascii',
1249     '1',
1250     'urn:x-suika-fam-cx:charset:ibm367',
1251     '1',
1252     'urn:x-suika-fam-cx:charset:iso-ir-6',
1253     '1',
1254     'urn:x-suika-fam-cx:charset:iso646-us',
1255     '1',
1256     'urn:x-suika-fam-cx:charset:iso_646.irv:1991',
1257     '1',
1258     'urn:x-suika-fam-cx:charset:us',
1259     '1',
1260     'urn:x-suika-fam-cx:charset:us-ascii',
1261     '1'},
1262 wakaba 1.2 };
1263    
1264 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.ascii-ctrl'} = {perl_name =>
1265     ['ascii-ctrl'],
1266     'uri',
1267     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.ascii-ctrl',
1268     '1'}};
1269     $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.null'} = {perl_name =>
1270     ['null'],
1271     'uri',
1272     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.null',
1273     '1'}};
1274     $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.utf-8'} = {ascii8 =>
1275     '1',
1276     bom_allowed =>
1277     '1',
1278     no_bom_variant =>
1279     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf8',
1280     utf8_encoding_scheme =>
1281     '1',
1282     'uri',
1283     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.utf-8',
1284     '1'},
1285 wakaba 1.2 xml_name => 'UTF-8',
1286     };
1287    
1288 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/UTF-8.RFC2279'} = {ascii8 =>
1289     '1',
1290     bom_allowed =>
1291     '1',
1292     no_bom_variant =>
1293     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf8',
1294     utf8_encoding_scheme =>
1295     '1',
1296     'uri',
1297     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/UTF-8.RFC2279',
1298     '1'}};
1299     $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-8'} = {
1300     ascii8 => 1,
1301     is_block_safe =>
1302     '1',
1303     perl_name =>
1304     ['utf-8'],
1305     utf8_encoding_scheme =>
1306     '1',
1307     'uri',
1308     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-8',
1309     '1'}};
1310     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:utf-8'} = {
1311     ascii8 => 1,
1312     bom_allowed =>
1313     '1',
1314     no_bom_variant =>
1315     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-8',
1316     ietf_name =>
1317     ['utf-8'],
1318     mime_name =>
1319     'utf-8',
1320     utf8_encoding_scheme =>
1321     '1',
1322     'uri',
1323     {'urn:x-suika-fam-cx:charset:utf-8',
1324     '1'},
1325 wakaba 1.2 };
1326    
1327 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf8'} = {ascii8 =>
1328     '1',
1329     is_block_safe =>
1330     '1',
1331     perl_name =>
1332     ['utf8'],
1333     utf8_encoding_scheme =>
1334     '1',
1335     'uri',
1336     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf8',
1337     '1'}};
1338     $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.utf-16'} = {
1339     ascii16 => 1,
1340     bom_allowed =>
1341     '1',
1342     bom_required =>
1343     '1',
1344     no_bom_variant =>
1345     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16le',
1346     no_bom_variant16be =>
1347     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16be',
1348     no_bom_variant16le =>
1349     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16le',
1350     perl_name =>
1351     ['utf-16'],
1352     'uri',
1353     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.utf-16',
1354     '1'},
1355 wakaba 1.2 xml_name => 'UTF-16',
1356     };
1357    
1358 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:utf-16'} = {
1359     ascii16 => 1,
1360     bom_allowed =>
1361     '1',
1362     no_bom_variant =>
1363     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16le',
1364     no_bom_variant16be =>
1365     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16be',
1366     no_bom_variant16le =>
1367     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16le',
1368     ietf_name =>
1369     ['utf-16'],
1370     mime_name =>
1371     'utf-16',
1372     'uri',
1373     {'urn:x-suika-fam-cx:charset:utf-16',
1374     '1'},
1375 wakaba 1.2 };
1376    
1377 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:utf-16be'} = {
1378     ascii16 => 1,
1379     ascii16be => 1,
1380     bom_allowed =>
1381     '1',
1382     no_bom_variant =>
1383     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16be',
1384     no_bom_variant16be =>
1385     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16be',
1386     ietf_name =>
1387     ['utf-16be'],
1388     mime_name =>
1389     'utf-16be',
1390     'uri',
1391     {'urn:x-suika-fam-cx:charset:utf-16be',
1392     '1'},
1393 wakaba 1.2 };
1394    
1395 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:utf-16le'} = {
1396     ascii16 => 1,
1397     ascii16le => 1,
1398     bom_allowed =>
1399     '1',
1400     no_bom_variant =>
1401     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16le',
1402     no_bom_variant16le =>
1403     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16le',
1404     ietf_name =>
1405     ['utf-16le'],
1406     mime_name =>
1407     'utf-16le',
1408     'uri',
1409     {'urn:x-suika-fam-cx:charset:utf-16le',
1410     '1'},
1411 wakaba 1.2 };
1412    
1413 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16be'} = {
1414     ascii16 => 1,
1415     ascii16be => 1,
1416     is_block_safe =>
1417     '1',
1418     perl_name =>
1419     ['utf-16be'],
1420     'uri',
1421     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16be',
1422     '1'}};
1423     $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16le'} = {
1424     ascii16 => 1,
1425     ascii16le => 1,
1426     is_block_safe =>
1427     '1',
1428     perl_name =>
1429     ['utf-16le'],
1430     'uri',
1431     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16le',
1432     '1'}};
1433     $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-10646-ucs-2'} = $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:iso-10646-ucs-2'} = {
1434     ascii16 => 1,
1435     bom_allowed =>
1436     '1',
1437     no_bom_variant =>
1438     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.ucs-2le',
1439     no_bom_variant16be =>
1440     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.ucs-2be',
1441     no_bom_variant16le =>
1442     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.ucs-2le',
1443     ietf_name =>
1444     ['csunicode',
1445     'iso-10646-ucs-2'],
1446     mime_name =>
1447     'iso-10646-ucs-2',
1448     'uri',
1449     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-10646-ucs-2',
1450     '1',
1451     'urn:x-suika-fam-cx:charset:iso-10646-ucs-2',
1452     '1'},
1453 wakaba 1.2 xml_name => 'ISO-10646-UCS-2',
1454     };
1455    
1456 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.ucs-2be'} = {
1457     ascii16 => 1,
1458     ascii16be => 1,
1459     is_block_safe =>
1460     '1',
1461     perl_name =>
1462     ['ucs-2be'],
1463     'uri',
1464     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.ucs-2be',
1465     '1'}};
1466     $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.ucs-2le'} = {
1467     ascii16 => 1,
1468     ascii16le => 1,
1469     is_block_safe =>
1470     '1',
1471     perl_name =>
1472     ['ucs-2le'],
1473     'uri',
1474     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.ucs-2le',
1475     '1'}};
1476     $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-10646-ucs-4'} = $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:iso-10646-ucs-4'} = {
1477     ascii32 => 1,
1478     bom_allowed =>
1479     '1',
1480     no_bom_variant =>
1481     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-32le',
1482     no_bom_variant32endian1234 =>
1483     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-32be',
1484     no_bom_variant32endian4321 =>
1485     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-32le',
1486     ietf_name =>
1487     ['csucs4',
1488     'iso-10646-ucs-4'],
1489     mime_name =>
1490     'iso-10646-ucs-4',
1491     'uri',
1492     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-10646-ucs-4',
1493     '1',
1494     'urn:x-suika-fam-cx:charset:iso-10646-ucs-4',
1495     '1'},
1496 wakaba 1.2 xml_name => 'ISO-10646-UCS-4',
1497     };
1498    
1499 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-32be'} = {
1500     ascii32 => 1,
1501     ascii32endian1234 => 1,
1502     is_block_safe =>
1503     '1',
1504     perl_name =>
1505     ['ucs-4be',
1506     'utf-32be'],
1507     'uri',
1508     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-32be',
1509     '1'}};
1510     $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-32le'} = {
1511     ascii32 => 1,
1512     ascii32endian4321 => 1,
1513     is_block_safe =>
1514     '1',
1515     perl_name =>
1516     ['ucs-4le',
1517     'utf-32le'],
1518     'uri',
1519     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-32le',
1520     '1'}};
1521     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:iso_8859-1:1987'} = $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-1'} = {ascii8 =>
1522     '1',
1523     is_block_safe =>
1524     '1',
1525     ietf_name =>
1526     ['cp819',
1527     'csisolatin1',
1528     'ibm819',
1529     'iso-8859-1',
1530     'iso-8859-1',
1531     'iso-ir-100',
1532     'iso_8859-1',
1533     'iso_8859-1:1987',
1534     'l1',
1535     'latin1'],
1536     mime_name =>
1537     'iso-8859-1',
1538     perl_name =>
1539     ['iso-8859-1',
1540     'latin1'],
1541     'uri',
1542     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-1',
1543     '1',
1544     'urn:x-suika-fam-cx:charset:iso_8859-1:1987',
1545     '1'},
1546 wakaba 1.2 xml_name => 'ISO-8859-1',
1547     };
1548    
1549 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-2'} = {ascii8 =>
1550     '1',
1551     is_block_safe =>
1552     '1',
1553     'uri',
1554     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-2',
1555     '1'},
1556 wakaba 1.2 xml_name => 'ISO-8859-2',
1557     };
1558    
1559 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-3'} = {ascii8 =>
1560     '1',
1561     is_block_safe =>
1562     '1',
1563     'uri',
1564     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-3',
1565     '1'},
1566 wakaba 1.2 xml_name => 'ISO-8859-3',
1567     };
1568    
1569 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-4'} = {ascii8 =>
1570     '1',
1571     is_block_safe =>
1572     '1',
1573     'uri',
1574     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-4',
1575     '1'},
1576 wakaba 1.2 xml_name => 'ISO-8859-4',
1577     };
1578    
1579 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-5'} = {ascii8 =>
1580     '1',
1581     is_block_safe =>
1582     '1',
1583     'uri',
1584     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-5',
1585     '1'},
1586 wakaba 1.2 xml_name => 'ISO-8859-5',
1587     };
1588    
1589 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-6'} = {ascii8 =>
1590     '1',
1591     is_block_safe =>
1592     '1',
1593     'uri',
1594     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-6',
1595     '1'},
1596 wakaba 1.2 xml_name => 'ISO-8859-6',
1597     };
1598    
1599 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-7'} = {ascii8 =>
1600     '1',
1601     is_block_safe =>
1602     '1',
1603     'uri',
1604     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-7',
1605     '1'},
1606 wakaba 1.2 xml_name => 'ISO-8859-7',
1607     };
1608    
1609 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-8'} = {ascii8 =>
1610     '1',
1611     is_block_safe =>
1612     '1',
1613     'uri',
1614     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-8',
1615     '1'},
1616 wakaba 1.2 xml_name => 'ISO-8859-8',
1617     };
1618    
1619 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-9'} = {ascii8 =>
1620     '1',
1621     is_block_safe =>
1622     '1',
1623     'uri',
1624     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-9',
1625     '1'},
1626 wakaba 1.2 xml_name => 'ISO-8859-9',
1627     };
1628    
1629 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-10'} = {ascii8 =>
1630     '1',
1631     is_block_safe =>
1632     '1',
1633     'uri',
1634     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-10',
1635     '1'},
1636 wakaba 1.2 xml_name => 'ISO-8859-10',
1637     };
1638    
1639 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-11'} = {ascii8 =>
1640     '1',
1641     is_block_safe =>
1642     '1',
1643     'uri',
1644     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-11',
1645     '1'},
1646 wakaba 1.2 xml_name => 'ISO-8859-11',
1647     };
1648    
1649 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-13'} = {ascii8 =>
1650     '1',
1651     is_block_safe =>
1652     '1',
1653     'uri',
1654     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-13',
1655     '1'},
1656 wakaba 1.2 xml_name => 'ISO-8859-13',
1657     };
1658    
1659 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-14'} = {ascii8 =>
1660     '1',
1661     is_block_safe =>
1662     '1',
1663     'uri',
1664     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-14',
1665     '1'},
1666 wakaba 1.2 xml_name => 'ISO-8859-14',
1667     };
1668    
1669 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-15'} = {ascii8 =>
1670     '1',
1671     is_block_safe =>
1672     '1',
1673     'uri',
1674     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-15',
1675     '1'},
1676 wakaba 1.2 xml_name => 'ISO-8859-15',
1677     };
1678    
1679 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-16'} = {ascii8 =>
1680     '1',
1681     is_block_safe =>
1682     '1',
1683     'uri',
1684     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-16',
1685     '1'},
1686 wakaba 1.2 xml_name => 'ISO-8859-16',
1687     };
1688    
1689 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-2022-jp'} = {ascii8 =>
1690     '1',
1691     'uri',
1692     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-2022-jp',
1693     '1'},
1694 wakaba 1.2 xml_name => 'ISO-2022-JP',
1695     };
1696    
1697 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:iso-2022-jp'} = {ascii8 =>
1698     '1',
1699     ietf_name =>
1700     ['csiso2022jp',
1701     'iso-2022-jp',
1702     'iso-2022-jp'],
1703     mime_name =>
1704     'iso-2022-jp',
1705     'uri',
1706     {'urn:x-suika-fam-cx:charset:iso-2022-jp',
1707     '1'},
1708 wakaba 1.2 };
1709    
1710 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.iso-2022-jp'} = {ascii8 =>
1711     '1',
1712     perl_name =>
1713     ['iso-2022-jp'],
1714     'uri',
1715     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.iso-2022-jp',
1716     '1'}};
1717     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:shift_jis'} = $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.shift_jis'} = {ascii8 =>
1718     '1',
1719     is_block_safe =>
1720     '1',
1721     ietf_name =>
1722     ['csshiftjis',
1723     'ms_kanji',
1724     'shift_jis',
1725     'shift_jis'],
1726     mime_name =>
1727     'shift_jis',
1728     perl_name =>
1729     ['shift-jis-1997'],
1730     'uri',
1731     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.shift_jis',
1732     '1',
1733     'urn:x-suika-fam-cx:charset:shift_jis',
1734     '1'},
1735 wakaba 1.2 xml_name => 'Shift_JIS',
1736     };
1737    
1738 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.shiftjis'} = {ascii8 =>
1739     '1',
1740     is_block_safe =>
1741     '1',
1742     perl_name =>
1743     ['shiftjis',
1744     'sjis'],
1745     'uri',
1746     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.shiftjis',
1747     '1'}};
1748     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:euc-jp'} = $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.euc-jp'} = $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:extended_unix_code_packed_format_for_japanese'} = {ascii8 =>
1749     '1',
1750     is_block_safe =>
1751     '1',
1752     ietf_name =>
1753     ['cseucpkdfmtjapanese',
1754     'euc-jp',
1755     'euc-jp',
1756     'extended_unix_code_packed_format_for_japanese'],
1757     mime_name =>
1758     'euc-jp',
1759     perl_name =>
1760     ['euc-jp-1997'],
1761     'uri',
1762     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.euc-jp',
1763     '1',
1764     'urn:x-suika-fam-cx:charset:euc-jp',
1765     '1',
1766     'urn:x-suika-fam-cx:charset:extended_unix_code_packed_format_for_japanese',
1767     '1'},
1768 wakaba 1.2 xml_name => 'EUC-JP',
1769     };
1770    
1771 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.euc-jp'} = {ascii8 =>
1772     '1',
1773     is_block_safe =>
1774     '1',
1775     perl_name =>
1776     ['euc-jp',
1777     'ujis'],
1778     'uri',
1779     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.euc-jp',
1780     '1'}};
1781    
1782     1;
1783 wakaba 1.14 ## $Date: 2008/09/14 03:59:08 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24