/[suikacvs]/markup/html/whatpm/Whatpm/Charset/DecodeHandle.pm
Suika

Contents of /markup/html/whatpm/Whatpm/Charset/DecodeHandle.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.15 - (hide annotations) (download)
Sun Sep 14 06:58:28 2008 UTC (16 years, 10 months ago) by wakaba
Branch: MAIN
Changes since 1.14: +102 -186 lines
++ whatpm/Whatpm/ChangeLog	14 Sep 2008 06:57:36 -0000
	* HTML.pm.src: It turns out that U+FFFD don't have to
	be added to the list of excluded characters.

2008-09-14  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/Charset/ChangeLog	14 Sep 2008 06:58:07 -0000
	* DecodeHandle.pm: Merge the EUCJP class into the Encode class.

2008-09-14  Wakaba  <wakaba@suika.fam.cx>

1 wakaba 1.1 package Whatpm::Charset::DecodeHandle;
2     use strict;
3    
4 wakaba 1.9 ## NOTE: |Message::Charset::Info| uses this module without calling
5     ## the constructor.
6 wakaba 1.15 use Message::Charset::Info;
7 wakaba 1.9
8 wakaba 1.1 my $XML_AUTO_CHARSET = q<http://suika.fam.cx/www/2006/03/xml-entity/>;
9     my $IANA_CHARSET = q<urn:x-suika-fam-cx:charset:>;
10     my $PERL_CHARSET = q<http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.>;
11     my $XML_CHARSET = q<http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.>;
12    
13     ## ->create_decode_handle ($charset_uri, $byte_stream, $onerror)
14     sub create_decode_handle ($$$;$) {
15     my $csdef = $Whatpm::Charset::CharsetDef->{$_[1]};
16     my $obj = {
17 wakaba 1.15 category => 0,
18 wakaba 1.9 char_buffer => \(my $s = ''),
19 wakaba 1.11 char_buffer_pos => 0,
20 wakaba 1.1 character_queue => [],
21     filehandle => $_[2],
22     charset => $_[1],
23     byte_buffer => '',
24     onerror => $_[3] || sub {},
25     };
26     if ($csdef->{uri}->{$XML_AUTO_CHARSET} or
27     $obj->{charset} eq $XML_AUTO_CHARSET) {
28     my $b = ''; # UTF-8 w/o BOM
29     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
30 wakaba 1.2 $obj->{input_encoding} = 'UTF-8';
31 wakaba 1.1 if (read $obj->{filehandle}, $b, 256) {
32     no warnings "substr";
33     no warnings "uninitialized";
34     if (substr ($b, 0, 1) eq "<") {
35     if (substr ($b, 1, 1) eq "?") { # ASCII8
36     if ($b =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
37     encoding\s*=\s*["']([^"']*)/x) {
38 wakaba 1.2 $obj->{input_encoding} = $1;
39 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
40     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
41     if (not $csdef->{ascii8} or $csdef->{bom_required}) {
42     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
43     charset_uri => $uri,
44     charset_name => $obj->{input_encoding});
45     }
46     } else {
47     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
48 wakaba 1.2 $obj->{input_encoding} = 'UTF-8';
49 wakaba 1.1 }
50     if (defined $csdef->{no_bom_variant}) {
51     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant}};
52     }
53     } elsif (substr ($b, 1, 1) eq "\x00") {
54     if (substr ($b, 2, 2) eq "?\x00") { # ASCII16LE
55     my $c = $b; $c =~ tr/\x00//d;
56     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
57     encoding\s*=\s*["']([^"']*)/x) {
58 wakaba 1.2 $obj->{input_encoding} = $1;
59 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
60     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
61     if (not $csdef->{ascii16} or $csdef->{ascii16be} or
62     $csdef->{bom_required}) {
63     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
64     charset_uri => $uri,
65     charset_name => $obj->{input_encoding});
66     }
67     } else {
68     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
69 wakaba 1.2 $obj->{input_encoding} = 'UTF-8';
70 wakaba 1.1 }
71     if (defined $csdef->{no_bom_variant16le}) {
72     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant16le}};
73     }
74     } elsif (substr ($b, 2, 2) eq "\x00\x00") { # ASCII32Endian4321
75     my $c = $b; $c =~ tr/\x00//d;
76     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
77     encoding\s*=\s*["']([^"']*)/x) {
78 wakaba 1.2 $obj->{input_encoding} = $1;
79 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
80     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
81     if (not $csdef->{ascii32} or
82     $csdef->{ascii32endian1234} or
83     $csdef->{ascii32endian2143} or
84     $csdef->{ascii32endian3412} or
85     $csdef->{bom_required}) {
86     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
87     charset_uri => $uri,
88     charset_name => $obj->{input_encoding});
89     }
90     } else {
91     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
92 wakaba 1.2 $obj->{input_encoding} = 'UTF-8';
93 wakaba 1.1 }
94     if (defined $csdef->{no_bom_variant32endian4321}) {
95     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant32endian4321}};
96     }
97     }
98     }
99     } elsif (substr ($b, 0, 3) eq "\xEF\xBB\xBF") { # UTF8
100     $obj->{has_bom} = 1;
101     substr ($b, 0, 3) = '';
102     my $c = $b;
103     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
104     encoding\s*=\s*["']([^"']*)/x) {
105 wakaba 1.2 $obj->{input_encoding} = $1;
106 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
107     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
108     if (not $csdef->{utf8_encoding_scheme} or
109     not $csdef->{bom_allowed}) {
110     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
111     charset_uri => $uri,
112     charset_name => $obj->{input_encoding});
113     }
114     } else {
115     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
116 wakaba 1.2 $obj->{input_encoding} = 'UTF-8';
117 wakaba 1.1 }
118     if (defined $csdef->{no_bom_variant}) {
119     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant}};
120     }
121     } elsif (substr ($b, 0, 2) eq "\x00<") {
122     if (substr ($b, 2, 2) eq "\x00?") { # ASCII16BE
123     my $c = $b; $c =~ tr/\x00//d;
124     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
125     encoding\s*=\s*["']([^"']*)/x) {
126 wakaba 1.2 $obj->{input_encoding} = $1;
127 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
128     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
129     if (not $csdef->{ascii16} or $csdef->{ascii16le} or
130     $csdef->{bom_required}) {
131     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
132     charset_uri => $uri,
133     charset_name => $obj->{input_encoding});
134     }
135     } else {
136     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
137 wakaba 1.2 $obj->{input_encoding} = 'UTF-8';
138 wakaba 1.1 }
139     if (defined $csdef->{no_bom_variant16be}) {
140     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant16be}};
141     }
142     } elsif (substr ($b, 2, 2) eq "\x00\x00") { # ASCII32Endian3412
143     my $c = $b; $c =~ tr/\x00//d;
144     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
145     encoding\s*=\s*["']([^"']*)/x) {
146 wakaba 1.2 $obj->{input_encoding} = $1;
147 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
148     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
149     if (not $csdef->{ascii32} or
150     $csdef->{ascii32endian1234} or
151     $csdef->{ascii32endian2143} or
152     $csdef->{ascii32endian4321} or
153     $csdef->{bom_required}) {
154     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
155     charset_uri => $uri,
156     charset_name => $obj->{input_encoding});
157     }
158     } else {
159     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
160 wakaba 1.2 $obj->{input_encoding} = 'UTF-8';
161 wakaba 1.1 }
162     if (defined $csdef->{no_bom_variant32endian3412}) {
163     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant32endian3412}};
164     }
165     }
166     } elsif (substr ($b, 0, 2) eq "\xFE\xFF") {
167     if (substr ($b, 2, 2) eq "\x00<") { # ASCII16BE
168     $obj->{has_bom} = 1;
169     substr ($b, 0, 2) = '';
170     my $c = $b; $c =~ tr/\x00//d;
171     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
172     encoding\s*=\s*["']([^"']*)/x) {
173 wakaba 1.2 $obj->{input_encoding} = $1;
174 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
175     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
176     if (not $csdef->{ascii16} or
177     $csdef->{ascii16le} or
178     not $csdef->{bom_allowed}) {
179     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
180     charset_uri => $uri,
181     charset_name => $obj->{input_encoding});
182     }
183     } else {
184     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-16be'};
185 wakaba 1.2 $obj->{input_encoding} = 'UTF-16';
186 wakaba 1.1 }
187     if (defined $csdef->{no_bom_variant16be}) {
188     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant16be}};
189     }
190     } elsif (substr ($b, 2, 2) eq "\x00\x00") { # ASCII32Endian3412
191     $obj->{has_bom} = 1;
192     substr ($b, 0, 4) = '';
193     my $c = $b; $c =~ tr/\x00//d;
194     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
195     encoding\s*=\s*["']([^"']*)/x) {
196 wakaba 1.2 $obj->{input_encoding} = $1;
197 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
198     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
199     if (not $csdef->{ascii32} or
200     $csdef->{ascii32endian1234} or
201     $csdef->{ascii32endian2143} or
202     $csdef->{ascii32endian4321} or
203     not $csdef->{bom_allowed}) {
204     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
205     charset_uri => $uri,
206     charset_name => $obj->{input_encoding});
207     }
208     } else {
209     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-16be'};
210 wakaba 1.2 $obj->{input_encoding} = 'UTF-16';
211 wakaba 1.1 $obj->{byte_buffer} .= "\x00\x00";
212     }
213     if (defined $csdef->{no_bom_variant32endian3412}) {
214     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant32endian3412}};
215     }
216     } else {
217     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-16be'};
218 wakaba 1.2 $obj->{input_encoding} = 'UTF-16';
219 wakaba 1.1 substr ($b, 0, 2) = '';
220     $obj->{has_bom} = 1;
221     }
222     } elsif (substr ($b, 0, 2) eq "\xFF\xFE") {
223     if (substr ($b, 2, 2) eq "<\x00") { # ASCII16LE
224     $obj->{has_bom} = 1;
225     substr ($b, 0, 2) = '';
226     my $c = $b; $c =~ tr/\x00//d;
227     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
228     encoding\s*=\s*["']([^"']*)/x) {
229 wakaba 1.2 $obj->{input_encoding} = $1;
230 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
231     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
232     if (not $csdef->{ascii16} or
233     $csdef->{ascii16be} or
234     not $csdef->{bom_allowed}) {
235     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
236     charset_uri => $uri,
237     charset_name => $obj->{input_encoding});
238     }
239     } else {
240     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-16le'};
241 wakaba 1.2 $obj->{input_encoding} = 'UTF-16';
242 wakaba 1.1 }
243     if (defined $csdef->{no_bom_variant16le}) {
244     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant16le}};
245     }
246     } elsif (substr ($b, 2, 2) eq "\x00\x00") { # ASCII32Endian4321
247     $obj->{has_bom} = 1;
248     substr ($b, 0, 4) = '';
249     my $c = $b; $c =~ tr/\x00//d;
250     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
251     encoding\s*=\s*["']([^"']*)/x) {
252 wakaba 1.2 $obj->{input_encoding} = $1;
253 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
254     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
255     if (not $csdef->{ascii32} or
256     $csdef->{ascii32endian1234} or
257     $csdef->{ascii32endian2143} or
258     $csdef->{ascii32endian3412} or
259     not $csdef->{bom_allowed}) {
260     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
261     charset_uri => $uri,
262     charset_name => $obj->{input_encoding});
263     }
264     } else {
265     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-16le'};
266 wakaba 1.2 $obj->{input_encoding} = 'UTF-16';
267 wakaba 1.1 $obj->{byte_buffer} .= "\x00\x00";
268     }
269     if (defined $csdef->{no_bom_variant32endian4321}) {
270     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant32endian4321}};
271     }
272     } else {
273     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-16le'};
274 wakaba 1.2 $obj->{input_encoding} = 'UTF-16';
275 wakaba 1.1 substr ($b, 0, 2) = '';
276     $obj->{has_bom} = 1;
277     }
278     } elsif (substr ($b, 0, 2) eq "\x00\x00") {
279     if (substr ($b, 2, 2) eq "\x00<") { # ASCII32Endian1234
280     my $c = $b; $c =~ tr/\x00//d;
281     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
282     encoding\s*=\s*["']([^"']*)/x) {
283 wakaba 1.2 $obj->{input_encoding} = $1;
284 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
285     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
286     if (not $csdef->{ascii32} or
287     $csdef->{ascii32endian2143} or
288     $csdef->{ascii32endian3412} or
289     $csdef->{ascii32endian4321} or
290     $csdef->{bom_required}) {
291     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
292     charset_uri => $uri,
293     charset_name => $obj->{input_encoding});
294     }
295     } else {
296     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
297 wakaba 1.2 $obj->{input_encoding} = 'UTF-8';
298 wakaba 1.1 }
299     if (defined $csdef->{no_bom_variant32endian1234}) {
300     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant32endian1234}};
301     }
302     } elsif (substr ($b, 2, 2) eq "<\x00") { # ASCII32Endian2143
303     my $c = $b; $c =~ tr/\x00//d;
304     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
305     encoding\s*=\s*["']([^"']*)/x) {
306 wakaba 1.2 $obj->{input_encoding} = $1;
307 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
308     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
309     if (not $csdef->{ascii32} or
310     $csdef->{ascii32endian1234} or
311     $csdef->{ascii32endian3412} or
312     $csdef->{ascii32endian4321} or
313     $csdef->{bom_required}) {
314     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
315     charset_uri => $uri,
316     charset_name => $obj->{input_encoding});
317     }
318     } else {
319     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
320 wakaba 1.2 $obj->{input_encoding} = 'UTF-8';
321 wakaba 1.1 }
322     if (defined $csdef->{no_bom_variant32endian2143}) {
323     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant32endian2143}};
324     }
325     } elsif (substr ($b, 2, 2) eq "\xFE\xFF") { # ASCII32Endian1234
326     $obj->{has_bom} = 1;
327     substr ($b, 0, 4) = '';
328     my $c = $b; $c =~ tr/\x00//d;
329     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
330     encoding\s*=\s*["']([^"']*)/x) {
331 wakaba 1.2 $obj->{input_encoding} = $1;
332 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
333     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
334     if (not $csdef->{ascii32} or
335     $csdef->{ascii32endian2143} or
336     $csdef->{ascii32endian3412} or
337     $csdef->{ascii32endian4321} or
338     $csdef->{bom_required}) {
339     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
340     charset_uri => $uri,
341     charset_name => $obj->{input_encoding});
342     }
343     } else {
344     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
345 wakaba 1.2 $obj->{input_encoding} = 'UTF-8';
346 wakaba 1.1 $obj->{has_bom} = 0;
347     $obj->{byte_buffer} .= "\x00\x00\xFE\xFF";
348     }
349     if (defined $csdef->{no_bom_variant32endian1234}) {
350     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant32endian1234}};
351     }
352     } elsif (substr ($b, 2, 2) eq "\xFF\xFE") { # ASCII32Endian2143
353     $obj->{has_bom} = 1;
354     substr ($b, 0, 4) = '';
355     my $c = $b; $c =~ tr/\x00//d;
356     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
357     encoding\s*=\s*["']([^"']*)/x) {
358 wakaba 1.2 $obj->{input_encoding} = $1;
359 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
360     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
361     if (not $csdef->{ascii32} or
362     $csdef->{ascii32endian1234} or
363     $csdef->{ascii32endian3412} or
364     $csdef->{ascii32endian4321} or
365     $csdef->{bom_required}) {
366     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
367     charset_uri => $uri,
368     charset_name => $obj->{input_encoding});
369     }
370     } else {
371     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
372 wakaba 1.2 $obj->{input_encoding} = 'UTF-8';
373 wakaba 1.1 $obj->{has_bom} = 0;
374     $obj->{byte_buffer} .= "\x00\x00\xFF\xFE";
375     }
376     if (defined $csdef->{no_bom_variant32endian2143}) {
377     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant32endian2143}};
378     }
379     }
380     # \x4C\x6F\xA7\x94 EBCDIC
381     } # buffer
382     $obj->{byte_buffer} .= $b;
383     } # read
384     } elsif ($csdef->{uri}->{$XML_CHARSET.'utf-8'}) {
385     ## BOM is optional.
386     my $b = '';
387     if (read $obj->{filehandle}, $b, 3) {
388     if ($b eq "\xEF\xBB\xBF") {
389     $obj->{has_bom} = 1;
390     } else {
391     $obj->{byte_buffer} .= $b;
392     }
393     }
394     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'}; # UTF-8 w/o BOM
395     } elsif ($csdef->{uri}->{$XML_CHARSET.'utf-16'}) {
396     ## BOM is mandated.
397     my $b = '';
398     if (read $obj->{filehandle}, $b, 2) {
399     if ($b eq "\xFE\xFF") {
400     $obj->{has_bom} = 1; # UTF-16BE w/o BOM
401     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-16be'};
402     } elsif ($b eq "\xFF\xFE") {
403     $obj->{has_bom} = 1; # UTF-16LE w/o BOM
404     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-16le'};
405     } else {
406     $obj->{onerror}->(undef, 'no-bom-error', charset_uri => $obj->{charset});
407     $obj->{has_bom} = 0;
408     $obj->{byte_buffer} .= $b; # UTF-16BE w/o BOM
409     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-16be'};
410     }
411     } else {
412     $obj->{onerror}->(undef, 'no-bom-error', charset_uri => $obj->{charset});
413     $obj->{has_bom} = 0; # UTF-16BE w/o BOM
414     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-16be'};
415     }
416     }
417    
418     if ($csdef->{uri}->{$XML_CHARSET.'iso-2022-jp'}) {
419     $obj->{state_2440} = 'gl-jis-1997-swapped';
420     $obj->{state_2442} = 'gl-jis-1997';
421     $obj->{state} = 'state_2842';
422     require Encode::GLJIS1997Swapped;
423     require Encode::GLJIS1997;
424     if (Encode::find_encoding ($obj->{state_2440}) and
425     Encode::find_encoding ($obj->{state_2442})) {
426     return bless $obj, 'Whatpm::Charset::DecodeHandle::ISO2022JP';
427     }
428     } elsif ($csdef->{uri}->{$IANA_CHARSET.'iso-2022-jp'}) {
429     $obj->{state_2440} = 'gl-jis-1978';
430     $obj->{state_2442} = 'gl-jis-1983';
431     $obj->{state} = 'state_2842';
432     require Encode::GLJIS1978;
433     require Encode::GLJIS1983;
434     if (Encode::find_encoding ($obj->{state_2440}) and
435     Encode::find_encoding ($obj->{state_2442})) {
436     return bless $obj, 'Whatpm::Charset::DecodeHandle::ISO2022JP';
437     }
438     } elsif (defined $csdef->{perl_name}->[0]) {
439     if ($csdef->{uri}->{$XML_CHARSET.'euc-jp'} or
440     $csdef->{uri}->{$IANA_CHARSET.'euc-jp'}) {
441     $obj->{perl_encoding_name} = $csdef->{perl_name}->[0];
442     require Encode::EUCJP1997;
443     if (Encode::find_encoding ($obj->{perl_encoding_name})) {
444 wakaba 1.15 $obj->{category} |= Message::Charset::Info::CHARSET_CATEGORY_EUCJP;
445     return bless $obj, 'Whatpm::Charset::DecodeHandle::Encode';
446 wakaba 1.1 }
447     } elsif ($csdef->{uri}->{$XML_CHARSET.'shift_jis'} or
448     $csdef->{uri}->{$IANA_CHARSET.'shift_jis'}) {
449     $obj->{perl_encoding_name} = $csdef->{perl_name}->[0];
450     require Encode::ShiftJIS1997;
451     if (Encode::find_encoding ($obj->{perl_encoding_name})) {
452     return bless $obj, 'Whatpm::Charset::DecodeHandle::ShiftJIS';
453     }
454     } elsif ($csdef->{is_block_safe}) {
455     $obj->{perl_encoding_name} = $csdef->{perl_name}->[0];
456     require Encode;
457     if (Encode::find_encoding ($obj->{perl_encoding_name})) {
458     return bless $obj, 'Whatpm::Charset::DecodeHandle::Encode';
459     }
460     }
461     }
462    
463     $obj->{onerror}->(undef, 'charset-not-supported-error',
464     charset_uri => $obj->{charset});
465     return undef;
466     } # create_decode_handle
467    
468     sub name_to_uri ($$$) {
469     my $domain = $_[1];
470     my $name = lc $_[2];
471    
472     if ($domain eq 'ietf') {
473     return $IANA_CHARSET . $name;
474     } elsif ($domain eq 'xml') {
475     if ({
476     'utf-8' => 1,
477     'utf-16' => 1,
478     'iso-10646-ucs-2' => 1,
479     'iso-10646-ucs-4' => 1,
480     'iso-8859-1' => 1,
481     'iso-8859-2' => 1,
482     'iso-8859-3' => 1,
483     'iso-8859-4' => 1,
484     'iso-8859-5' => 1,
485     'iso-8859-6' => 1,
486     'iso-8859-7' => 1,
487     'iso-8859-8' => 1,
488     'iso-8859-9' => 1,
489     'iso-8859-10' => 1,
490     'iso-8859-11' => 1,
491     'iso-8859-13' => 1,
492     'iso-8859-14' => 1,
493     'iso-8859-15' => 1,
494     'iso-8859-16' => 1,
495     'iso-2022-jp' => 1,
496     'shift_jis' => 1,
497     'euc-jp' => 1,
498     }->{$name}) {
499     return $XML_CHARSET . $name;
500     }
501    
502     my $uri = $IANA_CHARSET . $name;
503     return $uri if $Whatpm::Charset::CharsetDef->{$uri};
504    
505     return $XML_CHARSET . $name;
506     } else {
507     return undef;
508     }
509     } # name_to_uri
510    
511     sub uri_to_name ($$$) {
512     my (undef, $domain, $uri) = @_;
513 wakaba 1.2
514     if ($domain eq 'xml') {
515     my $v = $Whatpm::Charset::CharsetDef->{$uri}->{xml_name};
516     return $v if defined $v;
517    
518     if (substr ($uri, 0, length $XML_CHARSET) eq $XML_CHARSET) {
519     return substr ($uri, length $XML_CHARSET);
520     }
521    
522     $domain = 'ietf'; ## TODO: XML encoding name has smaller range
523     }
524    
525     if ($domain eq 'ietf') {
526     my $v = $Whatpm::Charset::CharsetDef->{$uri}->{iana_name};
527     return $v->[0] if defined $v;
528    
529     if (substr ($uri, 0, length $IANA_CHARSET) eq $IANA_CHARSET) {
530 wakaba 1.1 return substr ($uri, length $IANA_CHARSET);
531     }
532     }
533    
534     return undef;
535     } # uri_to_name
536    
537 wakaba 1.3 require IO::Handle;
538    
539     package Whatpm::Charset::DecodeHandle::ByteBuffer;
540    
541 wakaba 1.7 ## NOTE: Provides a byte buffer wrapper object.
542    
543 wakaba 1.3 sub new ($$) {
544     my $self = bless {
545     buffer => '',
546     }, shift;
547     $self->{filehandle} = shift;
548     return $self;
549     } # new
550    
551     sub read {
552     my $self = shift;
553     my $pos = length $self->{buffer};
554     my $r = $self->{filehandle}->read ($self->{buffer}, $_[1], $pos);
555     substr ($_[0], $_[2]) = substr ($self->{buffer}, $pos);
556 wakaba 1.8 ## NOTE: This would do different behavior from Perl's standard
557     ## |read| when $pos points beyond the end of the string.
558 wakaba 1.3 return $r;
559     } # read
560    
561     sub close { $_[0]->{filehandle}->close }
562    
563 wakaba 1.11 package Whatpm::Charset::DecodeHandle::CharString;
564    
565     ## NOTE: Same as Perl's standard |open $handle, '<', \$char_string|,
566     ## but supports |ungetc| and other extensions.
567    
568     sub new ($$) {
569     my $self = bless {pos => 0}, shift;
570     $self->{string} = shift; # must be a scalar ref
571     return $self;
572     } # new
573    
574     sub getc ($) {
575     my $self = shift;
576     if ($self->{pos} < length ${$self->{string}}) {
577     return substr ${$self->{string}}, $self->{pos}++, 1;
578     } else {
579     return undef;
580     }
581     } # getc
582    
583     sub read ($$$$) {
584     #my ($self, $scalar, $length, $offset) = @_;
585     my $self = $_[0];
586     my $length = $_[2] || 0;
587     my $offset = $_[3];
588     ## NOTE: We don't support standard Perl semantics if $offset is
589     ## greater than the length of $scalar.
590     substr ($_[1], $offset) = substr (${$self->{string}}, $self->{pos}, $length);
591     my $count = (length $_[1]) - $offset;
592     $self->{pos} += $count;
593     return $count;
594     } # read
595    
596 wakaba 1.12 sub manakai_read_until ($$$;$) {
597     #my ($self, $scalar, $pattern, $offset) = @_;
598     my $self = $_[0];
599 wakaba 1.11 pos (${$self->{string}}) = $self->{pos};
600 wakaba 1.12 if (${$self->{string}} =~ /\G(?>$_[2])+/) {
601     substr ($_[1], $_[3]) = substr (${$self->{string}}, $-[0], $+[0] - $-[0]);
602 wakaba 1.11 $self->{pos} += $+[0] - $-[0];
603 wakaba 1.12 return $+[0] - $-[0];
604 wakaba 1.11 } else {
605 wakaba 1.12 return 0;
606 wakaba 1.11 }
607 wakaba 1.12 } # manakai_read_until
608 wakaba 1.11
609     sub ungetc ($$) {
610     my $self = shift;
611     ## Ignore second parameter.
612     $self->{pos}-- if $self->{pos} > 0;
613     } # ungetc
614    
615     sub close ($) { }
616    
617 wakaba 1.1 package Whatpm::Charset::DecodeHandle::Encode;
618    
619 wakaba 1.7 ## NOTE: Provides a Perl |Encode| module wrapper object.
620    
621 wakaba 1.1 sub charset ($) { $_[0]->{charset} }
622    
623 wakaba 1.3 sub close ($) { $_[0]->{filehandle}->close }
624 wakaba 1.1
625     sub getc ($) {
626 wakaba 1.9 my $c = '';
627     my $l = $_[0]->read ($c, 1);
628     if ($l) {
629     return $c;
630     } else {
631     return undef;
632     }
633     } # getc
634    
635     sub read ($$$;$) {
636 wakaba 1.1 my $self = $_[0];
637 wakaba 1.14 #my $scalar = $_[1];
638     my $length = $_[2];
639     my $offset = $_[3] || 0;
640     my $count = 0;
641     my $eof;
642     ## NOTE: It is incompatible with the standard Perl semantics
643     ## if $offset is greater than the length of $scalar.
644    
645     A: {
646     return $count if $length < 1;
647    
648     if (my $l = (length ${$self->{char_buffer}}) - $self->{char_buffer_pos}) {
649     if ($l >= $length) {
650     substr ($_[1], $offset)
651     = substr (${$self->{char_buffer}}, $self->{char_buffer_pos},
652     $length);
653     $count += $length;
654     $self->{char_buffer_pos} += $length;
655     $length = 0;
656     return $count;
657     } else {
658     substr ($_[1], $offset)
659     = substr (${$self->{char_buffer}}, $self->{char_buffer_pos});
660     $count += $l;
661     $length -= $l;
662     ${$self->{char_buffer}} = '';
663     $self->{char_buffer_pos} = 0;
664     }
665     $offset = length $_[1];
666     }
667 wakaba 1.1
668 wakaba 1.14 if ($eof) {
669     return $count;
670 wakaba 1.1 }
671 wakaba 1.14
672     my $error;
673     if ($self->{continue}) {
674     if ($self->{filehandle}->read ($self->{byte_buffer}, 256,
675     length $self->{byte_buffer})) {
676     #
677     } else {
678     $error = 1;
679     }
680     $self->{continue} = 0;
681     } elsif (512 > length $self->{byte_buffer}) {
682     if ($self->{filehandle}->read ($self->{byte_buffer}, 256,
683     length $self->{byte_buffer})) {
684     #
685     } else {
686     $eof = 1;
687 wakaba 1.1 }
688 wakaba 1.14 }
689    
690     unless ($error) {
691 wakaba 1.15 if (not $self->{bom_checked}) {
692     if (defined $self->{bom_pattern}) {
693     if ($self->{byte_buffer} =~ s/^$self->{bom_pattern}//) {
694     $self->{has_bom} = 1;
695     }
696     }
697     $self->{bom_checked} = 1;
698     }
699    
700 wakaba 1.14 my $string = Encode::decode ($self->{perl_encoding_name},
701     $self->{byte_buffer},
702     Encode::FB_QUIET ());
703     if (length $string) {
704     $self->{char_buffer} = \$string;
705     $self->{char_buffer_pos} = 0;
706     if (length $self->{byte_buffer}) {
707     $self->{continue} = 1;
708     }
709 wakaba 1.1 } else {
710 wakaba 1.14 if (length $self->{byte_buffer}) {
711     $error = 1;
712     } else {
713     ## NOTE: No further input.
714     redo A;
715     }
716 wakaba 1.1 }
717     }
718    
719 wakaba 1.14 if ($error) {
720     my $r = substr $self->{byte_buffer}, 0, 1, '';
721 wakaba 1.15 my $fallback;
722 wakaba 1.14 my $etype = 'illegal-octets-error';
723 wakaba 1.15 my %earg;
724     if ($self->{category}
725     & Message::Charset::Info::CHARSET_CATEGORY_EUCJP) {
726     if ($r =~ /^[\xA1-\xFE]/) {
727     if ($self->{byte_buffer} =~ s/^([\xA1-\xFE])//) {
728     $r .= $1;
729     $etype = 'unassigned-code-point-error';
730     }
731     } elsif ($r eq "\x8F") {
732     if ($self->{byte_buffer} =~ s/^([\xA1-\xFE][\xA1-\xFE]?)//) {
733     $r .= $1;
734     $etype = 'unassigned-code-point-error' if length $1 == 2;
735     }
736     } elsif ($r eq "\x8E") {
737     if ($self->{byte_buffer} =~ s/^([\xA1-\xFE])//) {
738     $r .= $1;
739     $etype = 'unassigned-code-point-error';
740     }
741     } elsif ($r eq "\xA0" or $r eq "\xFF") {
742 wakaba 1.14 $etype = 'unassigned-code-point-error';
743     }
744 wakaba 1.15 } else {
745     $fallback = $self->{fallback}->{$r};
746     if (defined $fallback) {
747     ## NOTE: This is an HTML5 parse error.
748     $etype = 'fallback-char-error';
749     $earg{char} = \$fallback;
750     } elsif (exists $self->{fallback}->{$r}) {
751     ## NOTE: This is an HTML5 parse error. In addition, the octet
752     ## is not assigned with a character.
753     $etype = 'fallback-unassigned-error';
754 wakaba 1.14 }
755 wakaba 1.1 }
756 wakaba 1.15
757 wakaba 1.14 ## NOTE: Fixup line/column number by counting the number of
758     ## lines/columns in the string that is to be retuend by this
759     ## method call.
760     my $line_diff = 0;
761     my $col_diff = 0;
762     my $set_col;
763     for (my $i = 0; $i < $count; $i++) {
764     my $s = substr $_[1], $i - $count, 1;
765     if ($s eq "\x0D") {
766     $line_diff++;
767     $col_diff = 0;
768     $set_col = 1;
769     $i++ if substr ($_[1], $i - $count + 1, 1) eq "\x0A";
770     } elsif ($s eq "\x0A") {
771     $line_diff++;
772     $col_diff = 0;
773     $set_col = 1;
774     } else {
775     $col_diff++;
776     }
777     }
778     my $i = $self->{char_buffer_pos};
779     if ($count and substr (${$self->{char_buffer}}, -1, 1) eq "\x0D") {
780     if (substr (${$self->{char_buffer}}, $i, 1) eq "\x0A") {
781     $i++;
782     }
783 wakaba 1.1 }
784 wakaba 1.14 my $cb_length = length ${$self->{char_buffer}};
785     for (; $i < $cb_length; $i++) {
786     my $s = substr $_[1], $i, 1;
787     if ($s eq "\x0D") {
788     $line_diff++;
789     $col_diff = 0;
790     $set_col = 1;
791     $i++ if substr ($_[1], $i + 1, 1) eq "\x0A";
792     } elsif ($s eq "\x0A") {
793     $line_diff++;
794     $col_diff = 0;
795     $set_col = 1;
796     } else {
797     $col_diff++;
798     }
799 wakaba 1.1 }
800 wakaba 1.15 $self->{onerror}->($self, $etype, octets => \$r, %earg,
801 wakaba 1.14 level => $self->{level}->{$self->{error_level}->{$etype}},
802     line_diff => $line_diff,
803     ($set_col ? (column => 1) : ()),
804     column_diff => $col_diff);
805     ## NOTE: Error handler may modify |octets| parameter, which
806     ## would be returned as part of the output. Note that what
807     ## is returned would affect what |manakai_read_until| returns.
808 wakaba 1.15 ${$self->{char_buffer}} .= defined $fallback ? $fallback : $r;
809 wakaba 1.1 }
810    
811 wakaba 1.14 redo A;
812     } # A
813 wakaba 1.9 } # read
814    
815 wakaba 1.15 sub manakai_read_until ($$$;$) {
816     #my ($self, $scalar, $pattern, $offset) = @_;
817     my $self = $_[0];
818     my $s = '';
819     $self->read ($s, 255);
820     if ($s =~ /^(?>$_[2])+/) {
821     my $rem_length = (length $s) - $+[0];
822     if ($rem_length) {
823     if ($self->{char_buffer_pos} > $rem_length) {
824     $self->{char_buffer_pos} -= $rem_length;
825     } else {
826     substr (${$self->{char_buffer}}, 0, $self->{char_buffer_pos})
827     = substr ($s, $+[0]);
828     $self->{char_buffer_pos} = 0;
829     }
830     }
831     substr ($_[1], $_[3]) = substr ($s, $-[0], $+[0] - $-[0]);
832     return $+[0];
833     } elsif (length $s) {
834     if ($self->{char_buffer_pos} > length $s) {
835     $self->{char_buffer_pos} -= length $s;
836     } else {
837     substr (${$self->{char_buffer}}, 0, $self->{char_buffer_pos}) = $s;
838     $self->{char_buffer_pos} = 0;
839     }
840     }
841     return 0;
842     } # manakai_read_until
843    
844     sub has_bom ($) { $_[0]->{has_bom} }
845    
846     sub input_encoding ($) {
847     my $v = $_[0]->{input_encoding};
848     return $v if defined $v;
849    
850     my $uri = $_[0]->{charset};
851     if (defined $uri) {
852     return Whatpm::Charset::DecodeHandle->uri_to_name (xml => $uri);
853     }
854    
855     return undef;
856     } # input_encoding
857    
858     sub onerror ($;$) {
859     if (@_ > 1) {
860     $_[0]->{onerror} = $_[1];
861     }
862    
863     return $_[0]->{onerror};
864     } # onerror
865    
866     sub ungetc ($$) {
867     unshift @{$_[0]->{character_queue}}, chr int ($_[1] or 0);
868     } # ungetc
869    
870 wakaba 1.1 package Whatpm::Charset::DecodeHandle::ISO2022JP;
871     push our @ISA, 'Whatpm::Charset::DecodeHandle::Encode';
872    
873     sub getc ($) {
874     my $self = $_[0];
875     return shift @{$self->{character_queue}} if @{$self->{character_queue}};
876    
877     my $r;
878     A: {
879     my $error;
880     if ($self->{continue}) {
881 wakaba 1.3 if ($self->{filehandle}->read ($self->{byte_buffer}, 256,
882     length $self->{byte_buffer})) {
883 wakaba 1.1 #
884     } else {
885     $error = 1;
886     }
887     $self->{continue} = 0;
888     } elsif (512 > length $self->{byte_buffer}) {
889 wakaba 1.3 $self->{filehandle}->read ($self->{byte_buffer}, 256,
890     length $self->{byte_buffer});
891 wakaba 1.1 }
892    
893     unless ($error) {
894     if ($self->{byte_buffer} =~ s/^\x1B(\x24[\x40\x42]|\x28[\x42\x4A])//) {
895     $self->{state} = {
896     "\x24\x40" => 'state_2440',
897     "\x24\x42" => 'state_2442',
898     "\x28\x42" => 'state_2842',
899     "\x28\x4A" => 'state_284A',
900     }->{$1};
901     redo A;
902     } elsif ($self->{state} eq 'state_2842') { # IRV
903     if ($self->{byte_buffer} =~ s/^([\x00-\x0D\x10-\x1A\x1C-\x7F]+)//) {
904     push @{$self->{character_queue}}, split //, $1;
905     $r = shift @{$self->{character_queue}};
906     } else {
907     if (length $self->{byte_buffer}) {
908     $error = 1;
909     } else {
910     $r = undef;
911     }
912     }
913     } elsif ($self->{state} eq 'state_284A') { # 0201
914     if ($self->{byte_buffer} =~ s/^([\x00-\x0D\x10-\x1A\x1C-\x7F]+)//) {
915     my $v = $1;
916     $v =~ tr/\x5C\x7E/\xA5\x{203E}/;
917     push @{$self->{character_queue}}, split //, $v;
918     $r = shift @{$self->{character_queue}};
919     } else {
920     if (length $self->{byte_buffer}) {
921     $error = 1;
922     } else {
923     $r = undef;
924     $self->{onerror}->($self, 'invalid-state-error',
925 wakaba 1.7 state => $self->{state},
926     level => $self->{level}->{$self->{error_level}->{'invalid-state-error'}});
927 wakaba 1.1 }
928     }
929     } elsif ($self->{state} eq 'state_2442') { # 1983
930     my $v = Encode::decode ($self->{state_2442},
931     $self->{byte_buffer},
932     Encode::FB_QUIET ());
933     if (length $v) {
934     push @{$self->{character_queue}}, split //, $v;
935     $r = shift @{$self->{character_queue}};
936     } else {
937     if (length $self->{byte_buffer}) {
938     $error = 1;
939     } else {
940     $r = undef;
941     $self->{onerror}->($self, 'invalid-state-error',
942 wakaba 1.7 state => $self->{state},
943     level => $self->{level}->{$self->{error_level}->{'invalid-state-error'}});
944 wakaba 1.1 }
945     }
946     } elsif ($self->{state} eq 'state_2440') { # 1978
947     my $v = Encode::decode ($self->{state_2440},
948     $self->{byte_buffer},
949     Encode::FB_QUIET ());
950     if (length $v) {
951     push @{$self->{character_queue}}, split //, $v;
952     $r = shift @{$self->{character_queue}};
953     } else {
954     if (length $self->{byte_buffer}) {
955     $error = 1;
956     } else {
957     $r = undef;
958     $self->{onerror}->($self, 'invalid-state-error',
959 wakaba 1.7 state => $self->{state},
960     level => $self->{level}->{$self->{error_level}->{'invalid-state-error'}});
961 wakaba 1.1 }
962     }
963     } else {
964     $error = 1;
965     }
966     }
967    
968     if ($error) {
969     $r = substr $self->{byte_buffer}, 0, 1, '';
970     my $etype = 'illegal-octets-error';
971     if (($self->{state} eq 'state_2442' or
972     $self->{state} eq 'state_2440') and
973     $r =~ /^[\x21-\x7E]/ and
974     $self->{byte_buffer} =~ s/^([\x21-\x7E])//) {
975     $r .= $1;
976     $etype = 'unassigned-code-point-error';
977     } elsif ($r eq "\x1B" and
978     $self->{byte_buffer} =~ s/^\(H//) { # Old 0201
979     $r .= "(H";
980     $self->{state} = 'state_284A';
981     }
982 wakaba 1.7 $self->{onerror}->($self, $etype, octets => \$r,
983     level => $self->{level}->{$self->{error_level}->{$etype}});
984 wakaba 1.1 }
985     } # A
986    
987     return $r;
988     } # getc
989    
990 wakaba 1.9 ## TODO: This is not good for performance. Should be replaced
991     ## by read-centric implementation.
992     sub read ($$$;$) {
993     #my ($self, $scalar, $length, $offset) = @_;
994     my $length = $_[2];
995     my $r = '';
996     while ($length > 0) {
997     my $c = $_[0]->getc;
998     last unless defined $c;
999     $r .= $c;
1000     $length--;
1001     }
1002     substr ($_[1], $_[3]) = $r;
1003     ## NOTE: This would do different thing from what Perl's |read| do
1004     ## if $offset points beyond the end of the $scalar.
1005     return length $r;
1006     } # read
1007    
1008 wakaba 1.12 sub manakai_read_until ($$$;$) {
1009     #my ($self, $scalar, $pattern, $offset) = @_;
1010     my $self = $_[0];
1011 wakaba 1.11 my $c = $self->getc;
1012 wakaba 1.12 if ($c =~ /^$_[2]/) {
1013     substr ($_[1], $_[3]) = $c;
1014     return 1;
1015 wakaba 1.11 } elsif (defined $c) {
1016     $self->ungetc (ord $c);
1017 wakaba 1.12 return 0;
1018 wakaba 1.11 } else {
1019 wakaba 1.12 return 0;
1020 wakaba 1.11 }
1021 wakaba 1.12 } # manakai_read_until
1022 wakaba 1.11
1023 wakaba 1.1 package Whatpm::Charset::DecodeHandle::ShiftJIS;
1024     push our @ISA, 'Whatpm::Charset::DecodeHandle::Encode';
1025    
1026     sub getc ($) {
1027     my $self = $_[0];
1028     return shift @{$self->{character_queue}} if @{$self->{character_queue}};
1029    
1030     my $error;
1031     if ($self->{continue}) {
1032 wakaba 1.3 if ($self->{filehandle}->read ($self->{byte_buffer}, 256,
1033     length $self->{byte_buffer})) {
1034 wakaba 1.1 #
1035     } else {
1036     $error = 1;
1037     }
1038     $self->{continue} = 0;
1039     } elsif (512 > length $self->{byte_buffer}) {
1040 wakaba 1.3 $self->{filehandle}->read ($self->{byte_buffer}, 256,
1041     length $self->{byte_buffer});
1042 wakaba 1.1 }
1043    
1044     my $r;
1045     unless ($error) {
1046     my $string = Encode::decode ($self->{perl_encoding_name},
1047     $self->{byte_buffer},
1048     Encode::FB_QUIET ());
1049     if (length $string) {
1050     push @{$self->{character_queue}}, split //, $string;
1051     $r = shift @{$self->{character_queue}};
1052     if (length $self->{byte_buffer}) {
1053     $self->{continue} = 1;
1054     }
1055     } else {
1056     if (length $self->{byte_buffer}) {
1057     $error = 1;
1058     } else {
1059     $r = undef;
1060     }
1061     }
1062     }
1063    
1064     if ($error) {
1065     $r = substr $self->{byte_buffer}, 0, 1, '';
1066     my $etype = 'illegal-octets-error';
1067 wakaba 1.5 if ($r =~ /^[\x81-\x9F\xE0-\xFC]/) {
1068 wakaba 1.1 if ($self->{byte_buffer} =~ s/(.)//s) {
1069     $r .= $1; # not limited to \x40-\xFC - \x7F
1070     $etype = 'unassigned-code-point-error';
1071     }
1072 wakaba 1.5 ## NOTE: Range [\xF0-\xFC] is unassigned and may be used as a single-byte
1073     ## character or as the first-byte of a double-byte character according
1074     ## to JIS X 0208:1997 Appendix 1. However, the current practice is
1075     ## use the range as the first-byte of double-byte characters.
1076     } elsif ($r =~ /^[\x80\xA0\xFD-\xFF]/) {
1077 wakaba 1.1 $etype = 'unassigned-code-point-error';
1078     }
1079 wakaba 1.7 $self->{onerror}->($self, $etype, octets => \$r,
1080     level => $self->{level}->{$self->{error_level}->{$etype}});
1081 wakaba 1.1 }
1082    
1083     return $r;
1084     } # getc
1085    
1086 wakaba 1.9 ## TODO: This is not good for performance. Should be replaced
1087     ## by read-centric implementation.
1088     sub read ($$$;$) {
1089     #my ($self, $scalar, $length, $offset) = @_;
1090     my $length = $_[2];
1091     my $r = '';
1092     while ($length > 0) {
1093     my $c = $_[0]->getc;
1094     last unless defined $c;
1095     $r .= $c;
1096     $length--;
1097     }
1098     substr ($_[1], $_[3]) = $r;
1099     ## NOTE: This would do different thing from what Perl's |read| do
1100     ## if $offset points beyond the end of the $scalar.
1101 wakaba 1.11
1102 wakaba 1.9 return length $r;
1103     } # read
1104    
1105 wakaba 1.12 sub manakai_read_until ($$$;$) {
1106     #my ($self, $scalar, $pattern, $offset) = @_;
1107     my $self = $_[0];
1108 wakaba 1.11 my $c = $self->getc;
1109 wakaba 1.12 if ($c =~ /^$_[2]/) {
1110     substr ($_[1], $_[3]) = $c;
1111     return 1;
1112 wakaba 1.11 } elsif (defined $c) {
1113     $self->ungetc (ord $c);
1114 wakaba 1.12 return 0;
1115 wakaba 1.11 } else {
1116 wakaba 1.12 return 0;
1117 wakaba 1.11 }
1118 wakaba 1.12 } # manakai_read_until
1119 wakaba 1.11
1120 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:us-ascii'} =
1121     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:us'} =
1122     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:iso646-us'} =
1123     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:cp367'} =
1124     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:ibm367'} =
1125     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:ansi_x3.4-1986'} =
1126     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:ansi_x3.4-1968'} =
1127     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:iso-ir-6'} =
1128     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:csascii'} =
1129     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:iso_646.irv:1991'} =
1130     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:ascii'} = {ascii8 =>
1131     '1',
1132     is_block_safe =>
1133     '1',
1134     ietf_name =>
1135     ['ansi_x3.4-1968',
1136     'ansi_x3.4-1986',
1137     'ascii',
1138     'cp367',
1139     'csascii',
1140     'ibm367',
1141     'iso-ir-6',
1142     'iso646-us',
1143     'iso_646.irv:1991',
1144     'us',
1145     'us-ascii',
1146     'us-ascii'],
1147     mime_name =>
1148     'us-ascii',
1149     perl_name =>
1150     ['ascii',
1151     'iso-646-us',
1152     'us-ascii'],
1153     utf8_encoding_scheme =>
1154     '1',
1155     'uri',
1156     {'urn:x-suika-fam-cx:charset:ansi_x3.4-1968',
1157     '1',
1158     'urn:x-suika-fam-cx:charset:ansi_x3.4-1986',
1159     '1',
1160     'urn:x-suika-fam-cx:charset:ascii',
1161     '1',
1162     'urn:x-suika-fam-cx:charset:cp367',
1163     '1',
1164     'urn:x-suika-fam-cx:charset:csascii',
1165     '1',
1166     'urn:x-suika-fam-cx:charset:ibm367',
1167     '1',
1168     'urn:x-suika-fam-cx:charset:iso-ir-6',
1169     '1',
1170     'urn:x-suika-fam-cx:charset:iso646-us',
1171     '1',
1172     'urn:x-suika-fam-cx:charset:iso_646.irv:1991',
1173     '1',
1174     'urn:x-suika-fam-cx:charset:us',
1175     '1',
1176     'urn:x-suika-fam-cx:charset:us-ascii',
1177     '1'},
1178 wakaba 1.2 };
1179    
1180 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.ascii-ctrl'} = {perl_name =>
1181     ['ascii-ctrl'],
1182     'uri',
1183     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.ascii-ctrl',
1184     '1'}};
1185     $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.null'} = {perl_name =>
1186     ['null'],
1187     'uri',
1188     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.null',
1189     '1'}};
1190     $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.utf-8'} = {ascii8 =>
1191     '1',
1192     bom_allowed =>
1193     '1',
1194     no_bom_variant =>
1195     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf8',
1196     utf8_encoding_scheme =>
1197     '1',
1198     'uri',
1199     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.utf-8',
1200     '1'},
1201 wakaba 1.2 xml_name => 'UTF-8',
1202     };
1203    
1204 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/UTF-8.RFC2279'} = {ascii8 =>
1205     '1',
1206     bom_allowed =>
1207     '1',
1208     no_bom_variant =>
1209     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf8',
1210     utf8_encoding_scheme =>
1211     '1',
1212     'uri',
1213     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/UTF-8.RFC2279',
1214     '1'}};
1215     $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-8'} = {
1216     ascii8 => 1,
1217     is_block_safe =>
1218     '1',
1219     perl_name =>
1220     ['utf-8'],
1221     utf8_encoding_scheme =>
1222     '1',
1223     'uri',
1224     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-8',
1225     '1'}};
1226     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:utf-8'} = {
1227     ascii8 => 1,
1228     bom_allowed =>
1229     '1',
1230     no_bom_variant =>
1231     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-8',
1232     ietf_name =>
1233     ['utf-8'],
1234     mime_name =>
1235     'utf-8',
1236     utf8_encoding_scheme =>
1237     '1',
1238     'uri',
1239     {'urn:x-suika-fam-cx:charset:utf-8',
1240     '1'},
1241 wakaba 1.2 };
1242    
1243 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf8'} = {ascii8 =>
1244     '1',
1245     is_block_safe =>
1246     '1',
1247     perl_name =>
1248     ['utf8'],
1249     utf8_encoding_scheme =>
1250     '1',
1251     'uri',
1252     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf8',
1253     '1'}};
1254     $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.utf-16'} = {
1255     ascii16 => 1,
1256     bom_allowed =>
1257     '1',
1258     bom_required =>
1259     '1',
1260     no_bom_variant =>
1261     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16le',
1262     no_bom_variant16be =>
1263     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16be',
1264     no_bom_variant16le =>
1265     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16le',
1266     perl_name =>
1267     ['utf-16'],
1268     'uri',
1269     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.utf-16',
1270     '1'},
1271 wakaba 1.2 xml_name => 'UTF-16',
1272     };
1273    
1274 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:utf-16'} = {
1275     ascii16 => 1,
1276     bom_allowed =>
1277     '1',
1278     no_bom_variant =>
1279     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16le',
1280     no_bom_variant16be =>
1281     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16be',
1282     no_bom_variant16le =>
1283     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16le',
1284     ietf_name =>
1285     ['utf-16'],
1286     mime_name =>
1287     'utf-16',
1288     'uri',
1289     {'urn:x-suika-fam-cx:charset:utf-16',
1290     '1'},
1291 wakaba 1.2 };
1292    
1293 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:utf-16be'} = {
1294     ascii16 => 1,
1295     ascii16be => 1,
1296     bom_allowed =>
1297     '1',
1298     no_bom_variant =>
1299     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16be',
1300     no_bom_variant16be =>
1301     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16be',
1302     ietf_name =>
1303     ['utf-16be'],
1304     mime_name =>
1305     'utf-16be',
1306     'uri',
1307     {'urn:x-suika-fam-cx:charset:utf-16be',
1308     '1'},
1309 wakaba 1.2 };
1310    
1311 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:utf-16le'} = {
1312     ascii16 => 1,
1313     ascii16le => 1,
1314     bom_allowed =>
1315     '1',
1316     no_bom_variant =>
1317     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16le',
1318     no_bom_variant16le =>
1319     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16le',
1320     ietf_name =>
1321     ['utf-16le'],
1322     mime_name =>
1323     'utf-16le',
1324     'uri',
1325     {'urn:x-suika-fam-cx:charset:utf-16le',
1326     '1'},
1327 wakaba 1.2 };
1328    
1329 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16be'} = {
1330     ascii16 => 1,
1331     ascii16be => 1,
1332     is_block_safe =>
1333     '1',
1334     perl_name =>
1335     ['utf-16be'],
1336     'uri',
1337     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16be',
1338     '1'}};
1339     $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16le'} = {
1340     ascii16 => 1,
1341     ascii16le => 1,
1342     is_block_safe =>
1343     '1',
1344     perl_name =>
1345     ['utf-16le'],
1346     'uri',
1347     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16le',
1348     '1'}};
1349     $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-10646-ucs-2'} = $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:iso-10646-ucs-2'} = {
1350     ascii16 => 1,
1351     bom_allowed =>
1352     '1',
1353     no_bom_variant =>
1354     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.ucs-2le',
1355     no_bom_variant16be =>
1356     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.ucs-2be',
1357     no_bom_variant16le =>
1358     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.ucs-2le',
1359     ietf_name =>
1360     ['csunicode',
1361     'iso-10646-ucs-2'],
1362     mime_name =>
1363     'iso-10646-ucs-2',
1364     'uri',
1365     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-10646-ucs-2',
1366     '1',
1367     'urn:x-suika-fam-cx:charset:iso-10646-ucs-2',
1368     '1'},
1369 wakaba 1.2 xml_name => 'ISO-10646-UCS-2',
1370     };
1371    
1372 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.ucs-2be'} = {
1373     ascii16 => 1,
1374     ascii16be => 1,
1375     is_block_safe =>
1376     '1',
1377     perl_name =>
1378     ['ucs-2be'],
1379     'uri',
1380     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.ucs-2be',
1381     '1'}};
1382     $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.ucs-2le'} = {
1383     ascii16 => 1,
1384     ascii16le => 1,
1385     is_block_safe =>
1386     '1',
1387     perl_name =>
1388     ['ucs-2le'],
1389     'uri',
1390     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.ucs-2le',
1391     '1'}};
1392     $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-10646-ucs-4'} = $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:iso-10646-ucs-4'} = {
1393     ascii32 => 1,
1394     bom_allowed =>
1395     '1',
1396     no_bom_variant =>
1397     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-32le',
1398     no_bom_variant32endian1234 =>
1399     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-32be',
1400     no_bom_variant32endian4321 =>
1401     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-32le',
1402     ietf_name =>
1403     ['csucs4',
1404     'iso-10646-ucs-4'],
1405     mime_name =>
1406     'iso-10646-ucs-4',
1407     'uri',
1408     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-10646-ucs-4',
1409     '1',
1410     'urn:x-suika-fam-cx:charset:iso-10646-ucs-4',
1411     '1'},
1412 wakaba 1.2 xml_name => 'ISO-10646-UCS-4',
1413     };
1414    
1415 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-32be'} = {
1416     ascii32 => 1,
1417     ascii32endian1234 => 1,
1418     is_block_safe =>
1419     '1',
1420     perl_name =>
1421     ['ucs-4be',
1422     'utf-32be'],
1423     'uri',
1424     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-32be',
1425     '1'}};
1426     $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-32le'} = {
1427     ascii32 => 1,
1428     ascii32endian4321 => 1,
1429     is_block_safe =>
1430     '1',
1431     perl_name =>
1432     ['ucs-4le',
1433     'utf-32le'],
1434     'uri',
1435     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-32le',
1436     '1'}};
1437     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:iso_8859-1:1987'} = $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-1'} = {ascii8 =>
1438     '1',
1439     is_block_safe =>
1440     '1',
1441     ietf_name =>
1442     ['cp819',
1443     'csisolatin1',
1444     'ibm819',
1445     'iso-8859-1',
1446     'iso-8859-1',
1447     'iso-ir-100',
1448     'iso_8859-1',
1449     'iso_8859-1:1987',
1450     'l1',
1451     'latin1'],
1452     mime_name =>
1453     'iso-8859-1',
1454     perl_name =>
1455     ['iso-8859-1',
1456     'latin1'],
1457     'uri',
1458     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-1',
1459     '1',
1460     'urn:x-suika-fam-cx:charset:iso_8859-1:1987',
1461     '1'},
1462 wakaba 1.2 xml_name => 'ISO-8859-1',
1463     };
1464    
1465 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-2'} = {ascii8 =>
1466     '1',
1467     is_block_safe =>
1468     '1',
1469     'uri',
1470     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-2',
1471     '1'},
1472 wakaba 1.2 xml_name => 'ISO-8859-2',
1473     };
1474    
1475 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-3'} = {ascii8 =>
1476     '1',
1477     is_block_safe =>
1478     '1',
1479     'uri',
1480     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-3',
1481     '1'},
1482 wakaba 1.2 xml_name => 'ISO-8859-3',
1483     };
1484    
1485 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-4'} = {ascii8 =>
1486     '1',
1487     is_block_safe =>
1488     '1',
1489     'uri',
1490     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-4',
1491     '1'},
1492 wakaba 1.2 xml_name => 'ISO-8859-4',
1493     };
1494    
1495 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-5'} = {ascii8 =>
1496     '1',
1497     is_block_safe =>
1498     '1',
1499     'uri',
1500     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-5',
1501     '1'},
1502 wakaba 1.2 xml_name => 'ISO-8859-5',
1503     };
1504    
1505 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-6'} = {ascii8 =>
1506     '1',
1507     is_block_safe =>
1508     '1',
1509     'uri',
1510     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-6',
1511     '1'},
1512 wakaba 1.2 xml_name => 'ISO-8859-6',
1513     };
1514    
1515 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-7'} = {ascii8 =>
1516     '1',
1517     is_block_safe =>
1518     '1',
1519     'uri',
1520     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-7',
1521     '1'},
1522 wakaba 1.2 xml_name => 'ISO-8859-7',
1523     };
1524    
1525 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-8'} = {ascii8 =>
1526     '1',
1527     is_block_safe =>
1528     '1',
1529     'uri',
1530     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-8',
1531     '1'},
1532 wakaba 1.2 xml_name => 'ISO-8859-8',
1533     };
1534    
1535 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-9'} = {ascii8 =>
1536     '1',
1537     is_block_safe =>
1538     '1',
1539     'uri',
1540     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-9',
1541     '1'},
1542 wakaba 1.2 xml_name => 'ISO-8859-9',
1543     };
1544    
1545 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-10'} = {ascii8 =>
1546     '1',
1547     is_block_safe =>
1548     '1',
1549     'uri',
1550     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-10',
1551     '1'},
1552 wakaba 1.2 xml_name => 'ISO-8859-10',
1553     };
1554    
1555 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-11'} = {ascii8 =>
1556     '1',
1557     is_block_safe =>
1558     '1',
1559     'uri',
1560     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-11',
1561     '1'},
1562 wakaba 1.2 xml_name => 'ISO-8859-11',
1563     };
1564    
1565 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-13'} = {ascii8 =>
1566     '1',
1567     is_block_safe =>
1568     '1',
1569     'uri',
1570     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-13',
1571     '1'},
1572 wakaba 1.2 xml_name => 'ISO-8859-13',
1573     };
1574    
1575 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-14'} = {ascii8 =>
1576     '1',
1577     is_block_safe =>
1578     '1',
1579     'uri',
1580     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-14',
1581     '1'},
1582 wakaba 1.2 xml_name => 'ISO-8859-14',
1583     };
1584    
1585 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-15'} = {ascii8 =>
1586     '1',
1587     is_block_safe =>
1588     '1',
1589     'uri',
1590     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-15',
1591     '1'},
1592 wakaba 1.2 xml_name => 'ISO-8859-15',
1593     };
1594    
1595 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-16'} = {ascii8 =>
1596     '1',
1597     is_block_safe =>
1598     '1',
1599     'uri',
1600     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-16',
1601     '1'},
1602 wakaba 1.2 xml_name => 'ISO-8859-16',
1603     };
1604    
1605 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-2022-jp'} = {ascii8 =>
1606     '1',
1607     'uri',
1608     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-2022-jp',
1609     '1'},
1610 wakaba 1.2 xml_name => 'ISO-2022-JP',
1611     };
1612    
1613 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:iso-2022-jp'} = {ascii8 =>
1614     '1',
1615     ietf_name =>
1616     ['csiso2022jp',
1617     'iso-2022-jp',
1618     'iso-2022-jp'],
1619     mime_name =>
1620     'iso-2022-jp',
1621     'uri',
1622     {'urn:x-suika-fam-cx:charset:iso-2022-jp',
1623     '1'},
1624 wakaba 1.2 };
1625    
1626 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.iso-2022-jp'} = {ascii8 =>
1627     '1',
1628     perl_name =>
1629     ['iso-2022-jp'],
1630     'uri',
1631     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.iso-2022-jp',
1632     '1'}};
1633     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:shift_jis'} = $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.shift_jis'} = {ascii8 =>
1634     '1',
1635     is_block_safe =>
1636     '1',
1637     ietf_name =>
1638     ['csshiftjis',
1639     'ms_kanji',
1640     'shift_jis',
1641     'shift_jis'],
1642     mime_name =>
1643     'shift_jis',
1644     perl_name =>
1645     ['shift-jis-1997'],
1646     'uri',
1647     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.shift_jis',
1648     '1',
1649     'urn:x-suika-fam-cx:charset:shift_jis',
1650     '1'},
1651 wakaba 1.2 xml_name => 'Shift_JIS',
1652     };
1653    
1654 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.shiftjis'} = {ascii8 =>
1655     '1',
1656     is_block_safe =>
1657     '1',
1658     perl_name =>
1659     ['shiftjis',
1660     'sjis'],
1661     'uri',
1662     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.shiftjis',
1663     '1'}};
1664     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:euc-jp'} = $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.euc-jp'} = $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:extended_unix_code_packed_format_for_japanese'} = {ascii8 =>
1665     '1',
1666     is_block_safe =>
1667     '1',
1668     ietf_name =>
1669     ['cseucpkdfmtjapanese',
1670     'euc-jp',
1671     'euc-jp',
1672     'extended_unix_code_packed_format_for_japanese'],
1673     mime_name =>
1674     'euc-jp',
1675     perl_name =>
1676     ['euc-jp-1997'],
1677     'uri',
1678     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.euc-jp',
1679     '1',
1680     'urn:x-suika-fam-cx:charset:euc-jp',
1681     '1',
1682     'urn:x-suika-fam-cx:charset:extended_unix_code_packed_format_for_japanese',
1683     '1'},
1684 wakaba 1.2 xml_name => 'EUC-JP',
1685     };
1686    
1687 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.euc-jp'} = {ascii8 =>
1688     '1',
1689     is_block_safe =>
1690     '1',
1691     perl_name =>
1692     ['euc-jp',
1693     'ujis'],
1694     'uri',
1695     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.euc-jp',
1696     '1'}};
1697    
1698     1;
1699 wakaba 1.15 ## $Date: 2008/09/14 06:32:49 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24