/[suikacvs]/markup/html/whatpm/Whatpm/Charset/DecodeHandle.pm
Suika

Contents of /markup/html/whatpm/Whatpm/Charset/DecodeHandle.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.11 - (hide annotations) (download)
Sun Sep 14 01:51:08 2008 UTC (16 years, 10 months ago) by wakaba
Branch: MAIN
Changes since 1.10: +133 -7 lines
++ whatpm/Whatpm/ChangeLog	14 Sep 2008 01:47:27 -0000
2008-09-14  Wakaba  <wakaba@suika.fam.cx>

	* HTML.pm.src (parse_char_string): Use newly created
	|Whatpm::Charset::DecodeHandle::CharString| instead of Perl's
	standard feature to |open| a string as a filehandle,
	since Perl's string filehandle seems not supporting |ungetc|
	method correctly.
	(parse_char_stream): Define |{getc_until}| method.
	(DATA_STATE): Experimental support for |getc_until| feature.

++ whatpm/Whatpm/Charset/ChangeLog	14 Sep 2008 01:50:52 -0000
2008-09-14  Wakaba  <wakaba@suika.fam.cx>

	* DecodeHandle.pm (CharString): New class.
	(Encode read): Don't remove read string from |{char_buffer}|,
	to decease the number of string operations and to enable
	|manakai_getc_until| ungetc'ing without any string operation.
	(manakai_getc_until): New method.

	* UnicodeChecker.pm (getc): Don't |read| more than one
	character, to prevent characters being bufferred
	such that mixture of |getc| and |manakai_getc_until|
	calls does not make the result broken.

1 wakaba 1.1 package Whatpm::Charset::DecodeHandle;
2     use strict;
3    
4 wakaba 1.9 ## NOTE: |Message::Charset::Info| uses this module without calling
5     ## the constructor.
6    
7 wakaba 1.1 my $XML_AUTO_CHARSET = q<http://suika.fam.cx/www/2006/03/xml-entity/>;
8     my $IANA_CHARSET = q<urn:x-suika-fam-cx:charset:>;
9     my $PERL_CHARSET = q<http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.>;
10     my $XML_CHARSET = q<http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.>;
11    
12     ## ->create_decode_handle ($charset_uri, $byte_stream, $onerror)
13     sub create_decode_handle ($$$;$) {
14     my $csdef = $Whatpm::Charset::CharsetDef->{$_[1]};
15     my $obj = {
16 wakaba 1.9 char_buffer => \(my $s = ''),
17 wakaba 1.11 char_buffer_pos => 0,
18 wakaba 1.1 character_queue => [],
19     filehandle => $_[2],
20     charset => $_[1],
21     byte_buffer => '',
22     onerror => $_[3] || sub {},
23     };
24     if ($csdef->{uri}->{$XML_AUTO_CHARSET} or
25     $obj->{charset} eq $XML_AUTO_CHARSET) {
26     my $b = ''; # UTF-8 w/o BOM
27     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
28 wakaba 1.2 $obj->{input_encoding} = 'UTF-8';
29 wakaba 1.1 if (read $obj->{filehandle}, $b, 256) {
30     no warnings "substr";
31     no warnings "uninitialized";
32     if (substr ($b, 0, 1) eq "<") {
33     if (substr ($b, 1, 1) eq "?") { # ASCII8
34     if ($b =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
35     encoding\s*=\s*["']([^"']*)/x) {
36 wakaba 1.2 $obj->{input_encoding} = $1;
37 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
38     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
39     if (not $csdef->{ascii8} or $csdef->{bom_required}) {
40     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
41     charset_uri => $uri,
42     charset_name => $obj->{input_encoding});
43     }
44     } else {
45     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
46 wakaba 1.2 $obj->{input_encoding} = 'UTF-8';
47 wakaba 1.1 }
48     if (defined $csdef->{no_bom_variant}) {
49     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant}};
50     }
51     } elsif (substr ($b, 1, 1) eq "\x00") {
52     if (substr ($b, 2, 2) eq "?\x00") { # ASCII16LE
53     my $c = $b; $c =~ tr/\x00//d;
54     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
55     encoding\s*=\s*["']([^"']*)/x) {
56 wakaba 1.2 $obj->{input_encoding} = $1;
57 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
58     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
59     if (not $csdef->{ascii16} or $csdef->{ascii16be} or
60     $csdef->{bom_required}) {
61     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
62     charset_uri => $uri,
63     charset_name => $obj->{input_encoding});
64     }
65     } else {
66     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
67 wakaba 1.2 $obj->{input_encoding} = 'UTF-8';
68 wakaba 1.1 }
69     if (defined $csdef->{no_bom_variant16le}) {
70     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant16le}};
71     }
72     } elsif (substr ($b, 2, 2) eq "\x00\x00") { # ASCII32Endian4321
73     my $c = $b; $c =~ tr/\x00//d;
74     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
75     encoding\s*=\s*["']([^"']*)/x) {
76 wakaba 1.2 $obj->{input_encoding} = $1;
77 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
78     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
79     if (not $csdef->{ascii32} or
80     $csdef->{ascii32endian1234} or
81     $csdef->{ascii32endian2143} or
82     $csdef->{ascii32endian3412} or
83     $csdef->{bom_required}) {
84     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
85     charset_uri => $uri,
86     charset_name => $obj->{input_encoding});
87     }
88     } else {
89     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
90 wakaba 1.2 $obj->{input_encoding} = 'UTF-8';
91 wakaba 1.1 }
92     if (defined $csdef->{no_bom_variant32endian4321}) {
93     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant32endian4321}};
94     }
95     }
96     }
97     } elsif (substr ($b, 0, 3) eq "\xEF\xBB\xBF") { # UTF8
98     $obj->{has_bom} = 1;
99     substr ($b, 0, 3) = '';
100     my $c = $b;
101     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
102     encoding\s*=\s*["']([^"']*)/x) {
103 wakaba 1.2 $obj->{input_encoding} = $1;
104 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
105     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
106     if (not $csdef->{utf8_encoding_scheme} or
107     not $csdef->{bom_allowed}) {
108     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
109     charset_uri => $uri,
110     charset_name => $obj->{input_encoding});
111     }
112     } else {
113     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
114 wakaba 1.2 $obj->{input_encoding} = 'UTF-8';
115 wakaba 1.1 }
116     if (defined $csdef->{no_bom_variant}) {
117     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant}};
118     }
119     } elsif (substr ($b, 0, 2) eq "\x00<") {
120     if (substr ($b, 2, 2) eq "\x00?") { # ASCII16BE
121     my $c = $b; $c =~ tr/\x00//d;
122     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
123     encoding\s*=\s*["']([^"']*)/x) {
124 wakaba 1.2 $obj->{input_encoding} = $1;
125 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
126     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
127     if (not $csdef->{ascii16} or $csdef->{ascii16le} or
128     $csdef->{bom_required}) {
129     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
130     charset_uri => $uri,
131     charset_name => $obj->{input_encoding});
132     }
133     } else {
134     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
135 wakaba 1.2 $obj->{input_encoding} = 'UTF-8';
136 wakaba 1.1 }
137     if (defined $csdef->{no_bom_variant16be}) {
138     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant16be}};
139     }
140     } elsif (substr ($b, 2, 2) eq "\x00\x00") { # ASCII32Endian3412
141     my $c = $b; $c =~ tr/\x00//d;
142     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
143     encoding\s*=\s*["']([^"']*)/x) {
144 wakaba 1.2 $obj->{input_encoding} = $1;
145 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
146     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
147     if (not $csdef->{ascii32} or
148     $csdef->{ascii32endian1234} or
149     $csdef->{ascii32endian2143} or
150     $csdef->{ascii32endian4321} or
151     $csdef->{bom_required}) {
152     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
153     charset_uri => $uri,
154     charset_name => $obj->{input_encoding});
155     }
156     } else {
157     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
158 wakaba 1.2 $obj->{input_encoding} = 'UTF-8';
159 wakaba 1.1 }
160     if (defined $csdef->{no_bom_variant32endian3412}) {
161     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant32endian3412}};
162     }
163     }
164     } elsif (substr ($b, 0, 2) eq "\xFE\xFF") {
165     if (substr ($b, 2, 2) eq "\x00<") { # ASCII16BE
166     $obj->{has_bom} = 1;
167     substr ($b, 0, 2) = '';
168     my $c = $b; $c =~ tr/\x00//d;
169     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
170     encoding\s*=\s*["']([^"']*)/x) {
171 wakaba 1.2 $obj->{input_encoding} = $1;
172 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
173     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
174     if (not $csdef->{ascii16} or
175     $csdef->{ascii16le} or
176     not $csdef->{bom_allowed}) {
177     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
178     charset_uri => $uri,
179     charset_name => $obj->{input_encoding});
180     }
181     } else {
182     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-16be'};
183 wakaba 1.2 $obj->{input_encoding} = 'UTF-16';
184 wakaba 1.1 }
185     if (defined $csdef->{no_bom_variant16be}) {
186     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant16be}};
187     }
188     } elsif (substr ($b, 2, 2) eq "\x00\x00") { # ASCII32Endian3412
189     $obj->{has_bom} = 1;
190     substr ($b, 0, 4) = '';
191     my $c = $b; $c =~ tr/\x00//d;
192     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
193     encoding\s*=\s*["']([^"']*)/x) {
194 wakaba 1.2 $obj->{input_encoding} = $1;
195 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
196     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
197     if (not $csdef->{ascii32} or
198     $csdef->{ascii32endian1234} or
199     $csdef->{ascii32endian2143} or
200     $csdef->{ascii32endian4321} or
201     not $csdef->{bom_allowed}) {
202     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
203     charset_uri => $uri,
204     charset_name => $obj->{input_encoding});
205     }
206     } else {
207     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-16be'};
208 wakaba 1.2 $obj->{input_encoding} = 'UTF-16';
209 wakaba 1.1 $obj->{byte_buffer} .= "\x00\x00";
210     }
211     if (defined $csdef->{no_bom_variant32endian3412}) {
212     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant32endian3412}};
213     }
214     } else {
215     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-16be'};
216 wakaba 1.2 $obj->{input_encoding} = 'UTF-16';
217 wakaba 1.1 substr ($b, 0, 2) = '';
218     $obj->{has_bom} = 1;
219     }
220     } elsif (substr ($b, 0, 2) eq "\xFF\xFE") {
221     if (substr ($b, 2, 2) eq "<\x00") { # ASCII16LE
222     $obj->{has_bom} = 1;
223     substr ($b, 0, 2) = '';
224     my $c = $b; $c =~ tr/\x00//d;
225     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
226     encoding\s*=\s*["']([^"']*)/x) {
227 wakaba 1.2 $obj->{input_encoding} = $1;
228 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
229     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
230     if (not $csdef->{ascii16} or
231     $csdef->{ascii16be} or
232     not $csdef->{bom_allowed}) {
233     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
234     charset_uri => $uri,
235     charset_name => $obj->{input_encoding});
236     }
237     } else {
238     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-16le'};
239 wakaba 1.2 $obj->{input_encoding} = 'UTF-16';
240 wakaba 1.1 }
241     if (defined $csdef->{no_bom_variant16le}) {
242     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant16le}};
243     }
244     } elsif (substr ($b, 2, 2) eq "\x00\x00") { # ASCII32Endian4321
245     $obj->{has_bom} = 1;
246     substr ($b, 0, 4) = '';
247     my $c = $b; $c =~ tr/\x00//d;
248     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
249     encoding\s*=\s*["']([^"']*)/x) {
250 wakaba 1.2 $obj->{input_encoding} = $1;
251 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
252     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
253     if (not $csdef->{ascii32} or
254     $csdef->{ascii32endian1234} or
255     $csdef->{ascii32endian2143} or
256     $csdef->{ascii32endian3412} or
257     not $csdef->{bom_allowed}) {
258     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
259     charset_uri => $uri,
260     charset_name => $obj->{input_encoding});
261     }
262     } else {
263     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-16le'};
264 wakaba 1.2 $obj->{input_encoding} = 'UTF-16';
265 wakaba 1.1 $obj->{byte_buffer} .= "\x00\x00";
266     }
267     if (defined $csdef->{no_bom_variant32endian4321}) {
268     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant32endian4321}};
269     }
270     } else {
271     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-16le'};
272 wakaba 1.2 $obj->{input_encoding} = 'UTF-16';
273 wakaba 1.1 substr ($b, 0, 2) = '';
274     $obj->{has_bom} = 1;
275     }
276     } elsif (substr ($b, 0, 2) eq "\x00\x00") {
277     if (substr ($b, 2, 2) eq "\x00<") { # ASCII32Endian1234
278     my $c = $b; $c =~ tr/\x00//d;
279     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
280     encoding\s*=\s*["']([^"']*)/x) {
281 wakaba 1.2 $obj->{input_encoding} = $1;
282 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
283     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
284     if (not $csdef->{ascii32} or
285     $csdef->{ascii32endian2143} or
286     $csdef->{ascii32endian3412} or
287     $csdef->{ascii32endian4321} or
288     $csdef->{bom_required}) {
289     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
290     charset_uri => $uri,
291     charset_name => $obj->{input_encoding});
292     }
293     } else {
294     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
295 wakaba 1.2 $obj->{input_encoding} = 'UTF-8';
296 wakaba 1.1 }
297     if (defined $csdef->{no_bom_variant32endian1234}) {
298     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant32endian1234}};
299     }
300     } elsif (substr ($b, 2, 2) eq "<\x00") { # ASCII32Endian2143
301     my $c = $b; $c =~ tr/\x00//d;
302     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
303     encoding\s*=\s*["']([^"']*)/x) {
304 wakaba 1.2 $obj->{input_encoding} = $1;
305 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
306     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
307     if (not $csdef->{ascii32} or
308     $csdef->{ascii32endian1234} or
309     $csdef->{ascii32endian3412} or
310     $csdef->{ascii32endian4321} or
311     $csdef->{bom_required}) {
312     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
313     charset_uri => $uri,
314     charset_name => $obj->{input_encoding});
315     }
316     } else {
317     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
318 wakaba 1.2 $obj->{input_encoding} = 'UTF-8';
319 wakaba 1.1 }
320     if (defined $csdef->{no_bom_variant32endian2143}) {
321     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant32endian2143}};
322     }
323     } elsif (substr ($b, 2, 2) eq "\xFE\xFF") { # ASCII32Endian1234
324     $obj->{has_bom} = 1;
325     substr ($b, 0, 4) = '';
326     my $c = $b; $c =~ tr/\x00//d;
327     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
328     encoding\s*=\s*["']([^"']*)/x) {
329 wakaba 1.2 $obj->{input_encoding} = $1;
330 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
331     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
332     if (not $csdef->{ascii32} or
333     $csdef->{ascii32endian2143} or
334     $csdef->{ascii32endian3412} or
335     $csdef->{ascii32endian4321} or
336     $csdef->{bom_required}) {
337     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
338     charset_uri => $uri,
339     charset_name => $obj->{input_encoding});
340     }
341     } else {
342     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
343 wakaba 1.2 $obj->{input_encoding} = 'UTF-8';
344 wakaba 1.1 $obj->{has_bom} = 0;
345     $obj->{byte_buffer} .= "\x00\x00\xFE\xFF";
346     }
347     if (defined $csdef->{no_bom_variant32endian1234}) {
348     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant32endian1234}};
349     }
350     } elsif (substr ($b, 2, 2) eq "\xFF\xFE") { # ASCII32Endian2143
351     $obj->{has_bom} = 1;
352     substr ($b, 0, 4) = '';
353     my $c = $b; $c =~ tr/\x00//d;
354     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
355     encoding\s*=\s*["']([^"']*)/x) {
356 wakaba 1.2 $obj->{input_encoding} = $1;
357 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
358     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
359     if (not $csdef->{ascii32} or
360     $csdef->{ascii32endian1234} or
361     $csdef->{ascii32endian3412} or
362     $csdef->{ascii32endian4321} or
363     $csdef->{bom_required}) {
364     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
365     charset_uri => $uri,
366     charset_name => $obj->{input_encoding});
367     }
368     } else {
369     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
370 wakaba 1.2 $obj->{input_encoding} = 'UTF-8';
371 wakaba 1.1 $obj->{has_bom} = 0;
372     $obj->{byte_buffer} .= "\x00\x00\xFF\xFE";
373     }
374     if (defined $csdef->{no_bom_variant32endian2143}) {
375     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant32endian2143}};
376     }
377     }
378     # \x4C\x6F\xA7\x94 EBCDIC
379     } # buffer
380     $obj->{byte_buffer} .= $b;
381     } # read
382     } elsif ($csdef->{uri}->{$XML_CHARSET.'utf-8'}) {
383     ## BOM is optional.
384     my $b = '';
385     if (read $obj->{filehandle}, $b, 3) {
386     if ($b eq "\xEF\xBB\xBF") {
387     $obj->{has_bom} = 1;
388     } else {
389     $obj->{byte_buffer} .= $b;
390     }
391     }
392     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'}; # UTF-8 w/o BOM
393     } elsif ($csdef->{uri}->{$XML_CHARSET.'utf-16'}) {
394     ## BOM is mandated.
395     my $b = '';
396     if (read $obj->{filehandle}, $b, 2) {
397     if ($b eq "\xFE\xFF") {
398     $obj->{has_bom} = 1; # UTF-16BE w/o BOM
399     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-16be'};
400     } elsif ($b eq "\xFF\xFE") {
401     $obj->{has_bom} = 1; # UTF-16LE w/o BOM
402     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-16le'};
403     } else {
404     $obj->{onerror}->(undef, 'no-bom-error', charset_uri => $obj->{charset});
405     $obj->{has_bom} = 0;
406     $obj->{byte_buffer} .= $b; # UTF-16BE w/o BOM
407     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-16be'};
408     }
409     } else {
410     $obj->{onerror}->(undef, 'no-bom-error', charset_uri => $obj->{charset});
411     $obj->{has_bom} = 0; # UTF-16BE w/o BOM
412     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-16be'};
413     }
414     }
415    
416     if ($csdef->{uri}->{$XML_CHARSET.'iso-2022-jp'}) {
417     $obj->{state_2440} = 'gl-jis-1997-swapped';
418     $obj->{state_2442} = 'gl-jis-1997';
419     $obj->{state} = 'state_2842';
420     require Encode::GLJIS1997Swapped;
421     require Encode::GLJIS1997;
422     if (Encode::find_encoding ($obj->{state_2440}) and
423     Encode::find_encoding ($obj->{state_2442})) {
424     return bless $obj, 'Whatpm::Charset::DecodeHandle::ISO2022JP';
425     }
426     } elsif ($csdef->{uri}->{$IANA_CHARSET.'iso-2022-jp'}) {
427     $obj->{state_2440} = 'gl-jis-1978';
428     $obj->{state_2442} = 'gl-jis-1983';
429     $obj->{state} = 'state_2842';
430     require Encode::GLJIS1978;
431     require Encode::GLJIS1983;
432     if (Encode::find_encoding ($obj->{state_2440}) and
433     Encode::find_encoding ($obj->{state_2442})) {
434     return bless $obj, 'Whatpm::Charset::DecodeHandle::ISO2022JP';
435     }
436     } elsif (defined $csdef->{perl_name}->[0]) {
437     if ($csdef->{uri}->{$XML_CHARSET.'euc-jp'} or
438     $csdef->{uri}->{$IANA_CHARSET.'euc-jp'}) {
439     $obj->{perl_encoding_name} = $csdef->{perl_name}->[0];
440     require Encode::EUCJP1997;
441     if (Encode::find_encoding ($obj->{perl_encoding_name})) {
442     return bless $obj, 'Whatpm::Charset::DecodeHandle::EUCJP';
443     }
444     } elsif ($csdef->{uri}->{$XML_CHARSET.'shift_jis'} or
445     $csdef->{uri}->{$IANA_CHARSET.'shift_jis'}) {
446     $obj->{perl_encoding_name} = $csdef->{perl_name}->[0];
447     require Encode::ShiftJIS1997;
448     if (Encode::find_encoding ($obj->{perl_encoding_name})) {
449     return bless $obj, 'Whatpm::Charset::DecodeHandle::ShiftJIS';
450     }
451     } elsif ($csdef->{is_block_safe}) {
452     $obj->{perl_encoding_name} = $csdef->{perl_name}->[0];
453     require Encode;
454     if (Encode::find_encoding ($obj->{perl_encoding_name})) {
455     return bless $obj, 'Whatpm::Charset::DecodeHandle::Encode';
456     }
457     }
458     }
459    
460     $obj->{onerror}->(undef, 'charset-not-supported-error',
461     charset_uri => $obj->{charset});
462     return undef;
463     } # create_decode_handle
464    
465     sub name_to_uri ($$$) {
466     my $domain = $_[1];
467     my $name = lc $_[2];
468    
469     if ($domain eq 'ietf') {
470     return $IANA_CHARSET . $name;
471     } elsif ($domain eq 'xml') {
472     if ({
473     'utf-8' => 1,
474     'utf-16' => 1,
475     'iso-10646-ucs-2' => 1,
476     'iso-10646-ucs-4' => 1,
477     'iso-8859-1' => 1,
478     'iso-8859-2' => 1,
479     'iso-8859-3' => 1,
480     'iso-8859-4' => 1,
481     'iso-8859-5' => 1,
482     'iso-8859-6' => 1,
483     'iso-8859-7' => 1,
484     'iso-8859-8' => 1,
485     'iso-8859-9' => 1,
486     'iso-8859-10' => 1,
487     'iso-8859-11' => 1,
488     'iso-8859-13' => 1,
489     'iso-8859-14' => 1,
490     'iso-8859-15' => 1,
491     'iso-8859-16' => 1,
492     'iso-2022-jp' => 1,
493     'shift_jis' => 1,
494     'euc-jp' => 1,
495     }->{$name}) {
496     return $XML_CHARSET . $name;
497     }
498    
499     my $uri = $IANA_CHARSET . $name;
500     return $uri if $Whatpm::Charset::CharsetDef->{$uri};
501    
502     return $XML_CHARSET . $name;
503     } else {
504     return undef;
505     }
506     } # name_to_uri
507    
508     sub uri_to_name ($$$) {
509     my (undef, $domain, $uri) = @_;
510 wakaba 1.2
511     if ($domain eq 'xml') {
512     my $v = $Whatpm::Charset::CharsetDef->{$uri}->{xml_name};
513     return $v if defined $v;
514    
515     if (substr ($uri, 0, length $XML_CHARSET) eq $XML_CHARSET) {
516     return substr ($uri, length $XML_CHARSET);
517     }
518    
519     $domain = 'ietf'; ## TODO: XML encoding name has smaller range
520     }
521    
522     if ($domain eq 'ietf') {
523     my $v = $Whatpm::Charset::CharsetDef->{$uri}->{iana_name};
524     return $v->[0] if defined $v;
525    
526     if (substr ($uri, 0, length $IANA_CHARSET) eq $IANA_CHARSET) {
527 wakaba 1.1 return substr ($uri, length $IANA_CHARSET);
528     }
529     }
530    
531     return undef;
532     } # uri_to_name
533    
534 wakaba 1.3 require IO::Handle;
535    
536     package Whatpm::Charset::DecodeHandle::ByteBuffer;
537    
538 wakaba 1.7 ## NOTE: Provides a byte buffer wrapper object.
539    
540 wakaba 1.3 sub new ($$) {
541     my $self = bless {
542     buffer => '',
543     }, shift;
544     $self->{filehandle} = shift;
545     return $self;
546     } # new
547    
548     sub read {
549     my $self = shift;
550     my $pos = length $self->{buffer};
551     my $r = $self->{filehandle}->read ($self->{buffer}, $_[1], $pos);
552     substr ($_[0], $_[2]) = substr ($self->{buffer}, $pos);
553 wakaba 1.8 ## NOTE: This would do different behavior from Perl's standard
554     ## |read| when $pos points beyond the end of the string.
555 wakaba 1.3 return $r;
556     } # read
557    
558     sub close { $_[0]->{filehandle}->close }
559    
560 wakaba 1.11 package Whatpm::Charset::DecodeHandle::CharString;
561    
562     ## NOTE: Same as Perl's standard |open $handle, '<', \$char_string|,
563     ## but supports |ungetc| and other extensions.
564    
565     sub new ($$) {
566     my $self = bless {pos => 0}, shift;
567     $self->{string} = shift; # must be a scalar ref
568     return $self;
569     } # new
570    
571     sub getc ($) {
572     my $self = shift;
573     if ($self->{pos} < length ${$self->{string}}) {
574     return substr ${$self->{string}}, $self->{pos}++, 1;
575     } else {
576     return undef;
577     }
578     } # getc
579    
580     sub read ($$$$) {
581     #my ($self, $scalar, $length, $offset) = @_;
582     my $self = $_[0];
583     my $length = $_[2] || 0;
584     my $offset = $_[3];
585     ## NOTE: We don't support standard Perl semantics if $offset is
586     ## greater than the length of $scalar.
587     substr ($_[1], $offset) = substr (${$self->{string}}, $self->{pos}, $length);
588     my $count = (length $_[1]) - $offset;
589     $self->{pos} += $count;
590     return $count;
591     } # read
592    
593     sub manakai_getc_until ($$) {
594     my ($self, $pattern) = @_;
595     pos (${$self->{string}}) = $self->{pos};
596     if (${$self->{string}} =~ /\G(?>$pattern)+/) {
597     my $s = substr (${$self->{string}}, $-[0], $+[0] - $-[0]);
598     $self->{pos} += $+[0] - $-[0];
599     return \$s;
600     } else {
601     return undef;
602     }
603     } # manakai_getc_until
604    
605     sub ungetc ($$) {
606     my $self = shift;
607     ## Ignore second parameter.
608     $self->{pos}-- if $self->{pos} > 0;
609     } # ungetc
610    
611     sub close ($) { }
612    
613 wakaba 1.1 package Whatpm::Charset::DecodeHandle::Encode;
614    
615 wakaba 1.7 ## NOTE: Provides a Perl |Encode| module wrapper object.
616    
617 wakaba 1.1 sub charset ($) { $_[0]->{charset} }
618    
619 wakaba 1.3 sub close ($) { $_[0]->{filehandle}->close }
620 wakaba 1.1
621     sub getc ($) {
622 wakaba 1.9 my $c = '';
623     my $l = $_[0]->read ($c, 1);
624     if ($l) {
625     return $c;
626     } else {
627     return undef;
628     }
629     } # getc
630    
631     sub read ($$$;$) {
632 wakaba 1.1 my $self = $_[0];
633 wakaba 1.9 # $scalar = $_[1];
634     my $length = $_[2];
635     my $offset = $_[3] || 0;
636     my $count = 0;
637     my $eof;
638 wakaba 1.10 ## NOTE: It is incompatible with the standard Perl semantics
639     ## if $offset is greater than the length of $scalar.
640    
641 wakaba 1.9 A: {
642     return $count if $length < 1;
643    
644 wakaba 1.11 if (my $l = (length ${$self->{char_buffer}}) - $self->{char_buffer_pos}) {
645 wakaba 1.9 if ($l >= $length) {
646 wakaba 1.11 substr ($_[1], $offset)
647     = substr (${$self->{char_buffer}}, $self->{char_buffer_pos},
648     $length);
649 wakaba 1.9 $count += $length;
650 wakaba 1.11 $self->{char_buffer_pos} += $length;
651 wakaba 1.9 $length = 0;
652     return $count;
653     } else {
654 wakaba 1.11 substr ($_[1], $offset)
655     = substr (${$self->{char_buffer}}, $self->{char_buffer_pos});
656 wakaba 1.9 $count += $l;
657     $length -= $l;
658     ${$self->{char_buffer}} = '';
659 wakaba 1.11 $self->{char_buffer_pos} = 0;
660 wakaba 1.9 }
661     $offset = length $_[1];
662     }
663    
664     if ($eof) {
665     return $count;
666     }
667 wakaba 1.1
668     my $error;
669     if ($self->{continue}) {
670 wakaba 1.3 if ($self->{filehandle}->read ($self->{byte_buffer}, 256,
671     length $self->{byte_buffer})) {
672 wakaba 1.1 #
673     } else {
674     $error = 1;
675     }
676     $self->{continue} = 0;
677     } elsif (512 > length $self->{byte_buffer}) {
678 wakaba 1.9 if ($self->{filehandle}->read ($self->{byte_buffer}, 256,
679     length $self->{byte_buffer})) {
680     #
681     } else {
682     $eof = 1;
683     }
684 wakaba 1.1 }
685    
686     unless ($error) {
687 wakaba 1.4 if (not $self->{bom_checked}) {
688     if (defined $self->{bom_pattern}) {
689     if ($self->{byte_buffer} =~ s/^$self->{bom_pattern}//) {
690     $self->{has_bom} = 1;
691     }
692     }
693     $self->{bom_checked} = 1;
694     }
695    
696 wakaba 1.1 my $string = Encode::decode ($self->{perl_encoding_name},
697     $self->{byte_buffer},
698     Encode::FB_QUIET ());
699     if (length $string) {
700 wakaba 1.9 $self->{char_buffer} = \$string;
701 wakaba 1.11 $self->{char_buffer_pos} = 0;
702 wakaba 1.1 if (length $self->{byte_buffer}) {
703     $self->{continue} = 1;
704     }
705     } else {
706     if (length $self->{byte_buffer}) {
707     $error = 1;
708     } else {
709 wakaba 1.9 ## NOTE: No further input
710     redo A;
711 wakaba 1.1 }
712     }
713     }
714    
715     if ($error) {
716 wakaba 1.9 my $r = substr $self->{byte_buffer}, 0, 1, '';
717 wakaba 1.6 my $fallback = $self->{fallback}->{$r};
718     if (defined $fallback) {
719 wakaba 1.7 ## NOTE: This is an HTML5 parse error.
720 wakaba 1.6 $self->{onerror}->($self, 'fallback-char-error', octets => \$r,
721     char => \$fallback,
722 wakaba 1.7 level => $self->{level}->{$self->{error_level}->{'fallback-char-error'}});
723 wakaba 1.9 ${$self->{char_buffer}} .= $fallback;
724 wakaba 1.7 } elsif (exists $self->{fallback}->{$r}) {
725     ## NOTE: This is an HTML5 parse error. In addition, the octet
726     ## is not assigned with a character.
727     $self->{onerror}->($self, 'fallback-unassigned-error', octets => \$r,
728     level => $self->{level}->{$self->{error_level}->{'fallback-unassigned-error'}});
729 wakaba 1.9 ${$self->{char_buffer}} .= $r;
730 wakaba 1.6 } else {
731 wakaba 1.7 $self->{onerror}->($self, 'illegal-octets-error', octets => \$r,
732     level => $self->{level}->{$self->{error_level}->{'illegal-octets-error'}});
733 wakaba 1.9 ${$self->{char_buffer}} .= $r;
734 wakaba 1.6 }
735 wakaba 1.1 }
736    
737 wakaba 1.9 redo A;
738     } # A
739 wakaba 1.11 } # read
740    
741     sub manakai_getc_until ($$) {
742     my ($self, $pattern) = @_;
743     my $s = '';
744     $self->read ($s, 255);
745     if ($s =~ /^(?>$pattern)+/) {
746     my $rem_length = (length $s) - $+[0];
747     if ($rem_length) {
748     if ($self->{char_buffer_pos} > $rem_length) {
749     $self->{char_buffer_pos} -= $rem_length;
750     } else {
751     substr (${$self->{char_buffer}}, 0, $self->{char_buffer_pos})
752     = substr ($s, $+[0]);
753     $self->{char_buffer_pos} = 0;
754     }
755     substr ($s, $+[0]) = '';
756     }
757     return \$s;
758     } elsif (length $s) {
759     if ($self->{char_buffer_pos} > length $s) {
760     $self->{char_buffer_pos} -= length $s;
761     } else {
762     substr (${$self->{char_buffer}}, 0, $self->{char_buffer_pos}) = $s;
763     $self->{char_buffer_pos} = 0;
764     }
765     }
766     return undef;
767     } # manakai_getc_until
768 wakaba 1.1
769     sub has_bom ($) { $_[0]->{has_bom} }
770    
771 wakaba 1.2 sub input_encoding ($) {
772     my $v = $_[0]->{input_encoding};
773     return $v if defined $v;
774    
775     my $uri = $_[0]->{charset};
776     if (defined $uri) {
777     return Whatpm::Charset::DecodeHandle->uri_to_name (xml => $uri);
778     }
779    
780     return undef;
781     } # input_encoding
782 wakaba 1.1
783     sub onerror ($;$) {
784     if (@_ > 1) {
785     $_[0]->{onerror} = $_[1];
786     }
787    
788     return $_[0]->{onerror};
789     } # onerror
790    
791     sub ungetc ($$) {
792     unshift @{$_[0]->{character_queue}}, chr int ($_[1] or 0);
793     } # ungetc
794    
795     package Whatpm::Charset::DecodeHandle::EUCJP;
796     push our @ISA, 'Whatpm::Charset::DecodeHandle::Encode';
797    
798     sub getc ($) {
799     my $self = $_[0];
800     return shift @{$self->{character_queue}} if @{$self->{character_queue}};
801    
802     my $error;
803     if ($self->{continue}) {
804 wakaba 1.3 if ($self->{filehandle}->read ($self->{byte_buffer}, 256,
805     length $self->{byte_buffer})) {
806 wakaba 1.1 #
807     } else {
808     $error = 1;
809     }
810     $self->{continue} = 0;
811     } elsif (512 > length $self->{byte_buffer}) {
812 wakaba 1.3 $self->{filehandle}->read ($self->{byte_buffer}, 256,
813     length $self->{byte_buffer});
814 wakaba 1.1 }
815    
816     my $r;
817     unless ($error) {
818     my $string = Encode::decode ($self->{perl_encoding_name},
819     $self->{byte_buffer},
820     Encode::FB_QUIET ());
821     if (length $string) {
822     push @{$self->{character_queue}}, split //, $string;
823     $r = shift @{$self->{character_queue}};
824     if (length $self->{byte_buffer}) {
825     $self->{continue} = 1;
826     }
827     } else {
828     if (length $self->{byte_buffer}) {
829     $error = 1;
830     } else {
831     $r = undef;
832     }
833     }
834     }
835    
836     if ($error) {
837     $r = substr $self->{byte_buffer}, 0, 1, '';
838     my $etype = 'illegal-octets-error';
839     if ($r =~ /^[\xA1-\xFE]/) {
840     if ($self->{byte_buffer} =~ s/^([\xA1-\xFE])//) {
841     $r .= $1;
842     $etype = 'unassigned-code-point-error';
843     }
844     } elsif ($r eq "\x8F") {
845     if ($self->{byte_buffer} =~ s/^([\xA1-\xFE][\xA1-\xFE]?)//) {
846     $r .= $1;
847     $etype = 'unassigned-code-point-error' if length $1 == 2;
848     }
849     } elsif ($r eq "\x8E") {
850     if ($self->{byte_buffer} =~ s/^([\xA1-\xFE])//) {
851     $r .= $1;
852     $etype = 'unassigned-code-point-error';
853     }
854     } elsif ($r eq "\xA0" or $r eq "\xFF") {
855     $etype = 'unassigned-code-point-error';
856     }
857 wakaba 1.7 $self->{onerror}->($self, $etype, octets => \$r,
858     level => $self->{level}->{$self->{error_level}->{$etype}});
859 wakaba 1.1 }
860    
861     return $r;
862     } # getc
863    
864 wakaba 1.9 ## TODO: This is not good for performance. Should be replaced
865     ## by read-centric implementation.
866     sub read ($$$;$) {
867     #my ($self, $scalar, $length, $offset) = @_;
868     my $length = $_[2];
869     my $r = '';
870     while ($length > 0) {
871     my $c = $_[0]->getc;
872     last unless defined $c;
873     $r .= $c;
874     $length--;
875     }
876     substr ($_[1], $_[3]) = $r;
877     ## NOTE: This would do different thing from what Perl's |read| do
878     ## if $offset points beyond the end of the $scalar.
879     return length $r;
880     } # read
881    
882 wakaba 1.11 sub manakai_getc_until ($$) {
883     my ($self, $pattern) = @_;
884     my $c = $self->getc;
885     if ($c =~ /^$pattern/) {
886     return \$c;
887     } elsif (defined $c) {
888     $self->ungetc (ord $c);
889     return undef;
890     } else {
891     return undef;
892     }
893     } # manakai_getc_until
894    
895 wakaba 1.1 package Whatpm::Charset::DecodeHandle::ISO2022JP;
896     push our @ISA, 'Whatpm::Charset::DecodeHandle::Encode';
897    
898     sub getc ($) {
899     my $self = $_[0];
900     return shift @{$self->{character_queue}} if @{$self->{character_queue}};
901    
902     my $r;
903     A: {
904     my $error;
905     if ($self->{continue}) {
906 wakaba 1.3 if ($self->{filehandle}->read ($self->{byte_buffer}, 256,
907     length $self->{byte_buffer})) {
908 wakaba 1.1 #
909     } else {
910     $error = 1;
911     }
912     $self->{continue} = 0;
913     } elsif (512 > length $self->{byte_buffer}) {
914 wakaba 1.3 $self->{filehandle}->read ($self->{byte_buffer}, 256,
915     length $self->{byte_buffer});
916 wakaba 1.1 }
917    
918     unless ($error) {
919     if ($self->{byte_buffer} =~ s/^\x1B(\x24[\x40\x42]|\x28[\x42\x4A])//) {
920     $self->{state} = {
921     "\x24\x40" => 'state_2440',
922     "\x24\x42" => 'state_2442',
923     "\x28\x42" => 'state_2842',
924     "\x28\x4A" => 'state_284A',
925     }->{$1};
926     redo A;
927     } elsif ($self->{state} eq 'state_2842') { # IRV
928     if ($self->{byte_buffer} =~ s/^([\x00-\x0D\x10-\x1A\x1C-\x7F]+)//) {
929     push @{$self->{character_queue}}, split //, $1;
930     $r = shift @{$self->{character_queue}};
931     } else {
932     if (length $self->{byte_buffer}) {
933     $error = 1;
934     } else {
935     $r = undef;
936     }
937     }
938     } elsif ($self->{state} eq 'state_284A') { # 0201
939     if ($self->{byte_buffer} =~ s/^([\x00-\x0D\x10-\x1A\x1C-\x7F]+)//) {
940     my $v = $1;
941     $v =~ tr/\x5C\x7E/\xA5\x{203E}/;
942     push @{$self->{character_queue}}, split //, $v;
943     $r = shift @{$self->{character_queue}};
944     } else {
945     if (length $self->{byte_buffer}) {
946     $error = 1;
947     } else {
948     $r = undef;
949     $self->{onerror}->($self, 'invalid-state-error',
950 wakaba 1.7 state => $self->{state},
951     level => $self->{level}->{$self->{error_level}->{'invalid-state-error'}});
952 wakaba 1.1 }
953     }
954     } elsif ($self->{state} eq 'state_2442') { # 1983
955     my $v = Encode::decode ($self->{state_2442},
956     $self->{byte_buffer},
957     Encode::FB_QUIET ());
958     if (length $v) {
959     push @{$self->{character_queue}}, split //, $v;
960     $r = shift @{$self->{character_queue}};
961     } else {
962     if (length $self->{byte_buffer}) {
963     $error = 1;
964     } else {
965     $r = undef;
966     $self->{onerror}->($self, 'invalid-state-error',
967 wakaba 1.7 state => $self->{state},
968     level => $self->{level}->{$self->{error_level}->{'invalid-state-error'}});
969 wakaba 1.1 }
970     }
971     } elsif ($self->{state} eq 'state_2440') { # 1978
972     my $v = Encode::decode ($self->{state_2440},
973     $self->{byte_buffer},
974     Encode::FB_QUIET ());
975     if (length $v) {
976     push @{$self->{character_queue}}, split //, $v;
977     $r = shift @{$self->{character_queue}};
978     } else {
979     if (length $self->{byte_buffer}) {
980     $error = 1;
981     } else {
982     $r = undef;
983     $self->{onerror}->($self, 'invalid-state-error',
984 wakaba 1.7 state => $self->{state},
985     level => $self->{level}->{$self->{error_level}->{'invalid-state-error'}});
986 wakaba 1.1 }
987     }
988     } else {
989     $error = 1;
990     }
991     }
992    
993     if ($error) {
994     $r = substr $self->{byte_buffer}, 0, 1, '';
995     my $etype = 'illegal-octets-error';
996     if (($self->{state} eq 'state_2442' or
997     $self->{state} eq 'state_2440') and
998     $r =~ /^[\x21-\x7E]/ and
999     $self->{byte_buffer} =~ s/^([\x21-\x7E])//) {
1000     $r .= $1;
1001     $etype = 'unassigned-code-point-error';
1002     } elsif ($r eq "\x1B" and
1003     $self->{byte_buffer} =~ s/^\(H//) { # Old 0201
1004     $r .= "(H";
1005     $self->{state} = 'state_284A';
1006     }
1007 wakaba 1.7 $self->{onerror}->($self, $etype, octets => \$r,
1008     level => $self->{level}->{$self->{error_level}->{$etype}});
1009 wakaba 1.1 }
1010     } # A
1011    
1012     return $r;
1013     } # getc
1014    
1015 wakaba 1.9 ## TODO: This is not good for performance. Should be replaced
1016     ## by read-centric implementation.
1017     sub read ($$$;$) {
1018     #my ($self, $scalar, $length, $offset) = @_;
1019     my $length = $_[2];
1020     my $r = '';
1021     while ($length > 0) {
1022     my $c = $_[0]->getc;
1023     last unless defined $c;
1024     $r .= $c;
1025     $length--;
1026     }
1027     substr ($_[1], $_[3]) = $r;
1028     ## NOTE: This would do different thing from what Perl's |read| do
1029     ## if $offset points beyond the end of the $scalar.
1030     return length $r;
1031     } # read
1032    
1033 wakaba 1.11 sub manakai_getc_until ($$) {
1034     my ($self, $pattern) = @_;
1035     my $c = $self->getc;
1036     if ($c =~ /^$pattern/) {
1037     return \$c;
1038     } elsif (defined $c) {
1039     $self->ungetc (ord $c);
1040     return undef;
1041     } else {
1042     return undef;
1043     }
1044     } # manakai_getc_until
1045    
1046 wakaba 1.1 package Whatpm::Charset::DecodeHandle::ShiftJIS;
1047     push our @ISA, 'Whatpm::Charset::DecodeHandle::Encode';
1048    
1049     sub getc ($) {
1050     my $self = $_[0];
1051     return shift @{$self->{character_queue}} if @{$self->{character_queue}};
1052    
1053     my $error;
1054     if ($self->{continue}) {
1055 wakaba 1.3 if ($self->{filehandle}->read ($self->{byte_buffer}, 256,
1056     length $self->{byte_buffer})) {
1057 wakaba 1.1 #
1058     } else {
1059     $error = 1;
1060     }
1061     $self->{continue} = 0;
1062     } elsif (512 > length $self->{byte_buffer}) {
1063 wakaba 1.3 $self->{filehandle}->read ($self->{byte_buffer}, 256,
1064     length $self->{byte_buffer});
1065 wakaba 1.1 }
1066    
1067     my $r;
1068     unless ($error) {
1069     my $string = Encode::decode ($self->{perl_encoding_name},
1070     $self->{byte_buffer},
1071     Encode::FB_QUIET ());
1072     if (length $string) {
1073     push @{$self->{character_queue}}, split //, $string;
1074     $r = shift @{$self->{character_queue}};
1075     if (length $self->{byte_buffer}) {
1076     $self->{continue} = 1;
1077     }
1078     } else {
1079     if (length $self->{byte_buffer}) {
1080     $error = 1;
1081     } else {
1082     $r = undef;
1083     }
1084     }
1085     }
1086    
1087     if ($error) {
1088     $r = substr $self->{byte_buffer}, 0, 1, '';
1089     my $etype = 'illegal-octets-error';
1090 wakaba 1.5 if ($r =~ /^[\x81-\x9F\xE0-\xFC]/) {
1091 wakaba 1.1 if ($self->{byte_buffer} =~ s/(.)//s) {
1092     $r .= $1; # not limited to \x40-\xFC - \x7F
1093     $etype = 'unassigned-code-point-error';
1094     }
1095 wakaba 1.5 ## NOTE: Range [\xF0-\xFC] is unassigned and may be used as a single-byte
1096     ## character or as the first-byte of a double-byte character according
1097     ## to JIS X 0208:1997 Appendix 1. However, the current practice is
1098     ## use the range as the first-byte of double-byte characters.
1099     } elsif ($r =~ /^[\x80\xA0\xFD-\xFF]/) {
1100 wakaba 1.1 $etype = 'unassigned-code-point-error';
1101     }
1102 wakaba 1.7 $self->{onerror}->($self, $etype, octets => \$r,
1103     level => $self->{level}->{$self->{error_level}->{$etype}});
1104 wakaba 1.1 }
1105    
1106     return $r;
1107     } # getc
1108    
1109 wakaba 1.9 ## TODO: This is not good for performance. Should be replaced
1110     ## by read-centric implementation.
1111     sub read ($$$;$) {
1112     #my ($self, $scalar, $length, $offset) = @_;
1113     my $length = $_[2];
1114     my $r = '';
1115     while ($length > 0) {
1116     my $c = $_[0]->getc;
1117     last unless defined $c;
1118     $r .= $c;
1119     $length--;
1120     }
1121     substr ($_[1], $_[3]) = $r;
1122     ## NOTE: This would do different thing from what Perl's |read| do
1123     ## if $offset points beyond the end of the $scalar.
1124 wakaba 1.11
1125 wakaba 1.9 return length $r;
1126     } # read
1127    
1128 wakaba 1.11 sub manakai_getc_until ($$) {
1129     my ($self, $pattern) = @_;
1130     my $c = $self->getc;
1131     if ($c =~ /^$pattern/) {
1132     return \$c;
1133     } elsif (defined $c) {
1134     $self->ungetc (ord $c);
1135     return undef;
1136     } else {
1137     return undef;
1138     }
1139     } # manakai_getc_until
1140    
1141 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:us-ascii'} =
1142     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:us'} =
1143     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:iso646-us'} =
1144     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:cp367'} =
1145     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:ibm367'} =
1146     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:ansi_x3.4-1986'} =
1147     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:ansi_x3.4-1968'} =
1148     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:iso-ir-6'} =
1149     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:csascii'} =
1150     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:iso_646.irv:1991'} =
1151     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:ascii'} = {ascii8 =>
1152     '1',
1153     is_block_safe =>
1154     '1',
1155     ietf_name =>
1156     ['ansi_x3.4-1968',
1157     'ansi_x3.4-1986',
1158     'ascii',
1159     'cp367',
1160     'csascii',
1161     'ibm367',
1162     'iso-ir-6',
1163     'iso646-us',
1164     'iso_646.irv:1991',
1165     'us',
1166     'us-ascii',
1167     'us-ascii'],
1168     mime_name =>
1169     'us-ascii',
1170     perl_name =>
1171     ['ascii',
1172     'iso-646-us',
1173     'us-ascii'],
1174     utf8_encoding_scheme =>
1175     '1',
1176     'uri',
1177     {'urn:x-suika-fam-cx:charset:ansi_x3.4-1968',
1178     '1',
1179     'urn:x-suika-fam-cx:charset:ansi_x3.4-1986',
1180     '1',
1181     'urn:x-suika-fam-cx:charset:ascii',
1182     '1',
1183     'urn:x-suika-fam-cx:charset:cp367',
1184     '1',
1185     'urn:x-suika-fam-cx:charset:csascii',
1186     '1',
1187     'urn:x-suika-fam-cx:charset:ibm367',
1188     '1',
1189     'urn:x-suika-fam-cx:charset:iso-ir-6',
1190     '1',
1191     'urn:x-suika-fam-cx:charset:iso646-us',
1192     '1',
1193     'urn:x-suika-fam-cx:charset:iso_646.irv:1991',
1194     '1',
1195     'urn:x-suika-fam-cx:charset:us',
1196     '1',
1197     'urn:x-suika-fam-cx:charset:us-ascii',
1198     '1'},
1199 wakaba 1.2 };
1200    
1201 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.ascii-ctrl'} = {perl_name =>
1202     ['ascii-ctrl'],
1203     'uri',
1204     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.ascii-ctrl',
1205     '1'}};
1206     $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.null'} = {perl_name =>
1207     ['null'],
1208     'uri',
1209     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.null',
1210     '1'}};
1211     $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.utf-8'} = {ascii8 =>
1212     '1',
1213     bom_allowed =>
1214     '1',
1215     no_bom_variant =>
1216     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf8',
1217     utf8_encoding_scheme =>
1218     '1',
1219     'uri',
1220     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.utf-8',
1221     '1'},
1222 wakaba 1.2 xml_name => 'UTF-8',
1223     };
1224    
1225 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/UTF-8.RFC2279'} = {ascii8 =>
1226     '1',
1227     bom_allowed =>
1228     '1',
1229     no_bom_variant =>
1230     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf8',
1231     utf8_encoding_scheme =>
1232     '1',
1233     'uri',
1234     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/UTF-8.RFC2279',
1235     '1'}};
1236     $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-8'} = {
1237     ascii8 => 1,
1238     is_block_safe =>
1239     '1',
1240     perl_name =>
1241     ['utf-8'],
1242     utf8_encoding_scheme =>
1243     '1',
1244     'uri',
1245     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-8',
1246     '1'}};
1247     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:utf-8'} = {
1248     ascii8 => 1,
1249     bom_allowed =>
1250     '1',
1251     no_bom_variant =>
1252     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-8',
1253     ietf_name =>
1254     ['utf-8'],
1255     mime_name =>
1256     'utf-8',
1257     utf8_encoding_scheme =>
1258     '1',
1259     'uri',
1260     {'urn:x-suika-fam-cx:charset:utf-8',
1261     '1'},
1262 wakaba 1.2 };
1263    
1264 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf8'} = {ascii8 =>
1265     '1',
1266     is_block_safe =>
1267     '1',
1268     perl_name =>
1269     ['utf8'],
1270     utf8_encoding_scheme =>
1271     '1',
1272     'uri',
1273     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf8',
1274     '1'}};
1275     $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.utf-16'} = {
1276     ascii16 => 1,
1277     bom_allowed =>
1278     '1',
1279     bom_required =>
1280     '1',
1281     no_bom_variant =>
1282     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16le',
1283     no_bom_variant16be =>
1284     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16be',
1285     no_bom_variant16le =>
1286     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16le',
1287     perl_name =>
1288     ['utf-16'],
1289     'uri',
1290     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.utf-16',
1291     '1'},
1292 wakaba 1.2 xml_name => 'UTF-16',
1293     };
1294    
1295 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:utf-16'} = {
1296     ascii16 => 1,
1297     bom_allowed =>
1298     '1',
1299     no_bom_variant =>
1300     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16le',
1301     no_bom_variant16be =>
1302     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16be',
1303     no_bom_variant16le =>
1304     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16le',
1305     ietf_name =>
1306     ['utf-16'],
1307     mime_name =>
1308     'utf-16',
1309     'uri',
1310     {'urn:x-suika-fam-cx:charset:utf-16',
1311     '1'},
1312 wakaba 1.2 };
1313    
1314 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:utf-16be'} = {
1315     ascii16 => 1,
1316     ascii16be => 1,
1317     bom_allowed =>
1318     '1',
1319     no_bom_variant =>
1320     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16be',
1321     no_bom_variant16be =>
1322     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16be',
1323     ietf_name =>
1324     ['utf-16be'],
1325     mime_name =>
1326     'utf-16be',
1327     'uri',
1328     {'urn:x-suika-fam-cx:charset:utf-16be',
1329     '1'},
1330 wakaba 1.2 };
1331    
1332 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:utf-16le'} = {
1333     ascii16 => 1,
1334     ascii16le => 1,
1335     bom_allowed =>
1336     '1',
1337     no_bom_variant =>
1338     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16le',
1339     no_bom_variant16le =>
1340     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16le',
1341     ietf_name =>
1342     ['utf-16le'],
1343     mime_name =>
1344     'utf-16le',
1345     'uri',
1346     {'urn:x-suika-fam-cx:charset:utf-16le',
1347     '1'},
1348 wakaba 1.2 };
1349    
1350 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16be'} = {
1351     ascii16 => 1,
1352     ascii16be => 1,
1353     is_block_safe =>
1354     '1',
1355     perl_name =>
1356     ['utf-16be'],
1357     'uri',
1358     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16be',
1359     '1'}};
1360     $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16le'} = {
1361     ascii16 => 1,
1362     ascii16le => 1,
1363     is_block_safe =>
1364     '1',
1365     perl_name =>
1366     ['utf-16le'],
1367     'uri',
1368     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16le',
1369     '1'}};
1370     $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-10646-ucs-2'} = $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:iso-10646-ucs-2'} = {
1371     ascii16 => 1,
1372     bom_allowed =>
1373     '1',
1374     no_bom_variant =>
1375     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.ucs-2le',
1376     no_bom_variant16be =>
1377     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.ucs-2be',
1378     no_bom_variant16le =>
1379     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.ucs-2le',
1380     ietf_name =>
1381     ['csunicode',
1382     'iso-10646-ucs-2'],
1383     mime_name =>
1384     'iso-10646-ucs-2',
1385     'uri',
1386     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-10646-ucs-2',
1387     '1',
1388     'urn:x-suika-fam-cx:charset:iso-10646-ucs-2',
1389     '1'},
1390 wakaba 1.2 xml_name => 'ISO-10646-UCS-2',
1391     };
1392    
1393 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.ucs-2be'} = {
1394     ascii16 => 1,
1395     ascii16be => 1,
1396     is_block_safe =>
1397     '1',
1398     perl_name =>
1399     ['ucs-2be'],
1400     'uri',
1401     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.ucs-2be',
1402     '1'}};
1403     $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.ucs-2le'} = {
1404     ascii16 => 1,
1405     ascii16le => 1,
1406     is_block_safe =>
1407     '1',
1408     perl_name =>
1409     ['ucs-2le'],
1410     'uri',
1411     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.ucs-2le',
1412     '1'}};
1413     $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-10646-ucs-4'} = $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:iso-10646-ucs-4'} = {
1414     ascii32 => 1,
1415     bom_allowed =>
1416     '1',
1417     no_bom_variant =>
1418     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-32le',
1419     no_bom_variant32endian1234 =>
1420     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-32be',
1421     no_bom_variant32endian4321 =>
1422     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-32le',
1423     ietf_name =>
1424     ['csucs4',
1425     'iso-10646-ucs-4'],
1426     mime_name =>
1427     'iso-10646-ucs-4',
1428     'uri',
1429     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-10646-ucs-4',
1430     '1',
1431     'urn:x-suika-fam-cx:charset:iso-10646-ucs-4',
1432     '1'},
1433 wakaba 1.2 xml_name => 'ISO-10646-UCS-4',
1434     };
1435    
1436 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-32be'} = {
1437     ascii32 => 1,
1438     ascii32endian1234 => 1,
1439     is_block_safe =>
1440     '1',
1441     perl_name =>
1442     ['ucs-4be',
1443     'utf-32be'],
1444     'uri',
1445     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-32be',
1446     '1'}};
1447     $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-32le'} = {
1448     ascii32 => 1,
1449     ascii32endian4321 => 1,
1450     is_block_safe =>
1451     '1',
1452     perl_name =>
1453     ['ucs-4le',
1454     'utf-32le'],
1455     'uri',
1456     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-32le',
1457     '1'}};
1458     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:iso_8859-1:1987'} = $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-1'} = {ascii8 =>
1459     '1',
1460     is_block_safe =>
1461     '1',
1462     ietf_name =>
1463     ['cp819',
1464     'csisolatin1',
1465     'ibm819',
1466     'iso-8859-1',
1467     'iso-8859-1',
1468     'iso-ir-100',
1469     'iso_8859-1',
1470     'iso_8859-1:1987',
1471     'l1',
1472     'latin1'],
1473     mime_name =>
1474     'iso-8859-1',
1475     perl_name =>
1476     ['iso-8859-1',
1477     'latin1'],
1478     'uri',
1479     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-1',
1480     '1',
1481     'urn:x-suika-fam-cx:charset:iso_8859-1:1987',
1482     '1'},
1483 wakaba 1.2 xml_name => 'ISO-8859-1',
1484     };
1485    
1486 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-2'} = {ascii8 =>
1487     '1',
1488     is_block_safe =>
1489     '1',
1490     'uri',
1491     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-2',
1492     '1'},
1493 wakaba 1.2 xml_name => 'ISO-8859-2',
1494     };
1495    
1496 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-3'} = {ascii8 =>
1497     '1',
1498     is_block_safe =>
1499     '1',
1500     'uri',
1501     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-3',
1502     '1'},
1503 wakaba 1.2 xml_name => 'ISO-8859-3',
1504     };
1505    
1506 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-4'} = {ascii8 =>
1507     '1',
1508     is_block_safe =>
1509     '1',
1510     'uri',
1511     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-4',
1512     '1'},
1513 wakaba 1.2 xml_name => 'ISO-8859-4',
1514     };
1515    
1516 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-5'} = {ascii8 =>
1517     '1',
1518     is_block_safe =>
1519     '1',
1520     'uri',
1521     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-5',
1522     '1'},
1523 wakaba 1.2 xml_name => 'ISO-8859-5',
1524     };
1525    
1526 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-6'} = {ascii8 =>
1527     '1',
1528     is_block_safe =>
1529     '1',
1530     'uri',
1531     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-6',
1532     '1'},
1533 wakaba 1.2 xml_name => 'ISO-8859-6',
1534     };
1535    
1536 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-7'} = {ascii8 =>
1537     '1',
1538     is_block_safe =>
1539     '1',
1540     'uri',
1541     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-7',
1542     '1'},
1543 wakaba 1.2 xml_name => 'ISO-8859-7',
1544     };
1545    
1546 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-8'} = {ascii8 =>
1547     '1',
1548     is_block_safe =>
1549     '1',
1550     'uri',
1551     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-8',
1552     '1'},
1553 wakaba 1.2 xml_name => 'ISO-8859-8',
1554     };
1555    
1556 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-9'} = {ascii8 =>
1557     '1',
1558     is_block_safe =>
1559     '1',
1560     'uri',
1561     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-9',
1562     '1'},
1563 wakaba 1.2 xml_name => 'ISO-8859-9',
1564     };
1565    
1566 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-10'} = {ascii8 =>
1567     '1',
1568     is_block_safe =>
1569     '1',
1570     'uri',
1571     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-10',
1572     '1'},
1573 wakaba 1.2 xml_name => 'ISO-8859-10',
1574     };
1575    
1576 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-11'} = {ascii8 =>
1577     '1',
1578     is_block_safe =>
1579     '1',
1580     'uri',
1581     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-11',
1582     '1'},
1583 wakaba 1.2 xml_name => 'ISO-8859-11',
1584     };
1585    
1586 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-13'} = {ascii8 =>
1587     '1',
1588     is_block_safe =>
1589     '1',
1590     'uri',
1591     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-13',
1592     '1'},
1593 wakaba 1.2 xml_name => 'ISO-8859-13',
1594     };
1595    
1596 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-14'} = {ascii8 =>
1597     '1',
1598     is_block_safe =>
1599     '1',
1600     'uri',
1601     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-14',
1602     '1'},
1603 wakaba 1.2 xml_name => 'ISO-8859-14',
1604     };
1605    
1606 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-15'} = {ascii8 =>
1607     '1',
1608     is_block_safe =>
1609     '1',
1610     'uri',
1611     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-15',
1612     '1'},
1613 wakaba 1.2 xml_name => 'ISO-8859-15',
1614     };
1615    
1616 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-16'} = {ascii8 =>
1617     '1',
1618     is_block_safe =>
1619     '1',
1620     'uri',
1621     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-16',
1622     '1'},
1623 wakaba 1.2 xml_name => 'ISO-8859-16',
1624     };
1625    
1626 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-2022-jp'} = {ascii8 =>
1627     '1',
1628     'uri',
1629     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-2022-jp',
1630     '1'},
1631 wakaba 1.2 xml_name => 'ISO-2022-JP',
1632     };
1633    
1634 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:iso-2022-jp'} = {ascii8 =>
1635     '1',
1636     ietf_name =>
1637     ['csiso2022jp',
1638     'iso-2022-jp',
1639     'iso-2022-jp'],
1640     mime_name =>
1641     'iso-2022-jp',
1642     'uri',
1643     {'urn:x-suika-fam-cx:charset:iso-2022-jp',
1644     '1'},
1645 wakaba 1.2 };
1646    
1647 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.iso-2022-jp'} = {ascii8 =>
1648     '1',
1649     perl_name =>
1650     ['iso-2022-jp'],
1651     'uri',
1652     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.iso-2022-jp',
1653     '1'}};
1654     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:shift_jis'} = $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.shift_jis'} = {ascii8 =>
1655     '1',
1656     is_block_safe =>
1657     '1',
1658     ietf_name =>
1659     ['csshiftjis',
1660     'ms_kanji',
1661     'shift_jis',
1662     'shift_jis'],
1663     mime_name =>
1664     'shift_jis',
1665     perl_name =>
1666     ['shift-jis-1997'],
1667     'uri',
1668     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.shift_jis',
1669     '1',
1670     'urn:x-suika-fam-cx:charset:shift_jis',
1671     '1'},
1672 wakaba 1.2 xml_name => 'Shift_JIS',
1673     };
1674    
1675 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.shiftjis'} = {ascii8 =>
1676     '1',
1677     is_block_safe =>
1678     '1',
1679     perl_name =>
1680     ['shiftjis',
1681     'sjis'],
1682     'uri',
1683     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.shiftjis',
1684     '1'}};
1685     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:euc-jp'} = $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.euc-jp'} = $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:extended_unix_code_packed_format_for_japanese'} = {ascii8 =>
1686     '1',
1687     is_block_safe =>
1688     '1',
1689     ietf_name =>
1690     ['cseucpkdfmtjapanese',
1691     'euc-jp',
1692     'euc-jp',
1693     'extended_unix_code_packed_format_for_japanese'],
1694     mime_name =>
1695     'euc-jp',
1696     perl_name =>
1697     ['euc-jp-1997'],
1698     'uri',
1699     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.euc-jp',
1700     '1',
1701     'urn:x-suika-fam-cx:charset:euc-jp',
1702     '1',
1703     'urn:x-suika-fam-cx:charset:extended_unix_code_packed_format_for_japanese',
1704     '1'},
1705 wakaba 1.2 xml_name => 'EUC-JP',
1706     };
1707    
1708 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.euc-jp'} = {ascii8 =>
1709     '1',
1710     is_block_safe =>
1711     '1',
1712     perl_name =>
1713     ['euc-jp',
1714     'ujis'],
1715     'uri',
1716     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.euc-jp',
1717     '1'}};
1718    
1719     1;
1720 wakaba 1.11 ## $Date: 2008/09/12 03:31:40 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24