/[suikacvs]/markup/html/whatpm/Whatpm/Charset/DecodeHandle.pm
Suika

Contents of /markup/html/whatpm/Whatpm/Charset/DecodeHandle.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.18 - (hide annotations) (download)
Mon Sep 15 07:19:03 2008 UTC (16 years, 10 months ago) by wakaba
Branch: MAIN
CVS Tags: HEAD
Changes since 1.17: +10 -3 lines
++ whatpm/Whatpm/ChangeLog	15 Sep 2008 07:17:34 -0000
	* HTML.pm.src: Remove checking for control character, surrogate
	pair, or noncharacter code points and non-Unicode code
	points (they should be handled by Whatpm::Charset::UnicodeChecker).
	(parse_char_stream): Support for the |$get_wrapper| argument and
	character stream error handlers.

2008-09-15  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/Charset/ChangeLog	15 Sep 2008 07:18:45 -0000
	* DecodeHandle.pm (onerror): Return |undef| if no explicit value
	is set.

	* UnicodeChecker.pm: Support for HTML5 parse errors.
	(onerror): Return |undef| if no explicit value is set.

2008-09-15  Wakaba  <wakaba@suika.fam.cx>

1 wakaba 1.1 package Whatpm::Charset::DecodeHandle;
2     use strict;
3    
4 wakaba 1.9 ## NOTE: |Message::Charset::Info| uses this module without calling
5     ## the constructor.
6 wakaba 1.15 use Message::Charset::Info;
7 wakaba 1.9
8 wakaba 1.1 my $XML_AUTO_CHARSET = q<http://suika.fam.cx/www/2006/03/xml-entity/>;
9     my $IANA_CHARSET = q<urn:x-suika-fam-cx:charset:>;
10     my $PERL_CHARSET = q<http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.>;
11     my $XML_CHARSET = q<http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.>;
12    
13     ## ->create_decode_handle ($charset_uri, $byte_stream, $onerror)
14     sub create_decode_handle ($$$;$) {
15     my $csdef = $Whatpm::Charset::CharsetDef->{$_[1]};
16     my $obj = {
17 wakaba 1.15 category => 0,
18 wakaba 1.9 char_buffer => \(my $s = ''),
19 wakaba 1.11 char_buffer_pos => 0,
20 wakaba 1.1 character_queue => [],
21     filehandle => $_[2],
22     charset => $_[1],
23     byte_buffer => '',
24     onerror => $_[3] || sub {},
25 wakaba 1.18 #onerror_set
26 wakaba 1.1 };
27     if ($csdef->{uri}->{$XML_AUTO_CHARSET} or
28     $obj->{charset} eq $XML_AUTO_CHARSET) {
29     my $b = ''; # UTF-8 w/o BOM
30     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
31 wakaba 1.2 $obj->{input_encoding} = 'UTF-8';
32 wakaba 1.1 if (read $obj->{filehandle}, $b, 256) {
33     no warnings "substr";
34     no warnings "uninitialized";
35     if (substr ($b, 0, 1) eq "<") {
36     if (substr ($b, 1, 1) eq "?") { # ASCII8
37     if ($b =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
38     encoding\s*=\s*["']([^"']*)/x) {
39 wakaba 1.2 $obj->{input_encoding} = $1;
40 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
41     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
42     if (not $csdef->{ascii8} or $csdef->{bom_required}) {
43     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
44     charset_uri => $uri,
45     charset_name => $obj->{input_encoding});
46     }
47     } else {
48     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
49 wakaba 1.2 $obj->{input_encoding} = 'UTF-8';
50 wakaba 1.1 }
51     if (defined $csdef->{no_bom_variant}) {
52     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant}};
53     }
54     } elsif (substr ($b, 1, 1) eq "\x00") {
55     if (substr ($b, 2, 2) eq "?\x00") { # ASCII16LE
56     my $c = $b; $c =~ tr/\x00//d;
57     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
58     encoding\s*=\s*["']([^"']*)/x) {
59 wakaba 1.2 $obj->{input_encoding} = $1;
60 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
61     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
62     if (not $csdef->{ascii16} or $csdef->{ascii16be} or
63     $csdef->{bom_required}) {
64     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
65     charset_uri => $uri,
66     charset_name => $obj->{input_encoding});
67     }
68     } else {
69     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
70 wakaba 1.2 $obj->{input_encoding} = 'UTF-8';
71 wakaba 1.1 }
72     if (defined $csdef->{no_bom_variant16le}) {
73     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant16le}};
74     }
75     } elsif (substr ($b, 2, 2) eq "\x00\x00") { # ASCII32Endian4321
76     my $c = $b; $c =~ tr/\x00//d;
77     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
78     encoding\s*=\s*["']([^"']*)/x) {
79 wakaba 1.2 $obj->{input_encoding} = $1;
80 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
81     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
82     if (not $csdef->{ascii32} or
83     $csdef->{ascii32endian1234} or
84     $csdef->{ascii32endian2143} or
85     $csdef->{ascii32endian3412} or
86     $csdef->{bom_required}) {
87     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
88     charset_uri => $uri,
89     charset_name => $obj->{input_encoding});
90     }
91     } else {
92     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
93 wakaba 1.2 $obj->{input_encoding} = 'UTF-8';
94 wakaba 1.1 }
95     if (defined $csdef->{no_bom_variant32endian4321}) {
96     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant32endian4321}};
97     }
98     }
99     }
100     } elsif (substr ($b, 0, 3) eq "\xEF\xBB\xBF") { # UTF8
101     $obj->{has_bom} = 1;
102     substr ($b, 0, 3) = '';
103     my $c = $b;
104     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
105     encoding\s*=\s*["']([^"']*)/x) {
106 wakaba 1.2 $obj->{input_encoding} = $1;
107 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
108     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
109     if (not $csdef->{utf8_encoding_scheme} or
110     not $csdef->{bom_allowed}) {
111     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
112     charset_uri => $uri,
113     charset_name => $obj->{input_encoding});
114     }
115     } else {
116     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
117 wakaba 1.2 $obj->{input_encoding} = 'UTF-8';
118 wakaba 1.1 }
119     if (defined $csdef->{no_bom_variant}) {
120     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant}};
121     }
122     } elsif (substr ($b, 0, 2) eq "\x00<") {
123     if (substr ($b, 2, 2) eq "\x00?") { # ASCII16BE
124     my $c = $b; $c =~ tr/\x00//d;
125     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
126     encoding\s*=\s*["']([^"']*)/x) {
127 wakaba 1.2 $obj->{input_encoding} = $1;
128 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
129     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
130     if (not $csdef->{ascii16} or $csdef->{ascii16le} or
131     $csdef->{bom_required}) {
132     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
133     charset_uri => $uri,
134     charset_name => $obj->{input_encoding});
135     }
136     } else {
137     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
138 wakaba 1.2 $obj->{input_encoding} = 'UTF-8';
139 wakaba 1.1 }
140     if (defined $csdef->{no_bom_variant16be}) {
141     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant16be}};
142     }
143     } elsif (substr ($b, 2, 2) eq "\x00\x00") { # ASCII32Endian3412
144     my $c = $b; $c =~ tr/\x00//d;
145     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
146     encoding\s*=\s*["']([^"']*)/x) {
147 wakaba 1.2 $obj->{input_encoding} = $1;
148 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
149     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
150     if (not $csdef->{ascii32} or
151     $csdef->{ascii32endian1234} or
152     $csdef->{ascii32endian2143} or
153     $csdef->{ascii32endian4321} or
154     $csdef->{bom_required}) {
155     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
156     charset_uri => $uri,
157     charset_name => $obj->{input_encoding});
158     }
159     } else {
160     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
161 wakaba 1.2 $obj->{input_encoding} = 'UTF-8';
162 wakaba 1.1 }
163     if (defined $csdef->{no_bom_variant32endian3412}) {
164     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant32endian3412}};
165     }
166     }
167     } elsif (substr ($b, 0, 2) eq "\xFE\xFF") {
168     if (substr ($b, 2, 2) eq "\x00<") { # ASCII16BE
169     $obj->{has_bom} = 1;
170     substr ($b, 0, 2) = '';
171     my $c = $b; $c =~ tr/\x00//d;
172     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
173     encoding\s*=\s*["']([^"']*)/x) {
174 wakaba 1.2 $obj->{input_encoding} = $1;
175 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
176     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
177     if (not $csdef->{ascii16} or
178     $csdef->{ascii16le} or
179     not $csdef->{bom_allowed}) {
180     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
181     charset_uri => $uri,
182     charset_name => $obj->{input_encoding});
183     }
184     } else {
185     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-16be'};
186 wakaba 1.2 $obj->{input_encoding} = 'UTF-16';
187 wakaba 1.1 }
188     if (defined $csdef->{no_bom_variant16be}) {
189     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant16be}};
190     }
191     } elsif (substr ($b, 2, 2) eq "\x00\x00") { # ASCII32Endian3412
192     $obj->{has_bom} = 1;
193     substr ($b, 0, 4) = '';
194     my $c = $b; $c =~ tr/\x00//d;
195     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
196     encoding\s*=\s*["']([^"']*)/x) {
197 wakaba 1.2 $obj->{input_encoding} = $1;
198 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
199     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
200     if (not $csdef->{ascii32} or
201     $csdef->{ascii32endian1234} or
202     $csdef->{ascii32endian2143} or
203     $csdef->{ascii32endian4321} or
204     not $csdef->{bom_allowed}) {
205     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
206     charset_uri => $uri,
207     charset_name => $obj->{input_encoding});
208     }
209     } else {
210     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-16be'};
211 wakaba 1.2 $obj->{input_encoding} = 'UTF-16';
212 wakaba 1.1 $obj->{byte_buffer} .= "\x00\x00";
213     }
214     if (defined $csdef->{no_bom_variant32endian3412}) {
215     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant32endian3412}};
216     }
217     } else {
218     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-16be'};
219 wakaba 1.2 $obj->{input_encoding} = 'UTF-16';
220 wakaba 1.1 substr ($b, 0, 2) = '';
221     $obj->{has_bom} = 1;
222     }
223     } elsif (substr ($b, 0, 2) eq "\xFF\xFE") {
224     if (substr ($b, 2, 2) eq "<\x00") { # ASCII16LE
225     $obj->{has_bom} = 1;
226     substr ($b, 0, 2) = '';
227     my $c = $b; $c =~ tr/\x00//d;
228     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
229     encoding\s*=\s*["']([^"']*)/x) {
230 wakaba 1.2 $obj->{input_encoding} = $1;
231 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
232     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
233     if (not $csdef->{ascii16} or
234     $csdef->{ascii16be} or
235     not $csdef->{bom_allowed}) {
236     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
237     charset_uri => $uri,
238     charset_name => $obj->{input_encoding});
239     }
240     } else {
241     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-16le'};
242 wakaba 1.2 $obj->{input_encoding} = 'UTF-16';
243 wakaba 1.1 }
244     if (defined $csdef->{no_bom_variant16le}) {
245     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant16le}};
246     }
247     } elsif (substr ($b, 2, 2) eq "\x00\x00") { # ASCII32Endian4321
248     $obj->{has_bom} = 1;
249     substr ($b, 0, 4) = '';
250     my $c = $b; $c =~ tr/\x00//d;
251     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
252     encoding\s*=\s*["']([^"']*)/x) {
253 wakaba 1.2 $obj->{input_encoding} = $1;
254 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
255     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
256     if (not $csdef->{ascii32} or
257     $csdef->{ascii32endian1234} or
258     $csdef->{ascii32endian2143} or
259     $csdef->{ascii32endian3412} or
260     not $csdef->{bom_allowed}) {
261     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
262     charset_uri => $uri,
263     charset_name => $obj->{input_encoding});
264     }
265     } else {
266     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-16le'};
267 wakaba 1.2 $obj->{input_encoding} = 'UTF-16';
268 wakaba 1.1 $obj->{byte_buffer} .= "\x00\x00";
269     }
270     if (defined $csdef->{no_bom_variant32endian4321}) {
271     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant32endian4321}};
272     }
273     } else {
274     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-16le'};
275 wakaba 1.2 $obj->{input_encoding} = 'UTF-16';
276 wakaba 1.1 substr ($b, 0, 2) = '';
277     $obj->{has_bom} = 1;
278     }
279     } elsif (substr ($b, 0, 2) eq "\x00\x00") {
280     if (substr ($b, 2, 2) eq "\x00<") { # ASCII32Endian1234
281     my $c = $b; $c =~ tr/\x00//d;
282     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
283     encoding\s*=\s*["']([^"']*)/x) {
284 wakaba 1.2 $obj->{input_encoding} = $1;
285 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
286     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
287     if (not $csdef->{ascii32} or
288     $csdef->{ascii32endian2143} or
289     $csdef->{ascii32endian3412} or
290     $csdef->{ascii32endian4321} or
291     $csdef->{bom_required}) {
292     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
293     charset_uri => $uri,
294     charset_name => $obj->{input_encoding});
295     }
296     } else {
297     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
298 wakaba 1.2 $obj->{input_encoding} = 'UTF-8';
299 wakaba 1.1 }
300     if (defined $csdef->{no_bom_variant32endian1234}) {
301     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant32endian1234}};
302     }
303     } elsif (substr ($b, 2, 2) eq "<\x00") { # ASCII32Endian2143
304     my $c = $b; $c =~ tr/\x00//d;
305     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
306     encoding\s*=\s*["']([^"']*)/x) {
307 wakaba 1.2 $obj->{input_encoding} = $1;
308 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
309     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
310     if (not $csdef->{ascii32} or
311     $csdef->{ascii32endian1234} or
312     $csdef->{ascii32endian3412} or
313     $csdef->{ascii32endian4321} or
314     $csdef->{bom_required}) {
315     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
316     charset_uri => $uri,
317     charset_name => $obj->{input_encoding});
318     }
319     } else {
320     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
321 wakaba 1.2 $obj->{input_encoding} = 'UTF-8';
322 wakaba 1.1 }
323     if (defined $csdef->{no_bom_variant32endian2143}) {
324     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant32endian2143}};
325     }
326     } elsif (substr ($b, 2, 2) eq "\xFE\xFF") { # ASCII32Endian1234
327     $obj->{has_bom} = 1;
328     substr ($b, 0, 4) = '';
329     my $c = $b; $c =~ tr/\x00//d;
330     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
331     encoding\s*=\s*["']([^"']*)/x) {
332 wakaba 1.2 $obj->{input_encoding} = $1;
333 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
334     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
335     if (not $csdef->{ascii32} or
336     $csdef->{ascii32endian2143} or
337     $csdef->{ascii32endian3412} or
338     $csdef->{ascii32endian4321} or
339     $csdef->{bom_required}) {
340     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
341     charset_uri => $uri,
342     charset_name => $obj->{input_encoding});
343     }
344     } else {
345     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
346 wakaba 1.2 $obj->{input_encoding} = 'UTF-8';
347 wakaba 1.1 $obj->{has_bom} = 0;
348     $obj->{byte_buffer} .= "\x00\x00\xFE\xFF";
349     }
350     if (defined $csdef->{no_bom_variant32endian1234}) {
351     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant32endian1234}};
352     }
353     } elsif (substr ($b, 2, 2) eq "\xFF\xFE") { # ASCII32Endian2143
354     $obj->{has_bom} = 1;
355     substr ($b, 0, 4) = '';
356     my $c = $b; $c =~ tr/\x00//d;
357     if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)?
358     encoding\s*=\s*["']([^"']*)/x) {
359 wakaba 1.2 $obj->{input_encoding} = $1;
360 wakaba 1.1 my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding});
361     $csdef = $Whatpm::Charset::CharsetDef->{$uri};
362     if (not $csdef->{ascii32} or
363     $csdef->{ascii32endian1234} or
364     $csdef->{ascii32endian3412} or
365     $csdef->{ascii32endian4321} or
366     $csdef->{bom_required}) {
367     $obj->{onerror}->(undef, 'charset-name-mismatch-error',
368     charset_uri => $uri,
369     charset_name => $obj->{input_encoding});
370     }
371     } else {
372     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'};
373 wakaba 1.2 $obj->{input_encoding} = 'UTF-8';
374 wakaba 1.1 $obj->{has_bom} = 0;
375     $obj->{byte_buffer} .= "\x00\x00\xFF\xFE";
376     }
377     if (defined $csdef->{no_bom_variant32endian2143}) {
378     $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant32endian2143}};
379     }
380     }
381     # \x4C\x6F\xA7\x94 EBCDIC
382     } # buffer
383     $obj->{byte_buffer} .= $b;
384     } # read
385     } elsif ($csdef->{uri}->{$XML_CHARSET.'utf-8'}) {
386     ## BOM is optional.
387     my $b = '';
388     if (read $obj->{filehandle}, $b, 3) {
389     if ($b eq "\xEF\xBB\xBF") {
390     $obj->{has_bom} = 1;
391     } else {
392     $obj->{byte_buffer} .= $b;
393     }
394     }
395     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'}; # UTF-8 w/o BOM
396     } elsif ($csdef->{uri}->{$XML_CHARSET.'utf-16'}) {
397     ## BOM is mandated.
398     my $b = '';
399     if (read $obj->{filehandle}, $b, 2) {
400     if ($b eq "\xFE\xFF") {
401     $obj->{has_bom} = 1; # UTF-16BE w/o BOM
402     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-16be'};
403     } elsif ($b eq "\xFF\xFE") {
404     $obj->{has_bom} = 1; # UTF-16LE w/o BOM
405     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-16le'};
406     } else {
407     $obj->{onerror}->(undef, 'no-bom-error', charset_uri => $obj->{charset});
408     $obj->{has_bom} = 0;
409     $obj->{byte_buffer} .= $b; # UTF-16BE w/o BOM
410     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-16be'};
411     }
412     } else {
413     $obj->{onerror}->(undef, 'no-bom-error', charset_uri => $obj->{charset});
414     $obj->{has_bom} = 0; # UTF-16BE w/o BOM
415     $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-16be'};
416     }
417     }
418    
419     if ($csdef->{uri}->{$XML_CHARSET.'iso-2022-jp'}) {
420     $obj->{state_2440} = 'gl-jis-1997-swapped';
421     $obj->{state_2442} = 'gl-jis-1997';
422     $obj->{state} = 'state_2842';
423     require Encode::GLJIS1997Swapped;
424     require Encode::GLJIS1997;
425     if (Encode::find_encoding ($obj->{state_2440}) and
426     Encode::find_encoding ($obj->{state_2442})) {
427     return bless $obj, 'Whatpm::Charset::DecodeHandle::ISO2022JP';
428     }
429     } elsif ($csdef->{uri}->{$IANA_CHARSET.'iso-2022-jp'}) {
430     $obj->{state_2440} = 'gl-jis-1978';
431     $obj->{state_2442} = 'gl-jis-1983';
432     $obj->{state} = 'state_2842';
433     require Encode::GLJIS1978;
434     require Encode::GLJIS1983;
435     if (Encode::find_encoding ($obj->{state_2440}) and
436     Encode::find_encoding ($obj->{state_2442})) {
437     return bless $obj, 'Whatpm::Charset::DecodeHandle::ISO2022JP';
438     }
439     } elsif (defined $csdef->{perl_name}->[0]) {
440     if ($csdef->{uri}->{$XML_CHARSET.'euc-jp'} or
441     $csdef->{uri}->{$IANA_CHARSET.'euc-jp'}) {
442     $obj->{perl_encoding_name} = $csdef->{perl_name}->[0];
443     require Encode::EUCJP1997;
444     if (Encode::find_encoding ($obj->{perl_encoding_name})) {
445 wakaba 1.15 $obj->{category} |= Message::Charset::Info::CHARSET_CATEGORY_EUCJP;
446     return bless $obj, 'Whatpm::Charset::DecodeHandle::Encode';
447 wakaba 1.1 }
448     } elsif ($csdef->{uri}->{$XML_CHARSET.'shift_jis'} or
449     $csdef->{uri}->{$IANA_CHARSET.'shift_jis'}) {
450     $obj->{perl_encoding_name} = $csdef->{perl_name}->[0];
451     require Encode::ShiftJIS1997;
452     if (Encode::find_encoding ($obj->{perl_encoding_name})) {
453 wakaba 1.16 return bless $obj, 'Whatpm::Charset::DecodeHandle::Encode';
454 wakaba 1.1 }
455     } elsif ($csdef->{is_block_safe}) {
456     $obj->{perl_encoding_name} = $csdef->{perl_name}->[0];
457     require Encode;
458     if (Encode::find_encoding ($obj->{perl_encoding_name})) {
459     return bless $obj, 'Whatpm::Charset::DecodeHandle::Encode';
460     }
461     }
462     }
463    
464     $obj->{onerror}->(undef, 'charset-not-supported-error',
465     charset_uri => $obj->{charset});
466     return undef;
467     } # create_decode_handle
468    
469     sub name_to_uri ($$$) {
470     my $domain = $_[1];
471     my $name = lc $_[2];
472    
473     if ($domain eq 'ietf') {
474     return $IANA_CHARSET . $name;
475     } elsif ($domain eq 'xml') {
476     if ({
477     'utf-8' => 1,
478     'utf-16' => 1,
479     'iso-10646-ucs-2' => 1,
480     'iso-10646-ucs-4' => 1,
481     'iso-8859-1' => 1,
482     'iso-8859-2' => 1,
483     'iso-8859-3' => 1,
484     'iso-8859-4' => 1,
485     'iso-8859-5' => 1,
486     'iso-8859-6' => 1,
487     'iso-8859-7' => 1,
488     'iso-8859-8' => 1,
489     'iso-8859-9' => 1,
490     'iso-8859-10' => 1,
491     'iso-8859-11' => 1,
492     'iso-8859-13' => 1,
493     'iso-8859-14' => 1,
494     'iso-8859-15' => 1,
495     'iso-8859-16' => 1,
496     'iso-2022-jp' => 1,
497     'shift_jis' => 1,
498     'euc-jp' => 1,
499     }->{$name}) {
500     return $XML_CHARSET . $name;
501     }
502    
503     my $uri = $IANA_CHARSET . $name;
504     return $uri if $Whatpm::Charset::CharsetDef->{$uri};
505    
506     return $XML_CHARSET . $name;
507     } else {
508     return undef;
509     }
510     } # name_to_uri
511    
512     sub uri_to_name ($$$) {
513     my (undef, $domain, $uri) = @_;
514 wakaba 1.2
515     if ($domain eq 'xml') {
516     my $v = $Whatpm::Charset::CharsetDef->{$uri}->{xml_name};
517     return $v if defined $v;
518    
519     if (substr ($uri, 0, length $XML_CHARSET) eq $XML_CHARSET) {
520     return substr ($uri, length $XML_CHARSET);
521     }
522    
523     $domain = 'ietf'; ## TODO: XML encoding name has smaller range
524     }
525    
526     if ($domain eq 'ietf') {
527     my $v = $Whatpm::Charset::CharsetDef->{$uri}->{iana_name};
528     return $v->[0] if defined $v;
529    
530     if (substr ($uri, 0, length $IANA_CHARSET) eq $IANA_CHARSET) {
531 wakaba 1.1 return substr ($uri, length $IANA_CHARSET);
532     }
533     }
534    
535     return undef;
536     } # uri_to_name
537    
538 wakaba 1.3 require IO::Handle;
539    
540     package Whatpm::Charset::DecodeHandle::ByteBuffer;
541    
542 wakaba 1.7 ## NOTE: Provides a byte buffer wrapper object.
543    
544 wakaba 1.3 sub new ($$) {
545     my $self = bless {
546     buffer => '',
547     }, shift;
548     $self->{filehandle} = shift;
549     return $self;
550     } # new
551    
552     sub read {
553     my $self = shift;
554     my $pos = length $self->{buffer};
555     my $r = $self->{filehandle}->read ($self->{buffer}, $_[1], $pos);
556     substr ($_[0], $_[2]) = substr ($self->{buffer}, $pos);
557 wakaba 1.8 ## NOTE: This would do different behavior from Perl's standard
558     ## |read| when $pos points beyond the end of the string.
559 wakaba 1.3 return $r;
560     } # read
561    
562     sub close { $_[0]->{filehandle}->close }
563    
564 wakaba 1.11 package Whatpm::Charset::DecodeHandle::CharString;
565    
566     ## NOTE: Same as Perl's standard |open $handle, '<', \$char_string|,
567     ## but supports |ungetc| and other extensions.
568    
569     sub new ($$) {
570     my $self = bless {pos => 0}, shift;
571     $self->{string} = shift; # must be a scalar ref
572     return $self;
573     } # new
574    
575     sub getc ($) {
576     my $self = shift;
577     if ($self->{pos} < length ${$self->{string}}) {
578     return substr ${$self->{string}}, $self->{pos}++, 1;
579     } else {
580     return undef;
581     }
582     } # getc
583    
584     sub read ($$$$) {
585     #my ($self, $scalar, $length, $offset) = @_;
586     my $self = $_[0];
587     my $length = $_[2] || 0;
588     my $offset = $_[3];
589     ## NOTE: We don't support standard Perl semantics if $offset is
590     ## greater than the length of $scalar.
591     substr ($_[1], $offset) = substr (${$self->{string}}, $self->{pos}, $length);
592     my $count = (length $_[1]) - $offset;
593     $self->{pos} += $count;
594     return $count;
595     } # read
596    
597 wakaba 1.12 sub manakai_read_until ($$$;$) {
598     #my ($self, $scalar, $pattern, $offset) = @_;
599     my $self = $_[0];
600 wakaba 1.11 pos (${$self->{string}}) = $self->{pos};
601 wakaba 1.12 if (${$self->{string}} =~ /\G(?>$_[2])+/) {
602     substr ($_[1], $_[3]) = substr (${$self->{string}}, $-[0], $+[0] - $-[0]);
603 wakaba 1.11 $self->{pos} += $+[0] - $-[0];
604 wakaba 1.12 return $+[0] - $-[0];
605 wakaba 1.11 } else {
606 wakaba 1.12 return 0;
607 wakaba 1.11 }
608 wakaba 1.12 } # manakai_read_until
609 wakaba 1.11
610     sub ungetc ($$) {
611     my $self = shift;
612     ## Ignore second parameter.
613     $self->{pos}-- if $self->{pos} > 0;
614     } # ungetc
615    
616     sub close ($) { }
617    
618 wakaba 1.17 sub onerror ($;$) { }
619    
620 wakaba 1.1 package Whatpm::Charset::DecodeHandle::Encode;
621    
622 wakaba 1.7 ## NOTE: Provides a Perl |Encode| module wrapper object.
623    
624 wakaba 1.1 sub charset ($) { $_[0]->{charset} }
625    
626 wakaba 1.3 sub close ($) { $_[0]->{filehandle}->close }
627 wakaba 1.1
628     sub getc ($) {
629 wakaba 1.9 my $c = '';
630     my $l = $_[0]->read ($c, 1);
631     if ($l) {
632     return $c;
633     } else {
634     return undef;
635     }
636     } # getc
637    
638     sub read ($$$;$) {
639 wakaba 1.1 my $self = $_[0];
640 wakaba 1.14 #my $scalar = $_[1];
641     my $length = $_[2];
642     my $offset = $_[3] || 0;
643     my $count = 0;
644     my $eof;
645     ## NOTE: It is incompatible with the standard Perl semantics
646     ## if $offset is greater than the length of $scalar.
647    
648     A: {
649     return $count if $length < 1;
650    
651     if (my $l = (length ${$self->{char_buffer}}) - $self->{char_buffer_pos}) {
652     if ($l >= $length) {
653     substr ($_[1], $offset)
654     = substr (${$self->{char_buffer}}, $self->{char_buffer_pos},
655     $length);
656     $count += $length;
657     $self->{char_buffer_pos} += $length;
658     $length = 0;
659     return $count;
660     } else {
661     substr ($_[1], $offset)
662     = substr (${$self->{char_buffer}}, $self->{char_buffer_pos});
663     $count += $l;
664     $length -= $l;
665     ${$self->{char_buffer}} = '';
666     $self->{char_buffer_pos} = 0;
667     }
668     $offset = length $_[1];
669     }
670 wakaba 1.1
671 wakaba 1.14 if ($eof) {
672     return $count;
673 wakaba 1.1 }
674 wakaba 1.14
675     my $error;
676     if ($self->{continue}) {
677     if ($self->{filehandle}->read ($self->{byte_buffer}, 256,
678     length $self->{byte_buffer})) {
679     #
680     } else {
681     $error = 1;
682     }
683     $self->{continue} = 0;
684     } elsif (512 > length $self->{byte_buffer}) {
685     if ($self->{filehandle}->read ($self->{byte_buffer}, 256,
686     length $self->{byte_buffer})) {
687     #
688     } else {
689     $eof = 1;
690 wakaba 1.1 }
691 wakaba 1.14 }
692    
693     unless ($error) {
694 wakaba 1.15 if (not $self->{bom_checked}) {
695     if (defined $self->{bom_pattern}) {
696     if ($self->{byte_buffer} =~ s/^$self->{bom_pattern}//) {
697     $self->{has_bom} = 1;
698     }
699     }
700     $self->{bom_checked} = 1;
701     }
702    
703 wakaba 1.14 my $string = Encode::decode ($self->{perl_encoding_name},
704     $self->{byte_buffer},
705     Encode::FB_QUIET ());
706     if (length $string) {
707     $self->{char_buffer} = \$string;
708     $self->{char_buffer_pos} = 0;
709     if (length $self->{byte_buffer}) {
710     $self->{continue} = 1;
711     }
712 wakaba 1.1 } else {
713 wakaba 1.14 if (length $self->{byte_buffer}) {
714     $error = 1;
715     } else {
716     ## NOTE: No further input.
717     redo A;
718     }
719 wakaba 1.1 }
720     }
721    
722 wakaba 1.14 if ($error) {
723     my $r = substr $self->{byte_buffer}, 0, 1, '';
724 wakaba 1.15 my $fallback;
725 wakaba 1.14 my $etype = 'illegal-octets-error';
726 wakaba 1.15 my %earg;
727     if ($self->{category}
728 wakaba 1.16 & Message::Charset::Info::CHARSET_CATEGORY_SJIS) {
729     if ($r =~ /^[\x81-\x9F\xE0-\xFC]/) {
730     if ($self->{byte_buffer} =~ s/(.)//s) {
731     $r .= $1; # not limited to \x40-\xFC - \x7F
732     $etype = 'unassigned-code-point-error';
733     }
734     ## NOTE: Range [\xF0-\xFC] is unassigned and may be used as a
735     ## single-byte character or as the first-byte of a double-byte
736     ## character, according to JIS X 0208:1997 Appendix 1. However, the
737     ## current practice is using the range as first-bytes of double-byte
738     ## characters.
739     } elsif ($r =~ /^[\x80\xA0\xFD-\xFF]/) {
740     $etype = 'unassigned-code-point-error';
741     }
742     } elsif ($self->{category}
743     & Message::Charset::Info::CHARSET_CATEGORY_EUCJP) {
744 wakaba 1.15 if ($r =~ /^[\xA1-\xFE]/) {
745     if ($self->{byte_buffer} =~ s/^([\xA1-\xFE])//) {
746     $r .= $1;
747     $etype = 'unassigned-code-point-error';
748     }
749     } elsif ($r eq "\x8F") {
750     if ($self->{byte_buffer} =~ s/^([\xA1-\xFE][\xA1-\xFE]?)//) {
751     $r .= $1;
752     $etype = 'unassigned-code-point-error' if length $1 == 2;
753     }
754     } elsif ($r eq "\x8E") {
755     if ($self->{byte_buffer} =~ s/^([\xA1-\xFE])//) {
756     $r .= $1;
757     $etype = 'unassigned-code-point-error';
758     }
759     } elsif ($r eq "\xA0" or $r eq "\xFF") {
760 wakaba 1.14 $etype = 'unassigned-code-point-error';
761     }
762 wakaba 1.15 } else {
763     $fallback = $self->{fallback}->{$r};
764     if (defined $fallback) {
765     ## NOTE: This is an HTML5 parse error.
766     $etype = 'fallback-char-error';
767     $earg{char} = \$fallback;
768     } elsif (exists $self->{fallback}->{$r}) {
769     ## NOTE: This is an HTML5 parse error. In addition, the octet
770     ## is not assigned with a character.
771     $etype = 'fallback-unassigned-error';
772 wakaba 1.14 }
773 wakaba 1.1 }
774 wakaba 1.15
775 wakaba 1.14 ## NOTE: Fixup line/column number by counting the number of
776     ## lines/columns in the string that is to be retuend by this
777     ## method call.
778     my $line_diff = 0;
779     my $col_diff = 0;
780     my $set_col;
781     for (my $i = 0; $i < $count; $i++) {
782     my $s = substr $_[1], $i - $count, 1;
783     if ($s eq "\x0D") {
784     $line_diff++;
785     $col_diff = 0;
786     $set_col = 1;
787     $i++ if substr ($_[1], $i - $count + 1, 1) eq "\x0A";
788     } elsif ($s eq "\x0A") {
789     $line_diff++;
790     $col_diff = 0;
791     $set_col = 1;
792     } else {
793     $col_diff++;
794     }
795     }
796     my $i = $self->{char_buffer_pos};
797     if ($count and substr (${$self->{char_buffer}}, -1, 1) eq "\x0D") {
798     if (substr (${$self->{char_buffer}}, $i, 1) eq "\x0A") {
799     $i++;
800     }
801 wakaba 1.1 }
802 wakaba 1.14 my $cb_length = length ${$self->{char_buffer}};
803     for (; $i < $cb_length; $i++) {
804     my $s = substr $_[1], $i, 1;
805     if ($s eq "\x0D") {
806     $line_diff++;
807     $col_diff = 0;
808     $set_col = 1;
809     $i++ if substr ($_[1], $i + 1, 1) eq "\x0A";
810     } elsif ($s eq "\x0A") {
811     $line_diff++;
812     $col_diff = 0;
813     $set_col = 1;
814     } else {
815     $col_diff++;
816     }
817 wakaba 1.1 }
818 wakaba 1.15 $self->{onerror}->($self, $etype, octets => \$r, %earg,
819 wakaba 1.14 level => $self->{level}->{$self->{error_level}->{$etype}},
820     line_diff => $line_diff,
821     ($set_col ? (column => 1) : ()),
822     column_diff => $col_diff);
823     ## NOTE: Error handler may modify |octets| parameter, which
824     ## would be returned as part of the output. Note that what
825     ## is returned would affect what |manakai_read_until| returns.
826 wakaba 1.15 ${$self->{char_buffer}} .= defined $fallback ? $fallback : $r;
827 wakaba 1.1 }
828    
829 wakaba 1.14 redo A;
830     } # A
831 wakaba 1.9 } # read
832    
833 wakaba 1.15 sub manakai_read_until ($$$;$) {
834     #my ($self, $scalar, $pattern, $offset) = @_;
835     my $self = $_[0];
836     my $s = '';
837     $self->read ($s, 255);
838     if ($s =~ /^(?>$_[2])+/) {
839     my $rem_length = (length $s) - $+[0];
840     if ($rem_length) {
841     if ($self->{char_buffer_pos} > $rem_length) {
842     $self->{char_buffer_pos} -= $rem_length;
843     } else {
844     substr (${$self->{char_buffer}}, 0, $self->{char_buffer_pos})
845     = substr ($s, $+[0]);
846     $self->{char_buffer_pos} = 0;
847     }
848     }
849     substr ($_[1], $_[3]) = substr ($s, $-[0], $+[0] - $-[0]);
850     return $+[0];
851     } elsif (length $s) {
852     if ($self->{char_buffer_pos} > length $s) {
853     $self->{char_buffer_pos} -= length $s;
854     } else {
855     substr (${$self->{char_buffer}}, 0, $self->{char_buffer_pos}) = $s;
856     $self->{char_buffer_pos} = 0;
857     }
858     }
859     return 0;
860     } # manakai_read_until
861    
862     sub has_bom ($) { $_[0]->{has_bom} }
863    
864     sub input_encoding ($) {
865     my $v = $_[0]->{input_encoding};
866     return $v if defined $v;
867    
868     my $uri = $_[0]->{charset};
869     if (defined $uri) {
870     return Whatpm::Charset::DecodeHandle->uri_to_name (xml => $uri);
871     }
872    
873     return undef;
874     } # input_encoding
875    
876     sub onerror ($;$) {
877     if (@_ > 1) {
878 wakaba 1.18 if ($_[1]) {
879     $_[0]->{onerror} = $_[1];
880     $_[0]->{onerror_set} = 1;
881     } else {
882     $_[0]->{onerror} = sub { };
883     delete $_[0]->{onerror_set};
884     }
885 wakaba 1.15 }
886    
887 wakaba 1.18 return $_[0]->{onerror_set} ? $_[0]->{onerror} : undef;
888 wakaba 1.15 } # onerror
889    
890     sub ungetc ($$) {
891     unshift @{$_[0]->{character_queue}}, chr int ($_[1] or 0);
892     } # ungetc
893    
894 wakaba 1.1 package Whatpm::Charset::DecodeHandle::ISO2022JP;
895     push our @ISA, 'Whatpm::Charset::DecodeHandle::Encode';
896    
897     sub getc ($) {
898     my $self = $_[0];
899     return shift @{$self->{character_queue}} if @{$self->{character_queue}};
900    
901     my $r;
902     A: {
903     my $error;
904     if ($self->{continue}) {
905 wakaba 1.3 if ($self->{filehandle}->read ($self->{byte_buffer}, 256,
906     length $self->{byte_buffer})) {
907 wakaba 1.1 #
908     } else {
909     $error = 1;
910     }
911     $self->{continue} = 0;
912     } elsif (512 > length $self->{byte_buffer}) {
913 wakaba 1.3 $self->{filehandle}->read ($self->{byte_buffer}, 256,
914     length $self->{byte_buffer});
915 wakaba 1.1 }
916    
917     unless ($error) {
918     if ($self->{byte_buffer} =~ s/^\x1B(\x24[\x40\x42]|\x28[\x42\x4A])//) {
919     $self->{state} = {
920     "\x24\x40" => 'state_2440',
921     "\x24\x42" => 'state_2442',
922     "\x28\x42" => 'state_2842',
923     "\x28\x4A" => 'state_284A',
924     }->{$1};
925     redo A;
926     } elsif ($self->{state} eq 'state_2842') { # IRV
927     if ($self->{byte_buffer} =~ s/^([\x00-\x0D\x10-\x1A\x1C-\x7F]+)//) {
928     push @{$self->{character_queue}}, split //, $1;
929     $r = shift @{$self->{character_queue}};
930     } else {
931     if (length $self->{byte_buffer}) {
932     $error = 1;
933     } else {
934     $r = undef;
935     }
936     }
937     } elsif ($self->{state} eq 'state_284A') { # 0201
938     if ($self->{byte_buffer} =~ s/^([\x00-\x0D\x10-\x1A\x1C-\x7F]+)//) {
939     my $v = $1;
940     $v =~ tr/\x5C\x7E/\xA5\x{203E}/;
941     push @{$self->{character_queue}}, split //, $v;
942     $r = shift @{$self->{character_queue}};
943     } else {
944     if (length $self->{byte_buffer}) {
945     $error = 1;
946     } else {
947     $r = undef;
948     $self->{onerror}->($self, 'invalid-state-error',
949 wakaba 1.7 state => $self->{state},
950     level => $self->{level}->{$self->{error_level}->{'invalid-state-error'}});
951 wakaba 1.1 }
952     }
953     } elsif ($self->{state} eq 'state_2442') { # 1983
954     my $v = Encode::decode ($self->{state_2442},
955     $self->{byte_buffer},
956     Encode::FB_QUIET ());
957     if (length $v) {
958     push @{$self->{character_queue}}, split //, $v;
959     $r = shift @{$self->{character_queue}};
960     } else {
961     if (length $self->{byte_buffer}) {
962     $error = 1;
963     } else {
964     $r = undef;
965     $self->{onerror}->($self, 'invalid-state-error',
966 wakaba 1.7 state => $self->{state},
967     level => $self->{level}->{$self->{error_level}->{'invalid-state-error'}});
968 wakaba 1.1 }
969     }
970     } elsif ($self->{state} eq 'state_2440') { # 1978
971     my $v = Encode::decode ($self->{state_2440},
972     $self->{byte_buffer},
973     Encode::FB_QUIET ());
974     if (length $v) {
975     push @{$self->{character_queue}}, split //, $v;
976     $r = shift @{$self->{character_queue}};
977     } else {
978     if (length $self->{byte_buffer}) {
979     $error = 1;
980     } else {
981     $r = undef;
982     $self->{onerror}->($self, 'invalid-state-error',
983 wakaba 1.7 state => $self->{state},
984     level => $self->{level}->{$self->{error_level}->{'invalid-state-error'}});
985 wakaba 1.1 }
986     }
987     } else {
988     $error = 1;
989     }
990     }
991    
992     if ($error) {
993     $r = substr $self->{byte_buffer}, 0, 1, '';
994     my $etype = 'illegal-octets-error';
995     if (($self->{state} eq 'state_2442' or
996     $self->{state} eq 'state_2440') and
997     $r =~ /^[\x21-\x7E]/ and
998     $self->{byte_buffer} =~ s/^([\x21-\x7E])//) {
999     $r .= $1;
1000     $etype = 'unassigned-code-point-error';
1001     } elsif ($r eq "\x1B" and
1002     $self->{byte_buffer} =~ s/^\(H//) { # Old 0201
1003     $r .= "(H";
1004     $self->{state} = 'state_284A';
1005     }
1006 wakaba 1.7 $self->{onerror}->($self, $etype, octets => \$r,
1007     level => $self->{level}->{$self->{error_level}->{$etype}});
1008 wakaba 1.1 }
1009     } # A
1010    
1011     return $r;
1012     } # getc
1013    
1014 wakaba 1.9 ## TODO: This is not good for performance. Should be replaced
1015     ## by read-centric implementation.
1016     sub read ($$$;$) {
1017     #my ($self, $scalar, $length, $offset) = @_;
1018     my $length = $_[2];
1019     my $r = '';
1020     while ($length > 0) {
1021     my $c = $_[0]->getc;
1022     last unless defined $c;
1023     $r .= $c;
1024     $length--;
1025     }
1026     substr ($_[1], $_[3]) = $r;
1027     ## NOTE: This would do different thing from what Perl's |read| do
1028     ## if $offset points beyond the end of the $scalar.
1029     return length $r;
1030     } # read
1031    
1032 wakaba 1.12 sub manakai_read_until ($$$;$) {
1033     #my ($self, $scalar, $pattern, $offset) = @_;
1034     my $self = $_[0];
1035 wakaba 1.11 my $c = $self->getc;
1036 wakaba 1.12 if ($c =~ /^$_[2]/) {
1037     substr ($_[1], $_[3]) = $c;
1038     return 1;
1039 wakaba 1.11 } elsif (defined $c) {
1040     $self->ungetc (ord $c);
1041 wakaba 1.12 return 0;
1042 wakaba 1.11 } else {
1043 wakaba 1.12 return 0;
1044 wakaba 1.11 }
1045 wakaba 1.12 } # manakai_read_until
1046 wakaba 1.11
1047 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:us-ascii'} =
1048     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:us'} =
1049     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:iso646-us'} =
1050     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:cp367'} =
1051     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:ibm367'} =
1052     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:ansi_x3.4-1986'} =
1053     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:ansi_x3.4-1968'} =
1054     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:iso-ir-6'} =
1055     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:csascii'} =
1056     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:iso_646.irv:1991'} =
1057     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:ascii'} = {ascii8 =>
1058     '1',
1059     is_block_safe =>
1060     '1',
1061     ietf_name =>
1062     ['ansi_x3.4-1968',
1063     'ansi_x3.4-1986',
1064     'ascii',
1065     'cp367',
1066     'csascii',
1067     'ibm367',
1068     'iso-ir-6',
1069     'iso646-us',
1070     'iso_646.irv:1991',
1071     'us',
1072     'us-ascii',
1073     'us-ascii'],
1074     mime_name =>
1075     'us-ascii',
1076     perl_name =>
1077     ['ascii',
1078     'iso-646-us',
1079     'us-ascii'],
1080     utf8_encoding_scheme =>
1081     '1',
1082     'uri',
1083     {'urn:x-suika-fam-cx:charset:ansi_x3.4-1968',
1084     '1',
1085     'urn:x-suika-fam-cx:charset:ansi_x3.4-1986',
1086     '1',
1087     'urn:x-suika-fam-cx:charset:ascii',
1088     '1',
1089     'urn:x-suika-fam-cx:charset:cp367',
1090     '1',
1091     'urn:x-suika-fam-cx:charset:csascii',
1092     '1',
1093     'urn:x-suika-fam-cx:charset:ibm367',
1094     '1',
1095     'urn:x-suika-fam-cx:charset:iso-ir-6',
1096     '1',
1097     'urn:x-suika-fam-cx:charset:iso646-us',
1098     '1',
1099     'urn:x-suika-fam-cx:charset:iso_646.irv:1991',
1100     '1',
1101     'urn:x-suika-fam-cx:charset:us',
1102     '1',
1103     'urn:x-suika-fam-cx:charset:us-ascii',
1104     '1'},
1105 wakaba 1.2 };
1106    
1107 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.ascii-ctrl'} = {perl_name =>
1108     ['ascii-ctrl'],
1109     'uri',
1110     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.ascii-ctrl',
1111     '1'}};
1112     $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.null'} = {perl_name =>
1113     ['null'],
1114     'uri',
1115     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.null',
1116     '1'}};
1117     $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.utf-8'} = {ascii8 =>
1118     '1',
1119     bom_allowed =>
1120     '1',
1121     no_bom_variant =>
1122     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf8',
1123     utf8_encoding_scheme =>
1124     '1',
1125     'uri',
1126     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.utf-8',
1127     '1'},
1128 wakaba 1.2 xml_name => 'UTF-8',
1129     };
1130    
1131 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/UTF-8.RFC2279'} = {ascii8 =>
1132     '1',
1133     bom_allowed =>
1134     '1',
1135     no_bom_variant =>
1136     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf8',
1137     utf8_encoding_scheme =>
1138     '1',
1139     'uri',
1140     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/UTF-8.RFC2279',
1141     '1'}};
1142     $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-8'} = {
1143     ascii8 => 1,
1144     is_block_safe =>
1145     '1',
1146     perl_name =>
1147     ['utf-8'],
1148     utf8_encoding_scheme =>
1149     '1',
1150     'uri',
1151     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-8',
1152     '1'}};
1153     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:utf-8'} = {
1154     ascii8 => 1,
1155     bom_allowed =>
1156     '1',
1157     no_bom_variant =>
1158     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-8',
1159     ietf_name =>
1160     ['utf-8'],
1161     mime_name =>
1162     'utf-8',
1163     utf8_encoding_scheme =>
1164     '1',
1165     'uri',
1166     {'urn:x-suika-fam-cx:charset:utf-8',
1167     '1'},
1168 wakaba 1.2 };
1169    
1170 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf8'} = {ascii8 =>
1171     '1',
1172     is_block_safe =>
1173     '1',
1174     perl_name =>
1175     ['utf8'],
1176     utf8_encoding_scheme =>
1177     '1',
1178     'uri',
1179     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf8',
1180     '1'}};
1181     $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.utf-16'} = {
1182     ascii16 => 1,
1183     bom_allowed =>
1184     '1',
1185     bom_required =>
1186     '1',
1187     no_bom_variant =>
1188     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16le',
1189     no_bom_variant16be =>
1190     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16be',
1191     no_bom_variant16le =>
1192     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16le',
1193     perl_name =>
1194     ['utf-16'],
1195     'uri',
1196     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.utf-16',
1197     '1'},
1198 wakaba 1.2 xml_name => 'UTF-16',
1199     };
1200    
1201 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:utf-16'} = {
1202     ascii16 => 1,
1203     bom_allowed =>
1204     '1',
1205     no_bom_variant =>
1206     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16le',
1207     no_bom_variant16be =>
1208     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16be',
1209     no_bom_variant16le =>
1210     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16le',
1211     ietf_name =>
1212     ['utf-16'],
1213     mime_name =>
1214     'utf-16',
1215     'uri',
1216     {'urn:x-suika-fam-cx:charset:utf-16',
1217     '1'},
1218 wakaba 1.2 };
1219    
1220 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:utf-16be'} = {
1221     ascii16 => 1,
1222     ascii16be => 1,
1223     bom_allowed =>
1224     '1',
1225     no_bom_variant =>
1226     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16be',
1227     no_bom_variant16be =>
1228     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16be',
1229     ietf_name =>
1230     ['utf-16be'],
1231     mime_name =>
1232     'utf-16be',
1233     'uri',
1234     {'urn:x-suika-fam-cx:charset:utf-16be',
1235     '1'},
1236 wakaba 1.2 };
1237    
1238 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:utf-16le'} = {
1239     ascii16 => 1,
1240     ascii16le => 1,
1241     bom_allowed =>
1242     '1',
1243     no_bom_variant =>
1244     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16le',
1245     no_bom_variant16le =>
1246     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16le',
1247     ietf_name =>
1248     ['utf-16le'],
1249     mime_name =>
1250     'utf-16le',
1251     'uri',
1252     {'urn:x-suika-fam-cx:charset:utf-16le',
1253     '1'},
1254 wakaba 1.2 };
1255    
1256 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16be'} = {
1257     ascii16 => 1,
1258     ascii16be => 1,
1259     is_block_safe =>
1260     '1',
1261     perl_name =>
1262     ['utf-16be'],
1263     'uri',
1264     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16be',
1265     '1'}};
1266     $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16le'} = {
1267     ascii16 => 1,
1268     ascii16le => 1,
1269     is_block_safe =>
1270     '1',
1271     perl_name =>
1272     ['utf-16le'],
1273     'uri',
1274     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16le',
1275     '1'}};
1276     $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-10646-ucs-2'} = $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:iso-10646-ucs-2'} = {
1277     ascii16 => 1,
1278     bom_allowed =>
1279     '1',
1280     no_bom_variant =>
1281     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.ucs-2le',
1282     no_bom_variant16be =>
1283     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.ucs-2be',
1284     no_bom_variant16le =>
1285     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.ucs-2le',
1286     ietf_name =>
1287     ['csunicode',
1288     'iso-10646-ucs-2'],
1289     mime_name =>
1290     'iso-10646-ucs-2',
1291     'uri',
1292     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-10646-ucs-2',
1293     '1',
1294     'urn:x-suika-fam-cx:charset:iso-10646-ucs-2',
1295     '1'},
1296 wakaba 1.2 xml_name => 'ISO-10646-UCS-2',
1297     };
1298    
1299 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.ucs-2be'} = {
1300     ascii16 => 1,
1301     ascii16be => 1,
1302     is_block_safe =>
1303     '1',
1304     perl_name =>
1305     ['ucs-2be'],
1306     'uri',
1307     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.ucs-2be',
1308     '1'}};
1309     $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.ucs-2le'} = {
1310     ascii16 => 1,
1311     ascii16le => 1,
1312     is_block_safe =>
1313     '1',
1314     perl_name =>
1315     ['ucs-2le'],
1316     'uri',
1317     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.ucs-2le',
1318     '1'}};
1319     $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-10646-ucs-4'} = $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:iso-10646-ucs-4'} = {
1320     ascii32 => 1,
1321     bom_allowed =>
1322     '1',
1323     no_bom_variant =>
1324     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-32le',
1325     no_bom_variant32endian1234 =>
1326     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-32be',
1327     no_bom_variant32endian4321 =>
1328     'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-32le',
1329     ietf_name =>
1330     ['csucs4',
1331     'iso-10646-ucs-4'],
1332     mime_name =>
1333     'iso-10646-ucs-4',
1334     'uri',
1335     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-10646-ucs-4',
1336     '1',
1337     'urn:x-suika-fam-cx:charset:iso-10646-ucs-4',
1338     '1'},
1339 wakaba 1.2 xml_name => 'ISO-10646-UCS-4',
1340     };
1341    
1342 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-32be'} = {
1343     ascii32 => 1,
1344     ascii32endian1234 => 1,
1345     is_block_safe =>
1346     '1',
1347     perl_name =>
1348     ['ucs-4be',
1349     'utf-32be'],
1350     'uri',
1351     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-32be',
1352     '1'}};
1353     $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-32le'} = {
1354     ascii32 => 1,
1355     ascii32endian4321 => 1,
1356     is_block_safe =>
1357     '1',
1358     perl_name =>
1359     ['ucs-4le',
1360     'utf-32le'],
1361     'uri',
1362     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-32le',
1363     '1'}};
1364     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:iso_8859-1:1987'} = $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-1'} = {ascii8 =>
1365     '1',
1366     is_block_safe =>
1367     '1',
1368     ietf_name =>
1369     ['cp819',
1370     'csisolatin1',
1371     'ibm819',
1372     'iso-8859-1',
1373     'iso-8859-1',
1374     'iso-ir-100',
1375     'iso_8859-1',
1376     'iso_8859-1:1987',
1377     'l1',
1378     'latin1'],
1379     mime_name =>
1380     'iso-8859-1',
1381     perl_name =>
1382     ['iso-8859-1',
1383     'latin1'],
1384     'uri',
1385     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-1',
1386     '1',
1387     'urn:x-suika-fam-cx:charset:iso_8859-1:1987',
1388     '1'},
1389 wakaba 1.2 xml_name => 'ISO-8859-1',
1390     };
1391    
1392 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-2'} = {ascii8 =>
1393     '1',
1394     is_block_safe =>
1395     '1',
1396     'uri',
1397     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-2',
1398     '1'},
1399 wakaba 1.2 xml_name => 'ISO-8859-2',
1400     };
1401    
1402 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-3'} = {ascii8 =>
1403     '1',
1404     is_block_safe =>
1405     '1',
1406     'uri',
1407     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-3',
1408     '1'},
1409 wakaba 1.2 xml_name => 'ISO-8859-3',
1410     };
1411    
1412 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-4'} = {ascii8 =>
1413     '1',
1414     is_block_safe =>
1415     '1',
1416     'uri',
1417     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-4',
1418     '1'},
1419 wakaba 1.2 xml_name => 'ISO-8859-4',
1420     };
1421    
1422 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-5'} = {ascii8 =>
1423     '1',
1424     is_block_safe =>
1425     '1',
1426     'uri',
1427     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-5',
1428     '1'},
1429 wakaba 1.2 xml_name => 'ISO-8859-5',
1430     };
1431    
1432 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-6'} = {ascii8 =>
1433     '1',
1434     is_block_safe =>
1435     '1',
1436     'uri',
1437     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-6',
1438     '1'},
1439 wakaba 1.2 xml_name => 'ISO-8859-6',
1440     };
1441    
1442 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-7'} = {ascii8 =>
1443     '1',
1444     is_block_safe =>
1445     '1',
1446     'uri',
1447     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-7',
1448     '1'},
1449 wakaba 1.2 xml_name => 'ISO-8859-7',
1450     };
1451    
1452 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-8'} = {ascii8 =>
1453     '1',
1454     is_block_safe =>
1455     '1',
1456     'uri',
1457     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-8',
1458     '1'},
1459 wakaba 1.2 xml_name => 'ISO-8859-8',
1460     };
1461    
1462 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-9'} = {ascii8 =>
1463     '1',
1464     is_block_safe =>
1465     '1',
1466     'uri',
1467     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-9',
1468     '1'},
1469 wakaba 1.2 xml_name => 'ISO-8859-9',
1470     };
1471    
1472 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-10'} = {ascii8 =>
1473     '1',
1474     is_block_safe =>
1475     '1',
1476     'uri',
1477     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-10',
1478     '1'},
1479 wakaba 1.2 xml_name => 'ISO-8859-10',
1480     };
1481    
1482 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-11'} = {ascii8 =>
1483     '1',
1484     is_block_safe =>
1485     '1',
1486     'uri',
1487     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-11',
1488     '1'},
1489 wakaba 1.2 xml_name => 'ISO-8859-11',
1490     };
1491    
1492 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-13'} = {ascii8 =>
1493     '1',
1494     is_block_safe =>
1495     '1',
1496     'uri',
1497     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-13',
1498     '1'},
1499 wakaba 1.2 xml_name => 'ISO-8859-13',
1500     };
1501    
1502 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-14'} = {ascii8 =>
1503     '1',
1504     is_block_safe =>
1505     '1',
1506     'uri',
1507     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-14',
1508     '1'},
1509 wakaba 1.2 xml_name => 'ISO-8859-14',
1510     };
1511    
1512 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-15'} = {ascii8 =>
1513     '1',
1514     is_block_safe =>
1515     '1',
1516     'uri',
1517     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-15',
1518     '1'},
1519 wakaba 1.2 xml_name => 'ISO-8859-15',
1520     };
1521    
1522 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-16'} = {ascii8 =>
1523     '1',
1524     is_block_safe =>
1525     '1',
1526     'uri',
1527     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-16',
1528     '1'},
1529 wakaba 1.2 xml_name => 'ISO-8859-16',
1530     };
1531    
1532 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-2022-jp'} = {ascii8 =>
1533     '1',
1534     'uri',
1535     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-2022-jp',
1536     '1'},
1537 wakaba 1.2 xml_name => 'ISO-2022-JP',
1538     };
1539    
1540 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:iso-2022-jp'} = {ascii8 =>
1541     '1',
1542     ietf_name =>
1543     ['csiso2022jp',
1544     'iso-2022-jp',
1545     'iso-2022-jp'],
1546     mime_name =>
1547     'iso-2022-jp',
1548     'uri',
1549     {'urn:x-suika-fam-cx:charset:iso-2022-jp',
1550     '1'},
1551 wakaba 1.2 };
1552    
1553 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.iso-2022-jp'} = {ascii8 =>
1554     '1',
1555     perl_name =>
1556     ['iso-2022-jp'],
1557     'uri',
1558     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.iso-2022-jp',
1559     '1'}};
1560     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:shift_jis'} = $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.shift_jis'} = {ascii8 =>
1561     '1',
1562     is_block_safe =>
1563     '1',
1564     ietf_name =>
1565     ['csshiftjis',
1566     'ms_kanji',
1567     'shift_jis',
1568     'shift_jis'],
1569     mime_name =>
1570     'shift_jis',
1571     perl_name =>
1572     ['shift-jis-1997'],
1573     'uri',
1574     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.shift_jis',
1575     '1',
1576     'urn:x-suika-fam-cx:charset:shift_jis',
1577     '1'},
1578 wakaba 1.2 xml_name => 'Shift_JIS',
1579     };
1580    
1581 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.shiftjis'} = {ascii8 =>
1582     '1',
1583     is_block_safe =>
1584     '1',
1585     perl_name =>
1586     ['shiftjis',
1587     'sjis'],
1588     'uri',
1589     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.shiftjis',
1590     '1'}};
1591     $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:euc-jp'} = $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.euc-jp'} = $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:extended_unix_code_packed_format_for_japanese'} = {ascii8 =>
1592     '1',
1593     is_block_safe =>
1594     '1',
1595     ietf_name =>
1596     ['cseucpkdfmtjapanese',
1597     'euc-jp',
1598     'euc-jp',
1599     'extended_unix_code_packed_format_for_japanese'],
1600     mime_name =>
1601     'euc-jp',
1602     perl_name =>
1603     ['euc-jp-1997'],
1604     'uri',
1605     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.euc-jp',
1606     '1',
1607     'urn:x-suika-fam-cx:charset:euc-jp',
1608     '1',
1609     'urn:x-suika-fam-cx:charset:extended_unix_code_packed_format_for_japanese',
1610     '1'},
1611 wakaba 1.2 xml_name => 'EUC-JP',
1612     };
1613    
1614 wakaba 1.1 $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.euc-jp'} = {ascii8 =>
1615     '1',
1616     is_block_safe =>
1617     '1',
1618     perl_name =>
1619     ['euc-jp',
1620     'ujis'],
1621     'uri',
1622     {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.euc-jp',
1623     '1'}};
1624    
1625     1;
1626 wakaba 1.18 ## $Date: 2008/09/14 11:57:41 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24