package Whatpm::Charset::DecodeHandle; use strict; ## NOTE: |Message::Charset::Info| uses this module without calling ## the constructor. use Message::Charset::Info; my $XML_AUTO_CHARSET = q; my $IANA_CHARSET = q; my $PERL_CHARSET = q; my $XML_CHARSET = q; ## ->create_decode_handle ($charset_uri, $byte_stream, $onerror) sub create_decode_handle ($$$;$) { my $csdef = $Whatpm::Charset::CharsetDef->{$_[1]}; my $obj = { category => 0, char_buffer => \(my $s = ''), char_buffer_pos => 0, character_queue => [], filehandle => $_[2], charset => $_[1], byte_buffer => '', onerror => $_[3] || sub {}, #onerror_set }; if ($csdef->{uri}->{$XML_AUTO_CHARSET} or $obj->{charset} eq $XML_AUTO_CHARSET) { my $b = ''; # UTF-8 w/o BOM $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'}; $obj->{input_encoding} = 'UTF-8'; if (read $obj->{filehandle}, $b, 256) { no warnings "substr"; no warnings "uninitialized"; if (substr ($b, 0, 1) eq "<") { if (substr ($b, 1, 1) eq "?") { # ASCII8 if ($b =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)? encoding\s*=\s*["']([^"']*)/x) { $obj->{input_encoding} = $1; my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding}); $csdef = $Whatpm::Charset::CharsetDef->{$uri}; if (not $csdef->{ascii8} or $csdef->{bom_required}) { $obj->{onerror}->(undef, 'charset-name-mismatch-error', charset_uri => $uri, charset_name => $obj->{input_encoding}); } } else { $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'}; $obj->{input_encoding} = 'UTF-8'; } if (defined $csdef->{no_bom_variant}) { $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant}}; } } elsif (substr ($b, 1, 1) eq "\x00") { if (substr ($b, 2, 2) eq "?\x00") { # ASCII16LE my $c = $b; $c =~ tr/\x00//d; if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)? encoding\s*=\s*["']([^"']*)/x) { $obj->{input_encoding} = $1; my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding}); $csdef = $Whatpm::Charset::CharsetDef->{$uri}; if (not $csdef->{ascii16} or $csdef->{ascii16be} or $csdef->{bom_required}) { $obj->{onerror}->(undef, 'charset-name-mismatch-error', charset_uri => $uri, charset_name => $obj->{input_encoding}); } } else { $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'}; $obj->{input_encoding} = 'UTF-8'; } if (defined $csdef->{no_bom_variant16le}) { $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant16le}}; } } elsif (substr ($b, 2, 2) eq "\x00\x00") { # ASCII32Endian4321 my $c = $b; $c =~ tr/\x00//d; if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)? encoding\s*=\s*["']([^"']*)/x) { $obj->{input_encoding} = $1; my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding}); $csdef = $Whatpm::Charset::CharsetDef->{$uri}; if (not $csdef->{ascii32} or $csdef->{ascii32endian1234} or $csdef->{ascii32endian2143} or $csdef->{ascii32endian3412} or $csdef->{bom_required}) { $obj->{onerror}->(undef, 'charset-name-mismatch-error', charset_uri => $uri, charset_name => $obj->{input_encoding}); } } else { $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'}; $obj->{input_encoding} = 'UTF-8'; } if (defined $csdef->{no_bom_variant32endian4321}) { $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant32endian4321}}; } } } } elsif (substr ($b, 0, 3) eq "\xEF\xBB\xBF") { # UTF8 $obj->{has_bom} = 1; substr ($b, 0, 3) = ''; my $c = $b; if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)? encoding\s*=\s*["']([^"']*)/x) { $obj->{input_encoding} = $1; my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding}); $csdef = $Whatpm::Charset::CharsetDef->{$uri}; if (not $csdef->{utf8_encoding_scheme} or not $csdef->{bom_allowed}) { $obj->{onerror}->(undef, 'charset-name-mismatch-error', charset_uri => $uri, charset_name => $obj->{input_encoding}); } } else { $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'}; $obj->{input_encoding} = 'UTF-8'; } if (defined $csdef->{no_bom_variant}) { $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant}}; } } elsif (substr ($b, 0, 2) eq "\x00<") { if (substr ($b, 2, 2) eq "\x00?") { # ASCII16BE my $c = $b; $c =~ tr/\x00//d; if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)? encoding\s*=\s*["']([^"']*)/x) { $obj->{input_encoding} = $1; my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding}); $csdef = $Whatpm::Charset::CharsetDef->{$uri}; if (not $csdef->{ascii16} or $csdef->{ascii16le} or $csdef->{bom_required}) { $obj->{onerror}->(undef, 'charset-name-mismatch-error', charset_uri => $uri, charset_name => $obj->{input_encoding}); } } else { $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'}; $obj->{input_encoding} = 'UTF-8'; } if (defined $csdef->{no_bom_variant16be}) { $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant16be}}; } } elsif (substr ($b, 2, 2) eq "\x00\x00") { # ASCII32Endian3412 my $c = $b; $c =~ tr/\x00//d; if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)? encoding\s*=\s*["']([^"']*)/x) { $obj->{input_encoding} = $1; my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding}); $csdef = $Whatpm::Charset::CharsetDef->{$uri}; if (not $csdef->{ascii32} or $csdef->{ascii32endian1234} or $csdef->{ascii32endian2143} or $csdef->{ascii32endian4321} or $csdef->{bom_required}) { $obj->{onerror}->(undef, 'charset-name-mismatch-error', charset_uri => $uri, charset_name => $obj->{input_encoding}); } } else { $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'}; $obj->{input_encoding} = 'UTF-8'; } if (defined $csdef->{no_bom_variant32endian3412}) { $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant32endian3412}}; } } } elsif (substr ($b, 0, 2) eq "\xFE\xFF") { if (substr ($b, 2, 2) eq "\x00<") { # ASCII16BE $obj->{has_bom} = 1; substr ($b, 0, 2) = ''; my $c = $b; $c =~ tr/\x00//d; if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)? encoding\s*=\s*["']([^"']*)/x) { $obj->{input_encoding} = $1; my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding}); $csdef = $Whatpm::Charset::CharsetDef->{$uri}; if (not $csdef->{ascii16} or $csdef->{ascii16le} or not $csdef->{bom_allowed}) { $obj->{onerror}->(undef, 'charset-name-mismatch-error', charset_uri => $uri, charset_name => $obj->{input_encoding}); } } else { $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-16be'}; $obj->{input_encoding} = 'UTF-16'; } if (defined $csdef->{no_bom_variant16be}) { $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant16be}}; } } elsif (substr ($b, 2, 2) eq "\x00\x00") { # ASCII32Endian3412 $obj->{has_bom} = 1; substr ($b, 0, 4) = ''; my $c = $b; $c =~ tr/\x00//d; if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)? encoding\s*=\s*["']([^"']*)/x) { $obj->{input_encoding} = $1; my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding}); $csdef = $Whatpm::Charset::CharsetDef->{$uri}; if (not $csdef->{ascii32} or $csdef->{ascii32endian1234} or $csdef->{ascii32endian2143} or $csdef->{ascii32endian4321} or not $csdef->{bom_allowed}) { $obj->{onerror}->(undef, 'charset-name-mismatch-error', charset_uri => $uri, charset_name => $obj->{input_encoding}); } } else { $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-16be'}; $obj->{input_encoding} = 'UTF-16'; $obj->{byte_buffer} .= "\x00\x00"; } if (defined $csdef->{no_bom_variant32endian3412}) { $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant32endian3412}}; } } else { $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-16be'}; $obj->{input_encoding} = 'UTF-16'; substr ($b, 0, 2) = ''; $obj->{has_bom} = 1; } } elsif (substr ($b, 0, 2) eq "\xFF\xFE") { if (substr ($b, 2, 2) eq "<\x00") { # ASCII16LE $obj->{has_bom} = 1; substr ($b, 0, 2) = ''; my $c = $b; $c =~ tr/\x00//d; if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)? encoding\s*=\s*["']([^"']*)/x) { $obj->{input_encoding} = $1; my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding}); $csdef = $Whatpm::Charset::CharsetDef->{$uri}; if (not $csdef->{ascii16} or $csdef->{ascii16be} or not $csdef->{bom_allowed}) { $obj->{onerror}->(undef, 'charset-name-mismatch-error', charset_uri => $uri, charset_name => $obj->{input_encoding}); } } else { $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-16le'}; $obj->{input_encoding} = 'UTF-16'; } if (defined $csdef->{no_bom_variant16le}) { $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant16le}}; } } elsif (substr ($b, 2, 2) eq "\x00\x00") { # ASCII32Endian4321 $obj->{has_bom} = 1; substr ($b, 0, 4) = ''; my $c = $b; $c =~ tr/\x00//d; if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)? encoding\s*=\s*["']([^"']*)/x) { $obj->{input_encoding} = $1; my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding}); $csdef = $Whatpm::Charset::CharsetDef->{$uri}; if (not $csdef->{ascii32} or $csdef->{ascii32endian1234} or $csdef->{ascii32endian2143} or $csdef->{ascii32endian3412} or not $csdef->{bom_allowed}) { $obj->{onerror}->(undef, 'charset-name-mismatch-error', charset_uri => $uri, charset_name => $obj->{input_encoding}); } } else { $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-16le'}; $obj->{input_encoding} = 'UTF-16'; $obj->{byte_buffer} .= "\x00\x00"; } if (defined $csdef->{no_bom_variant32endian4321}) { $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant32endian4321}}; } } else { $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-16le'}; $obj->{input_encoding} = 'UTF-16'; substr ($b, 0, 2) = ''; $obj->{has_bom} = 1; } } elsif (substr ($b, 0, 2) eq "\x00\x00") { if (substr ($b, 2, 2) eq "\x00<") { # ASCII32Endian1234 my $c = $b; $c =~ tr/\x00//d; if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)? encoding\s*=\s*["']([^"']*)/x) { $obj->{input_encoding} = $1; my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding}); $csdef = $Whatpm::Charset::CharsetDef->{$uri}; if (not $csdef->{ascii32} or $csdef->{ascii32endian2143} or $csdef->{ascii32endian3412} or $csdef->{ascii32endian4321} or $csdef->{bom_required}) { $obj->{onerror}->(undef, 'charset-name-mismatch-error', charset_uri => $uri, charset_name => $obj->{input_encoding}); } } else { $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'}; $obj->{input_encoding} = 'UTF-8'; } if (defined $csdef->{no_bom_variant32endian1234}) { $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant32endian1234}}; } } elsif (substr ($b, 2, 2) eq "<\x00") { # ASCII32Endian2143 my $c = $b; $c =~ tr/\x00//d; if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)? encoding\s*=\s*["']([^"']*)/x) { $obj->{input_encoding} = $1; my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding}); $csdef = $Whatpm::Charset::CharsetDef->{$uri}; if (not $csdef->{ascii32} or $csdef->{ascii32endian1234} or $csdef->{ascii32endian3412} or $csdef->{ascii32endian4321} or $csdef->{bom_required}) { $obj->{onerror}->(undef, 'charset-name-mismatch-error', charset_uri => $uri, charset_name => $obj->{input_encoding}); } } else { $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'}; $obj->{input_encoding} = 'UTF-8'; } if (defined $csdef->{no_bom_variant32endian2143}) { $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant32endian2143}}; } } elsif (substr ($b, 2, 2) eq "\xFE\xFF") { # ASCII32Endian1234 $obj->{has_bom} = 1; substr ($b, 0, 4) = ''; my $c = $b; $c =~ tr/\x00//d; if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)? encoding\s*=\s*["']([^"']*)/x) { $obj->{input_encoding} = $1; my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding}); $csdef = $Whatpm::Charset::CharsetDef->{$uri}; if (not $csdef->{ascii32} or $csdef->{ascii32endian2143} or $csdef->{ascii32endian3412} or $csdef->{ascii32endian4321} or $csdef->{bom_required}) { $obj->{onerror}->(undef, 'charset-name-mismatch-error', charset_uri => $uri, charset_name => $obj->{input_encoding}); } } else { $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'}; $obj->{input_encoding} = 'UTF-8'; $obj->{has_bom} = 0; $obj->{byte_buffer} .= "\x00\x00\xFE\xFF"; } if (defined $csdef->{no_bom_variant32endian1234}) { $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant32endian1234}}; } } elsif (substr ($b, 2, 2) eq "\xFF\xFE") { # ASCII32Endian2143 $obj->{has_bom} = 1; substr ($b, 0, 4) = ''; my $c = $b; $c =~ tr/\x00//d; if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)? encoding\s*=\s*["']([^"']*)/x) { $obj->{input_encoding} = $1; my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding}); $csdef = $Whatpm::Charset::CharsetDef->{$uri}; if (not $csdef->{ascii32} or $csdef->{ascii32endian1234} or $csdef->{ascii32endian3412} or $csdef->{ascii32endian4321} or $csdef->{bom_required}) { $obj->{onerror}->(undef, 'charset-name-mismatch-error', charset_uri => $uri, charset_name => $obj->{input_encoding}); } } else { $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'}; $obj->{input_encoding} = 'UTF-8'; $obj->{has_bom} = 0; $obj->{byte_buffer} .= "\x00\x00\xFF\xFE"; } if (defined $csdef->{no_bom_variant32endian2143}) { $csdef = $Whatpm::Charset::CharsetDef->{$csdef->{no_bom_variant32endian2143}}; } } # \x4C\x6F\xA7\x94 EBCDIC } # buffer $obj->{byte_buffer} .= $b; } # read } elsif ($csdef->{uri}->{$XML_CHARSET.'utf-8'}) { ## BOM is optional. my $b = ''; if (read $obj->{filehandle}, $b, 3) { if ($b eq "\xEF\xBB\xBF") { $obj->{has_bom} = 1; } else { $obj->{byte_buffer} .= $b; } } $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'}; # UTF-8 w/o BOM } elsif ($csdef->{uri}->{$XML_CHARSET.'utf-16'}) { ## BOM is mandated. my $b = ''; if (read $obj->{filehandle}, $b, 2) { if ($b eq "\xFE\xFF") { $obj->{has_bom} = 1; # UTF-16BE w/o BOM $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-16be'}; } elsif ($b eq "\xFF\xFE") { $obj->{has_bom} = 1; # UTF-16LE w/o BOM $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-16le'}; } else { $obj->{onerror}->(undef, 'no-bom-error', charset_uri => $obj->{charset}); $obj->{has_bom} = 0; $obj->{byte_buffer} .= $b; # UTF-16BE w/o BOM $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-16be'}; } } else { $obj->{onerror}->(undef, 'no-bom-error', charset_uri => $obj->{charset}); $obj->{has_bom} = 0; # UTF-16BE w/o BOM $csdef = $Whatpm::Charset::CharsetDef->{$PERL_CHARSET.'utf-16be'}; } } if ($csdef->{uri}->{$XML_CHARSET.'iso-2022-jp'}) { $obj->{state_2440} = 'gl-jis-1997-swapped'; $obj->{state_2442} = 'gl-jis-1997'; $obj->{state} = 'state_2842'; require Encode::GLJIS1997Swapped; require Encode::GLJIS1997; if (Encode::find_encoding ($obj->{state_2440}) and Encode::find_encoding ($obj->{state_2442})) { return bless $obj, 'Whatpm::Charset::DecodeHandle::ISO2022JP'; } } elsif ($csdef->{uri}->{$IANA_CHARSET.'iso-2022-jp'}) { $obj->{state_2440} = 'gl-jis-1978'; $obj->{state_2442} = 'gl-jis-1983'; $obj->{state} = 'state_2842'; require Encode::GLJIS1978; require Encode::GLJIS1983; if (Encode::find_encoding ($obj->{state_2440}) and Encode::find_encoding ($obj->{state_2442})) { return bless $obj, 'Whatpm::Charset::DecodeHandle::ISO2022JP'; } } elsif (defined $csdef->{perl_name}->[0]) { if ($csdef->{uri}->{$XML_CHARSET.'euc-jp'} or $csdef->{uri}->{$IANA_CHARSET.'euc-jp'}) { $obj->{perl_encoding_name} = $csdef->{perl_name}->[0]; require Encode::EUCJP1997; if (Encode::find_encoding ($obj->{perl_encoding_name})) { $obj->{category} |= Message::Charset::Info::CHARSET_CATEGORY_EUCJP; return bless $obj, 'Whatpm::Charset::DecodeHandle::Encode'; } } elsif ($csdef->{uri}->{$XML_CHARSET.'shift_jis'} or $csdef->{uri}->{$IANA_CHARSET.'shift_jis'}) { $obj->{perl_encoding_name} = $csdef->{perl_name}->[0]; require Encode::ShiftJIS1997; if (Encode::find_encoding ($obj->{perl_encoding_name})) { return bless $obj, 'Whatpm::Charset::DecodeHandle::Encode'; } } elsif ($csdef->{is_block_safe}) { $obj->{perl_encoding_name} = $csdef->{perl_name}->[0]; require Encode; if (Encode::find_encoding ($obj->{perl_encoding_name})) { return bless $obj, 'Whatpm::Charset::DecodeHandle::Encode'; } } } $obj->{onerror}->(undef, 'charset-not-supported-error', charset_uri => $obj->{charset}); return undef; } # create_decode_handle sub name_to_uri ($$$) { my $domain = $_[1]; my $name = lc $_[2]; if ($domain eq 'ietf') { return $IANA_CHARSET . $name; } elsif ($domain eq 'xml') { if ({ 'utf-8' => 1, 'utf-16' => 1, 'iso-10646-ucs-2' => 1, 'iso-10646-ucs-4' => 1, 'iso-8859-1' => 1, 'iso-8859-2' => 1, 'iso-8859-3' => 1, 'iso-8859-4' => 1, 'iso-8859-5' => 1, 'iso-8859-6' => 1, 'iso-8859-7' => 1, 'iso-8859-8' => 1, 'iso-8859-9' => 1, 'iso-8859-10' => 1, 'iso-8859-11' => 1, 'iso-8859-13' => 1, 'iso-8859-14' => 1, 'iso-8859-15' => 1, 'iso-8859-16' => 1, 'iso-2022-jp' => 1, 'shift_jis' => 1, 'euc-jp' => 1, }->{$name}) { return $XML_CHARSET . $name; } my $uri = $IANA_CHARSET . $name; return $uri if $Whatpm::Charset::CharsetDef->{$uri}; return $XML_CHARSET . $name; } else { return undef; } } # name_to_uri sub uri_to_name ($$$) { my (undef, $domain, $uri) = @_; if ($domain eq 'xml') { my $v = $Whatpm::Charset::CharsetDef->{$uri}->{xml_name}; return $v if defined $v; if (substr ($uri, 0, length $XML_CHARSET) eq $XML_CHARSET) { return substr ($uri, length $XML_CHARSET); } $domain = 'ietf'; ## TODO: XML encoding name has smaller range } if ($domain eq 'ietf') { my $v = $Whatpm::Charset::CharsetDef->{$uri}->{iana_name}; return $v->[0] if defined $v; if (substr ($uri, 0, length $IANA_CHARSET) eq $IANA_CHARSET) { return substr ($uri, length $IANA_CHARSET); } } return undef; } # uri_to_name require IO::Handle; package Whatpm::Charset::DecodeHandle::ByteBuffer; ## NOTE: Provides a byte buffer wrapper object. sub new ($$) { my $self = bless { buffer => '', }, shift; $self->{filehandle} = shift; return $self; } # new sub read { my $self = shift; my $pos = length $self->{buffer}; my $r = $self->{filehandle}->read ($self->{buffer}, $_[1], $pos); substr ($_[0], $_[2]) = substr ($self->{buffer}, $pos); ## NOTE: This would do different behavior from Perl's standard ## |read| when $pos points beyond the end of the string. return $r; } # read sub close { $_[0]->{filehandle}->close } package Whatpm::Charset::DecodeHandle::CharString; ## NOTE: Same as Perl's standard |open $handle, '<', \$char_string|, ## but supports |ungetc| and other extensions. sub new ($$) { my $self = bless {pos => 0}, shift; $self->{string} = shift; # must be a scalar ref return $self; } # new sub getc ($) { my $self = shift; if ($self->{pos} < length ${$self->{string}}) { return substr ${$self->{string}}, $self->{pos}++, 1; } else { return undef; } } # getc sub read ($$$$) { #my ($self, $scalar, $length, $offset) = @_; my $self = $_[0]; my $length = $_[2] || 0; my $offset = $_[3]; ## NOTE: We don't support standard Perl semantics if $offset is ## greater than the length of $scalar. substr ($_[1], $offset) = substr (${$self->{string}}, $self->{pos}, $length); my $count = (length $_[1]) - $offset; $self->{pos} += $count; return $count; } # read sub manakai_read_until ($$$;$) { #my ($self, $scalar, $pattern, $offset) = @_; my $self = $_[0]; pos (${$self->{string}}) = $self->{pos}; if (${$self->{string}} =~ /\G(?>$_[2])+/) { substr ($_[1], $_[3]) = substr (${$self->{string}}, $-[0], $+[0] - $-[0]); $self->{pos} += $+[0] - $-[0]; return $+[0] - $-[0]; } else { return 0; } } # manakai_read_until sub ungetc ($$) { my $self = shift; ## Ignore second parameter. $self->{pos}-- if $self->{pos} > 0; } # ungetc sub close ($) { } sub onerror ($;$) { } package Whatpm::Charset::DecodeHandle::Encode; ## NOTE: Provides a Perl |Encode| module wrapper object. sub charset ($) { $_[0]->{charset} } sub close ($) { $_[0]->{filehandle}->close } sub getc ($) { my $c = ''; my $l = $_[0]->read ($c, 1); if ($l) { return $c; } else { return undef; } } # getc sub read ($$$;$) { my $self = $_[0]; #my $scalar = $_[1]; my $length = $_[2]; my $offset = $_[3] || 0; my $count = 0; my $eof; ## NOTE: It is incompatible with the standard Perl semantics ## if $offset is greater than the length of $scalar. A: { return $count if $length < 1; if (my $l = (length ${$self->{char_buffer}}) - $self->{char_buffer_pos}) { if ($l >= $length) { substr ($_[1], $offset) = substr (${$self->{char_buffer}}, $self->{char_buffer_pos}, $length); $count += $length; $self->{char_buffer_pos} += $length; $length = 0; return $count; } else { substr ($_[1], $offset) = substr (${$self->{char_buffer}}, $self->{char_buffer_pos}); $count += $l; $length -= $l; ${$self->{char_buffer}} = ''; $self->{char_buffer_pos} = 0; } $offset = length $_[1]; } if ($eof) { return $count; } my $error; if ($self->{continue}) { if ($self->{filehandle}->read ($self->{byte_buffer}, 256, length $self->{byte_buffer})) { # } else { $error = 1; } $self->{continue} = 0; } elsif (512 > length $self->{byte_buffer}) { if ($self->{filehandle}->read ($self->{byte_buffer}, 256, length $self->{byte_buffer})) { # } else { $eof = 1; } } unless ($error) { if (not $self->{bom_checked}) { if (defined $self->{bom_pattern}) { if ($self->{byte_buffer} =~ s/^$self->{bom_pattern}//) { $self->{has_bom} = 1; } } $self->{bom_checked} = 1; } my $string = Encode::decode ($self->{perl_encoding_name}, $self->{byte_buffer}, Encode::FB_QUIET ()); if (length $string) { $self->{char_buffer} = \$string; $self->{char_buffer_pos} = 0; if (length $self->{byte_buffer}) { $self->{continue} = 1; } } else { if (length $self->{byte_buffer}) { $error = 1; } else { ## NOTE: No further input. redo A; } } } if ($error) { my $r = substr $self->{byte_buffer}, 0, 1, ''; my $fallback; my $etype = 'illegal-octets-error'; my %earg; if ($self->{category} & Message::Charset::Info::CHARSET_CATEGORY_SJIS) { if ($r =~ /^[\x81-\x9F\xE0-\xFC]/) { if ($self->{byte_buffer} =~ s/(.)//s) { $r .= $1; # not limited to \x40-\xFC - \x7F $etype = 'unassigned-code-point-error'; } ## NOTE: Range [\xF0-\xFC] is unassigned and may be used as a ## single-byte character or as the first-byte of a double-byte ## character, according to JIS X 0208:1997 Appendix 1. However, the ## current practice is using the range as first-bytes of double-byte ## characters. } elsif ($r =~ /^[\x80\xA0\xFD-\xFF]/) { $etype = 'unassigned-code-point-error'; } } elsif ($self->{category} & Message::Charset::Info::CHARSET_CATEGORY_EUCJP) { if ($r =~ /^[\xA1-\xFE]/) { if ($self->{byte_buffer} =~ s/^([\xA1-\xFE])//) { $r .= $1; $etype = 'unassigned-code-point-error'; } } elsif ($r eq "\x8F") { if ($self->{byte_buffer} =~ s/^([\xA1-\xFE][\xA1-\xFE]?)//) { $r .= $1; $etype = 'unassigned-code-point-error' if length $1 == 2; } } elsif ($r eq "\x8E") { if ($self->{byte_buffer} =~ s/^([\xA1-\xFE])//) { $r .= $1; $etype = 'unassigned-code-point-error'; } } elsif ($r eq "\xA0" or $r eq "\xFF") { $etype = 'unassigned-code-point-error'; } } else { $fallback = $self->{fallback}->{$r}; if (defined $fallback) { ## NOTE: This is an HTML5 parse error. $etype = 'fallback-char-error'; $earg{char} = \$fallback; } elsif (exists $self->{fallback}->{$r}) { ## NOTE: This is an HTML5 parse error. In addition, the octet ## is not assigned with a character. $etype = 'fallback-unassigned-error'; } } ## NOTE: Fixup line/column number by counting the number of ## lines/columns in the string that is to be retuend by this ## method call. my $line_diff = 0; my $col_diff = 0; my $set_col; for (my $i = 0; $i < $count; $i++) { my $s = substr $_[1], $i - $count, 1; if ($s eq "\x0D") { $line_diff++; $col_diff = 0; $set_col = 1; $i++ if substr ($_[1], $i - $count + 1, 1) eq "\x0A"; } elsif ($s eq "\x0A") { $line_diff++; $col_diff = 0; $set_col = 1; } else { $col_diff++; } } my $i = $self->{char_buffer_pos}; if ($count and substr (${$self->{char_buffer}}, -1, 1) eq "\x0D") { if (substr (${$self->{char_buffer}}, $i, 1) eq "\x0A") { $i++; } } my $cb_length = length ${$self->{char_buffer}}; for (; $i < $cb_length; $i++) { my $s = substr $_[1], $i, 1; if ($s eq "\x0D") { $line_diff++; $col_diff = 0; $set_col = 1; $i++ if substr ($_[1], $i + 1, 1) eq "\x0A"; } elsif ($s eq "\x0A") { $line_diff++; $col_diff = 0; $set_col = 1; } else { $col_diff++; } } $self->{onerror}->($self, $etype, octets => \$r, %earg, level => $self->{level}->{$self->{error_level}->{$etype}}, line_diff => $line_diff, ($set_col ? (column => 1) : ()), column_diff => $col_diff); ## NOTE: Error handler may modify |octets| parameter, which ## would be returned as part of the output. Note that what ## is returned would affect what |manakai_read_until| returns. ${$self->{char_buffer}} .= defined $fallback ? $fallback : $r; } redo A; } # A } # read sub manakai_read_until ($$$;$) { #my ($self, $scalar, $pattern, $offset) = @_; my $self = $_[0]; my $s = ''; $self->read ($s, 255); if ($s =~ /^(?>$_[2])+/) { my $rem_length = (length $s) - $+[0]; if ($rem_length) { if ($self->{char_buffer_pos} > $rem_length) { $self->{char_buffer_pos} -= $rem_length; } else { substr (${$self->{char_buffer}}, 0, $self->{char_buffer_pos}) = substr ($s, $+[0]); $self->{char_buffer_pos} = 0; } } substr ($_[1], $_[3]) = substr ($s, $-[0], $+[0] - $-[0]); return $+[0]; } elsif (length $s) { if ($self->{char_buffer_pos} > length $s) { $self->{char_buffer_pos} -= length $s; } else { substr (${$self->{char_buffer}}, 0, $self->{char_buffer_pos}) = $s; $self->{char_buffer_pos} = 0; } } return 0; } # manakai_read_until sub has_bom ($) { $_[0]->{has_bom} } sub input_encoding ($) { my $v = $_[0]->{input_encoding}; return $v if defined $v; my $uri = $_[0]->{charset}; if (defined $uri) { return Whatpm::Charset::DecodeHandle->uri_to_name (xml => $uri); } return undef; } # input_encoding sub onerror ($;$) { if (@_ > 1) { if ($_[1]) { $_[0]->{onerror} = $_[1]; $_[0]->{onerror_set} = 1; } else { $_[0]->{onerror} = sub { }; delete $_[0]->{onerror_set}; } } return $_[0]->{onerror_set} ? $_[0]->{onerror} : undef; } # onerror sub ungetc ($$) { unshift @{$_[0]->{character_queue}}, chr int ($_[1] or 0); } # ungetc package Whatpm::Charset::DecodeHandle::ISO2022JP; push our @ISA, 'Whatpm::Charset::DecodeHandle::Encode'; sub getc ($) { my $self = $_[0]; return shift @{$self->{character_queue}} if @{$self->{character_queue}}; my $r; A: { my $error; if ($self->{continue}) { if ($self->{filehandle}->read ($self->{byte_buffer}, 256, length $self->{byte_buffer})) { # } else { $error = 1; } $self->{continue} = 0; } elsif (512 > length $self->{byte_buffer}) { $self->{filehandle}->read ($self->{byte_buffer}, 256, length $self->{byte_buffer}); } unless ($error) { if ($self->{byte_buffer} =~ s/^\x1B(\x24[\x40\x42]|\x28[\x42\x4A])//) { $self->{state} = { "\x24\x40" => 'state_2440', "\x24\x42" => 'state_2442', "\x28\x42" => 'state_2842', "\x28\x4A" => 'state_284A', }->{$1}; redo A; } elsif ($self->{state} eq 'state_2842') { # IRV if ($self->{byte_buffer} =~ s/^([\x00-\x0D\x10-\x1A\x1C-\x7F]+)//) { push @{$self->{character_queue}}, split //, $1; $r = shift @{$self->{character_queue}}; } else { if (length $self->{byte_buffer}) { $error = 1; } else { $r = undef; } } } elsif ($self->{state} eq 'state_284A') { # 0201 if ($self->{byte_buffer} =~ s/^([\x00-\x0D\x10-\x1A\x1C-\x7F]+)//) { my $v = $1; $v =~ tr/\x5C\x7E/\xA5\x{203E}/; push @{$self->{character_queue}}, split //, $v; $r = shift @{$self->{character_queue}}; } else { if (length $self->{byte_buffer}) { $error = 1; } else { $r = undef; $self->{onerror}->($self, 'invalid-state-error', state => $self->{state}, level => $self->{level}->{$self->{error_level}->{'invalid-state-error'}}); } } } elsif ($self->{state} eq 'state_2442') { # 1983 my $v = Encode::decode ($self->{state_2442}, $self->{byte_buffer}, Encode::FB_QUIET ()); if (length $v) { push @{$self->{character_queue}}, split //, $v; $r = shift @{$self->{character_queue}}; } else { if (length $self->{byte_buffer}) { $error = 1; } else { $r = undef; $self->{onerror}->($self, 'invalid-state-error', state => $self->{state}, level => $self->{level}->{$self->{error_level}->{'invalid-state-error'}}); } } } elsif ($self->{state} eq 'state_2440') { # 1978 my $v = Encode::decode ($self->{state_2440}, $self->{byte_buffer}, Encode::FB_QUIET ()); if (length $v) { push @{$self->{character_queue}}, split //, $v; $r = shift @{$self->{character_queue}}; } else { if (length $self->{byte_buffer}) { $error = 1; } else { $r = undef; $self->{onerror}->($self, 'invalid-state-error', state => $self->{state}, level => $self->{level}->{$self->{error_level}->{'invalid-state-error'}}); } } } else { $error = 1; } } if ($error) { $r = substr $self->{byte_buffer}, 0, 1, ''; my $etype = 'illegal-octets-error'; if (($self->{state} eq 'state_2442' or $self->{state} eq 'state_2440') and $r =~ /^[\x21-\x7E]/ and $self->{byte_buffer} =~ s/^([\x21-\x7E])//) { $r .= $1; $etype = 'unassigned-code-point-error'; } elsif ($r eq "\x1B" and $self->{byte_buffer} =~ s/^\(H//) { # Old 0201 $r .= "(H"; $self->{state} = 'state_284A'; } $self->{onerror}->($self, $etype, octets => \$r, level => $self->{level}->{$self->{error_level}->{$etype}}); } } # A return $r; } # getc ## TODO: This is not good for performance. Should be replaced ## by read-centric implementation. sub read ($$$;$) { #my ($self, $scalar, $length, $offset) = @_; my $length = $_[2]; my $r = ''; while ($length > 0) { my $c = $_[0]->getc; last unless defined $c; $r .= $c; $length--; } substr ($_[1], $_[3]) = $r; ## NOTE: This would do different thing from what Perl's |read| do ## if $offset points beyond the end of the $scalar. return length $r; } # read sub manakai_read_until ($$$;$) { #my ($self, $scalar, $pattern, $offset) = @_; my $self = $_[0]; my $c = $self->getc; if ($c =~ /^$_[2]/) { substr ($_[1], $_[3]) = $c; return 1; } elsif (defined $c) { $self->ungetc (ord $c); return 0; } else { return 0; } } # manakai_read_until $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:us-ascii'} = $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:us'} = $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:iso646-us'} = $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:cp367'} = $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:ibm367'} = $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:ansi_x3.4-1986'} = $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:ansi_x3.4-1968'} = $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:iso-ir-6'} = $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:csascii'} = $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:iso_646.irv:1991'} = $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:ascii'} = {ascii8 => '1', is_block_safe => '1', ietf_name => ['ansi_x3.4-1968', 'ansi_x3.4-1986', 'ascii', 'cp367', 'csascii', 'ibm367', 'iso-ir-6', 'iso646-us', 'iso_646.irv:1991', 'us', 'us-ascii', 'us-ascii'], mime_name => 'us-ascii', perl_name => ['ascii', 'iso-646-us', 'us-ascii'], utf8_encoding_scheme => '1', 'uri', {'urn:x-suika-fam-cx:charset:ansi_x3.4-1968', '1', 'urn:x-suika-fam-cx:charset:ansi_x3.4-1986', '1', 'urn:x-suika-fam-cx:charset:ascii', '1', 'urn:x-suika-fam-cx:charset:cp367', '1', 'urn:x-suika-fam-cx:charset:csascii', '1', 'urn:x-suika-fam-cx:charset:ibm367', '1', 'urn:x-suika-fam-cx:charset:iso-ir-6', '1', 'urn:x-suika-fam-cx:charset:iso646-us', '1', 'urn:x-suika-fam-cx:charset:iso_646.irv:1991', '1', 'urn:x-suika-fam-cx:charset:us', '1', 'urn:x-suika-fam-cx:charset:us-ascii', '1'}, }; $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.ascii-ctrl'} = {perl_name => ['ascii-ctrl'], 'uri', {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.ascii-ctrl', '1'}}; $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.null'} = {perl_name => ['null'], 'uri', {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.null', '1'}}; $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.utf-8'} = {ascii8 => '1', bom_allowed => '1', no_bom_variant => 'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf8', utf8_encoding_scheme => '1', 'uri', {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.utf-8', '1'}, xml_name => 'UTF-8', }; $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/UTF-8.RFC2279'} = {ascii8 => '1', bom_allowed => '1', no_bom_variant => 'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf8', utf8_encoding_scheme => '1', 'uri', {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/UTF-8.RFC2279', '1'}}; $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-8'} = { ascii8 => 1, is_block_safe => '1', perl_name => ['utf-8'], utf8_encoding_scheme => '1', 'uri', {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-8', '1'}}; $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:utf-8'} = { ascii8 => 1, bom_allowed => '1', no_bom_variant => 'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-8', ietf_name => ['utf-8'], mime_name => 'utf-8', utf8_encoding_scheme => '1', 'uri', {'urn:x-suika-fam-cx:charset:utf-8', '1'}, }; $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf8'} = {ascii8 => '1', is_block_safe => '1', perl_name => ['utf8'], utf8_encoding_scheme => '1', 'uri', {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf8', '1'}}; $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.utf-16'} = { ascii16 => 1, bom_allowed => '1', bom_required => '1', no_bom_variant => 'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16le', no_bom_variant16be => 'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16be', no_bom_variant16le => 'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16le', perl_name => ['utf-16'], 'uri', {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.utf-16', '1'}, xml_name => 'UTF-16', }; $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:utf-16'} = { ascii16 => 1, bom_allowed => '1', no_bom_variant => 'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16le', no_bom_variant16be => 'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16be', no_bom_variant16le => 'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16le', ietf_name => ['utf-16'], mime_name => 'utf-16', 'uri', {'urn:x-suika-fam-cx:charset:utf-16', '1'}, }; $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:utf-16be'} = { ascii16 => 1, ascii16be => 1, bom_allowed => '1', no_bom_variant => 'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16be', no_bom_variant16be => 'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16be', ietf_name => ['utf-16be'], mime_name => 'utf-16be', 'uri', {'urn:x-suika-fam-cx:charset:utf-16be', '1'}, }; $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:utf-16le'} = { ascii16 => 1, ascii16le => 1, bom_allowed => '1', no_bom_variant => 'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16le', no_bom_variant16le => 'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16le', ietf_name => ['utf-16le'], mime_name => 'utf-16le', 'uri', {'urn:x-suika-fam-cx:charset:utf-16le', '1'}, }; $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16be'} = { ascii16 => 1, ascii16be => 1, is_block_safe => '1', perl_name => ['utf-16be'], 'uri', {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16be', '1'}}; $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16le'} = { ascii16 => 1, ascii16le => 1, is_block_safe => '1', perl_name => ['utf-16le'], 'uri', {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16le', '1'}}; $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-10646-ucs-2'} = $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:iso-10646-ucs-2'} = { ascii16 => 1, bom_allowed => '1', no_bom_variant => 'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.ucs-2le', no_bom_variant16be => 'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.ucs-2be', no_bom_variant16le => 'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.ucs-2le', ietf_name => ['csunicode', 'iso-10646-ucs-2'], mime_name => 'iso-10646-ucs-2', 'uri', {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-10646-ucs-2', '1', 'urn:x-suika-fam-cx:charset:iso-10646-ucs-2', '1'}, xml_name => 'ISO-10646-UCS-2', }; $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.ucs-2be'} = { ascii16 => 1, ascii16be => 1, is_block_safe => '1', perl_name => ['ucs-2be'], 'uri', {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.ucs-2be', '1'}}; $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.ucs-2le'} = { ascii16 => 1, ascii16le => 1, is_block_safe => '1', perl_name => ['ucs-2le'], 'uri', {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.ucs-2le', '1'}}; $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-10646-ucs-4'} = $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:iso-10646-ucs-4'} = { ascii32 => 1, bom_allowed => '1', no_bom_variant => 'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-32le', no_bom_variant32endian1234 => 'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-32be', no_bom_variant32endian4321 => 'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-32le', ietf_name => ['csucs4', 'iso-10646-ucs-4'], mime_name => 'iso-10646-ucs-4', 'uri', {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-10646-ucs-4', '1', 'urn:x-suika-fam-cx:charset:iso-10646-ucs-4', '1'}, xml_name => 'ISO-10646-UCS-4', }; $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-32be'} = { ascii32 => 1, ascii32endian1234 => 1, is_block_safe => '1', perl_name => ['ucs-4be', 'utf-32be'], 'uri', {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-32be', '1'}}; $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-32le'} = { ascii32 => 1, ascii32endian4321 => 1, is_block_safe => '1', perl_name => ['ucs-4le', 'utf-32le'], 'uri', {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-32le', '1'}}; $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:iso_8859-1:1987'} = $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-1'} = {ascii8 => '1', is_block_safe => '1', ietf_name => ['cp819', 'csisolatin1', 'ibm819', 'iso-8859-1', 'iso-8859-1', 'iso-ir-100', 'iso_8859-1', 'iso_8859-1:1987', 'l1', 'latin1'], mime_name => 'iso-8859-1', perl_name => ['iso-8859-1', 'latin1'], 'uri', {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-1', '1', 'urn:x-suika-fam-cx:charset:iso_8859-1:1987', '1'}, xml_name => 'ISO-8859-1', }; $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-2'} = {ascii8 => '1', is_block_safe => '1', 'uri', {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-2', '1'}, xml_name => 'ISO-8859-2', }; $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-3'} = {ascii8 => '1', is_block_safe => '1', 'uri', {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-3', '1'}, xml_name => 'ISO-8859-3', }; $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-4'} = {ascii8 => '1', is_block_safe => '1', 'uri', {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-4', '1'}, xml_name => 'ISO-8859-4', }; $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-5'} = {ascii8 => '1', is_block_safe => '1', 'uri', {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-5', '1'}, xml_name => 'ISO-8859-5', }; $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-6'} = {ascii8 => '1', is_block_safe => '1', 'uri', {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-6', '1'}, xml_name => 'ISO-8859-6', }; $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-7'} = {ascii8 => '1', is_block_safe => '1', 'uri', {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-7', '1'}, xml_name => 'ISO-8859-7', }; $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-8'} = {ascii8 => '1', is_block_safe => '1', 'uri', {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-8', '1'}, xml_name => 'ISO-8859-8', }; $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-9'} = {ascii8 => '1', is_block_safe => '1', 'uri', {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-9', '1'}, xml_name => 'ISO-8859-9', }; $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-10'} = {ascii8 => '1', is_block_safe => '1', 'uri', {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-10', '1'}, xml_name => 'ISO-8859-10', }; $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-11'} = {ascii8 => '1', is_block_safe => '1', 'uri', {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-11', '1'}, xml_name => 'ISO-8859-11', }; $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-13'} = {ascii8 => '1', is_block_safe => '1', 'uri', {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-13', '1'}, xml_name => 'ISO-8859-13', }; $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-14'} = {ascii8 => '1', is_block_safe => '1', 'uri', {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-14', '1'}, xml_name => 'ISO-8859-14', }; $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-15'} = {ascii8 => '1', is_block_safe => '1', 'uri', {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-15', '1'}, xml_name => 'ISO-8859-15', }; $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-16'} = {ascii8 => '1', is_block_safe => '1', 'uri', {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-16', '1'}, xml_name => 'ISO-8859-16', }; $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-2022-jp'} = {ascii8 => '1', 'uri', {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-2022-jp', '1'}, xml_name => 'ISO-2022-JP', }; $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:iso-2022-jp'} = {ascii8 => '1', ietf_name => ['csiso2022jp', 'iso-2022-jp', 'iso-2022-jp'], mime_name => 'iso-2022-jp', 'uri', {'urn:x-suika-fam-cx:charset:iso-2022-jp', '1'}, }; $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.iso-2022-jp'} = {ascii8 => '1', perl_name => ['iso-2022-jp'], 'uri', {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.iso-2022-jp', '1'}}; $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:shift_jis'} = $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.shift_jis'} = {ascii8 => '1', is_block_safe => '1', ietf_name => ['csshiftjis', 'ms_kanji', 'shift_jis', 'shift_jis'], mime_name => 'shift_jis', perl_name => ['shift-jis-1997'], 'uri', {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.shift_jis', '1', 'urn:x-suika-fam-cx:charset:shift_jis', '1'}, xml_name => 'Shift_JIS', }; $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.shiftjis'} = {ascii8 => '1', is_block_safe => '1', perl_name => ['shiftjis', 'sjis'], 'uri', {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.shiftjis', '1'}}; $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:euc-jp'} = $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.euc-jp'} = $Whatpm::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:extended_unix_code_packed_format_for_japanese'} = {ascii8 => '1', is_block_safe => '1', ietf_name => ['cseucpkdfmtjapanese', 'euc-jp', 'euc-jp', 'extended_unix_code_packed_format_for_japanese'], mime_name => 'euc-jp', perl_name => ['euc-jp-1997'], 'uri', {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.euc-jp', '1', 'urn:x-suika-fam-cx:charset:euc-jp', '1', 'urn:x-suika-fam-cx:charset:extended_unix_code_packed_format_for_japanese', '1'}, xml_name => 'EUC-JP', }; $Whatpm::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.euc-jp'} = {ascii8 => '1', is_block_safe => '1', perl_name => ['euc-jp', 'ujis'], 'uri', {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.euc-jp', '1'}}; 1; ## $Date: 2008/09/15 07:19:03 $