/[suikacvs]/markup/html/whatpm/Whatpm/HTML.pm.src

Diff of /markup/html/whatpm/Whatpm/HTML.pm.src

Parent Directory | Revision Log | View Patch Patch

-revision 1.132 by wakaba,
Sun Apr 13 10:36:40 2008 UTC
+revision 1.133 by wakaba,
Sat May 17 04:54:11 2008 UTC
 Line 333 
 my $c1_entity_char = {
  sub parse_byte_string ($$$$;$) {
    my $self = ref $_[0] ? shift : shift->new;
-   my $charset = shift;
+   my $charset_name = shift;
    my $bytes_s = ref $_[0] ? $_[0] : \($_[0]);
    my $s;
-   if (defined $charset) {
+   ## HTML5 encoding sniffing algorithm
-     require Encode; ## TODO: decode(utf8) don't delete BOM
+   require Message::Charset::Info;
-     $s = \ (Encode::decode ($charset, $$bytes_s));
+   my $charset;
-     $self->{input_encoding} = lc $charset; ## TODO: normalize name
+   my ($e, $e_status);
-     $self->{confident} = 1;
-   } else {
+   SNIFFING: {
-     ## TODO: Implement HTML5 detection algorithm
+     ## Step 1
+     if (defined $charset_name) {
+       $charset = Message::Charset::Info->get_by_iana_name ($charset_name);
+       ## ISSUE: Unsupported encoding is not ignored according to the spec.
+       ($e, $e_status) = $charset->get_perl_encoding
+           (allow_error_reporting => 1,
+            allow_fallback => 1);
+       if ($e) {
+         $self->{confident} = 1;
+         last SNIFFING;
+       }
+     }
+     ## Step 2
+     # wait
+     ## Step 3
+     my $head = substr ($$bytes_s, 0, 3);
+     if ($head =~ /^\xFE\xFF/) {
+       $charset = Message::Charset::Info->get_by_iana_name ('utf-16be');
+       ($e, $e_status) = $charset->get_perl_encoding
+           (allow_error_reporting => 1,
+            allow_fallback => 1);
+       $self->{confident} = 1;
+       last SNIFFING;
+     } elsif ($head =~ /^\xFF\xFE/) {
+       $charset = Message::Charset::Info->get_by_iana_name ('utf-16le');
+       ($e, $e_status) = $charset->get_perl_encoding
+           (allow_error_reporting => 1,
+            allow_fallback => 1);
+       $self->{confident} = 1;
+       last SNIFFING;
+     } elsif ($head eq "\xEF\xBB\xBF") {
+       $charset = Message::Charset::Info->get_by_iana_name ('utf-8');
+       ($e, $e_status) = $charset->get_perl_encoding
+           (allow_error_reporting => 1,
+            allow_fallback => 1);
+       $self->{confident} = 1;
+       last SNIFFING;
+     }
+     ## Step 4
+     ## TODO: <meta charset>
+     ## Step 5
+     ## TODO: from history
+     ## Step 6
      require Whatpm::Charset::UniversalCharDet;
-     $charset = Whatpm::Charset::UniversalCharDet->detect_byte_string
+     $charset_name = Whatpm::Charset::UniversalCharDet->detect_byte_string
          (substr ($$bytes_s, 0, 1024));
-     $charset ||= 'windows-1252';
+     if (defined $charset_name) {
-     $s = \ (Encode::decode ($charset, $$bytes_s));
+       $charset = Message::Charset::Info->get_by_iana_name ($charset_name);
-     $self->{input_encoding} = $charset;
+       ## ISSUE: Unsupported encoding is not ignored according to the spec.
+       ($e, $e_status) = $charset->get_perl_encoding
+           (allow_error_reporting => 1,
+            allow_fallback => 1);
+       if ($e) {
+         $self->{confident} = 0;
+         last SNIFFING;
+       }
+     }
+     ## Step 7: default
+     ## TODO: Make this configurable.
+     $charset = Message::Charset::Info->get_by_iana_name ('windows-1252');
+         ## NOTE: We choose |windows-1252| here, since |utf-8| should be
+         ## detectable in the step 6.
+     ($e, $e_status) = $charset->get_perl_encoding (allow_error_reporting => 1,
+                                                    allow_fallback => 1);
      $self->{confident} = 0;
+   } # SNIFFING
+   if ($e_status & Message::Charset::Info::FALLBACK_ENCODING_IMPL ()) {
+   } elsif (not ($e_status &
+                 Message::Charset::Info::ERROR_REPORTING_ENCODING_IMPL())) {
    }
+   $s = \ $e->decode ($$bytes_s);
+   $self->{input_encoding} = $charset->get_iana_name;
    $self->{change_encoding} = sub {
      my $self = shift;
-     my $charset = lc shift;
+     my $charset_name = lc shift;
      my $token = shift;
-     ## TODO: if $charset is supported
+     ## TODO: if $charset_name is supported
      ## TODO: normalize charset name
      ## "Change the encoding" algorithm:
      ## Step 1
-     if ($charset eq 'utf-16') { ## ISSUE: UTF-16BE -> UTF-8? UTF-16LE -> UTF-8?
+     if ($charset_name eq 'utf-16') { ## ISSUE: UTF-16BE -> UTF-8? UTF-16LE -> UTF-8?
-       $charset = 'utf-8';
+       $charset_name = 'utf-8';
      }
      ## Step 2
      if (defined $self->{input_encoding} and
-         $self->{input_encoding} eq $charset) {
+         $self->{input_encoding} eq $charset_name) {
        $self->{confident} = 1;
        return;
      }
      !!!parse-error (type => 'charset label detected:'.$self->{input_encoding}.
-         ':'.$charset, level => 'w', token => $token);
+         ':'.$charset_name, level => 'w', token => $token);
      ## Step 3
      # if (can) {
-Line 385 
 sub parse_byte_string ($$$$;$) {
+Line 460 
 sub parse_byte_string ($$$$;$) {
      # }
      ## Step 4
-     throw Whatpm::HTML::RestartParser (charset => $charset);
+     throw Whatpm::HTML::RestartParser (charset => $charset_name);
    }; # $self->{change_encoding}
    my @args = @_; shift @args; # $s
-Line 393 
 sub parse_byte_string ($$$$;$) {
+Line 468 
 sub parse_byte_string ($$$$;$) {
    try {
      $return = $self->parse_char_string ($s, @args);
    } catch Whatpm::HTML::RestartParser with {
-     my $charset = shift->{charset};
+     my $charset_name = shift->{charset};
-     $s = \ (Encode::decode ($charset, $$bytes_s));
+     $s = \ (Encode::decode ($charset_name, $$bytes_s));
-     $self->{input_encoding} = $charset; ## TODO: normalize
+     $self->{input_encoding} = $charset_name; ## TODO: normalize
      $self->{confident} = 1;
      $return = $self->parse_char_string ($s, @args);
    };

 Legend:



Removed from v.1.132
 


changed lines


 
Added in v.1.133
 Legend:



Removed from v.1.132
 


changed lines


 
Added in v.1.133
-Removed from v.1.132
+Added in v.1.133

admin@suikawiki.org	ViewVC Help
Powered by ViewVC 1.1.24