/[suikacvs]/markup/html/whatpm/Whatpm/HTML.pm.src

Diff of /markup/html/whatpm/Whatpm/HTML.pm.src

Parent Directory | Revision Log | View Patch Patch

-revision 1.133 by wakaba,
Sat May 17 04:54:11 2008 UTC
+revision 1.145 by wakaba,
Sat May 24 11:57:47 2008 UTC
 Line 8 
 use Error qw(:try);
  ## doc.write ('');
  ## alert (doc.compatMode);
- ## TODO: 1252 parse error (revision 1264)
+ require IO::Handle;
- ## TODO: 8859-11 = 874 (revision 1271)
  my $HTML_NS = q<http://www.w3.org/1999/xhtml>;
  my $MML_NS = q<http://www.w3.org/1998/Math/MathML>;
-Line 332 
 my $c1_entity_char = {
+Line 331 
 my $c1_entity_char = {
  }; # $c1_entity_char
  sub parse_byte_string ($$$$;$) {
+   my $self = shift;
+   my $charset_name = shift;
+   open my $input, '<', ref $_[0] ? $_[0] : \($_[0]);
+   return $self->parse_byte_stream ($charset_name, $input, @_[1..$#_]);
+ } # parse_byte_string
+ sub parse_byte_stream ($$$$;$) {
    my $self = ref $_[0] ? shift : shift->new;
    my $charset_name = shift;
-   my $bytes_s = ref $_[0] ? $_[0] : \($_[0]);
+   my $byte_stream = $_[0];
-   my $s;
+   my $onerror = $_[2] || sub {
+     my (%opt) = @_;
+     warn "Parse error ($opt{type})\n";
+   };
+   $self->{parse_error} = $onerror; # updated later by parse_char_string
    ## HTML5 encoding sniffing algorithm
    require Message::Charset::Info;
    my $charset;
-   my ($e, $e_status);
+   my $buffer;
+   my ($char_stream, $e_status);
    SNIFFING: {
-Line 349 
 sub parse_byte_string ($$$$;$) {
+Line 361 
 sub parse_byte_string ($$$$;$) {
        $charset = Message::Charset::Info->get_by_iana_name ($charset_name);
        ## ISSUE: Unsupported encoding is not ignored according to the spec.
-       ($e, $e_status) = $charset->get_perl_encoding
+       ($char_stream, $e_status) = $charset->get_decode_handle
-           (allow_error_reporting => 1,
+           ($byte_stream, allow_error_reporting => 1,
             allow_fallback => 1);
-       if ($e) {
+       if ($char_stream) {
          $self->{confident} = 1;
          last SNIFFING;
+       } else {
+         ## TODO: unsupported error
        }
      }
      ## Step 2
-     # wait
+     my $byte_buffer = '';
+     for (1..1024) {
+       my $char = $byte_stream->getc;
+       last unless defined $char;
+       $byte_buffer .= $char;
+     } ## TODO: timeout
      ## Step 3
-     my $head = substr ($$bytes_s, 0, 3);
+     if ($byte_buffer =~ /^\xFE\xFF/) {
-     if ($head =~ /^\xFE\xFF/) {
        $charset = Message::Charset::Info->get_by_iana_name ('utf-16be');
-       ($e, $e_status) = $charset->get_perl_encoding
+       ($char_stream, $e_status) = $charset->get_decode_handle
-           (allow_error_reporting => 1,
+           ($byte_stream, allow_error_reporting => 1,
-            allow_fallback => 1);
+            allow_fallback => 1, byte_buffer => \$byte_buffer);
        $self->{confident} = 1;
        last SNIFFING;
-     } elsif ($head =~ /^\xFF\xFE/) {
+     } elsif ($byte_buffer =~ /^\xFF\xFE/) {
        $charset = Message::Charset::Info->get_by_iana_name ('utf-16le');
-       ($e, $e_status) = $charset->get_perl_encoding
+       ($char_stream, $e_status) = $charset->get_decode_handle
-           (allow_error_reporting => 1,
+           ($byte_stream, allow_error_reporting => 1,
-            allow_fallback => 1);
+            allow_fallback => 1, byte_buffer => \$byte_buffer);
        $self->{confident} = 1;
        last SNIFFING;
-     } elsif ($head eq "\xEF\xBB\xBF") {
+     } elsif ($byte_buffer =~ /^\xEF\xBB\xBF/) {
        $charset = Message::Charset::Info->get_by_iana_name ('utf-8');
-       ($e, $e_status) = $charset->get_perl_encoding
+       ($char_stream, $e_status) = $charset->get_decode_handle
-           (allow_error_reporting => 1,
+           ($byte_stream, allow_error_reporting => 1,
-            allow_fallback => 1);
+            allow_fallback => 1, byte_buffer => \$byte_buffer);
        $self->{confident} = 1;
        last SNIFFING;
      }
-Line 395 
 sub parse_byte_string ($$$$;$) {
+Line 413 
 sub parse_byte_string ($$$$;$) {
      ## Step 6
      require Whatpm::Charset::UniversalCharDet;
      $charset_name = Whatpm::Charset::UniversalCharDet->detect_byte_string
-         (substr ($$bytes_s, 0, 1024));
+         ($byte_buffer);
      if (defined $charset_name) {
        $charset = Message::Charset::Info->get_by_iana_name ($charset_name);
        ## ISSUE: Unsupported encoding is not ignored according to the spec.
-       ($e, $e_status) = $charset->get_perl_encoding
+       require Whatpm::Charset::DecodeHandle;
-           (allow_error_reporting => 1,
+       $buffer = Whatpm::Charset::DecodeHandle::ByteBuffer->new
-            allow_fallback => 1);
+           ($byte_stream);
-       if ($e) {
+       ($char_stream, $e_status) = $charset->get_decode_handle
+           ($buffer, allow_error_reporting => 1,
+            allow_fallback => 1, byte_buffer => \$byte_buffer);
+       if ($char_stream) {
+         $buffer->{buffer} = $byte_buffer;
+         !!!parse-error (type => 'sniffing:chardet', ## TODO: type name
+                         value => $charset_name,
+                         level => $self->{info_level},
+                         line => 1, column => 1);
          $self->{confident} = 0;
          last SNIFFING;
        }
-Line 414 
 sub parse_byte_string ($$$$;$) {
+Line 440 
 sub parse_byte_string ($$$$;$) {
      $charset = Message::Charset::Info->get_by_iana_name ('windows-1252');
          ## NOTE: We choose |windows-1252| here, since |utf-8| should be
          ## detectable in the step 6.
-     ($e, $e_status) = $charset->get_perl_encoding (allow_error_reporting => 1,
+     require Whatpm::Charset::DecodeHandle;
-                                                    allow_fallback => 1);
+     $buffer = Whatpm::Charset::DecodeHandle::ByteBuffer->new
+         ($byte_stream);
+     ($char_stream, $e_status)
+         = $charset->get_decode_handle ($buffer,
+                                        allow_error_reporting => 1,
+                                        allow_fallback => 1,
+                                        byte_buffer => \$byte_buffer);
+     $buffer->{buffer} = $byte_buffer;
+     !!!parse-error (type => 'sniffing:default', ## TODO: type name
+                     value => 'windows-1252',
+                     level => $self->{info_level},
+                     line => 1, column => 1);
      $self->{confident} = 0;
    } # SNIFFING
+   $self->{input_encoding} = $charset->get_iana_name;
    if ($e_status & Message::Charset::Info::FALLBACK_ENCODING_IMPL ()) {
+     !!!parse-error (type => 'chardecode:fallback', ## TODO: type name
+                     value => $self->{input_encoding},
+                     level => $self->{unsupported_level},
+                     line => 1, column => 1);
    } elsif (not ($e_status &
                  Message::Charset::Info::ERROR_REPORTING_ENCODING_IMPL())) {
+     !!!parse-error (type => 'chardecode:no error', ## TODO: type name
+                     value => $self->{input_encoding},
+                     level => $self->{unsupported_level},
+                     line => 1, column => 1);
    }
-   $s = \ $e->decode ($$bytes_s);
-   $self->{input_encoding} = $charset->get_iana_name;
    $self->{change_encoding} = sub {
      my $self = shift;
-     my $charset_name = lc shift;
+     $charset_name = shift;
      my $token = shift;
-     ## TODO: if $charset_name is supported
-     ## TODO: normalize charset name
-     ## "Change the encoding" algorithm:
+     $charset = Message::Charset::Info->get_by_iana_name ($charset_name);
+     ($char_stream, $e_status) = $charset->get_decode_handle
+         ($byte_stream, allow_error_reporting => 1, allow_fallback => 1,
+          byte_buffer => \ $buffer->{buffer});
+     if ($char_stream) { # if supported
+       ## "Change the encoding" algorithm:
-     ## Step 1
+       ## Step 1
-     if ($charset_name eq 'utf-16') { ## ISSUE: UTF-16BE -> UTF-8? UTF-16LE -> UTF-8?
+       if ($charset->{iana_names}->{'utf-16'}) { ## ISSUE: UTF-16BE -> UTF-8? UTF-16LE -> UTF-8?
-       $charset_name = 'utf-8';
+         $charset = Message::Charset::Info->get_by_iana_name ('utf-8');
-     }
+         ($char_stream, $e_status) = $charset->get_decode_handle
+             ($byte_stream,
+              byte_buffer => \ $buffer->{buffer});
+       }
+       $charset_name = $charset->get_iana_name;
+       ## Step 2
+       if (defined $self->{input_encoding} and
+           $self->{input_encoding} eq $charset_name) {
+         !!!parse-error (type => 'charset label:matching', ## TODO: type
+                         value => $charset_name,
+                         level => $self->{info_level});
+         $self->{confident} = 1;
+         return;
+       }
-     ## Step 2
+       !!!parse-error (type => 'charset label detected:'.$self->{input_encoding}.
-     if (defined $self->{input_encoding} and
+           ':'.$charset_name, level => 'w', token => $token);
-         $self->{input_encoding} eq $charset_name) {
-       $self->{confident} = 1;
+       ## Step 3
-       return;
+       # if (can) {
+         ## change the encoding on the fly.
+         #$self->{confident} = 1;
+         #return;
+       # }
+       ## Step 4
+       throw Whatpm::HTML::RestartParser ();
      }
-     !!!parse-error (type => 'charset label detected:'.$self->{input_encoding}.
-         ':'.$charset_name, level => 'w', token => $token);
-     ## Step 3
-     # if (can) {
-       ## change the encoding on the fly.
-       #$self->{confident} = 1;
-       #return;
-     # }
-     ## Step 4
-     throw Whatpm::HTML::RestartParser (charset => $charset_name);
    }; # $self->{change_encoding}
+   my $char_onerror = sub {
+     my (undef, $type, %opt) = @_;
+     !!!parse-error (%opt, type => $type,
+                     line => $self->{line}, column => $self->{column} + 1);
+     if ($opt{octets}) {
+       ${$opt{octets}} = "\x{FFFD}"; # relacement character
+     }
+   };
+   $char_stream->onerror ($char_onerror);
    my @args = @_; shift @args; # $s
    my $return;
    try {
-     $return = $self->parse_char_string ($s, @args);
+     $return = $self->parse_char_stream ($char_stream, @args);
    } catch Whatpm::HTML::RestartParser with {
-     my $charset_name = shift->{charset};
+     ## NOTE: Invoked after {change_encoding}.
-     $s = \ (Encode::decode ($charset_name, $$bytes_s));
-     $self->{input_encoding} = $charset_name; ## TODO: normalize
+     $self->{input_encoding} = $charset->get_iana_name;
+     if ($e_status & Message::Charset::Info::FALLBACK_ENCODING_IMPL ()) {
+       !!!parse-error (type => 'chardecode:fallback', ## TODO: type name
+                       value => $self->{input_encoding},
+                       level => $self->{unsupported_level},
+                       line => 1, column => 1);
+     } elsif (not ($e_status &
+                   Message::Charset::Info::ERROR_REPORTING_ENCODING_IMPL())) {
+       !!!parse-error (type => 'chardecode:no error', ## TODO: type name
+                       value => $self->{input_encoding},
+                       level => $self->{unsupported_level},
+                       line => 1, column => 1);
+     }
      $self->{confident} = 1;
-     $return = $self->parse_char_string ($s, @args);
+     $char_stream->onerror ($char_onerror);
+     $return = $self->parse_char_stream ($char_stream, @args);
    };
    return $return;
- } # parse_byte_string
+ } # parse_byte_stream
  ## NOTE: HTML5 spec says that the encoding layer MUST NOT strip BOM
  ## and the HTML layer MUST ignore it.  However, we does strip BOM in
-Line 486 
 sub parse_byte_string ($$$$;$) {
+Line 563 
 sub parse_byte_string ($$$$;$) {
  ## such as |parse_byte_string| in this module, must ensure that it does
  ## strip the BOM and never strip any ZWNBSP.
- *parse_char_string = \&parse_string;
+ sub parse_char_string ($$$;$) {
+   my $self = shift;
+   require utf8;
+   my $s = ref $_[0] ? $_[0] : \($_[0]);
+   open my $input, '<' . (utf8::is_utf8 ($$s) ? ':utf8' : ''), $s;
+   return $self->parse_char_stream ($input, @_[1..$#_]);
+ } # parse_char_string
+ *parse_string = \&parse_char_string;
- sub parse_string ($$$;$) {
+ sub parse_char_stream ($$$;$) {
    my $self = ref $_[0] ? shift : shift->new;
-   my $s = ref $_[0] ? $_[0] : \($_[0]);
+   my $input = $_[0];
    $self->{document} = $_[1];
    @{$self->{document}->child_nodes} = ();
-Line 509 
 sub parse_string ($$$;$) {
+Line 593 
 sub parse_string ($$$;$) {
      pop @{$self->{prev_char}};
      unshift @{$self->{prev_char}}, $self->{next_char};
-     $self->{next_char} = -1 and return if $i >= length $$s;
+     my $char;
-     $self->{next_char} = ord substr $$s, $i++, 1;
+     if (defined $self->{next_next_char}) {
+       $char = $self->{next_next_char};
+       delete $self->{next_next_char};
+     } else {
+       $char = $input->getc;
+     }
+     $self->{next_char} = -1 and return unless defined $char;
+     $self->{next_char} = ord $char;
      ($self->{line_prev}, $self->{column_prev})
          = ($self->{line}, $self->{column});
-Line 522 
 sub parse_string ($$$;$) {
+Line 613 
 sub parse_string ($$$;$) {
        $self->{column} = 0;
      } elsif ($self->{next_char} == 0x000D) { # CR
        !!!cp ('j2');
-       $i++ if substr ($$s, $i, 1) eq "\x0A";
+       my $next = $input->getc;
+       if (defined $next and $next ne "\x0A") {
+         $self->{next_next_char} = $next;
+       }
        $self->{next_char} = 0x000A; # LF # MUST
        $self->{line}++;
        $self->{column} = 0;
-Line 575 
 sub parse_string ($$$;$) {
+Line 669 
 sub parse_string ($$$;$) {
    delete $self->{parse_error}; # remove loop
    return $self->{document};
- } # parse_string
+ } # parse_char_stream
  sub new ($) {
    my $class = shift;
-   my $self = bless {}, $class;
+   my $self = bless {
+     must_level => 'm',
+     should_level => 's',
+     good_level => 'w',
+     warn_level => 'w',
+     info_level => 'i',
+     unsupported_level => 'u',
+   }, $class;
    $self->{set_next_char} = sub {
      $self->{next_char} = -1;
    };
-Line 944 
 sub _get_next_token ($) {
+Line 1045 
 sub _get_next_token ($) {
            redo A;
          } else {
            !!!cp (23);
-           !!!parse-error (type => 'bare stago');
+           !!!parse-error (type => 'bare stago',
+                           line => $self->{line_prev},
+                           column => $self->{column_prev});
            $self->{state} = DATA_STATE;
            ## reconsume
-Line 1721 
 sub _get_next_token ($) {
+Line 1824 
 sub _get_next_token ($) {
          $self->{state} = SELF_CLOSING_START_TAG_STATE;
          !!!next-input-character;
          redo A;
+       } elsif ($self->{next_char} == -1) {
+         !!!parse-error (type => 'unclosed tag');
+         if ($self->{current_token}->{type} == START_TAG_TOKEN) {
+           !!!cp (122.3);
+           $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
+         } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
+           if ($self->{current_token}->{attributes}) {
+             !!!cp (122.1);
+             !!!parse-error (type => 'end tag attribute');
+           } else {
+             ## NOTE: This state should never be reached.
+             !!!cp (122.2);
+           }
+         } else {
+           die "$0: $self->{current_token}->{type}: Unknown token type";
+         }
+         $self->{state} = DATA_STATE;
+         ## Reconsume.
+         !!!emit ($self->{current_token}); # start tag or end tag
+         redo A;
        } else {
          !!!cp ('124.1');
          !!!parse-error (type => 'no space between attributes');
-Line 1753 
 sub _get_next_token ($) {
+Line 1876 
 sub _get_next_token ($) {
          !!!emit ($self->{current_token}); # start tag or end tag
          redo A;
+       } elsif ($self->{next_char} == -1) {
+         !!!parse-error (type => 'unclosed tag');
+         if ($self->{current_token}->{type} == START_TAG_TOKEN) {
+           !!!cp (124.7);
+           $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
+         } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
+           if ($self->{current_token}->{attributes}) {
+             !!!cp (124.5);
+             !!!parse-error (type => 'end tag attribute');
+           } else {
+             ## NOTE: This state should never be reached.
+             !!!cp (124.6);
+           }
+         } else {
+           die "$0: $self->{current_token}->{type}: Unknown token type";
+         }
+         $self->{state} = DATA_STATE;
+         ## Reconsume.
+         !!!emit ($self->{current_token}); # start tag or end tag
+         redo A;
        } else {
          !!!cp ('124.4');
          !!!parse-error (type => 'nestc');
-Line 2593 
 sub _get_next_token ($) {
+Line 2736 
 sub _get_next_token ($) {
          redo A;
        } elsif ($self->{next_char} == -1) {
          !!!cp (217);
-         !!!parse-error (type => 'unclosed DOCTYPE');
          $self->{state} = DATA_STATE;
          ## reconsume
-Line 2997 
 sub _tree_construction_initial ($) {
+Line 3139 
 sub _tree_construction_initial ($) {
        } elsif (defined $token->{public_identifier}) {
          my $pubid = $token->{public_identifier};
          $pubid =~ tr/a-z/A-z/;
-         if ({
+         my $prefix = [
-           "+//SILMARIL//DTD HTML PRO V0R11 19970101//EN" => 1,
+           "+//SILMARIL//DTD HTML PRO V0R11 19970101//",
-           "-//ADVASOFT LTD//DTD HTML 3.0 ASWEDIT + EXTENSIONS//EN" => 1,
+           "-//ADVASOFT LTD//DTD HTML 3.0 ASWEDIT + EXTENSIONS//",
-           "-//AS//DTD HTML 3.0 ASWEDIT + EXTENSIONS//EN" => 1,
+           "-//AS//DTD HTML 3.0 ASWEDIT + EXTENSIONS//",
-           "-//IETF//DTD HTML 2.0 LEVEL 1//EN" => 1,
+           "-//IETF//DTD HTML 2.0 LEVEL 1//",
-           "-//IETF//DTD HTML 2.0 LEVEL 2//EN" => 1,
+           "-//IETF//DTD HTML 2.0 LEVEL 2//",
-           "-//IETF//DTD HTML 2.0 STRICT LEVEL 1//EN" => 1,
+           "-//IETF//DTD HTML 2.0 STRICT LEVEL 1//",
-           "-//IETF//DTD HTML 2.0 STRICT LEVEL 2//EN" => 1,
+           "-//IETF//DTD HTML 2.0 STRICT LEVEL 2//",
-           "-//IETF//DTD HTML 2.0 STRICT//EN" => 1,
+           "-//IETF//DTD HTML 2.0 STRICT//",
-           "-//IETF//DTD HTML 2.0//EN" => 1,
+           "-//IETF//DTD HTML 2.0//",
-           "-//IETF//DTD HTML 2.1E//EN" => 1,
+           "-//IETF//DTD HTML 2.1E//",
-           "-//IETF//DTD HTML 3.0//EN" => 1,
+           "-//IETF//DTD HTML 3.0//",
-           "-//IETF//DTD HTML 3.0//EN//" => 1,
+           "-//IETF//DTD HTML 3.2 FINAL//",
-           "-//IETF//DTD HTML 3.2 FINAL//EN" => 1,
+           "-//IETF//DTD HTML 3.2//",
-           "-//IETF//DTD HTML 3.2//EN" => 1,
+           "-//IETF//DTD HTML 3//",
-           "-//IETF//DTD HTML 3//EN" => 1,
+           "-//IETF//DTD HTML LEVEL 0//",
-           "-//IETF//DTD HTML LEVEL 0//EN" => 1,
+           "-//IETF//DTD HTML LEVEL 1//",
-           "-//IETF//DTD HTML LEVEL 0//EN//2.0" => 1,
+           "-//IETF//DTD HTML LEVEL 2//",
-           "-//IETF//DTD HTML LEVEL 1//EN" => 1,
+           "-//IETF//DTD HTML LEVEL 3//",
-           "-//IETF//DTD HTML LEVEL 1//EN//2.0" => 1,
+           "-//IETF//DTD HTML STRICT LEVEL 0//",
-           "-//IETF//DTD HTML LEVEL 2//EN" => 1,
+           "-//IETF//DTD HTML STRICT LEVEL 1//",
-           "-//IETF//DTD HTML LEVEL 2//EN//2.0" => 1,
+           "-//IETF//DTD HTML STRICT LEVEL 2//",
-           "-//IETF//DTD HTML LEVEL 3//EN" => 1,
+           "-//IETF//DTD HTML STRICT LEVEL 3//",
-           "-//IETF//DTD HTML LEVEL 3//EN//3.0" => 1,
+           "-//IETF//DTD HTML STRICT//",
-           "-//IETF//DTD HTML STRICT LEVEL 0//EN" => 1,
+           "-//IETF//DTD HTML//",
-           "-//IETF//DTD HTML STRICT LEVEL 0//EN//2.0" => 1,
+           "-//METRIUS//DTD METRIUS PRESENTATIONAL//",
-           "-//IETF//DTD HTML STRICT LEVEL 1//EN" => 1,
+           "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML STRICT//",
-           "-//IETF//DTD HTML STRICT LEVEL 1//EN//2.0" => 1,
+           "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML//",
-           "-//IETF//DTD HTML STRICT LEVEL 2//EN" => 1,
+           "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 TABLES//",
-           "-//IETF//DTD HTML STRICT LEVEL 2//EN//2.0" => 1,
+           "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML STRICT//",
-           "-//IETF//DTD HTML STRICT LEVEL 3//EN" => 1,
+           "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML//",
-           "-//IETF//DTD HTML STRICT LEVEL 3//EN//3.0" => 1,
+           "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 TABLES//",
-           "-//IETF//DTD HTML STRICT//EN" => 1,
+           "-//NETSCAPE COMM. CORP.//DTD HTML//",
-           "-//IETF//DTD HTML STRICT//EN//2.0" => 1,
+           "-//NETSCAPE COMM. CORP.//DTD STRICT HTML//",
-           "-//IETF//DTD HTML STRICT//EN//3.0" => 1,
+           "-//O'REILLY AND ASSOCIATES//DTD HTML 2.0//",
-           "-//IETF//DTD HTML//EN" => 1,
+           "-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED 1.0//",
-           "-//IETF//DTD HTML//EN//2.0" => 1,
+           "-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED RELAXED 1.0//",
-           "-//IETF//DTD HTML//EN//3.0" => 1,
+           "-//SOFTQUAD SOFTWARE//DTD HOTMETAL PRO 6.0::19990601::EXTENSIONS TO HTML 4.0//",
-           "-//METRIUS//DTD METRIUS PRESENTATIONAL//EN" => 1,
+           "-//SOFTQUAD//DTD HOTMETAL PRO 4.0::19971010::EXTENSIONS TO HTML 4.0//",
-           "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML STRICT//EN" => 1,
+           "-//SPYGLASS//DTD HTML 2.0 EXTENDED//",
-           "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML//EN" => 1,
+           "-//SQ//DTD HTML 2.0 HOTMETAL + EXTENSIONS//",
-           "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 TABLES//EN" => 1,
+           "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA HTML//",
-           "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML STRICT//EN" => 1,
+           "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA STRICT HTML//",
-           "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML//EN" => 1,
+           "-//W3C//DTD HTML 3 1995-03-24//",
-           "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 TABLES//EN" => 1,
+           "-//W3C//DTD HTML 3.2 DRAFT//",
-           "-//NETSCAPE COMM. CORP.//DTD HTML//EN" => 1,
+           "-//W3C//DTD HTML 3.2 FINAL//",
-           "-//NETSCAPE COMM. CORP.//DTD STRICT HTML//EN" => 1,
+           "-//W3C//DTD HTML 3.2//",
-           "-//O'REILLY AND ASSOCIATES//DTD HTML 2.0//EN" => 1,
+           "-//W3C//DTD HTML 3.2S DRAFT//",
-           "-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED 1.0//EN" => 1,
+           "-//W3C//DTD HTML 4.0 FRAMESET//",
-           "-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED RELAXED 1.0//EN" => 1,
+           "-//W3C//DTD HTML 4.0 TRANSITIONAL//",
-           "-//SOFTQUAD SOFTWARE//DTD HOTMETAL PRO 6.0::19990601::EXTENSIONS TO HTML 4.0//EN" => 1,
+           "-//W3C//DTD HTML EXPERIMETNAL 19960712//",
-           "-//SOFTQUAD//DTD HOTMETAL PRO 4.0::19971010::EXTENSIONS TO HTML 4.0//EN" => 1,
+           "-//W3C//DTD HTML EXPERIMENTAL 970421//",
-           "-//SPYGLASS//DTD HTML 2.0 EXTENDED//EN" => 1,
+           "-//W3C//DTD W3 HTML//",
-           "-//SQ//DTD HTML 2.0 HOTMETAL + EXTENSIONS//EN" => 1,
+           "-//W3O//DTD W3 HTML 3.0//",
-           "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA HTML//EN" => 1,
+           "-//WEBTECHS//DTD MOZILLA HTML 2.0//",
-           "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA STRICT HTML//EN" => 1,
+           "-//WEBTECHS//DTD MOZILLA HTML//",
-           "-//W3C//DTD HTML 3 1995-03-24//EN" => 1,
+         ]; # $prefix
-           "-//W3C//DTD HTML 3.2 DRAFT//EN" => 1,
+         my $match;
-           "-//W3C//DTD HTML 3.2 FINAL//EN" => 1,
+         for (@$prefix) {
-           "-//W3C//DTD HTML 3.2//EN" => 1,
+           if (substr ($prefix, 0, length $_) eq $_) {
-           "-//W3C//DTD HTML 3.2S DRAFT//EN" => 1,
+             $match = 1;
-           "-//W3C//DTD HTML 4.0 FRAMESET//EN" => 1,
+             last;
-           "-//W3C//DTD HTML 4.0 TRANSITIONAL//EN" => 1,
+           }
-           "-//W3C//DTD HTML EXPERIMETNAL 19960712//EN" => 1,
+         }
-           "-//W3C//DTD HTML EXPERIMENTAL 970421//EN" => 1,
+         if ($match or
-           "-//W3C//DTD W3 HTML//EN" => 1,
+             $pubid eq "-//W3O//DTD W3 HTML STRICT 3.0//EN//" or
-           "-//W3O//DTD W3 HTML 3.0//EN" => 1,
+             $pubid eq "-/W3C/DTD HTML 4.0 TRANSITIONAL/EN" or
-           "-//W3O//DTD W3 HTML 3.0//EN//" => 1,
+             $pubid eq "HTML") {
-           "-//W3O//DTD W3 HTML STRICT 3.0//EN//" => 1,
-           "-//WEBTECHS//DTD MOZILLA HTML 2.0//EN" => 1,
-           "-//WEBTECHS//DTD MOZILLA HTML//EN" => 1,
-           "-/W3C/DTD HTML 4.0 TRANSITIONAL/EN" => 1,
-           "HTML" => 1,
-         }->{$pubid}) {
            !!!cp ('t5');
            $self->{document}->manakai_compat_mode ('quirks');
-         } elsif ($pubid eq "-//W3C//DTD HTML 4.01 FRAMESET//EN" or
+         } elsif ($pubid =~ m[^-//W3C//DTD HTML 4.01 FRAMESET//] or
-                  $pubid eq "-//W3C//DTD HTML 4.01 TRANSITIONAL//EN") {
+                  $pubid =~ m[^-//W3C//DTD HTML 4.01 TRANSITIONAL//]) {
            if (defined $token->{system_identifier}) {
              !!!cp ('t6');
              $self->{document}->manakai_compat_mode ('quirks');
-Line 3082 
 sub _tree_construction_initial ($) {
+Line 3218 
 sub _tree_construction_initial ($) {
              !!!cp ('t7');
              $self->{document}->manakai_compat_mode ('limited quirks');
            }
-         } elsif ($pubid eq "-//W3C//DTD XHTML 1.0 FRAMESET//EN" or
+         } elsif ($pubid =~ m[^-//W3C//DTD XHTML 1.0 FRAMESET//] or
-                  $pubid eq "-//W3C//DTD XHTML 1.0 TRANSITIONAL//EN") {
+                  $pubid =~ m[^-//W3C//DTD XHTML 1.0 TRANSITIONAL//]) {
            !!!cp ('t8');
            $self->{document}->manakai_compat_mode ('limited quirks');
          } else {
-Line 3096 
 sub _tree_construction_initial ($) {
+Line 3232 
 sub _tree_construction_initial ($) {
          my $sysid = $token->{system_identifier};
          $sysid =~ tr/A-Z/a-z/;
          if ($sysid eq "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd") {
-           ## TODO: Check the spec: PUBLIC "(limited quirks)" "(quirks)"
+           ## NOTE: Ensure that |PUBLIC "(limited quirks)" "(quirks)"| is
+           ## marked as quirks.
            $self->{document}->manakai_compat_mode ('quirks');
            !!!cp ('t11');
          } else {
-Line 3268 
 sub _reset_insertion_mode ($) {
+Line 3405 
 sub _reset_insertion_mode ($) {
        if ($self->{open_elements}->[0]->[0] eq $node->[0]) {
          $last = 1;
          if (defined $self->{inner_html_node}) {
-           if ($self->{inner_html_node}->[1] & TABLE_CELL_EL) {
+           !!!cp ('t28');
-             !!!cp ('t27');
+           $node = $self->{inner_html_node};
-             #
+         } else {
-           } else {
+           die "_reset_insertion_mode: t27";
-             !!!cp ('t28');
-             $node = $self->{inner_html_node};
-           }
          }
        }
-     ## Step 4..14
+       ## Step 4..14
-     my $new_mode;
+       my $new_mode;
-     if ($node->[1] & FOREIGN_EL) {
+       if ($node->[1] & FOREIGN_EL) {
-       ## NOTE: Strictly spaking, the line below only applies to MathML and
+         !!!cp ('t28.1');
-       ## SVG elements.  Currently the HTML syntax supports only MathML and
+         ## NOTE: Strictly spaking, the line below only applies to MathML and
-       ## SVG elements as foreigners.
+         ## SVG elements.  Currently the HTML syntax supports only MathML and
-       $new_mode = $self->{insertion_mode} | IN_FOREIGN_CONTENT_IM;
+         ## SVG elements as foreigners.
-       ## ISSUE: What is set as the secondary insertion mode?
+         $new_mode = $self->{insertion_mode} | IN_FOREIGN_CONTENT_IM;
-     } else {
+         ## ISSUE: What is set as the secondary insertion mode?
-       $new_mode = {
+       } elsif ($node->[1] & TABLE_CELL_EL) {
+         if ($last) {
+           !!!cp ('t28.2');
+           #
+         } else {
+           !!!cp ('t28.3');
+           $new_mode = IN_CELL_IM;
+         }
+       } else {
+         !!!cp ('t28.4');
+         $new_mode = {
                        select => IN_SELECT_IM,
                        ## NOTE: |option| and |optgroup| do not set
                        ## insertion mode to "in select" by themselves.
-                       td => IN_CELL_IM,
-                       th => IN_CELL_IM,
                        tr => IN_ROW_IM,
                        tbody => IN_TABLE_BODY_IM,
                        thead => IN_TABLE_BODY_IM,
-Line 3304 
 sub _reset_insertion_mode ($) {
+Line 3446 
 sub _reset_insertion_mode ($) {
                        body => IN_BODY_IM,
                        frameset => IN_FRAMESET_IM,
                       }->{$node->[0]->manakai_local_name};
-     }
+       }
-     $self->{insertion_mode} = $new_mode and return if defined $new_mode;
+       $self->{insertion_mode} = $new_mode and return if defined $new_mode;
        ## Step 15
        if ($node->[1] & HTML_EL) {
-Line 4048 
 sub _tree_construction_main ($) {
+Line 4190 
 sub _tree_construction_main ($) {
              !!!next-token;
              next B;
            } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
-             !!!cp ('t94');
+             !!!cp ('t93.2');
-             #
+             !!!parse-error (type => 'after head:head', token => $token); ## TODO: error type
+             ## Ignore the token
+             !!!nack ('t93.3');
+             !!!next-token;
+             next B;
            } else {
              !!!cp ('t95');
              !!!parse-error (type => 'in head:head', token => $token); # or in head noscript
-Line 4132 
 sub _tree_construction_main ($) {
+Line 4278 
 sub _tree_construction_main ($) {
                my $meta_el = pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
                unless ($self->{confident}) {
-                 if ($token->{attributes}->{charset}) { ## TODO: And if supported
+                 if ($token->{attributes}->{charset}) {
                    !!!cp ('t106');
+                   ## NOTE: Whether the encoding is supported or not is handled
+                   ## in the {change_encoding} callback.
                    $self->{change_encoding}
                        ->($self, $token->{attributes}->{charset}->{value},
                           $token);
-Line 4143 
 sub _tree_construction_main ($) {
+Line 4291 
 sub _tree_construction_main ($) {
                                             $token->{attributes}->{charset}
                                                 ->{has_reference});
                  } elsif ($token->{attributes}->{content}) {
-                   ## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition.
                    if ($token->{attributes}->{content}->{value}
-                       =~ /\A[^;]*;[\x09-\x0D\x20]*[Cc][Hh][Aa][Rr][Ss][Ee][Tt]
+                       =~ /[Cc][Hh][Aa][Rr][Ss][Ee][Tt]
                            [\x09-\x0D\x20]*=
                            [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|
-                           ([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) {
+                           ([^"'\x09-\x0D\x20][^\x09-\x0D\x20\x3B]*))/x) {
                      !!!cp ('t107');
+                     ## NOTE: Whether the encoding is supported or not is handled
+                     ## in the {change_encoding} callback.
                      $self->{change_encoding}
                          ->($self, defined $1 ? $1 : defined $2 ? $2 : $3,
                             $token);
-Line 4368 
 sub _tree_construction_main ($) {
+Line 4517 
 sub _tree_construction_main ($) {
                  $self->{insertion_mode} = AFTER_HEAD_IM;
                  !!!next-token;
                  next B;
+               } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
+                 !!!cp ('t134.1');
+                 !!!parse-error (type => 'unmatched end tag:head', token => $token);
+                 ## Ignore the token
+                 !!!next-token;
+                 next B;
                } else {
-                 !!!cp ('t135');
+                 die "$0: $self->{insertion_mode}: Unknown insertion mode";
-                 #
                }
              } elsif ($token->{tag_name} eq 'noscript') {
                if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
-Line 4379 
 sub _tree_construction_main ($) {
+Line 4533 
 sub _tree_construction_main ($) {
                  $self->{insertion_mode} = IN_HEAD_IM;
                  !!!next-token;
                  next B;
-               } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM) {
+               } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM or
+                        $self->{insertion_mode} == AFTER_HEAD_IM) {
                  !!!cp ('t137');
                  !!!parse-error (type => 'unmatched end tag:noscript', token => $token);
                  ## Ignore the token ## ISSUE: An issue in the spec.
-Line 4392 
 sub _tree_construction_main ($) {
+Line 4547 
 sub _tree_construction_main ($) {
              } elsif ({
                        body => 1, html => 1,
                       }->{$token->{tag_name}}) {
-               if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
+               if ($self->{insertion_mode} == BEFORE_HEAD_IM or
-                 !!!cp ('t139');
+                   $self->{insertion_mode} == IN_HEAD_IM or
-                 ## As if <head>
+                   $self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
-                 !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);
-                 $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
-                 push @{$self->{open_elements}},
-                     [$self->{head_element}, $el_category->{head}];
-                 $self->{insertion_mode} = IN_HEAD_IM;
-                 ## Reprocess in the "in head" insertion mode...
-               } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
                  !!!cp ('t140');
                  !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
                  ## Ignore the token
                  !!!next-token;
                  next B;
+               } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
+                 !!!cp ('t140.1');
+                 !!!parse-error (type => 'unmatched end tag:' . $token->{tag_name}, token => $token);
+                 ## Ignore the token
+                 !!!next-token;
+                 next B;
                } else {
-                 !!!cp ('t141');
+                 die "$0: $self->{insertion_mode}: Unknown insertion mode";
                }
+             } elsif ($token->{tag_name} eq 'p') {
-               #
+               !!!cp ('t142');
-             } elsif ({
+               !!!parse-error (type => 'unmatched end tag:p', token => $token);
-                       p => 1, br => 1,
+               ## Ignore the token
-                      }->{$token->{tag_name}}) {
+               !!!next-token;
+               next B;
+             } elsif ($token->{tag_name} eq 'br') {
                if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
-                 !!!cp ('t142');
+                 !!!cp ('t142.2');
-                 ## As if <head>
+                 ## (before head) as if <head>, (in head) as if </head>
                  !!!create-element ($self->{head_element}, $HTML_NS, 'head',, $token);
                  $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
-                 push @{$self->{open_elements}},
+                 $self->{insertion_mode} = AFTER_HEAD_IM;
-                     [$self->{head_element}, $el_category->{head}];
+                 ## Reprocess in the "after head" insertion mode...
+               } elsif ($self->{insertion_mode} == IN_HEAD_IM) {
+                 !!!cp ('t143.2');
+                 ## As if </head>
+                 pop @{$self->{open_elements}};
+                 $self->{insertion_mode} = AFTER_HEAD_IM;
+                 ## Reprocess in the "after head" insertion mode...
+               } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
+                 !!!cp ('t143.3');
+                 ## ISSUE: Two parse errors for <head><noscript></br>
+                 !!!parse-error (type => 'unmatched end tag:br', token => $token);
+                 ## As if </noscript>
+                 pop @{$self->{open_elements}};
                  $self->{insertion_mode} = IN_HEAD_IM;
                  ## Reprocess in the "in head" insertion mode...
-               } else {
+                 ## As if </head>
-                 !!!cp ('t143');
+                 pop @{$self->{open_elements}};
-               }
+                 $self->{insertion_mode} = AFTER_HEAD_IM;
-               #
+                 ## Reprocess in the "after head" insertion mode...
-             } else {
+               } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) {
-               if ($self->{insertion_mode} == AFTER_HEAD_IM) {
+                 !!!cp ('t143.4');
-                 !!!cp ('t144');
                  #
                } else {
-                 !!!cp ('t145');
+                 die "$0: $self->{insertion_mode}: Unknown insertion mode";
-                 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
-                 ## Ignore the token
-                 !!!next-token;
-                 next B;
                }
+               ## ISSUE: does not agree with IE7 - it doesn't ignore </br>.
+               !!!parse-error (type => 'unmatched end tag:br', token => $token);
+               ## Ignore the token
+               !!!next-token;
+               next B;
+             } else {
+               !!!cp ('t145');
+               !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name}, token => $token);
+               ## Ignore the token
+               !!!next-token;
+               next B;
              }
              if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) {
-Line 6166 
 sub _tree_construction_main ($) {
+Line 6342 
 sub _tree_construction_main ($) {
          my $meta_el = pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
          unless ($self->{confident}) {
-           if ($token->{attributes}->{charset}) { ## TODO: And if supported
+           if ($token->{attributes}->{charset}) {
              !!!cp ('t335');
+             ## NOTE: Whether the encoding is supported or not is handled
+             ## in the {change_encoding} callback.
              $self->{change_encoding}
                  ->($self, $token->{attributes}->{charset}->{value}, $token);
-Line 6176 
 sub _tree_construction_main ($) {
+Line 6354 
 sub _tree_construction_main ($) {
                                       $token->{attributes}->{charset}
                                           ->{has_reference});
            } elsif ($token->{attributes}->{content}) {
-             ## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition.
              if ($token->{attributes}->{content}->{value}
-                 =~ /\A[^;]*;[\x09-\x0D\x20]*[Cc][Hh][Aa][Rr][Ss][Ee][Tt]
+                 =~ /[Cc][Hh][Aa][Rr][Ss][Ee][Tt]
                      [\x09-\x0D\x20]*=
                      [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|
-                     ([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) {
+                     ([^"'\x09-\x0D\x20][^\x09-\x0D\x20\x3B]*))/x) {
                !!!cp ('t336');
+               ## NOTE: Whether the encoding is supported or not is handled
+               ## in the {change_encoding} callback.
                $self->{change_encoding}
                    ->($self, defined $1 ? $1 : defined $2 ? $2 : $3, $token);
                $meta_el->[0]->get_attribute_node_ns (undef, 'content')

 Legend:



Removed from v.1.133
 


changed lines


 
Added in v.1.145
 Legend:



Removed from v.1.133
 


changed lines


 
Added in v.1.145
-Removed from v.1.133
+Added in v.1.145

admin@suikawiki.org	ViewVC Help
Powered by ViewVC 1.1.24