/[suikacvs]/markup/html/whatpm/Whatpm/HTML.pm.src

Diff of /markup/html/whatpm/Whatpm/HTML.pm.src

Parent Directory | Revision Log | View Patch Patch

-revision 1.11 by wakaba,
Sat Jun 23 03:53:35 2007 UTC
+revision 1.35 by wakaba,
Mon Jul 16 03:21:04 2007 UTC
 Line 2 
 package Whatpm::HTML;
  use strict;
  our $VERSION=do{my @r=(q$Revision$=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
- ## This is an early version of an HTML parser.
+ ## ISSUE:
+ ## var doc = implementation.createDocument (null, null, null);
+ ## doc.write ('');
+ ## alert (doc.compatMode);
+ ## ISSUE: HTML5 revision 967 says that the encoding layer MUST NOT
+ ## strip BOM and the HTML layer MUST ignore it.  Whether we can do it
+ ## is not yet clear.
+ ## "{U+FEFF}..." in UTF-16BE/UTF-16LE is three or four characters?
+ ## "{U+FEFF}..." in GB18030?
  my $permitted_slash_tag_name = {
    base => 1,
-Line 18 
 my $permitted_slash_tag_name = {
+Line 27 
 my $permitted_slash_tag_name = {
    input => 1,
  };
- my $entity_char = {
-   AElig => "\x{00C6}",
-   Aacute => "\x{00C1}",
-   Acirc => "\x{00C2}",
-   Agrave => "\x{00C0}",
-   Alpha => "\x{0391}",
-   Aring => "\x{00C5}",
-   Atilde => "\x{00C3}",
-   Auml => "\x{00C4}",
-   Beta => "\x{0392}",
-   Ccedil => "\x{00C7}",
-   Chi => "\x{03A7}",
-   Dagger => "\x{2021}",
-   Delta => "\x{0394}",
-   ETH => "\x{00D0}",
-   Eacute => "\x{00C9}",
-   Ecirc => "\x{00CA}",
-   Egrave => "\x{00C8}",
-   Epsilon => "\x{0395}",
-   Eta => "\x{0397}",
-   Euml => "\x{00CB}",
-   Gamma => "\x{0393}",
-   Iacute => "\x{00CD}",
-   Icirc => "\x{00CE}",
-   Igrave => "\x{00CC}",
-   Iota => "\x{0399}",
-   Iuml => "\x{00CF}",
-   Kappa => "\x{039A}",
-   Lambda => "\x{039B}",
-   Mu => "\x{039C}",
-   Ntilde => "\x{00D1}",
-   Nu => "\x{039D}",
-   OElig => "\x{0152}",
-   Oacute => "\x{00D3}",
-   Ocirc => "\x{00D4}",
-   Ograve => "\x{00D2}",
-   Omega => "\x{03A9}",
-   Omicron => "\x{039F}",
-   Oslash => "\x{00D8}",
-   Otilde => "\x{00D5}",
-   Ouml => "\x{00D6}",
-   Phi => "\x{03A6}",
-   Pi => "\x{03A0}",
-   Prime => "\x{2033}",
-   Psi => "\x{03A8}",
-   Rho => "\x{03A1}",
-   Scaron => "\x{0160}",
-   Sigma => "\x{03A3}",
-   THORN => "\x{00DE}",
-   Tau => "\x{03A4}",
-   Theta => "\x{0398}",
-   Uacute => "\x{00DA}",
-   Ucirc => "\x{00DB}",
-   Ugrave => "\x{00D9}",
-   Upsilon => "\x{03A5}",
-   Uuml => "\x{00DC}",
-   Xi => "\x{039E}",
-   Yacute => "\x{00DD}",
-   Yuml => "\x{0178}",
-   Zeta => "\x{0396}",
-   aacute => "\x{00E1}",
-   acirc => "\x{00E2}",
-   acute => "\x{00B4}",
-   aelig => "\x{00E6}",
-   agrave => "\x{00E0}",
-   alefsym => "\x{2135}",
-   alpha => "\x{03B1}",
-   amp => "\x{0026}",
-   AMP => "\x{0026}",
-   and => "\x{2227}",
-   ang => "\x{2220}",
-   apos => "\x{0027}",
-   aring => "\x{00E5}",
-   asymp => "\x{2248}",
-   atilde => "\x{00E3}",
-   auml => "\x{00E4}",
-   bdquo => "\x{201E}",
-   beta => "\x{03B2}",
-   brvbar => "\x{00A6}",
-   bull => "\x{2022}",
-   cap => "\x{2229}",
-   ccedil => "\x{00E7}",
-   cedil => "\x{00B8}",
-   cent => "\x{00A2}",
-   chi => "\x{03C7}",
-   circ => "\x{02C6}",
-   clubs => "\x{2663}",
-   cong => "\x{2245}",
-   copy => "\x{00A9}",
-   COPY => "\x{00A9}",
-   crarr => "\x{21B5}",
-   cup => "\x{222A}",
-   curren => "\x{00A4}",
-   dArr => "\x{21D3}",
-   dagger => "\x{2020}",
-   darr => "\x{2193}",
-   deg => "\x{00B0}",
-   delta => "\x{03B4}",
-   diams => "\x{2666}",
-   divide => "\x{00F7}",
-   eacute => "\x{00E9}",
-   ecirc => "\x{00EA}",
-   egrave => "\x{00E8}",
-   empty => "\x{2205}",
-   emsp => "\x{2003}",
-   ensp => "\x{2002}",
-   epsilon => "\x{03B5}",
-   equiv => "\x{2261}",
-   eta => "\x{03B7}",
-   eth => "\x{00F0}",
-   euml => "\x{00EB}",
-   euro => "\x{20AC}",
-   exist => "\x{2203}",
-   fnof => "\x{0192}",
-   forall => "\x{2200}",
-   frac12 => "\x{00BD}",
-   frac14 => "\x{00BC}",
-   frac34 => "\x{00BE}",
-   frasl => "\x{2044}",
-   gamma => "\x{03B3}",
-   ge => "\x{2265}",
-   gt => "\x{003E}",
-   GT => "\x{003E}",
-   hArr => "\x{21D4}",
-   harr => "\x{2194}",
-   hearts => "\x{2665}",
-   hellip => "\x{2026}",
-   iacute => "\x{00ED}",
-   icirc => "\x{00EE}",
-   iexcl => "\x{00A1}",
-   igrave => "\x{00EC}",
-   image => "\x{2111}",
-   infin => "\x{221E}",
-   int => "\x{222B}",
-   iota => "\x{03B9}",
-   iquest => "\x{00BF}",
-   isin => "\x{2208}",
-   iuml => "\x{00EF}",
-   kappa => "\x{03BA}",
-   lArr => "\x{21D0}",
-   lambda => "\x{03BB}",
-   lang => "\x{2329}",
-   laquo => "\x{00AB}",
-   larr => "\x{2190}",
-   lceil => "\x{2308}",
-   ldquo => "\x{201C}",
-   le => "\x{2264}",
-   lfloor => "\x{230A}",
-   lowast => "\x{2217}",
-   loz => "\x{25CA}",
-   lrm => "\x{200E}",
-   lsaquo => "\x{2039}",
-   lsquo => "\x{2018}",
-   lt => "\x{003C}",
-   LT => "\x{003C}",
-   macr => "\x{00AF}",
-   mdash => "\x{2014}",
-   micro => "\x{00B5}",
-   middot => "\x{00B7}",
-   minus => "\x{2212}",
-   mu => "\x{03BC}",
-   nabla => "\x{2207}",
-   nbsp => "\x{00A0}",
-   ndash => "\x{2013}",
-   ne => "\x{2260}",
-   ni => "\x{220B}",
-   not => "\x{00AC}",
-   notin => "\x{2209}",
-   nsub => "\x{2284}",
-   ntilde => "\x{00F1}",
-   nu => "\x{03BD}",
-   oacute => "\x{00F3}",
-   ocirc => "\x{00F4}",
-   oelig => "\x{0153}",
-   ograve => "\x{00F2}",
-   oline => "\x{203E}",
-   omega => "\x{03C9}",
-   omicron => "\x{03BF}",
-   oplus => "\x{2295}",
-   or => "\x{2228}",
-   ordf => "\x{00AA}",
-   ordm => "\x{00BA}",
-   oslash => "\x{00F8}",
-   otilde => "\x{00F5}",
-   otimes => "\x{2297}",
-   ouml => "\x{00F6}",
-   para => "\x{00B6}",
-   part => "\x{2202}",
-   permil => "\x{2030}",
-   perp => "\x{22A5}",
-   phi => "\x{03C6}",
-   pi => "\x{03C0}",
-   piv => "\x{03D6}",
-   plusmn => "\x{00B1}",
-   pound => "\x{00A3}",
-   prime => "\x{2032}",
-   prod => "\x{220F}",
-   prop => "\x{221D}",
-   psi => "\x{03C8}",
-   quot => "\x{0022}",
-   QUOT => "\x{0022}",
-   rArr => "\x{21D2}",
-   radic => "\x{221A}",
-   rang => "\x{232A}",
-   raquo => "\x{00BB}",
-   rarr => "\x{2192}",
-   rceil => "\x{2309}",
-   rdquo => "\x{201D}",
-   real => "\x{211C}",
-   reg => "\x{00AE}",
-   REG => "\x{00AE}",
-   rfloor => "\x{230B}",
-   rho => "\x{03C1}",
-   rlm => "\x{200F}",
-   rsaquo => "\x{203A}",
-   rsquo => "\x{2019}",
-   sbquo => "\x{201A}",
-   scaron => "\x{0161}",
-   sdot => "\x{22C5}",
-   sect => "\x{00A7}",
-   shy => "\x{00AD}",
-   sigma => "\x{03C3}",
-   sigmaf => "\x{03C2}",
-   sim => "\x{223C}",
-   spades => "\x{2660}",
-   sub => "\x{2282}",
-   sube => "\x{2286}",
-   sum => "\x{2211}",
-   sup => "\x{2283}",
-   sup1 => "\x{00B9}",
-   sup2 => "\x{00B2}",
-   sup3 => "\x{00B3}",
-   supe => "\x{2287}",
-   szlig => "\x{00DF}",
-   tau => "\x{03C4}",
-   there4 => "\x{2234}",
-   theta => "\x{03B8}",
-   thetasym => "\x{03D1}",
-   thinsp => "\x{2009}",
-   thorn => "\x{00FE}",
-   tilde => "\x{02DC}",
-   times => "\x{00D7}",
-   trade => "\x{2122}",
-   uArr => "\x{21D1}",
-   uacute => "\x{00FA}",
-   uarr => "\x{2191}",
-   ucirc => "\x{00FB}",
-   ugrave => "\x{00F9}",
-   uml => "\x{00A8}",
-   upsih => "\x{03D2}",
-   upsilon => "\x{03C5}",
-   uuml => "\x{00FC}",
-   weierp => "\x{2118}",
-   xi => "\x{03BE}",
-   yacute => "\x{00FD}",
-   yen => "\x{00A5}",
-   yuml => "\x{00FF}",
-   zeta => "\x{03B6}",
-   zwj => "\x{200D}",
-   zwnj => "\x{200C}",
- }; # $entity_char
  my $c1_entity_char = {
 x80 => 0x20AC,
 x81 => 0xFFFD,
-Line 349 
 sub parse_string ($$$;$) {
+Line 96 
 sub parse_string ($$$;$) {
    my $column = 0;
    $self->{set_next_input_character} = sub {
      my $self = shift;
+     pop @{$self->{prev_input_character}};
+     unshift @{$self->{prev_input_character}}, $self->{next_input_character};
      $self->{next_input_character} = -1 and return if $i >= length $$s;
      $self->{next_input_character} = ord substr $$s, $i++, 1;
      $column++;
-Line 357 
 sub parse_string ($$$;$) {
+Line 108 
 sub parse_string ($$$;$) {
        $line++;
        $column = 0;
      } elsif ($self->{next_input_character} == 0x000D) { # CR
-       if ($i >= length $$s) {
+       $i++ if substr ($$s, $i, 1) eq "\x0A";
-         #
-       } else {
-         my $next_char = ord substr $$s, $i++, 1;
-         if ($next_char == 0x000A) { # LF
-           #
-         } else {
-           push @{$self->{char}}, $next_char;
-         }
-       }
        $self->{next_input_character} = 0x000A; # LF # MUST
        $line++;
        $column = 0;
-Line 377 
 sub parse_string ($$$;$) {
+Line 119 
 sub parse_string ($$$;$) {
        $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
      }
    };
+   $self->{prev_input_character} = [-1, -1, -1];
+   $self->{next_input_character} = -1;
    my $onerror = $_[2] || sub {
      my (%opt) = @_;
-Line 420 
 sub _initialize_tokenizer ($) {
+Line 164 
 sub _initialize_tokenizer ($) {
    # $self->{next_input_character}
    !!!next-input-character;
    $self->{token} = [];
+   # $self->{escape}
  } # _initialize_tokenizer
  ## A token has:
  ##   ->{type} eq 'DOCTYPE', 'start tag', 'end tag', 'comment',
  ##       'character', or 'end-of-file'
- ##   ->{name} (DOCTYPE, start tag (tagname), end tag (tagname))
+ ##   ->{name} (DOCTYPE, start tag (tag name), end tag (tag name))
-     ## ISSUE: the spec need s/tagname/tag name/
+ ##   ->{public_identifier} (DOCTYPE)
- ##   ->{error} == 1 or 0 (DOCTYPE)
+ ##   ->{system_identifier} (DOCTYPE)
+ ##   ->{correct} == 1 or 0 (DOCTYPE)
  ##   ->{attributes} isa HASH (start tag, end tag)
  ##   ->{data} (comment, character)
- ## Macros
- ##   Macros MUST be preceded by three EXCLAMATION MARKs.
- ##   emit ($token)
- ##     Emits the specified token.
  ## Emitted token MUST immediately be handled by the tree construction state.
  ## Before each step, UA MAY check to see if either one of the scripts in
-Line 461 
 sub _get_next_token ($) {
+Line 202 
 sub _get_next_token ($) {
          } else {
            #
          }
+       } elsif ($self->{next_input_character} == 0x002D) { # -
+         if ($self->{content_model_flag} eq 'RCDATA' or
+             $self->{content_model_flag} eq 'CDATA') {
+           unless ($self->{escape}) {
+             if ($self->{prev_input_character}->[0] == 0x002D and # -
+                 $self->{prev_input_character}->[1] == 0x0021 and # !
+                 $self->{prev_input_character}->[2] == 0x003C) { # <
+               $self->{escape} = 1;
+             }
+           }
+         }
+         #
        } elsif ($self->{next_input_character} == 0x003C) { # <
-         if ($self->{content_model_flag} ne 'PLAINTEXT') {
+         if ($self->{content_model_flag} eq 'PCDATA' or
+             (($self->{content_model_flag} eq 'CDATA' or
+               $self->{content_model_flag} eq 'RCDATA') and
+              not $self->{escape})) {
            $self->{state} = 'tag open';
            !!!next-input-character;
            redo A;
          } else {
            #
          }
+       } elsif ($self->{next_input_character} == 0x003E) { # >
+         if ($self->{escape} and
+             ($self->{content_model_flag} eq 'RCDATA' or
+              $self->{content_model_flag} eq 'CDATA')) {
+           if ($self->{prev_input_character}->[0] == 0x002D and # -
+               $self->{prev_input_character}->[1] == 0x002D) { # -
+             delete $self->{escape};
+           }
+         }
+         #
        } elsif ($self->{next_input_character} == -1) {
          !!!emit ({type => 'end-of-file'});
          last A; ## TODO: ok?
-Line 485 
 sub _get_next_token ($) {
+Line 253 
 sub _get_next_token ($) {
      } elsif ($self->{state} eq 'entity data') {
        ## (cannot happen in CDATA state)
-       my $token = $self->_tokenize_attempt_to_consume_an_entity;
+       my $token = $self->_tokenize_attempt_to_consume_an_entity (0);
        $self->{state} = 'data';
        # next-input-character is already done
-Line 564 
 sub _get_next_token ($) {
+Line 332 
 sub _get_next_token ($) {
      } elsif ($self->{state} eq 'close tag open') {
        if ($self->{content_model_flag} eq 'RCDATA' or
            $self->{content_model_flag} eq 'CDATA') {
-         my @next_char;
+         if (defined $self->{last_emitted_start_tag_name}) {
-         TAGNAME: for (my $i = 0; $i < length $self->{last_emitted_start_tag_name}; $i++) {
+           ## NOTE: <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>
+           my @next_char;
+           TAGNAME: for (my $i = 0; $i < length $self->{last_emitted_start_tag_name}; $i++) {
+             push @next_char, $self->{next_input_character};
+             my $c = ord substr ($self->{last_emitted_start_tag_name}, $i, 1);
+             my $C = 0x0061 <= $c && $c <= 0x007A ? $c - 0x0020 : $c;
+             if ($self->{next_input_character} == $c or $self->{next_input_character} == $C) {
+               !!!next-input-character;
+               next TAGNAME;
+             } else {
+               $self->{next_input_character} = shift @next_char; # reconsume
+               !!!back-next-input-character (@next_char);
+               $self->{state} = 'data';
+               !!!emit ({type => 'character', data => '</'});
+               redo A;
+             }
+           }
            push @next_char, $self->{next_input_character};
-           my $c = ord substr ($self->{last_emitted_start_tag_name}, $i, 1);
-           my $C = 0x0061 <= $c && $c <= 0x007A ? $c - 0x0020 : $c;
+           unless ($self->{next_input_character} == 0x0009 or # HT
-           if ($self->{next_input_character} == $c or $self->{next_input_character} == $C) {
+                   $self->{next_input_character} == 0x000A or # LF
-             !!!next-input-character;
+                   $self->{next_input_character} == 0x000B or # VT
-             next TAGNAME;
+                   $self->{next_input_character} == 0x000C or # FF
-           } else {
+                   $self->{next_input_character} == 0x0020 or # SP
-             !!!parse-error (type => 'unmatched end tag');
+                   $self->{next_input_character} == 0x003E or # >
+                   $self->{next_input_character} == 0x002F or # /
+                   $self->{next_input_character} == -1) {
              $self->{next_input_character} = shift @next_char; # reconsume
              !!!back-next-input-character (@next_char);
              $self->{state} = 'data';
              !!!emit ({type => 'character', data => '</'});
              redo A;
+           } else {
+             $self->{next_input_character} = shift @next_char;
+             !!!back-next-input-character (@next_char);
+             # and consume...
            }
-         }
+         } else {
-         push @next_char, $self->{next_input_character};
+           ## No start tag token has ever been emitted
+           # next-input-character is already done
-         unless ($self->{next_input_character} == 0x0009 or # HT
-                 $self->{next_input_character} == 0x000A or # LF
-                 $self->{next_input_character} == 0x000B or # VT
-                 $self->{next_input_character} == 0x000C or # FF
-                 $self->{next_input_character} == 0x0020 or # SP
-                 $self->{next_input_character} == 0x003E or # >
-                 $self->{next_input_character} == 0x002F or # /
-                 $self->{next_input_character} == 0x003C or # <
-                 $self->{next_input_character} == -1) {
-           !!!parse-error (type => 'unmatched end tag');
-           $self->{next_input_character} = shift @next_char; # reconsume
-           !!!back-next-input-character (@next_char);
            $self->{state} = 'data';
            !!!emit ({type => 'character', data => '</'});
            redo A;
-         } else {
-           $self->{next_input_character} = shift @next_char;
-           !!!back-next-input-character (@next_char);
-           # and consume...
          }
        }
-Line 653 
 sub _get_next_token ($) {
+Line 425 
 sub _get_next_token ($) {
          redo A;
        } elsif ($self->{next_input_character} == 0x003E) { # >
          if ($self->{current_token}->{type} eq 'start tag') {
+           $self->{current_token}->{first_start_tag}
+               = not defined $self->{last_emitted_start_tag_name};
            $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
          } elsif ($self->{current_token}->{type} eq 'end tag') {
            $self->{content_model_flag} = 'PCDATA'; # MUST
-Line 666 
 sub _get_next_token ($) {
+Line 440 
 sub _get_next_token ($) {
          !!!next-input-character;
          !!!emit ($self->{current_token}); # start tag or end tag
-         undef $self->{current_token};
          redo A;
        } elsif (0x0041 <= $self->{next_input_character} and
-Line 676 
 sub _get_next_token ($) {
+Line 449 
 sub _get_next_token ($) {
          ## Stay in this state
          !!!next-input-character;
          redo A;
-       } elsif ($self->{next_input_character} == 0x003C or # <
+       } elsif ($self->{next_input_character} == -1) {
-                $self->{next_input_character} == -1) {
          !!!parse-error (type => 'unclosed tag');
          if ($self->{current_token}->{type} eq 'start tag') {
+           $self->{current_token}->{first_start_tag}
+               = not defined $self->{last_emitted_start_tag_name};
            $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
          } elsif ($self->{current_token}->{type} eq 'end tag') {
            $self->{content_model_flag} = 'PCDATA'; # MUST
-Line 693 
 sub _get_next_token ($) {
+Line 467 
 sub _get_next_token ($) {
          # reconsume
          !!!emit ($self->{current_token}); # start tag or end tag
-         undef $self->{current_token};
          redo A;
        } elsif ($self->{next_input_character} == 0x002F) { # /
-Line 727 
 sub _get_next_token ($) {
+Line 500 
 sub _get_next_token ($) {
          redo A;
        } elsif ($self->{next_input_character} == 0x003E) { # >
          if ($self->{current_token}->{type} eq 'start tag') {
+           $self->{current_token}->{first_start_tag}
+               = not defined $self->{last_emitted_start_tag_name};
            $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
          } elsif ($self->{current_token}->{type} eq 'end tag') {
            $self->{content_model_flag} = 'PCDATA'; # MUST
-Line 740 
 sub _get_next_token ($) {
+Line 515 
 sub _get_next_token ($) {
          !!!next-input-character;
          !!!emit ($self->{current_token}); # start tag or end tag
-         undef $self->{current_token};
          redo A;
        } elsif (0x0041 <= $self->{next_input_character} and
-Line 763 
 sub _get_next_token ($) {
+Line 537 
 sub _get_next_token ($) {
          ## Stay in the state
          # next-input-character is already done
          redo A;
-       } elsif ($self->{next_input_character} == 0x003C or # <
+       } elsif ($self->{next_input_character} == -1) {
-                $self->{next_input_character} == -1) {
          !!!parse-error (type => 'unclosed tag');
          if ($self->{current_token}->{type} eq 'start tag') {
+           $self->{current_token}->{first_start_tag}
+               = not defined $self->{last_emitted_start_tag_name};
            $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
          } elsif ($self->{current_token}->{type} eq 'end tag') {
            $self->{content_model_flag} = 'PCDATA'; # MUST
-Line 780 
 sub _get_next_token ($) {
+Line 555 
 sub _get_next_token ($) {
          # reconsume
          !!!emit ($self->{current_token}); # start tag or end tag
-         undef $self->{current_token};
          redo A;
        } else {
-Line 819 
 sub _get_next_token ($) {
+Line 593 
 sub _get_next_token ($) {
        } elsif ($self->{next_input_character} == 0x003E) { # >
          $before_leave->();
          if ($self->{current_token}->{type} eq 'start tag') {
+           $self->{current_token}->{first_start_tag}
+               = not defined $self->{last_emitted_start_tag_name};
            $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
          } elsif ($self->{current_token}->{type} eq 'end tag') {
            $self->{content_model_flag} = 'PCDATA'; # MUST
-Line 832 
 sub _get_next_token ($) {
+Line 608 
 sub _get_next_token ($) {
          !!!next-input-character;
          !!!emit ($self->{current_token}); # start tag or end tag
-         undef $self->{current_token};
          redo A;
        } elsif (0x0041 <= $self->{next_input_character} and
-Line 855 
 sub _get_next_token ($) {
+Line 630 
 sub _get_next_token ($) {
          $self->{state} = 'before attribute name';
          # next-input-character is already done
          redo A;
-       } elsif ($self->{next_input_character} == 0x003C or # <
+       } elsif ($self->{next_input_character} == -1) {
-                $self->{next_input_character} == -1) {
          !!!parse-error (type => 'unclosed tag');
          $before_leave->();
          if ($self->{current_token}->{type} eq 'start tag') {
+           $self->{current_token}->{first_start_tag}
+               = not defined $self->{last_emitted_start_tag_name};
            $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
          } elsif ($self->{current_token}->{type} eq 'end tag') {
            $self->{content_model_flag} = 'PCDATA'; # MUST
-Line 873 
 sub _get_next_token ($) {
+Line 649 
 sub _get_next_token ($) {
          # reconsume
          !!!emit ($self->{current_token}); # start tag or end tag
-         undef $self->{current_token};
          redo A;
        } else {
-Line 897 
 sub _get_next_token ($) {
+Line 672 
 sub _get_next_token ($) {
          redo A;
        } elsif ($self->{next_input_character} == 0x003E) { # >
          if ($self->{current_token}->{type} eq 'start tag') {
+           $self->{current_token}->{first_start_tag}
+               = not defined $self->{last_emitted_start_tag_name};
            $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
          } elsif ($self->{current_token}->{type} eq 'end tag') {
            $self->{content_model_flag} = 'PCDATA'; # MUST
-Line 910 
 sub _get_next_token ($) {
+Line 687 
 sub _get_next_token ($) {
          !!!next-input-character;
          !!!emit ($self->{current_token}); # start tag or end tag
-         undef $self->{current_token};
          redo A;
        } elsif (0x0041 <= $self->{next_input_character} and
-Line 929 
 sub _get_next_token ($) {
+Line 705 
 sub _get_next_token ($) {
            #
          } else {
            !!!parse-error (type => 'nestc');
+           ## TODO: Different error type for <aa / bb> than <aa/>
          }
          $self->{state} = 'before attribute name';
          # next-input-character is already done
          redo A;
-       } elsif ($self->{next_input_character} == 0x003C or # <
+       } elsif ($self->{next_input_character} == -1) {
-                $self->{next_input_character} == -1) {
          !!!parse-error (type => 'unclosed tag');
          if ($self->{current_token}->{type} eq 'start tag') {
+           $self->{current_token}->{first_start_tag}
+               = not defined $self->{last_emitted_start_tag_name};
            $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
          } elsif ($self->{current_token}->{type} eq 'end tag') {
            $self->{content_model_flag} = 'PCDATA'; # MUST
-Line 950 
 sub _get_next_token ($) {
+Line 728 
 sub _get_next_token ($) {
          # reconsume
          !!!emit ($self->{current_token}); # start tag or end tag
-         undef $self->{current_token};
          redo A;
        } else {
-Line 983 
 sub _get_next_token ($) {
+Line 760 
 sub _get_next_token ($) {
          redo A;
        } elsif ($self->{next_input_character} == 0x003E) { # >
          if ($self->{current_token}->{type} eq 'start tag') {
+           $self->{current_token}->{first_start_tag}
+               = not defined $self->{last_emitted_start_tag_name};
            $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
          } elsif ($self->{current_token}->{type} eq 'end tag') {
            $self->{content_model_flag} = 'PCDATA'; # MUST
-Line 996 
 sub _get_next_token ($) {
+Line 775 
 sub _get_next_token ($) {
          !!!next-input-character;
          !!!emit ($self->{current_token}); # start tag or end tag
-         undef $self->{current_token};
          redo A;
-       } elsif ($self->{next_input_character} == 0x003C or # <
+       } elsif ($self->{next_input_character} == -1) {
-                $self->{next_input_character} == -1) {
          !!!parse-error (type => 'unclosed tag');
          if ($self->{current_token}->{type} eq 'start tag') {
+           $self->{current_token}->{first_start_tag}
+               = not defined $self->{last_emitted_start_tag_name};
            $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
          } elsif ($self->{current_token}->{type} eq 'end tag') {
            $self->{content_model_flag} = 'PCDATA'; # MUST
-Line 1016 
 sub _get_next_token ($) {
+Line 795 
 sub _get_next_token ($) {
          ## reconsume
          !!!emit ($self->{current_token}); # start tag or end tag
-         undef $self->{current_token};
          redo A;
        } else {
-Line 1038 
 sub _get_next_token ($) {
+Line 816 
 sub _get_next_token ($) {
        } elsif ($self->{next_input_character} == -1) {
          !!!parse-error (type => 'unclosed attribute value');
          if ($self->{current_token}->{type} eq 'start tag') {
+           $self->{current_token}->{first_start_tag}
+               = not defined $self->{last_emitted_start_tag_name};
            $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
          } elsif ($self->{current_token}->{type} eq 'end tag') {
            $self->{content_model_flag} = 'PCDATA'; # MUST
-Line 1051 
 sub _get_next_token ($) {
+Line 831 
 sub _get_next_token ($) {
          ## reconsume
          !!!emit ($self->{current_token}); # start tag or end tag
-         undef $self->{current_token};
          redo A;
        } else {
-Line 1073 
 sub _get_next_token ($) {
+Line 852 
 sub _get_next_token ($) {
        } elsif ($self->{next_input_character} == -1) {
          !!!parse-error (type => 'unclosed attribute value');
          if ($self->{current_token}->{type} eq 'start tag') {
+           $self->{current_token}->{first_start_tag}
+               = not defined $self->{last_emitted_start_tag_name};
            $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
          } elsif ($self->{current_token}->{type} eq 'end tag') {
            $self->{content_model_flag} = 'PCDATA'; # MUST
-Line 1086 
 sub _get_next_token ($) {
+Line 867 
 sub _get_next_token ($) {
          ## reconsume
          !!!emit ($self->{current_token}); # start tag or end tag
-         undef $self->{current_token};
          redo A;
        } else {
-Line 1111 
 sub _get_next_token ($) {
+Line 891 
 sub _get_next_token ($) {
          redo A;
        } elsif ($self->{next_input_character} == 0x003E) { # >
          if ($self->{current_token}->{type} eq 'start tag') {
+           $self->{current_token}->{first_start_tag}
+               = not defined $self->{last_emitted_start_tag_name};
            $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
          } elsif ($self->{current_token}->{type} eq 'end tag') {
            $self->{content_model_flag} = 'PCDATA'; # MUST
-Line 1124 
 sub _get_next_token ($) {
+Line 906 
 sub _get_next_token ($) {
          !!!next-input-character;
          !!!emit ($self->{current_token}); # start tag or end tag
-         undef $self->{current_token};
          redo A;
-       } elsif ($self->{next_input_character} == 0x003C or # <
+       } elsif ($self->{next_input_character} == -1) {
-                $self->{next_input_character} == -1) {
          !!!parse-error (type => 'unclosed tag');
          if ($self->{current_token}->{type} eq 'start tag') {
+           $self->{current_token}->{first_start_tag}
+               = not defined $self->{last_emitted_start_tag_name};
            $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
          } elsif ($self->{current_token}->{type} eq 'end tag') {
            $self->{content_model_flag} = 'PCDATA'; # MUST
-Line 1144 
 sub _get_next_token ($) {
+Line 926 
 sub _get_next_token ($) {
          ## reconsume
          !!!emit ($self->{current_token}); # start tag or end tag
-         undef $self->{current_token};
          redo A;
        } else {
-Line 1154 
 sub _get_next_token ($) {
+Line 935 
 sub _get_next_token ($) {
          redo A;
        }
      } elsif ($self->{state} eq 'entity in attribute value') {
-       my $token = $self->_tokenize_attempt_to_consume_an_entity;
+       my $token = $self->_tokenize_attempt_to_consume_an_entity (1);
        unless (defined $token) {
          $self->{current_attribute}->{value} .= '&';
-Line 1203 
 sub _get_next_token ($) {
+Line 984 
 sub _get_next_token ($) {
          push @next_char, $self->{next_input_character};
          if ($self->{next_input_character} == 0x002D) { # -
            $self->{current_token} = {type => 'comment', data => ''};
-           $self->{state} = 'comment';
+           $self->{state} = 'comment start';
            !!!next-input-character;
            redo A;
          }
-Line 1245 
 sub _get_next_token ($) {
+Line 1026 
 sub _get_next_token ($) {
          }
        }
-       !!!parse-error (type => 'bogus comment open');
+       !!!parse-error (type => 'bogus comment');
        $self->{next_input_character} = shift @next_char;
        !!!back-next-input-character (@next_char);
        $self->{state} = 'bogus comment';
-Line 1253 
 sub _get_next_token ($) {
+Line 1034 
 sub _get_next_token ($) {
        ## ISSUE: typos in spec: chacacters, is is a parse error
        ## ISSUE: spec is somewhat unclear on "is the first character that will be in the comment"; what is "that will be in the comment" is what the algorithm defines, isn't it?
+     } elsif ($self->{state} eq 'comment start') {
+       if ($self->{next_input_character} == 0x002D) { # -
+         $self->{state} = 'comment start dash';
+         !!!next-input-character;
+         redo A;
+       } elsif ($self->{next_input_character} == 0x003E) { # >
+         !!!parse-error (type => 'bogus comment');
+         $self->{state} = 'data';
+         !!!next-input-character;
+         !!!emit ($self->{current_token}); # comment
+         redo A;
+       } elsif ($self->{next_input_character} == -1) {
+         !!!parse-error (type => 'unclosed comment');
+         $self->{state} = 'data';
+         ## reconsume
+         !!!emit ($self->{current_token}); # comment
+         redo A;
+       } else {
+         $self->{current_token}->{data} # comment
+             .= chr ($self->{next_input_character});
+         $self->{state} = 'comment';
+         !!!next-input-character;
+         redo A;
+       }
+     } elsif ($self->{state} eq 'comment start dash') {
+       if ($self->{next_input_character} == 0x002D) { # -
+         $self->{state} = 'comment end';
+         !!!next-input-character;
+         redo A;
+       } elsif ($self->{next_input_character} == 0x003E) { # >
+         !!!parse-error (type => 'bogus comment');
+         $self->{state} = 'data';
+         !!!next-input-character;
+         !!!emit ($self->{current_token}); # comment
+         redo A;
+       } elsif ($self->{next_input_character} == -1) {
+         !!!parse-error (type => 'unclosed comment');
+         $self->{state} = 'data';
+         ## reconsume
+         !!!emit ($self->{current_token}); # comment
+         redo A;
+       } else {
+         $self->{current_token}->{data} # comment
+             .= '-' . chr ($self->{next_input_character});
+         $self->{state} = 'comment';
+         !!!next-input-character;
+         redo A;
+       }
      } elsif ($self->{state} eq 'comment') {
        if ($self->{next_input_character} == 0x002D) { # -
-         $self->{state} = 'comment dash';
+         $self->{state} = 'comment end dash';
          !!!next-input-character;
          redo A;
        } elsif ($self->{next_input_character} == -1) {
-Line 1264 
 sub _get_next_token ($) {
+Line 1101 
 sub _get_next_token ($) {
          ## reconsume
          !!!emit ($self->{current_token}); # comment
-         undef $self->{current_token};
          redo A;
        } else {
-Line 1273 
 sub _get_next_token ($) {
+Line 1109 
 sub _get_next_token ($) {
          !!!next-input-character;
          redo A;
        }
-     } elsif ($self->{state} eq 'comment dash') {
+     } elsif ($self->{state} eq 'comment end dash') {
        if ($self->{next_input_character} == 0x002D) { # -
          $self->{state} = 'comment end';
          !!!next-input-character;
-Line 1284 
 sub _get_next_token ($) {
+Line 1120 
 sub _get_next_token ($) {
          ## reconsume
          !!!emit ($self->{current_token}); # comment
-         undef $self->{current_token};
          redo A;
        } else {
-Line 1299 
 sub _get_next_token ($) {
+Line 1134 
 sub _get_next_token ($) {
          !!!next-input-character;
          !!!emit ($self->{current_token}); # comment
-         undef $self->{current_token};
          redo A;
        } elsif ($self->{next_input_character} == 0x002D) { # -
-Line 1314 
 sub _get_next_token ($) {
+Line 1148 
 sub _get_next_token ($) {
          ## reconsume
          !!!emit ($self->{current_token}); # comment
-         undef $self->{current_token};
          redo A;
        } else {
-Line 1348 
 sub _get_next_token ($) {
+Line 1181 
 sub _get_next_token ($) {
          ## Stay in the state
          !!!next-input-character;
          redo A;
-       } elsif (0x0061 <= $self->{next_input_character} and
-                $self->{next_input_character} <= 0x007A) { # a..z
- ## ISSUE: "Set the token's name name to the" in the spec
-         $self->{current_token} = {type => 'DOCTYPE',
-                           name => chr ($self->{next_input_character} - 0x0020),
-                           error => 1};
-         $self->{state} = 'DOCTYPE name';
-         !!!next-input-character;
-         redo A;
        } elsif ($self->{next_input_character} == 0x003E) { # >
          !!!parse-error (type => 'no DOCTYPE name');
          $self->{state} = 'data';
          !!!next-input-character;
-         !!!emit ({type => 'DOCTYPE', name => '', error => 1});
+         !!!emit ({type => 'DOCTYPE'}); # incorrect
          redo A;
        } elsif ($self->{next_input_character} == -1) {
-Line 1370 
 sub _get_next_token ($) {
+Line 1194 
 sub _get_next_token ($) {
          $self->{state} = 'data';
          ## reconsume
-         !!!emit ({type => 'DOCTYPE', name => '', error => 1});
+         !!!emit ({type => 'DOCTYPE'}); # incorrect
          redo A;
        } else {
-         $self->{current_token} = {type => 'DOCTYPE',
+         $self->{current_token}
-                           name => chr ($self->{next_input_character}),
+             = {type => 'DOCTYPE',
-                           error => 1};
+                name => chr ($self->{next_input_character}),
+                correct => 1};
  ## ISSUE: "Set the token's name name to the" in the spec
          $self->{state} = 'DOCTYPE name';
          !!!next-input-character;
          redo A;
        }
      } elsif ($self->{state} eq 'DOCTYPE name') {
+ ## ISSUE: Redundant "First," in the spec.
        if ($self->{next_input_character} == 0x0009 or # HT
            $self->{next_input_character} == 0x000A or # LF
            $self->{next_input_character} == 0x000B or # VT
            $self->{next_input_character} == 0x000C or # FF
            $self->{next_input_character} == 0x0020) { # SP
-         $self->{current_token}->{error} = ($self->{current_token}->{name} ne 'HTML'); # DOCTYPE
          $self->{state} = 'after DOCTYPE name';
          !!!next-input-character;
          redo A;
        } elsif ($self->{next_input_character} == 0x003E) { # >
-         $self->{current_token}->{error} = ($self->{current_token}->{name} ne 'HTML'); # DOCTYPE
          $self->{state} = 'data';
          !!!next-input-character;
          !!!emit ($self->{current_token}); # DOCTYPE
-         undef $self->{current_token};
          redo A;
-       } elsif (0x0061 <= $self->{next_input_character} and
-                $self->{next_input_character} <= 0x007A) { # a..z
-         $self->{current_token}->{name} .= chr ($self->{next_input_character} - 0x0020); # DOCTYPE
-         #$self->{current_token}->{error} = ($self->{current_token}->{name} ne 'HTML');
-         ## Stay in the state
-         !!!next-input-character;
-         redo A;
        } elsif ($self->{next_input_character} == -1) {
          !!!parse-error (type => 'unclosed DOCTYPE');
-         $self->{current_token}->{error} = ($self->{current_token}->{name} ne 'HTML'); # DOCTYPE
          $self->{state} = 'data';
          ## reconsume
-         !!!emit ($self->{current_token});
+         delete $self->{current_token}->{correct};
-         undef $self->{current_token};
+         !!!emit ($self->{current_token}); # DOCTYPE
          redo A;
        } else {
          $self->{current_token}->{name}
            .= chr ($self->{next_input_character}); # DOCTYPE
-         #$self->{current_token}->{error} = ($self->{current_token}->{name} ne 'HTML');
          ## Stay in the state
          !!!next-input-character;
          redo A;
-Line 1440 
 sub _get_next_token ($) {
+Line 1254 
 sub _get_next_token ($) {
          !!!next-input-character;
          !!!emit ($self->{current_token}); # DOCTYPE
-         undef $self->{current_token};
          redo A;
        } elsif ($self->{next_input_character} == -1) {
-Line 1448 
 sub _get_next_token ($) {
+Line 1261 
 sub _get_next_token ($) {
          $self->{state} = 'data';
          ## reconsume
+         delete $self->{current_token}->{correct};
          !!!emit ($self->{current_token}); # DOCTYPE
-         undef $self->{current_token};
          redo A;
+       } elsif ($self->{next_input_character} == 0x0050 or # P
+                $self->{next_input_character} == 0x0070) { # p
+         !!!next-input-character;
+         if ($self->{next_input_character} == 0x0055 or # U
+             $self->{next_input_character} == 0x0075) { # u
+           !!!next-input-character;
+           if ($self->{next_input_character} == 0x0042 or # B
+               $self->{next_input_character} == 0x0062) { # b
+             !!!next-input-character;
+             if ($self->{next_input_character} == 0x004C or # L
+                 $self->{next_input_character} == 0x006C) { # l
+               !!!next-input-character;
+               if ($self->{next_input_character} == 0x0049 or # I
+                   $self->{next_input_character} == 0x0069) { # i
+                 !!!next-input-character;
+                 if ($self->{next_input_character} == 0x0043 or # C
+                     $self->{next_input_character} == 0x0063) { # c
+                   $self->{state} = 'before DOCTYPE public identifier';
+                   !!!next-input-character;
+                   redo A;
+                 }
+               }
+             }
+           }
+         }
+         #
+       } elsif ($self->{next_input_character} == 0x0053 or # S
+                $self->{next_input_character} == 0x0073) { # s
+         !!!next-input-character;
+         if ($self->{next_input_character} == 0x0059 or # Y
+             $self->{next_input_character} == 0x0079) { # y
+           !!!next-input-character;
+           if ($self->{next_input_character} == 0x0053 or # S
+               $self->{next_input_character} == 0x0073) { # s
+             !!!next-input-character;
+             if ($self->{next_input_character} == 0x0054 or # T
+                 $self->{next_input_character} == 0x0074) { # t
+               !!!next-input-character;
+               if ($self->{next_input_character} == 0x0045 or # E
+                   $self->{next_input_character} == 0x0065) { # e
+                 !!!next-input-character;
+                 if ($self->{next_input_character} == 0x004D or # M
+                     $self->{next_input_character} == 0x006D) { # m
+                   $self->{state} = 'before DOCTYPE system identifier';
+                   !!!next-input-character;
+                   redo A;
+                 }
+               }
+             }
+           }
+         }
+         #
        } else {
-         !!!parse-error (type => 'string after DOCTYPE name');
+         !!!next-input-character;
-         $self->{current_token}->{error} = 1; # DOCTYPE
+         #
+       }
+       !!!parse-error (type => 'string after DOCTYPE name');
+       $self->{state} = 'bogus DOCTYPE';
+       # next-input-character is already done
+       redo A;
+     } elsif ($self->{state} eq 'before DOCTYPE public identifier') {
+       if ({
+x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
+             #0x000D => 1, # HT, LF, VT, FF, SP, CR
+           }->{$self->{next_input_character}}) {
+         ## Stay in the state
+         !!!next-input-character;
+         redo A;
+       } elsif ($self->{next_input_character} eq 0x0022) { # "
+         $self->{current_token}->{public_identifier} = ''; # DOCTYPE
+         $self->{state} = 'DOCTYPE public identifier (double-quoted)';
+         !!!next-input-character;
+         redo A;
+       } elsif ($self->{next_input_character} eq 0x0027) { # '
+         $self->{current_token}->{public_identifier} = ''; # DOCTYPE
+         $self->{state} = 'DOCTYPE public identifier (single-quoted)';
+         !!!next-input-character;
+         redo A;
+       } elsif ($self->{next_input_character} eq 0x003E) { # >
+         !!!parse-error (type => 'no PUBLIC literal');
+         $self->{state} = 'data';
+         !!!next-input-character;
+         delete $self->{current_token}->{correct};
+         !!!emit ($self->{current_token}); # DOCTYPE
+         redo A;
+       } elsif ($self->{next_input_character} == -1) {
+         !!!parse-error (type => 'unclosed DOCTYPE');
+         $self->{state} = 'data';
+         ## reconsume
+         delete $self->{current_token}->{correct};
+         !!!emit ($self->{current_token}); # DOCTYPE
+         redo A;
+       } else {
+         !!!parse-error (type => 'string after PUBLIC');
+         $self->{state} = 'bogus DOCTYPE';
+         !!!next-input-character;
+         redo A;
+       }
+     } elsif ($self->{state} eq 'DOCTYPE public identifier (double-quoted)') {
+       if ($self->{next_input_character} == 0x0022) { # "
+         $self->{state} = 'after DOCTYPE public identifier';
+         !!!next-input-character;
+         redo A;
+       } elsif ($self->{next_input_character} == -1) {
+         !!!parse-error (type => 'unclosed PUBLIC literal');
+         $self->{state} = 'data';
+         ## reconsume
+         delete $self->{current_token}->{correct};
+         !!!emit ($self->{current_token}); # DOCTYPE
+         redo A;
+       } else {
+         $self->{current_token}->{public_identifier} # DOCTYPE
+             .= chr $self->{next_input_character};
+         ## Stay in the state
+         !!!next-input-character;
+         redo A;
+       }
+     } elsif ($self->{state} eq 'DOCTYPE public identifier (single-quoted)') {
+       if ($self->{next_input_character} == 0x0027) { # '
+         $self->{state} = 'after DOCTYPE public identifier';
+         !!!next-input-character;
+         redo A;
+       } elsif ($self->{next_input_character} == -1) {
+         !!!parse-error (type => 'unclosed PUBLIC literal');
+         $self->{state} = 'data';
+         ## reconsume
+         delete $self->{current_token}->{correct};
+         !!!emit ($self->{current_token}); # DOCTYPE
+         redo A;
+       } else {
+         $self->{current_token}->{public_identifier} # DOCTYPE
+             .= chr $self->{next_input_character};
+         ## Stay in the state
+         !!!next-input-character;
+         redo A;
+       }
+     } elsif ($self->{state} eq 'after DOCTYPE public identifier') {
+       if ({
+x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
+             #0x000D => 1, # HT, LF, VT, FF, SP, CR
+           }->{$self->{next_input_character}}) {
+         ## Stay in the state
+         !!!next-input-character;
+         redo A;
+       } elsif ($self->{next_input_character} == 0x0022) { # "
+         $self->{current_token}->{system_identifier} = ''; # DOCTYPE
+         $self->{state} = 'DOCTYPE system identifier (double-quoted)';
+         !!!next-input-character;
+         redo A;
+       } elsif ($self->{next_input_character} == 0x0027) { # '
+         $self->{current_token}->{system_identifier} = ''; # DOCTYPE
+         $self->{state} = 'DOCTYPE system identifier (single-quoted)';
+         !!!next-input-character;
+         redo A;
+       } elsif ($self->{next_input_character} == 0x003E) { # >
+         $self->{state} = 'data';
+         !!!next-input-character;
+         !!!emit ($self->{current_token}); # DOCTYPE
+         redo A;
+       } elsif ($self->{next_input_character} == -1) {
+         !!!parse-error (type => 'unclosed DOCTYPE');
+         $self->{state} = 'data';
+         ## reconsume
+         delete $self->{current_token}->{correct};
+         !!!emit ($self->{current_token}); # DOCTYPE
+         redo A;
+       } else {
+         !!!parse-error (type => 'string after PUBLIC literal');
+         $self->{state} = 'bogus DOCTYPE';
+         !!!next-input-character;
+         redo A;
+       }
+     } elsif ($self->{state} eq 'before DOCTYPE system identifier') {
+       if ({
+x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
+             #0x000D => 1, # HT, LF, VT, FF, SP, CR
+           }->{$self->{next_input_character}}) {
+         ## Stay in the state
+         !!!next-input-character;
+         redo A;
+       } elsif ($self->{next_input_character} == 0x0022) { # "
+         $self->{current_token}->{system_identifier} = ''; # DOCTYPE
+         $self->{state} = 'DOCTYPE system identifier (double-quoted)';
+         !!!next-input-character;
+         redo A;
+       } elsif ($self->{next_input_character} == 0x0027) { # '
+         $self->{current_token}->{system_identifier} = ''; # DOCTYPE
+         $self->{state} = 'DOCTYPE system identifier (single-quoted)';
+         !!!next-input-character;
+         redo A;
+       } elsif ($self->{next_input_character} == 0x003E) { # >
+         !!!parse-error (type => 'no SYSTEM literal');
+         $self->{state} = 'data';
+         !!!next-input-character;
+         delete $self->{current_token}->{correct};
+         !!!emit ($self->{current_token}); # DOCTYPE
+         redo A;
+       } elsif ($self->{next_input_character} == -1) {
+         !!!parse-error (type => 'unclosed DOCTYPE');
+         $self->{state} = 'data';
+         ## reconsume
+         delete $self->{current_token}->{correct};
+         !!!emit ($self->{current_token}); # DOCTYPE
+         redo A;
+       } else {
+         !!!parse-error (type => 'string after SYSTEM');
+         $self->{state} = 'bogus DOCTYPE';
+         !!!next-input-character;
+         redo A;
+       }
+     } elsif ($self->{state} eq 'DOCTYPE system identifier (double-quoted)') {
+       if ($self->{next_input_character} == 0x0022) { # "
+         $self->{state} = 'after DOCTYPE system identifier';
+         !!!next-input-character;
+         redo A;
+       } elsif ($self->{next_input_character} == -1) {
+         !!!parse-error (type => 'unclosed SYSTEM literal');
+         $self->{state} = 'data';
+         ## reconsume
+         delete $self->{current_token}->{correct};
+         !!!emit ($self->{current_token}); # DOCTYPE
+         redo A;
+       } else {
+         $self->{current_token}->{system_identifier} # DOCTYPE
+             .= chr $self->{next_input_character};
+         ## Stay in the state
+         !!!next-input-character;
+         redo A;
+       }
+     } elsif ($self->{state} eq 'DOCTYPE system identifier (single-quoted)') {
+       if ($self->{next_input_character} == 0x0027) { # '
+         $self->{state} = 'after DOCTYPE system identifier';
+         !!!next-input-character;
+         redo A;
+       } elsif ($self->{next_input_character} == -1) {
+         !!!parse-error (type => 'unclosed SYSTEM literal');
+         $self->{state} = 'data';
+         ## reconsume
+         delete $self->{current_token}->{correct};
+         !!!emit ($self->{current_token}); # DOCTYPE
+         redo A;
+       } else {
+         $self->{current_token}->{system_identifier} # DOCTYPE
+             .= chr $self->{next_input_character};
+         ## Stay in the state
+         !!!next-input-character;
+         redo A;
+       }
+     } elsif ($self->{state} eq 'after DOCTYPE system identifier') {
+       if ({
+x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
+             #0x000D => 1, # HT, LF, VT, FF, SP, CR
+           }->{$self->{next_input_character}}) {
+         ## Stay in the state
+         !!!next-input-character;
+         redo A;
+       } elsif ($self->{next_input_character} == 0x003E) { # >
+         $self->{state} = 'data';
+         !!!next-input-character;
+         !!!emit ($self->{current_token}); # DOCTYPE
+         redo A;
+       } elsif ($self->{next_input_character} == -1) {
+         !!!parse-error (type => 'unclosed DOCTYPE');
+         $self->{state} = 'data';
+         ## reconsume
+         delete $self->{current_token}->{correct};
+         !!!emit ($self->{current_token}); # DOCTYPE
+         redo A;
+       } else {
+         !!!parse-error (type => 'string after SYSTEM literal');
          $self->{state} = 'bogus DOCTYPE';
          !!!next-input-character;
          redo A;
-Line 1464 
 sub _get_next_token ($) {
+Line 1580 
 sub _get_next_token ($) {
          $self->{state} = 'data';
          !!!next-input-character;
+         delete $self->{current_token}->{correct};
          !!!emit ($self->{current_token}); # DOCTYPE
-         undef $self->{current_token};
          redo A;
        } elsif ($self->{next_input_character} == -1) {
-Line 1473 
 sub _get_next_token ($) {
+Line 1589 
 sub _get_next_token ($) {
          $self->{state} = 'data';
          ## reconsume
+         delete $self->{current_token}->{correct};
          !!!emit ($self->{current_token}); # DOCTYPE
-         undef $self->{current_token};
          redo A;
        } else {
-Line 1490 
 sub _get_next_token ($) {
+Line 1606 
 sub _get_next_token ($) {
    die "$0: _get_next_token: unexpected case";
  } # _get_next_token
- sub _tokenize_attempt_to_consume_an_entity ($) {
+ sub _tokenize_attempt_to_consume_an_entity ($$) {
-   my $self = shift;
+   my ($self, $in_attr) = @_;
-   if ($self->{next_input_character} == 0x0023) { # #
+   if ({
+x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, # HT, LF, VT, FF,
+x0020 => 1, 0x003C => 1, 0x0026 => 1, -1 => 1, # SP, <, & # 0x000D # CR
+       }->{$self->{next_input_character}}) {
+     ## Don't consume
+     ## No error
+     return undef;
+   } elsif ($self->{next_input_character} == 0x0023) { # #
      !!!next-input-character;
      if ($self->{next_input_character} == 0x0078 or # x
          $self->{next_input_character} == 0x0058) { # X
-       my $num;
+       my $code;
        X: {
          my $x_char = $self->{next_input_character};
          !!!next-input-character;
          if (0x0030 <= $self->{next_input_character} and
              $self->{next_input_character} <= 0x0039) { # 0..9
-           $num ||= 0;
+           $code ||= 0;
-           $num *= 0x10;
+           $code *= 0x10;
-           $num += $self->{next_input_character} - 0x0030;
+           $code += $self->{next_input_character} - 0x0030;
            redo X;
          } elsif (0x0061 <= $self->{next_input_character} and
                   $self->{next_input_character} <= 0x0066) { # a..f
-           ## ISSUE: the spec says U+0078, which is apparently incorrect
+           $code ||= 0;
-           $num ||= 0;
+           $code *= 0x10;
-           $num *= 0x10;
+           $code += $self->{next_input_character} - 0x0060 + 9;
-           $num += $self->{next_input_character} - 0x0060 + 9;
            redo X;
          } elsif (0x0041 <= $self->{next_input_character} and
                   $self->{next_input_character} <= 0x0046) { # A..F
-           ## ISSUE: the spec says U+0058, which is apparently incorrect
+           $code ||= 0;
-           $num ||= 0;
+           $code *= 0x10;
-           $num *= 0x10;
+           $code += $self->{next_input_character} - 0x0040 + 9;
-           $num += $self->{next_input_character} - 0x0040 + 9;
            redo X;
-         } elsif (not defined $num) { # no hexadecimal digit
+         } elsif (not defined $code) { # no hexadecimal digit
            !!!parse-error (type => 'bare hcro');
            $self->{next_input_character} = 0x0023; # #
            !!!back-next-input-character ($x_char);
-Line 1532 
 sub _tokenize_attempt_to_consume_an_enti
+Line 1653 
 sub _tokenize_attempt_to_consume_an_enti
            !!!parse-error (type => 'no refc');
          }
-         ## TODO: check the definition for |a valid Unicode character|.
+         if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) {
-         ## <http://lists.whatwg.org/pipermail/whatwg-whatwg.org/2006-December/thread.html#8189>
+           !!!parse-error (type => sprintf 'invalid character reference:U+%04X', $code);
-         if ($num > 1114111 or $num == 0) {
+           $code = 0xFFFD;
-           $num = 0xFFFD; # REPLACEMENT CHARACTER
+         } elsif ($code > 0x10FFFF) {
-           ## ISSUE: Why this is not an error?
+           !!!parse-error (type => sprintf 'invalid character reference:U-%08X', $code);
-         } elsif (0x80 <= $num and $num <= 0x9F) {
+           $code = 0xFFFD;
-           !!!parse-error (type => sprintf 'c1 entity:U+%04X', $num);
+         } elsif ($code == 0x000D) {
-           $num = $c1_entity_char->{$num};
+           !!!parse-error (type => 'CR character reference');
+           $code = 0x000A;
+         } elsif (0x80 <= $code and $code <= 0x9F) {
+           !!!parse-error (type => sprintf 'C1 character reference:U+%04X', $code);
+           $code = $c1_entity_char->{$code};
          }
-         return {type => 'character', data => chr $num};
+         return {type => 'character', data => chr $code};
        } # X
      } elsif (0x0030 <= $self->{next_input_character} and
               $self->{next_input_character} <= 0x0039) { # 0..9
-Line 1563 
 sub _tokenize_attempt_to_consume_an_enti
+Line 1688 
 sub _tokenize_attempt_to_consume_an_enti
          !!!parse-error (type => 'no refc');
        }
-       ## TODO: check the definition for |a valid Unicode character|.
+       if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) {
-       if ($code > 1114111 or $code == 0) {
+         !!!parse-error (type => sprintf 'invalid character reference:U+%04X', $code);
-         $code = 0xFFFD; # REPLACEMENT CHARACTER
+         $code = 0xFFFD;
-         ## ISSUE: Why this is not an error?
+       } elsif ($code > 0x10FFFF) {
+         !!!parse-error (type => sprintf 'invalid character reference:U-%08X', $code);
+         $code = 0xFFFD;
+       } elsif ($code == 0x000D) {
+         !!!parse-error (type => 'CR character reference');
+         $code = 0x000A;
        } elsif (0x80 <= $code and $code <= 0x9F) {
-         !!!parse-error (type => sprintf 'c1 entity:U+%04X', $code);
+         !!!parse-error (type => sprintf 'C1 character reference:U+%04X', $code);
          $code = $c1_entity_char->{$code};
        }
-Line 1588 
 sub _tokenize_attempt_to_consume_an_enti
+Line 1718 
 sub _tokenize_attempt_to_consume_an_enti
      my $value = $entity_name;
      my $match;
+     require Whatpm::_NamedEntityList;
+     our $EntityChar;
      while (length $entity_name < 10 and
             ## NOTE: Some number greater than the maximum length of entity name
-            ((0x0041 <= $self->{next_input_character} and
+            ((0x0041 <= $self->{next_input_character} and # a
-              $self->{next_input_character} <= 0x005A) or
+              $self->{next_input_character} <= 0x005A) or # x
-             (0x0061 <= $self->{next_input_character} and
+             (0x0061 <= $self->{next_input_character} and # a
-              $self->{next_input_character} <= 0x007A) or
+              $self->{next_input_character} <= 0x007A) or # z
-             (0x0030 <= $self->{next_input_character} and
+             (0x0030 <= $self->{next_input_character} and # 0
-              $self->{next_input_character} <= 0x0039))) {
+              $self->{next_input_character} <= 0x0039) or # 9
+             $self->{next_input_character} == 0x003B)) { # ;
        $entity_name .= chr $self->{next_input_character};
-       if (defined $entity_char->{$entity_name}) {
+       if (defined $EntityChar->{$entity_name}) {
-         $value = $entity_char->{$entity_name};
+         if ($self->{next_input_character} == 0x003B) { # ;
-         $match = 1;
+           $value = $EntityChar->{$entity_name};
+           $match = 1;
+           !!!next-input-character;
+           last;
+         } elsif (not $in_attr) {
+           $value = $EntityChar->{$entity_name};
+           $match = -1;
+         } else {
+           $value .= chr $self->{next_input_character};
+         }
        } else {
          $value .= chr $self->{next_input_character};
        }
        !!!next-input-character;
      }
-     if ($match) {
+     if ($match > 0) {
-       if ($self->{next_input_character} == 0x003B) { # ;
+       return {type => 'character', data => $value};
-         !!!next-input-character;
+     } elsif ($match < 0) {
-       } else {
+       !!!parse-error (type => 'no refc');
-         !!!parse-error (type => 'refc');
-       }
        return {type => 'character', data => $value};
      } else {
        !!!parse-error (type => 'bare ero');
        ## NOTE: No characters are consumed in the spec.
-       !!!back-token ({type => 'character', data => $value});
+       return {type => 'character', data => '&'.$value};
-       return undef;
      }
    } else {
      ## no characters are consumed
-Line 1634 
 sub _initialize_tree_constructor ($) {
+Line 1772 
 sub _initialize_tree_constructor ($) {
    $self->{document}->strict_error_checking (0);
    ## TODO: Turn mutation events off # MUST
    ## TODO: Turn loose Document option (manakai extension) on
-   ## TODO: Mark the Document as an HTML document # MUST
+   $self->{document}->manakai_is_html (1); # MUST
  } # _initialize_tree_constructor
  sub _terminate_tree_constructor ($) {
-Line 1674 
 sub _construct_tree ($) {
+Line 1812 
 sub _construct_tree ($) {
  sub _tree_construction_initial ($) {
    my $self = shift;
-   B: {
+   INITIAL: {
-       if ($token->{type} eq 'DOCTYPE') {
+     if ($token->{type} eq 'DOCTYPE') {
-         if ($token->{error}) {
+       ## NOTE: Conformance checkers MAY, instead of reporting "not HTML5"
-           ## ISSUE: Spec currently left this case undefined.
+       ## error, switch to a conformance checking mode for another
-           !!!parse-error (type => 'bogus DOCTYPE');
+       ## language.
-         }
+       my $doctype_name = $token->{name};
-         my $doctype = $self->{document}->create_document_type_definition
+       $doctype_name = '' unless defined $doctype_name;
-           ($token->{name});
+       $doctype_name =~ tr/a-z/A-Z/;
-         $self->{document}->append_child ($doctype);
+       if (not defined $token->{name} or # <!DOCTYPE>
-         #$phase = 'root element';
+           defined $token->{public_identifier} or
-         !!!next-token;
+           defined $token->{system_identifier}) {
-         #redo B;
+         !!!parse-error (type => 'not HTML5');
-         return;
+       } elsif ($doctype_name ne 'HTML') {
-       } elsif ({
+         ## ISSUE: ASCII case-insensitive? (in fact it does not matter)
-                 comment => 1,
+         !!!parse-error (type => 'not HTML5');
-                 'start tag' => 1,
+       }
-                 'end tag' => 1,
-                 'end-of-file' => 1,
+       my $doctype = $self->{document}->create_document_type_definition
-                }->{$token->{type}}) {
+         ($token->{name}); ## ISSUE: If name is missing (e.g. <!DOCTYPE>)?
-         ## ISSUE: Spec currently left this case undefined.
+       $doctype->public_id ($token->{public_identifier})
-         !!!parse-error (type => 'missing DOCTYPE');
+           if defined $token->{public_identifier};
-         #$phase = 'root element';
+       $doctype->system_id ($token->{system_identifier})
-         ## reprocess
+           if defined $token->{system_identifier};
-         #redo B;
+       ## NOTE: Other DocumentType attributes are null or empty lists.
-         return;
+       ## ISSUE: internalSubset = null??
-       } elsif ($token->{type} eq 'character') {
+       $self->{document}->append_child ($doctype);
-         if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
-           $self->{document}->manakai_append_text ($1);
+       if (not $token->{correct} or $doctype_name ne 'HTML') {
-           ## ISSUE: DOM3 Core does not allow Document > Text
+         $self->{document}->manakai_compat_mode ('quirks');
-           unless (length $token->{data}) {
+       } elsif (defined $token->{public_identifier}) {
-             ## Stay in the phase
+         my $pubid = $token->{public_identifier};
-             !!!next-token;
+         $pubid =~ tr/a-z/A-z/;
-             redo B;
+         if ({
+           "+//SILMARIL//DTD HTML PRO V0R11 19970101//EN" => 1,
+           "-//ADVASOFT LTD//DTD HTML 3.0 ASWEDIT + EXTENSIONS//EN" => 1,
+           "-//AS//DTD HTML 3.0 ASWEDIT + EXTENSIONS//EN" => 1,
+           "-//IETF//DTD HTML 2.0 LEVEL 1//EN" => 1,
+           "-//IETF//DTD HTML 2.0 LEVEL 2//EN" => 1,
+           "-//IETF//DTD HTML 2.0 STRICT LEVEL 1//EN" => 1,
+           "-//IETF//DTD HTML 2.0 STRICT LEVEL 2//EN" => 1,
+           "-//IETF//DTD HTML 2.0 STRICT//EN" => 1,
+           "-//IETF//DTD HTML 2.0//EN" => 1,
+           "-//IETF//DTD HTML 2.1E//EN" => 1,
+           "-//IETF//DTD HTML 3.0//EN" => 1,
+           "-//IETF//DTD HTML 3.0//EN//" => 1,
+           "-//IETF//DTD HTML 3.2 FINAL//EN" => 1,
+           "-//IETF//DTD HTML 3.2//EN" => 1,
+           "-//IETF//DTD HTML 3//EN" => 1,
+           "-//IETF//DTD HTML LEVEL 0//EN" => 1,
+           "-//IETF//DTD HTML LEVEL 0//EN//2.0" => 1,
+           "-//IETF//DTD HTML LEVEL 1//EN" => 1,
+           "-//IETF//DTD HTML LEVEL 1//EN//2.0" => 1,
+           "-//IETF//DTD HTML LEVEL 2//EN" => 1,
+           "-//IETF//DTD HTML LEVEL 2//EN//2.0" => 1,
+           "-//IETF//DTD HTML LEVEL 3//EN" => 1,
+           "-//IETF//DTD HTML LEVEL 3//EN//3.0" => 1,
+           "-//IETF//DTD HTML STRICT LEVEL 0//EN" => 1,
+           "-//IETF//DTD HTML STRICT LEVEL 0//EN//2.0" => 1,
+           "-//IETF//DTD HTML STRICT LEVEL 1//EN" => 1,
+           "-//IETF//DTD HTML STRICT LEVEL 1//EN//2.0" => 1,
+           "-//IETF//DTD HTML STRICT LEVEL 2//EN" => 1,
+           "-//IETF//DTD HTML STRICT LEVEL 2//EN//2.0" => 1,
+           "-//IETF//DTD HTML STRICT LEVEL 3//EN" => 1,
+           "-//IETF//DTD HTML STRICT LEVEL 3//EN//3.0" => 1,
+           "-//IETF//DTD HTML STRICT//EN" => 1,
+           "-//IETF//DTD HTML STRICT//EN//2.0" => 1,
+           "-//IETF//DTD HTML STRICT//EN//3.0" => 1,
+           "-//IETF//DTD HTML//EN" => 1,
+           "-//IETF//DTD HTML//EN//2.0" => 1,
+           "-//IETF//DTD HTML//EN//3.0" => 1,
+           "-//METRIUS//DTD METRIUS PRESENTATIONAL//EN" => 1,
+           "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML STRICT//EN" => 1,
+           "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML//EN" => 1,
+           "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 TABLES//EN" => 1,
+           "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML STRICT//EN" => 1,
+           "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML//EN" => 1,
+           "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 TABLES//EN" => 1,
+           "-//NETSCAPE COMM. CORP.//DTD HTML//EN" => 1,
+           "-//NETSCAPE COMM. CORP.//DTD STRICT HTML//EN" => 1,
+           "-//O'REILLY AND ASSOCIATES//DTD HTML 2.0//EN" => 1,
+           "-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED 1.0//EN" => 1,
+           "-//SPYGLASS//DTD HTML 2.0 EXTENDED//EN" => 1,
+           "-//SQ//DTD HTML 2.0 HOTMETAL + EXTENSIONS//EN" => 1,
+           "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA HTML//EN" => 1,
+           "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA STRICT HTML//EN" => 1,
+           "-//W3C//DTD HTML 3 1995-03-24//EN" => 1,
+           "-//W3C//DTD HTML 3.2 DRAFT//EN" => 1,
+           "-//W3C//DTD HTML 3.2 FINAL//EN" => 1,
+           "-//W3C//DTD HTML 3.2//EN" => 1,
+           "-//W3C//DTD HTML 3.2S DRAFT//EN" => 1,
+           "-//W3C//DTD HTML 4.0 FRAMESET//EN" => 1,
+           "-//W3C//DTD HTML 4.0 TRANSITIONAL//EN" => 1,
+           "-//W3C//DTD HTML EXPERIMETNAL 19960712//EN" => 1,
+           "-//W3C//DTD HTML EXPERIMENTAL 970421//EN" => 1,
+           "-//W3C//DTD W3 HTML//EN" => 1,
+           "-//W3O//DTD W3 HTML 3.0//EN" => 1,
+           "-//W3O//DTD W3 HTML 3.0//EN//" => 1,
+           "-//W3O//DTD W3 HTML STRICT 3.0//EN//" => 1,
+           "-//WEBTECHS//DTD MOZILLA HTML 2.0//EN" => 1,
+           "-//WEBTECHS//DTD MOZILLA HTML//EN" => 1,
+           "-/W3C/DTD HTML 4.0 TRANSITIONAL/EN" => 1,
+           "HTML" => 1,
+         }->{$pubid}) {
+           $self->{document}->manakai_compat_mode ('quirks');
+         } elsif ($pubid eq "-//W3C//DTD HTML 4.01 FRAMESET//EN" or
+                  $pubid eq "-//W3C//DTD HTML 4.01 TRANSITIONAL//EN") {
+           if (defined $token->{system_identifier}) {
+             $self->{document}->manakai_compat_mode ('quirks');
+           } else {
+             $self->{document}->manakai_compat_mode ('limited quirks');
            }
+         } elsif ($pubid eq "-//W3C//DTD XHTML 1.0 Frameset//EN" or
+                  $pubid eq "-//W3C//DTD XHTML 1.0 Transitional//EN") {
+           $self->{document}->manakai_compat_mode ('limited quirks');
+         }
+       }
+       if (defined $token->{system_identifier}) {
+         my $sysid = $token->{system_identifier};
+         $sysid =~ tr/A-Z/a-z/;
+         if ($sysid eq "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd") {
+           $self->{document}->manakai_compat_mode ('quirks');
+         }
+       }
+       ## Go to the root element phase.
+       !!!next-token;
+       return;
+     } elsif ({
+               'start tag' => 1,
+               'end tag' => 1,
+               'end-of-file' => 1,
+              }->{$token->{type}}) {
+       !!!parse-error (type => 'no DOCTYPE');
+       $self->{document}->manakai_compat_mode ('quirks');
+       ## Go to the root element phase
+       ## reprocess
+       return;
+     } elsif ($token->{type} eq 'character') {
+       if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { # \x0D
+         ## Ignore the token
+         unless (length $token->{data}) {
+           ## Stay in the phase
+           !!!next-token;
+           redo INITIAL;
          }
-         ## ISSUE: Spec currently left this case undefined.
-         !!!parse-error (type => 'missing DOCTYPE');
-         #$phase = 'root element';
-         ## reprocess
-         #redo B;
-         return;
-       } else {
-         die "$0: $token->{type}: Unknown token";
        }
-     } # B
+       !!!parse-error (type => 'no DOCTYPE');
+       $self->{document}->manakai_compat_mode ('quirks');
+       ## Go to the root element phase
+       ## reprocess
+       return;
+     } elsif ($token->{type} eq 'comment') {
+       my $comment = $self->{document}->create_comment ($token->{data});
+       $self->{document}->append_child ($comment);
+       ## Stay in the phase.
+       !!!next-token;
+       redo INITIAL;
+     } else {
+       die "$0: $token->{type}: Unknown token";
+     }
+   } # INITIAL
  } # _tree_construction_initial
  sub _tree_construction_root_element ($) {
-Line 1738 
 sub _tree_construction_root_element ($)
+Line 1995 
 sub _tree_construction_root_element ($)
          !!!next-token;
          redo B;
        } elsif ($token->{type} eq 'character') {
-         if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
+         if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { # \x0D
-           $self->{document}->manakai_append_text ($1);
+           ## Ignore the token.
-           ## ISSUE: DOM3 Core does not allow Document > Text
            unless (length $token->{data}) {
              ## Stay in the phase
              !!!next-token;
-Line 1761 
 sub _tree_construction_root_element ($)
+Line 2018 
 sub _tree_construction_root_element ($)
        my $root_element; !!!create-element ($root_element, 'html');
        $self->{document}->append_child ($root_element);
        push @{$self->{open_elements}}, [$root_element, 'html'];
-       #$phase = 'main';
        ## reprocess
        #redo B;
-       return;
+       return; ## Go to the main phase.
    } # B
  } # _tree_construction_root_element
-Line 1780 
 sub _reset_insertion_mode ($) {
+Line 2036 
 sub _reset_insertion_mode ($) {
      ## Step 3
      S3: {
-       $last = 1 if $self->{open_elements}->[0]->[0] eq $node->[0];
+       ## ISSUE: Oops! "If node is the first node in the stack of open
-       if (defined $self->{inner_html_node}) {
+       ## elements, then set last to true. If the context element of the
-         if ($self->{inner_html_node}->[1] eq 'td' or
+       ## HTML fragment parsing algorithm is neither a td element nor a
-             $self->{inner_html_node}->[1] eq 'th') {
+       ## th element, then set node to the context element. (fragment case)":
-           #
+       ## The second "if" is in the scope of the first "if"!?
-         } else {
+       if ($self->{open_elements}->[0]->[0] eq $node->[0]) {
-           $node = $self->{inner_html_node};
+         $last = 1;
+         if (defined $self->{inner_html_node}) {
+           if ($self->{inner_html_node}->[1] eq 'td' or
+               $self->{inner_html_node}->[1] eq 'th') {
+             #
+           } else {
+             $node = $self->{inner_html_node};
+           }
          }
        }
-Line 1833 
 sub _reset_insertion_mode ($) {
+Line 2096 
 sub _reset_insertion_mode ($) {
  sub _tree_construction_main ($) {
    my $self = shift;
-   my $phase = 'main';
+   my $previous_insertion_mode;
    my $active_formatting_elements = [];
-Line 1917 
 sub _tree_construction_main ($) {
+Line 2180 
 sub _tree_construction_main ($) {
      }
    }; # $clear_up_to_marker
-   my $style_start_tag = sub {
+   my $parse_rcdata = sub ($$) {
-     my $style_el; !!!create-element ($style_el, 'style', $token->{attributes});
+     my ($content_model_flag, $insert) = @_;
-     ## $self->{insertion_mode} eq 'in head' and ... (always true)
-     (($self->{insertion_mode} eq 'in head' and defined $self->{head_element})
+     ## Step 1
-      ? $self->{head_element} : $self->{open_elements}->[-1]->[0])
+     my $start_tag_name = $token->{tag_name};
-       ->append_child ($style_el);
+     my $el;
-     $self->{content_model_flag} = 'CDATA';
+     !!!create-element ($el, $start_tag_name, $token->{attributes});
+     ## Step 2
+     $insert->($el); # /context node/->append_child ($el)
+     ## Step 3
+     $self->{content_model_flag} = $content_model_flag; # CDATA or RCDATA
+     delete $self->{escape}; # MUST
+     ## Step 4
      my $text = '';
      !!!next-token;
-     while ($token->{type} eq 'character') {
+     while ($token->{type} eq 'character') { # or until stop tokenizing
        $text .= $token->{data};
        !!!next-token;
-     } # stop if non-character token or tokenizer stops tokenising
+     }
+     ## Step 5
      if (length $text) {
-       $style_el->manakai_append_text ($text);
+       my $text = $self->{document}->create_text_node ($text);
+       $el->append_child ($text);
      }
+     ## Step 6
      $self->{content_model_flag} = 'PCDATA';
-     if ($token->{type} eq 'end tag' and $token->{tag_name} eq 'style') {
+     ## Step 7
+     if ($token->{type} eq 'end tag' and $token->{tag_name} eq $start_tag_name) {
        ## Ignore the token
      } else {
-       !!!parse-error (type => 'in CDATA:#'.$token->{type});
+       !!!parse-error (type => 'in '.$content_model_flag.':#'.$token->{type});
-       ## ISSUE: And ignore?
      }
      !!!next-token;
-   }; # $style_start_tag
+   }; # $parse_rcdata
-   my $script_start_tag = sub {
+   my $script_start_tag = sub ($) {
+     my $insert = $_[0];
      my $script_el;
      !!!create-element ($script_el, 'script', $token->{attributes});
      ## TODO: mark as "parser-inserted"
      $self->{content_model_flag} = 'CDATA';
+     delete $self->{escape}; # MUST
      my $text = '';
      !!!next-token;
-Line 1979 
 sub _tree_construction_main ($) {
+Line 2256 
 sub _tree_construction_main ($) {
      } else {
        ## TODO: $old_insertion_point = current insertion point
        ## TODO: insertion point = just before the next input character
-       (($self->{insertion_mode} eq 'in head' and defined $self->{head_element})
+       $insert->($script_el);
-        ? $self->{head_element} : $self->{open_elements}->[-1]->[0])->append_child ($script_el);
        ## TODO: insertion point = $old_insertion_point (might be "undefined")
-Line 2175 
 sub _tree_construction_main ($) {
+Line 2451 
 sub _tree_construction_main ($) {
    }; # $formatting_end_tag
    my $insert_to_current = sub {
-     $self->{open_elements}->[-1]->[0]->append_child (shift);
+     $self->{open_elements}->[-1]->[0]->append_child ($_[0]);
    }; # $insert_to_current
    my $insert_to_foster = sub {
-Line 2213 
 sub _tree_construction_main ($) {
+Line 2489 
 sub _tree_construction_main ($) {
      my $insert = shift;
      if ($token->{type} eq 'start tag') {
        if ($token->{tag_name} eq 'script') {
-         $script_start_tag->();
+         ## NOTE: This is an "as if in head" code clone
+         $script_start_tag->($insert);
          return;
        } elsif ($token->{tag_name} eq 'style') {
-         $style_start_tag->();
+         ## NOTE: This is an "as if in head" code clone
+         $parse_rcdata->('CDATA', $insert);
          return;
        } elsif ({
-                 base => 1, link => 1, meta => 1,
+                 base => 1, link => 1,
                 }->{$token->{tag_name}}) {
-         !!!parse-error (type => 'in body:'.$token->{tag_name});
+         ## NOTE: This is an "as if in head" code clone, only "-t" differs
-         ## NOTE: This is an "as if in head" code clone
+         !!!insert-element-t ($token->{tag_name}, $token->{attributes});
-         my $el;
+         pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
-         !!!create-element ($el, $token->{tag_name}, $token->{attributes});
+         !!!next-token;
-         if (defined $self->{head_element}) {
+         return;
-           $self->{head_element}->append_child ($el);
+       } elsif ($token->{tag_name} eq 'meta') {
-         } else {
+         ## NOTE: This is an "as if in head" code clone, only "-t" differs
-           $insert->($el);
+         !!!insert-element-t ($token->{tag_name}, $token->{attributes});
+         pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
+         unless ($self->{confident}) {
+           my $charset;
+           if ($token->{attributes}->{charset}) { ## TODO: And if supported
+             $charset = $token->{attributes}->{charset}->{value};
+           }
+           if ($token->{attributes}->{'http-equiv'}) {
+             ## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition.
+             if ($token->{attributes}->{'http-equiv'}->{value}
+                 =~ /\A[^;]*;[\x09-\x0D\x20]*charset[\x09-\x0D\x20]*=
+                     [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|
+                     ([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) {
+               $charset = defined $1 ? $1 : defined $2 ? $2 : $3;
+             } ## TODO: And if supported
+           }
+           ## TODO: Change the encoding
          }
          !!!next-token;
          return;
        } elsif ($token->{tag_name} eq 'title') {
          !!!parse-error (type => 'in body:title');
-         ## NOTE: There is an "as if in head" code clone
+         ## NOTE: This is an "as if in head" code clone
-         my $title_el;
+         $parse_rcdata->('RCDATA', sub {
-         !!!create-element ($title_el, 'title', $token->{attributes});
+           if (defined $self->{head_element}) {
-         (defined $self->{head_element} ? $self->{head_element} : $self->{open_elements}->[-1]->[0])
+             $self->{head_element}->append_child ($_[0]);
-           ->append_child ($title_el);
+           } else {
-         $self->{content_model_flag} = 'RCDATA';
+             $insert->($_[0]);
+           }
-         my $text = '';
+         });
-         !!!next-token;
-         while ($token->{type} eq 'character') {
-           $text .= $token->{data};
-           !!!next-token;
-         }
-         if (length $text) {
-           $title_el->manakai_append_text ($text);
-         }
-         $self->{content_model_flag} = 'PCDATA';
-         if ($token->{type} eq 'end tag' and
-             $token->{tag_name} eq 'title') {
-           ## Ignore the token
-         } else {
-           !!!parse-error (type => 'in RCDATA:#'.$token->{type});
-           ## ISSUE: And ignore?
-         }
-         !!!next-token;
          return;
        } elsif ($token->{tag_name} eq 'body') {
          !!!parse-error (type => 'in body:body');
-Line 2364 
 sub _tree_construction_main ($) {
+Line 2640 
 sub _tree_construction_main ($) {
              if ($i != -1) {
                !!!parse-error (type => 'end tag missing:'.
                                $self->{open_elements}->[-1]->[1]);
-               ## TODO: test
              }
              splice @{$self->{open_elements}}, $i;
              last LI;
-Line 2412 
 sub _tree_construction_main ($) {
+Line 2687 
 sub _tree_construction_main ($) {
              if ($i != -1) {
                !!!parse-error (type => 'end tag missing:'.
                                $self->{open_elements}->[-1]->[1]);
-               ## TODO: test
              }
              splice @{$self->{open_elements}}, $i;
              last LI;
-Line 2475 
 sub _tree_construction_main ($) {
+Line 2749 
 sub _tree_construction_main ($) {
            }
          } # INSCOPE
+         ## NOTE: See <http://html5.org/tools/web-apps-tracker?from=925&to=926>
          ## has an element in scope
-         my $i;
+         #my $i;
-         INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
+         #INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
-           my $node = $self->{open_elements}->[$_];
+         #  my $node = $self->{open_elements}->[$_];
-           if ({
+         #  if ({
-                h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
+         #       h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
-               }->{$node->[1]}) {
+         #      }->{$node->[1]}) {
-             $i = $_;
+         #    $i = $_;
-             last INSCOPE;
+         #    last INSCOPE;
-           } elsif ({
+         #  } elsif ({
-                     table => 1, caption => 1, td => 1, th => 1,
+         #            table => 1, caption => 1, td => 1, th => 1,
-                     button => 1, marquee => 1, object => 1, html => 1,
+         #            button => 1, marquee => 1, object => 1, html => 1,
-                    }->{$node->[1]}) {
+         #           }->{$node->[1]}) {
-             last INSCOPE;
+         #    last INSCOPE;
-           }
+         #  }
-         } # INSCOPE
+         #} # INSCOPE
+         #
-         if (defined $i) {
+         #if (defined $i) {
-           !!!parse-error (type => 'in hn:hn');
+         #  !!! parse-error (type => 'in hn:hn');
-           splice @{$self->{open_elements}}, $i;
+         #  splice @{$self->{open_elements}}, $i;
-         }
+         #}
          !!!insert-element-t ($token->{tag_name}, $token->{attributes});
-Line 2538 
 sub _tree_construction_main ($) {
+Line 2813 
 sub _tree_construction_main ($) {
          return;
        } elsif ({
                  b => 1, big => 1, em => 1, font => 1, i => 1,
-                 nobr => 1, s => 1, small => 1, strile => 1,
+                 s => 1, small => 1, strile => 1,
                  strong => 1, tt => 1, u => 1,
                 }->{$token->{tag_name}}) {
          $reconstruct_active_formatting_elements->($insert_to_current);
-Line 2548 
 sub _tree_construction_main ($) {
+Line 2823 
 sub _tree_construction_main ($) {
          !!!next-token;
          return;
+       } elsif ($token->{tag_name} eq 'nobr') {
+         $reconstruct_active_formatting_elements->($insert_to_current);
+         ## has a |nobr| element in scope
+         INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
+           my $node = $self->{open_elements}->[$_];
+           if ($node->[1] eq 'nobr') {
+             !!!parse-error (type => 'not closed:nobr');
+             !!!back-token;
+             $token = {type => 'end tag', tag_name => 'nobr'};
+             return;
+           } elsif ({
+                     table => 1, caption => 1, td => 1, th => 1,
+                     button => 1, marquee => 1, object => 1, html => 1,
+                    }->{$node->[1]}) {
+             last INSCOPE;
+           }
+         } # INSCOPE
+         !!!insert-element-t ($token->{tag_name}, $token->{attributes});
+         push @$active_formatting_elements, $self->{open_elements}->[-1];
+         !!!next-token;
+         return;
        } elsif ($token->{tag_name} eq 'button') {
          ## has a button element in scope
          INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
-Line 2583 
 sub _tree_construction_main ($) {
+Line 2882 
 sub _tree_construction_main ($) {
          return;
        } elsif ($token->{tag_name} eq 'xmp') {
          $reconstruct_active_formatting_elements->($insert_to_current);
+         $parse_rcdata->('CDATA', $insert);
-         !!!insert-element-t ($token->{tag_name}, $token->{attributes});
-         $self->{content_model_flag} = 'CDATA';
-         !!!next-token;
          return;
        } elsif ($token->{tag_name} eq 'table') {
          ## has a p element in scope
-Line 2620 
 sub _tree_construction_main ($) {
+Line 2914 
 sub _tree_construction_main ($) {
            !!!parse-error (type => 'image');
            $token->{tag_name} = 'img';
          }
+         ## NOTE: There is an "as if <br>" code clone.
          $reconstruct_active_formatting_elements->($insert_to_current);
          !!!insert-element-t ($token->{tag_name}, $token->{attributes});
-Line 2666 
 sub _tree_construction_main ($) {
+Line 2961 
 sub _tree_construction_main ($) {
            return;
          } else {
            my $at = $token->{attributes};
+           my $form_attrs;
+           $form_attrs->{action} = $at->{action} if $at->{action};
+           my $prompt_attr = $at->{prompt};
            $at->{name} = {name => 'name', value => 'isindex'};
+           delete $at->{action};
+           delete $at->{prompt};
            my @tokens = (
-                         {type => 'start tag', tag_name => 'form'},
+                         {type => 'start tag', tag_name => 'form',
+                          attributes => $form_attrs},
                          {type => 'start tag', tag_name => 'hr'},
                          {type => 'start tag', tag_name => 'p'},
                          {type => 'start tag', tag_name => 'label'},
-                         {type => 'character',
+                        );
-                          data => 'This is a searchable index. Insert your search keywords here: '}, # SHOULD
+           if ($prompt_attr) {
-                         ## TODO: make this configurable
+             push @tokens, {type => 'character', data => $prompt_attr->{value}};
+           } else {
+             push @tokens, {type => 'character',
+                            data => 'This is a searchable index. Insert your search keywords here: '}; # SHOULD
+             ## TODO: make this configurable
+           }
+           push @tokens,
                          {type => 'start tag', tag_name => 'input', attributes => $at},
                          #{type => 'character', data => ''}, # SHOULD
                          {type => 'end tag', tag_name => 'label'},
                          {type => 'end tag', tag_name => 'p'},
                          {type => 'start tag', tag_name => 'hr'},
-                         {type => 'end tag', tag_name => 'form'},
+                         {type => 'end tag', tag_name => 'form'};
-                        );
            $token = shift @tokens;
            !!!back-token (@tokens);
            return;
          }
-       } elsif ({
+       } elsif ($token->{tag_name} eq 'textarea') {
-                 textarea => 1,
-                 iframe => 1,
-                 noembed => 1,
-                 noframes => 1,
-                 noscript => 0, ## TODO: 1 if scripting is enabled
-                }->{$token->{tag_name}}) {
          my $tag_name = $token->{tag_name};
          my $el;
          !!!create-element ($el, $token->{tag_name}, $token->{attributes});
-         if ($token->{tag_name} eq 'textarea') {
+         ## TODO: $self->{form_element} if defined
-           ## TODO: $self->{form_element} if defined
+         $self->{content_model_flag} = 'RCDATA';
-           $self->{content_model_flag} = 'RCDATA';
+         delete $self->{escape}; # MUST
-         } else {
-           $self->{content_model_flag} = 'CDATA';
-         }
          $insert->($el);
          my $text = '';
-         if ($token->{tag_name} eq 'textarea') {
+         !!!next-token;
-           !!!next-token;
+         if ($token->{type} eq 'character') {
-           if ($token->{type} eq 'character') {
+           $token->{data} =~ s/^\x0A//;
-             $token->{data} =~ s/^\x0A//;
+           unless (length $token->{data}) {
-             unless (length $token->{data}) {
+             !!!next-token;
-               !!!next-token;
-             }
            }
-         } else {
-           !!!next-token;
          }
          while ($token->{type} eq 'character') {
            $text .= $token->{data};
-Line 2732 
 sub _tree_construction_main ($) {
+Line 3025 
 sub _tree_construction_main ($) {
              $token->{tag_name} eq $tag_name) {
            ## Ignore the token
          } else {
-           if ($token->{tag_name} eq 'textarea') {
+           !!!parse-error (type => 'in RCDATA:#'.$token->{type});
-             !!!parse-error (type => 'in RCDATA:#'.$token->{type});
-           } else {
-             !!!parse-error (type => 'in CDATA:#'.$token->{type});
-           }
-           ## ISSUE: And ignore?
          }
          !!!next-token;
          return;
+       } elsif ({
+                 iframe => 1,
+                 noembed => 1,
+                 noframes => 1,
+                 noscript => 0, ## TODO: 1 if scripting is enabled
+                }->{$token->{tag_name}}) {
+         $parse_rcdata->('CDATA', $insert);
+         return;
        } elsif ($token->{tag_name} eq 'select') {
          $reconstruct_active_formatting_elements->($insert_to_current);
-Line 2771 
 sub _tree_construction_main ($) {
+Line 3067 
 sub _tree_construction_main ($) {
        }
      } elsif ($token->{type} eq 'end tag') {
        if ($token->{tag_name} eq 'body') {
-         if (@{$self->{open_elements}} > 1 and $self->{open_elements}->[1]->[1] eq 'body') {
+         if (@{$self->{open_elements}} > 1 and
-           ## ISSUE: There is an issue in the spec.
+             $self->{open_elements}->[1]->[1] eq 'body') {
-           if ($self->{open_elements}->[-1]->[1] ne 'body') {
+           for (@{$self->{open_elements}}) {
-             !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
+             unless ({
+                        dd => 1, dt => 1, li => 1, p => 1, td => 1,
+                        th => 1, tr => 1, body => 1, html => 1,
+                      tbody => 1, tfoot => 1, thead => 1,
+                     }->{$_->[1]}) {
+               !!!parse-error (type => 'not closed:'.$_->[1]);
+             }
            }
            $self->{insertion_mode} = 'after body';
            !!!next-token;
            return;
-Line 2804 
 sub _tree_construction_main ($) {
+Line 3107 
 sub _tree_construction_main ($) {
                  address => 1, blockquote => 1, center => 1, dir => 1,
                  div => 1, dl => 1, fieldset => 1, listing => 1,
                  menu => 1, ol => 1, pre => 1, ul => 1,
-                 form => 1,
                  p => 1,
                  dd => 1, dt => 1, li => 1,
                  button => 1, marquee => 1, object => 1,
-Line 2821 
 sub _tree_construction_main ($) {
+Line 3123 
 sub _tree_construction_main ($) {
                   li => ($token->{tag_name} ne 'li'),
                   p => ($token->{tag_name} ne 'p'),
                   td => 1, th => 1, tr => 1,
+                  tbody => 1, tfoot=> 1, thead => 1,
                  }->{$self->{open_elements}->[-1]->[1]}) {
                !!!back-token;
                $token = {type => 'end tag',
-Line 2838 
 sub _tree_construction_main ($) {
+Line 3141 
 sub _tree_construction_main ($) {
          } # INSCOPE
          if ($self->{open_elements}->[-1]->[1] ne $token->{tag_name}) {
-           !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
+           if (defined $i) {
+             !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
+           } else {
+             !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
+           }
          }
-         splice @{$self->{open_elements}}, $i if defined $i;
+         if (defined $i) {
-         undef $self->{form_element} if $token->{tag_name} eq 'form';
+           splice @{$self->{open_elements}}, $i;
+         } elsif ($token->{tag_name} eq 'p') {
+           ## As if <p>, then reprocess the current token
+           my $el;
+           !!!create-element ($el, 'p');
+           $insert->($el);
+         }
          $clear_up_to_marker->()
            if {
              button => 1, marquee => 1, object => 1,
            }->{$token->{tag_name}};
          !!!next-token;
          return;
+       } elsif ($token->{tag_name} eq 'form') {
+         ## has an element in scope
+         INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
+           my $node = $self->{open_elements}->[$_];
+           if ($node->[1] eq $token->{tag_name}) {
+             ## generate implied end tags
+             if ({
+                  dd => 1, dt => 1, li => 1, p => 1,
+                  td => 1, th => 1, tr => 1,
+                  tbody => 1, tfoot=> 1, thead => 1,
+                 }->{$self->{open_elements}->[-1]->[1]}) {
+               !!!back-token;
+               $token = {type => 'end tag',
+                         tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
+               return;
+             }
+             last INSCOPE;
+           } elsif ({
+                     table => 1, caption => 1, td => 1, th => 1,
+                     button => 1, marquee => 1, object => 1, html => 1,
+                    }->{$node->[1]}) {
+             last INSCOPE;
+           }
+         } # INSCOPE
+         if ($self->{open_elements}->[-1]->[1] eq $token->{tag_name}) {
+           pop @{$self->{open_elements}};
+         } else {
+           !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
+         }
+         undef $self->{form_element};
+         !!!next-token;
+         return;
        } elsif ({
                  h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
                 }->{$token->{tag_name}}) {
-Line 2863 
 sub _tree_construction_main ($) {
+Line 3210 
 sub _tree_construction_main ($) {
              if ({
                   dd => 1, dt => 1, li => 1, p => 1,
                   td => 1, th => 1, tr => 1,
+                  tbody => 1, tfoot=> 1, thead => 1,
                  }->{$self->{open_elements}->[-1]->[1]}) {
                !!!back-token;
                $token = {type => 'end tag',
-Line 2893 
 sub _tree_construction_main ($) {
+Line 3241 
 sub _tree_construction_main ($) {
                  strong => 1, tt => 1, u => 1,
                 }->{$token->{tag_name}}) {
          $formatting_end_tag->($token->{tag_name});
- ## TODO: <http://html5.org/tools/web-apps-tracker?from=883&to=884>
+         return;
+       } elsif ($token->{tag_name} eq 'br') {
+         !!!parse-error (type => 'unmatched end tag:br');
+         ## As if <br>
+         $reconstruct_active_formatting_elements->($insert_to_current);
+         my $el;
+         !!!create-element ($el, 'br');
+         $insert->($el);
+         ## Ignore the token.
+         !!!next-token;
          return;
        } elsif ({
                  caption => 1, col => 1, colgroup => 1, frame => 1,
                  frameset => 1, head => 1, option => 1, optgroup => 1,
                  tbody => 1, td => 1, tfoot => 1, th => 1,
                  thead => 1, tr => 1,
-                 area => 1, basefont => 1, bgsound => 1, br => 1,
+                 area => 1, basefont => 1, bgsound => 1,
                  embed => 1, hr => 1, iframe => 1, image => 1,
                  img => 1, input => 1, isindex => 1, noembed => 1,
                  noframes => 1, param => 1, select => 1, spacer => 1,
-Line 2927 
 sub _tree_construction_main ($) {
+Line 3287 
 sub _tree_construction_main ($) {
              if ({
                   dd => 1, dt => 1, li => 1, p => 1,
                   td => 1, th => 1, tr => 1,
+                  tbody => 1, tfoot=> 1, thead => 1,
                  }->{$self->{open_elements}->[-1]->[1]}) {
                !!!back-token;
                $token = {type => 'end tag',
-Line 2950 
 sub _tree_construction_main ($) {
+Line 3311 
 sub _tree_construction_main ($) {
                  #not $phrasing_category->{$node->[1]} and
                  ($special_category->{$node->[1]} or
                   $scoping_category->{$node->[1]})) {
-               !!!parse-error (type => 'not closed:'.$node->[1]);
+               !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
                ## Ignore the token
                !!!next-token;
                last S2;
-Line 2970 
 sub _tree_construction_main ($) {
+Line 3331 
 sub _tree_construction_main ($) {
    }; # $in_body
    B: {
-     if ($phase eq 'main') {
+     if ($self->{insertion_mode} ne 'trailing end') {
        if ($token->{type} eq 'DOCTYPE') {
          !!!parse-error (type => 'in html:#DOCTYPE');
          ## Ignore the token
-Line 2979 
 sub _tree_construction_main ($) {
+Line 3340 
 sub _tree_construction_main ($) {
          redo B;
        } elsif ($token->{type} eq 'start tag' and
                 $token->{tag_name} eq 'html') {
-         ## TODO: unless it is the first start tag token, parse-error
+ ## ISSUE: "aa<html>" is not a parse error.
+ ## ISSUE: "<html>" in fragment is not a parse error.
+         unless ($token->{first_start_tag}) {
+           !!!parse-error (type => 'not first start tag');
+         }
          my $top_el = $self->{open_elements}->[0]->[0];
          for my $attr_name (keys %{$token->{attributes}}) {
            unless ($top_el->has_attribute_ns (undef, $attr_name)) {
-Line 2994 
 sub _tree_construction_main ($) {
+Line 3359 
 sub _tree_construction_main ($) {
          ## Generate implied end tags
          if ({
               dd => 1, dt => 1, li => 1, p => 1, td => 1, th => 1, tr => 1,
+              tbody => 1, tfoot=> 1, thead => 1,
              }->{$self->{open_elements}->[-1]->[1]}) {
            !!!back-token;
            $token = {type => 'end tag', tag_name => $self->{open_elements}->[-1]->[1]};
-Line 3053 
 sub _tree_construction_main ($) {
+Line 3419 
 sub _tree_construction_main ($) {
              }
              redo B;
            } elsif ($token->{type} eq 'end tag') {
-             if ($token->{tag_name} eq 'html') {
+             if ({
+                  head => 1, body => 1, html => 1,
+                  p => 1, br => 1,
+                 }->{$token->{tag_name}}) {
                ## As if <head>
                !!!create-element ($self->{head_element}, 'head');
                $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
-Line 3063 
 sub _tree_construction_main ($) {
+Line 3432 
 sub _tree_construction_main ($) {
                redo B;
              } else {
                !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
-               ## Ignore the token
+               ## Ignore the token ## ISSUE: An issue in the spec.
                !!!next-token;
                redo B;
              }
            } else {
              die "$0: $token->{type}: Unknown type";
            }
-         } elsif ($self->{insertion_mode} eq 'in head') {
+         } elsif ($self->{insertion_mode} eq 'in head' or
+                  $self->{insertion_mode} eq 'in head noscript' or
+                  $self->{insertion_mode} eq 'after head') {
            if ($token->{type} eq 'character') {
              if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
                $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
-Line 3087 
 sub _tree_construction_main ($) {
+Line 3458 
 sub _tree_construction_main ($) {
              !!!next-token;
              redo B;
            } elsif ($token->{type} eq 'start tag') {
-             if ($token->{tag_name} eq 'title') {
+             if ({base => ($self->{insertion_mode} eq 'in head' or
-               ## NOTE: There is an "as if in head" code clone
+                           $self->{insertion_mode} eq 'after head'),
-               my $title_el;
+                  link => 1}->{$token->{tag_name}}) {
-               !!!create-element ($title_el, 'title', $token->{attributes});
+               ## NOTE: There is a "as if in head" code clone.
-               (defined $self->{head_element} ? $self->{head_element} : $self->{open_elements}->[-1]->[0])
+               if ($self->{insertion_mode} eq 'after head') {
-                 ->append_child ($title_el);
+                 !!!parse-error (type => 'after head:'.$token->{tag_name});
-               $self->{content_model_flag} = 'RCDATA';
+                 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
+               }
-               my $text = '';
+               !!!insert-element ($token->{tag_name}, $token->{attributes});
+               pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
+               pop @{$self->{open_elements}}
+                   if $self->{insertion_mode} eq 'after head';
                !!!next-token;
-               while ($token->{type} eq 'character') {
+               redo B;
-                 $text .= $token->{data};
+             } elsif ($token->{tag_name} eq 'meta') {
-                 !!!next-token;
+               ## NOTE: There is a "as if in head" code clone.
+               if ($self->{insertion_mode} eq 'after head') {
+                 !!!parse-error (type => 'after head:'.$token->{tag_name});
+                 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
                }
-               if (length $text) {
+               !!!insert-element ($token->{tag_name}, $token->{attributes});
-                 $title_el->manakai_append_text ($text);
+               pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
+               unless ($self->{confident}) {
+                 my $charset;
+                 if ($token->{attributes}->{charset}) { ## TODO: And if supported
+                   $charset = $token->{attributes}->{charset}->{value};
+                 }
+                 if ($token->{attributes}->{'http-equiv'}) {
+                   ## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition.
+                   if ($token->{attributes}->{'http-equiv'}->{value}
+                       =~ /\A[^;]*;[\x09-\x0D\x20]*charset[\x09-\x0D\x20]*=
+                           [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|
+                           ([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) {
+                     $charset = defined $1 ? $1 : defined $2 ? $2 : $3;
+                   } ## TODO: And if supported
+                 }
+                 ## TODO: Change the encoding
                }
-               $self->{content_model_flag} = 'PCDATA';
+               ## TODO: Extracting |charset| from |meta|.
+               pop @{$self->{open_elements}}
-               if ($token->{type} eq 'end tag' and
+                   if $self->{insertion_mode} eq 'after head';
-                   $token->{tag_name} eq 'title') {
+               !!!next-token;
+               redo B;
+             } elsif ($token->{tag_name} eq 'title' and
+                      $self->{insertion_mode} eq 'in head') {
+               ## NOTE: There is a "as if in head" code clone.
+               if ($self->{insertion_mode} eq 'after head') {
+                 !!!parse-error (type => 'after head:'.$token->{tag_name});
+                 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
+               }
+               my $parent = defined $self->{head_element} ? $self->{head_element}
+                   : $self->{open_elements}->[-1]->[0];
+               $parse_rcdata->('RCDATA', sub { $parent->append_child ($_[0]) });
+               pop @{$self->{open_elements}}
+                   if $self->{insertion_mode} eq 'after head';
+               redo B;
+             } elsif ($token->{tag_name} eq 'style') {
+               ## NOTE: Or (scripting is enabled and tag_name eq 'noscript' and
+               ## insertion mode 'in head')
+               ## NOTE: There is a "as if in head" code clone.
+               if ($self->{insertion_mode} eq 'after head') {
+                 !!!parse-error (type => 'after head:'.$token->{tag_name});
+                 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
+               }
+               $parse_rcdata->('CDATA', $insert_to_current);
+               pop @{$self->{open_elements}}
+                   if $self->{insertion_mode} eq 'after head';
+               redo B;
+             } elsif ($token->{tag_name} eq 'noscript') {
+               if ($self->{insertion_mode} eq 'in head') {
+                 ## NOTE: and scripting is disalbed
+                 !!!insert-element ($token->{tag_name}, $token->{attributes});
+                 $self->{insertion_mode} = 'in head noscript';
+                 !!!next-token;
+                 redo B;
+               } elsif ($self->{insertion_mode} eq 'in head noscript') {
+                 !!!parse-error (type => 'in noscript:noscript');
                  ## Ignore the token
+                 redo B;
                } else {
-                 !!!parse-error (type => 'in RCDATA:#'.$token->{type});
+                 #
-                 ## ISSUE: And ignore?
                }
+             } elsif ($token->{tag_name} eq 'head' and
+                      $self->{insertion_mode} ne 'after head') {
+               !!!parse-error (type => 'in head:head'); # or in head noscript
+               ## Ignore the token
                !!!next-token;
                redo B;
-             } elsif ($token->{tag_name} eq 'style') {
+             } elsif ($self->{insertion_mode} ne 'in head noscript' and
-               $style_start_tag->();
+                      $token->{tag_name} eq 'script') {
-               redo B;
+               if ($self->{insertion_mode} eq 'after head') {
-             } elsif ($token->{tag_name} eq 'script') {
+                 !!!parse-error (type => 'after head:'.$token->{tag_name});
-               $script_start_tag->();
+                 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
+               }
+               ## NOTE: There is a "as if in head" code clone.
+               $script_start_tag->($insert_to_current);
+               pop @{$self->{open_elements}}
+                   if $self->{insertion_mode} eq 'after head';
                redo B;
-             } elsif ({base => 1, link => 1, meta => 1}->{$token->{tag_name}}) {
+             } elsif ($self->{insertion_mode} eq 'after head' and
-               ## NOTE: There are "as if in head" code clones
+                      $token->{tag_name} eq 'body') {
-               my $el;
+               !!!insert-element ('body', $token->{attributes});
-               !!!create-element ($el, $token->{tag_name}, $token->{attributes});
+               $self->{insertion_mode} = 'in body';
-               (defined $self->{head_element} ? $self->{head_element} : $self->{open_elements}->[-1]->[0])
-                 ->append_child ($el);
                !!!next-token;
                redo B;
-             } elsif ($token->{tag_name} eq 'head') {
+             } elsif ($self->{insertion_mode} eq 'after head' and
-               !!!parse-error (type => 'in head:head');
+                      $token->{tag_name} eq 'frameset') {
-               ## Ignore the token
+               !!!insert-element ('frameset', $token->{attributes});
+               $self->{insertion_mode} = 'in frameset';
                !!!next-token;
                redo B;
              } else {
                #
              }
            } elsif ($token->{type} eq 'end tag') {
-             if ($token->{tag_name} eq 'head') {
+             if ($self->{insertion_mode} eq 'in head' and
-               if ($self->{open_elements}->[-1]->[1] eq 'head') {
+                 $token->{tag_name} eq 'head') {
-                 pop @{$self->{open_elements}};
+               pop @{$self->{open_elements}};
-               } else {
-                 !!!parse-error (type => 'unmatched end tag:head');
-               }
                $self->{insertion_mode} = 'after head';
                !!!next-token;
                redo B;
-             } elsif ($token->{tag_name} eq 'html') {
+             } elsif ($self->{insertion_mode} eq 'in head noscript' and
+                 $token->{tag_name} eq 'noscript') {
+               pop @{$self->{open_elements}};
+               $self->{insertion_mode} = 'in head';
+               !!!next-token;
+               redo B;
+             } elsif ($self->{insertion_mode} eq 'in head' and
+                      {
+                       body => 1, html => 1,
+                       p => 1, br => 1,
+                      }->{$token->{tag_name}}) {
                #
-             } else {
+             } elsif ($self->{insertion_mode} eq 'in head noscript' and
+                      {
+                       p => 1, br => 1,
+                      }->{$token->{tag_name}}) {
+               #
+             } elsif ($self->{insertion_mode} ne 'after head') {
                !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
                ## Ignore the token
                !!!next-token;
                redo B;
+             } else {
+               #
              }
            } else {
              #
            }
-           if ($self->{open_elements}->[-1]->[1] eq 'head') {
+           ## As if </head> or </noscript> or <body>
-             ## As if </head>
+           if ($self->{insertion_mode} eq 'in head') {
+             pop @{$self->{open_elements}};
+             $self->{insertion_mode} = 'after head';
+           } elsif ($self->{insertion_mode} eq 'in head noscript') {
              pop @{$self->{open_elements}};
+             !!!parse-error (type => 'in noscript:'.(defined $token->{tag_name} ? ($token->{type} eq 'end tag' ? '/' : '') . $token->{tag_name} : '#' . $token->{type}));
+             $self->{insertion_mode} = 'in head';
+           } else { # 'after head'
+             !!!insert-element ('body');
+             $self->{insertion_mode} = 'in body';
            }
-           $self->{insertion_mode} = 'after head';
            ## reprocess
            redo B;
            ## ISSUE: An issue in the spec.
-         } elsif ($self->{insertion_mode} eq 'after head') {
-           if ($token->{type} eq 'character') {
-             if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
-               $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
-               unless (length $token->{data}) {
-                 !!!next-token;
-                 redo B;
-               }
-             }
-             #
-           } elsif ($token->{type} eq 'comment') {
-             my $comment = $self->{document}->create_comment ($token->{data});
-             $self->{open_elements}->[-1]->[0]->append_child ($comment);
-             !!!next-token;
-             redo B;
-           } elsif ($token->{type} eq 'start tag') {
-             if ($token->{tag_name} eq 'body') {
-               !!!insert-element ('body', $token->{attributes});
-               $self->{insertion_mode} = 'in body';
-               !!!next-token;
-               redo B;
-             } elsif ($token->{tag_name} eq 'frameset') {
-               !!!insert-element ('frameset', $token->{attributes});
-               $self->{insertion_mode} = 'in frameset';
-               !!!next-token;
-               redo B;
-             } elsif ({
-                       base => 1, link => 1, meta => 1,
-                       script => 1, style => 1, title => 1,
-                      }->{$token->{tag_name}}) {
-               !!!parse-error (type => 'after head:'.$token->{tag_name});
-               $self->{insertion_mode} = 'in head';
-               ## reprocess
-               redo B;
-             } else {
-               #
-             }
-           } else {
-             #
-           }
-           ## As if <body>
-           !!!insert-element ('body');
-           $self->{insertion_mode} = 'in body';
-           ## reprocess
-           redo B;
          } elsif ($self->{insertion_mode} eq 'in body') {
            if ($token->{type} eq 'character') {
              ## NOTE: There is a code clone of "character in body".
-Line 3371 
 sub _tree_construction_main ($) {
+Line 3780 
 sub _tree_construction_main ($) {
                if ({
                     dd => 1, dt => 1, li => 1, p => 1,
                     td => 1, th => 1, tr => 1,
+                    tbody => 1, tfoot=> 1, thead => 1,
                    }->{$self->{open_elements}->[-1]->[1]}) {
                  !!!back-token; # <table>
                  $token = {type => 'end tag', tag_name => 'table'};
-Line 3419 
 sub _tree_construction_main ($) {
+Line 3829 
 sub _tree_construction_main ($) {
                if ({
                     dd => 1, dt => 1, li => 1, p => 1,
                     td => 1, th => 1, tr => 1,
+                    tbody => 1, tfoot=> 1, thead => 1,
                    }->{$self->{open_elements}->[-1]->[1]}) {
                  !!!back-token;
                  $token = {type => 'end tag',
-Line 3502 
 sub _tree_construction_main ($) {
+Line 3913 
 sub _tree_construction_main ($) {
                if ({
                     dd => 1, dt => 1, li => 1, p => 1,
                     td => 1, th => 1, tr => 1,
+                    tbody => 1, tfoot=> 1, thead => 1,
                    }->{$self->{open_elements}->[-1]->[1]}) {
                  !!!back-token; # <?>
                  $token = {type => 'end tag', tag_name => 'caption'};
-Line 3552 
 sub _tree_construction_main ($) {
+Line 3964 
 sub _tree_construction_main ($) {
                if ({
                     dd => 1, dt => 1, li => 1, p => 1,
                     td => 1, th => 1, tr => 1,
+                    tbody => 1, tfoot=> 1, thead => 1,
                    }->{$self->{open_elements}->[-1]->[1]}) {
                  !!!back-token;
                  $token = {type => 'end tag',
-Line 3599 
 sub _tree_construction_main ($) {
+Line 4012 
 sub _tree_construction_main ($) {
                if ({
                     dd => 1, dt => 1, li => 1, p => 1,
                     td => 1, th => 1, tr => 1,
+                    tbody => 1, tfoot=> 1, thead => 1,
                    }->{$self->{open_elements}->[-1]->[1]}) {
                  !!!back-token; # </table>
                  $token = {type => 'end tag', tag_name => 'caption'};
-Line 3864 
 sub _tree_construction_main ($) {
+Line 4278 
 sub _tree_construction_main ($) {
                if ({
                     dd => 1, dt => 1, li => 1, p => 1,
                     td => 1, th => 1, tr => 1,
+                    tbody => 1, tfoot=> 1, thead => 1,
                    }->{$self->{open_elements}->[-1]->[1]}) {
                  !!!back-token; # <table>
                  $token = {type => 'end tag', tag_name => 'table'};
-Line 4132 
 sub _tree_construction_main ($) {
+Line 4547 
 sub _tree_construction_main ($) {
                if ({
                     dd => 1, dt => 1, li => 1, p => 1,
                     td => 1, th => 1, tr => 1,
+                    tbody => 1, tfoot=> 1, thead => 1,
                    }->{$self->{open_elements}->[-1]->[1]}) {
                  !!!back-token; # <table>
                  $token = {type => 'end tag', tag_name => 'table'};
-Line 4373 
 sub _tree_construction_main ($) {
+Line 4789 
 sub _tree_construction_main ($) {
                     td => ($token->{tag_name} eq 'th'),
                     th => ($token->{tag_name} eq 'td'),
                     tr => 1,
+                    tbody => 1, tfoot=> 1, thead => 1,
                    }->{$self->{open_elements}->[-1]->[1]}) {
                  !!!back-token;
                  $token = {type => 'end tag',
-Line 4624 
 sub _tree_construction_main ($) {
+Line 5041 
 sub _tree_construction_main ($) {
          } elsif ($self->{insertion_mode} eq 'after body') {
            if ($token->{type} eq 'character') {
              if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
+               my $data = $1;
                ## As if in body
                $reconstruct_active_formatting_elements->($insert_to_current);
-               $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
+               $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
                unless (length $token->{data}) {
                  !!!next-token;
-Line 4653 
 sub _tree_construction_main ($) {
+Line 5071 
 sub _tree_construction_main ($) {
                  !!!next-token;
                  redo B;
                } else {
-                 $phase = 'trailing end';
+                 $previous_insertion_mode = $self->{insertion_mode};
+                 $self->{insertion_mode} = 'trailing end';
                  !!!next-token;
                  redo B;
                }
-Line 4670 
 sub _tree_construction_main ($) {
+Line 5089 
 sub _tree_construction_main ($) {
          } elsif ($self->{insertion_mode} eq 'in frameset') {
            if ($token->{type} eq 'character') {
              if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
-               $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
+               $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
                unless (length $token->{data}) {
                  !!!next-token;
-Line 4725 
 sub _tree_construction_main ($) {
+Line 5144 
 sub _tree_construction_main ($) {
            }
            if (defined $token->{tag_name}) {
-             !!!parse-error (type => 'in frameset:'.$token->{tag_name});
+             !!!parse-error (type => 'in frameset:'.($token->{type} eq 'end tag' ? '/' : '').$token->{tag_name});
            } else {
              !!!parse-error (type => 'in frameset:#'.$token->{type});
            }
-Line 4735 
 sub _tree_construction_main ($) {
+Line 5154 
 sub _tree_construction_main ($) {
          } elsif ($self->{insertion_mode} eq 'after frameset') {
            if ($token->{type} eq 'character') {
              if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
-               $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
+               $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
                unless (length $token->{data}) {
                  !!!next-token;
-Line 4743 
 sub _tree_construction_main ($) {
+Line 5162 
 sub _tree_construction_main ($) {
                }
              }
-             #
+             if ($token->{data} =~ s/^[^\x09\x0A\x0B\x0C\x20]+//) {
+               !!!parse-error (type => 'after frameset:#character');
+               ## Ignore the token.
+               if (length $token->{data}) {
+                 ## reprocess the rest of characters
+               } else {
+                 !!!next-token;
+               }
+               redo B;
+             }
            } elsif ($token->{type} eq 'comment') {
              my $comment = $self->{document}->create_comment ($token->{data});
              $self->{open_elements}->[-1]->[0]->append_child ($comment);
-Line 4758 
 sub _tree_construction_main ($) {
+Line 5187 
 sub _tree_construction_main ($) {
              }
            } elsif ($token->{type} eq 'end tag') {
              if ($token->{tag_name} eq 'html') {
-               $phase = 'trailing end';
+               $previous_insertion_mode = $self->{insertion_mode};
+               $self->{insertion_mode} = 'trailing end';
                !!!next-token;
                redo B;
              } else {
                #
              }
            } else {
-             #
+             die "$0: $token->{type}: Unknown token type";
            }
-           if (defined $token->{tag_name}) {
+           !!!parse-error (type => 'after frameset:'.($token->{tag_name} eq 'end tag' ? '/' : '').$token->{tag_name});
-             !!!parse-error (type => 'after frameset:'.$token->{tag_name});
-           } else {
-             !!!parse-error (type => 'after frameset:#'.$token->{type});
-           }
            ## Ignore the token
            !!!next-token;
            redo B;
-Line 4782 
 sub _tree_construction_main ($) {
+Line 5208 
 sub _tree_construction_main ($) {
            die "$0: $self->{insertion_mode}: Unknown insertion mode";
          }
        }
-     } elsif ($phase eq 'trailing end') {
+     } elsif ($self->{insertion_mode} eq 'trailing end') {
        ## states in the main stage is preserved yet # MUST
        if ($token->{type} eq 'DOCTYPE') {
-Line 4802 
 sub _tree_construction_main ($) {
+Line 5228 
 sub _tree_construction_main ($) {
            ## NOTE: The insertion mode in the main phase
            ## just before the phase has been changed to the trailing
            ## end phase is either "after body" or "after frameset".
-           $reconstruct_active_formatting_elements->($insert_to_current)
+           $reconstruct_active_formatting_elements->($insert_to_current);
-             if $phase eq 'main';
            $self->{open_elements}->[-1]->[0]->manakai_append_text ($data);
-Line 4814 
 sub _tree_construction_main ($) {
+Line 5239 
 sub _tree_construction_main ($) {
          }
          !!!parse-error (type => 'after html:#character');
-         $phase = 'main';
+         $self->{insertion_mode} = $previous_insertion_mode;
          ## reprocess
          redo B;
        } elsif ($token->{type} eq 'start tag' or
                 $token->{type} eq 'end tag') {
-         !!!parse-error (type => 'after html:'.$token->{tag_name});
+         !!!parse-error (type => 'after html:'.($token->{type} eq 'end tag' ? '/' : '').$token->{tag_name});
-         $phase = 'main';
+         $self->{insertion_mode} = $previous_insertion_mode;
          ## reprocess
          redo B;
        } elsif ($token->{type} eq 'end-of-file') {
-Line 4865 
 sub set_inner_html ($$$) {
+Line 5290 
 sub set_inner_html ($$$) {
      ## NOTE: Most of this code is copied from |parse_string|
      ## Step 1 # MUST
-     my $doc = $node->owner_document->implementation->create_document;
+     my $this_doc = $node->owner_document;
-     ## TODO: Mark as HTML document
+     my $doc = $this_doc->implementation->create_document;
+     $doc->manakai_is_html (1);
      my $p = $class->new;
      $p->{document} = $doc;
-Line 4876 
 sub set_inner_html ($$$) {
+Line 5302 
 sub set_inner_html ($$$) {
      my $column = 0;
      $p->{set_next_input_character} = sub {
        my $self = shift;
+       pop @{$self->{prev_input_character}};
+       unshift @{$self->{prev_input_character}}, $self->{next_input_character};
        $self->{next_input_character} = -1 and return if $i >= length $$s;
        $self->{next_input_character} = ord substr $$s, $i++, 1;
        $column++;
-Line 4884 
 sub set_inner_html ($$$) {
+Line 5314 
 sub set_inner_html ($$$) {
          $line++;
          $column = 0;
        } elsif ($self->{next_input_character} == 0x000D) { # CR
-         if ($i >= length $$s) {
+         $i++ if substr ($$s, $i, 1) eq "\x0A";
-           #
-         } else {
-           my $next_char = ord substr $$s, $i++, 1;
-           if ($next_char == 0x000A) { # LF
-             #
-           } else {
-             push @{$self->{char}}, $next_char;
-           }
-         }
          $self->{next_input_character} = 0x000A; # LF # MUST
          $line++;
          $column = 0;
        } elsif ($self->{next_input_character} > 0x10FFFF) {
          $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
        } elsif ($self->{next_input_character} == 0x0000) { # NULL
+         !!!parse-error (type => 'NULL');
          $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
        }
      };
+     $p->{prev_input_character} = [-1, -1, -1];
+     $p->{next_input_character} = -1;
      my $ponerror = $onerror || sub {
        my (%opt) = @_;
-Line 4981 
 sub set_inner_html ($$$) {
+Line 5405 
 sub set_inner_html ($$$) {
      ## Step 12 # MUST
      @cn = @{$root->child_nodes};
      for (@cn) {
+       $this_doc->adopt_node ($_);
        $node->append_child ($_);
      }
-     ## ISSUE: adopt_node? mutation events?
+     ## ISSUE: mutation events?
      $p->_terminate_tree_constructor;
    } else {
-Line 5028 
 sub get_inner_html ($$$) {
+Line 5453 
 sub get_inner_html ($$$) {
      my $nt = $child->node_type;
      if ($nt == 1) { # Element
-       my $tag_name = lc $child->tag_name; ## ISSUE: Definition of "lowercase"
+       my $tag_name = $child->tag_name; ## TODO: manakai_tag_name
        $s .= '<' . $tag_name;
+       ## NOTE: Non-HTML case:
-       ## ISSUE: Non-html elements
+       ## <http://permalink.gmane.org/gmane.org.w3c.whatwg.discuss/11191>
        my @attrs = @{$child->attributes}; # sort order MUST be stable
        for my $attr (@attrs) { # order is implementation dependent
-         my $attr_name = lc $attr->name; ## ISSUE: Definition of "lowercase"
+         my $attr_name = $attr->name; ## TODO: manakai_name
          $s .= ' ' . $attr_name . '="';
          my $attr_value = $attr->value;
          ## escape
-Line 5054 
 sub get_inner_html ($$$) {
+Line 5479 
 sub get_inner_html ($$$) {
          spacer => 1, wbr => 1,
        }->{$tag_name};
+       $s .= "\x0A" if $tag_name eq 'pre' or $tag_name eq 'textarea';
        if (not $in_cdata and {
          style => 1, script => 1, xmp => 1, iframe => 1,
          noembed => 1, noframes => 1, noscript => 1,
+         plaintext => 1,
        }->{$tag_name}) {
          unshift @node, 'cdata-out';
          $in_cdata = 1;

 Legend:



Removed from v.1.11
 


changed lines


 
Added in v.1.35
 Legend:



Removed from v.1.11
 


changed lines


 
Added in v.1.35
-Removed from v.1.11
+Added in v.1.35

admin@suikawiki.org	ViewVC Help
Powered by ViewVC 1.1.24