/[suikacvs]/markup/html/whatpm/Whatpm/HTML.pm.src

Diff of /markup/html/whatpm/Whatpm/HTML.pm.src

Parent Directory | Revision Log | View Patch Patch

-revision 1.15 by wakaba,
Sat Jun 23 06:48:24 2007 UTC
+revision 1.42 by wakaba,
Sat Jul 21 06:59:16 2007 UTC
 Line 2 
 package Whatpm::HTML;
  use strict;
  our $VERSION=do{my @r=(q$Revision$=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
- ## This is an early version of an HTML parser.
+ ## ISSUE:
+ ## var doc = implementation.createDocument (null, null, null);
+ ## doc.write ('');
+ ## alert (doc.compatMode);
+ ## ISSUE: HTML5 revision 967 says that the encoding layer MUST NOT
+ ## strip BOM and the HTML layer MUST ignore it.  Whether we can do it
+ ## is not yet clear.
+ ## "{U+FEFF}..." in UTF-16BE/UTF-16LE is three or four characters?
+ ## "{U+FEFF}..." in GB18030?
  my $permitted_slash_tag_name = {
    base => 1,
-Line 18 
 my $permitted_slash_tag_name = {
+Line 27 
 my $permitted_slash_tag_name = {
    input => 1,
  };
- my $entity_char = {
-   AElig => "\x{00C6}",
-   Aacute => "\x{00C1}",
-   Acirc => "\x{00C2}",
-   Agrave => "\x{00C0}",
-   Alpha => "\x{0391}",
-   Aring => "\x{00C5}",
-   Atilde => "\x{00C3}",
-   Auml => "\x{00C4}",
-   Beta => "\x{0392}",
-   Ccedil => "\x{00C7}",
-   Chi => "\x{03A7}",
-   Dagger => "\x{2021}",
-   Delta => "\x{0394}",
-   ETH => "\x{00D0}",
-   Eacute => "\x{00C9}",
-   Ecirc => "\x{00CA}",
-   Egrave => "\x{00C8}",
-   Epsilon => "\x{0395}",
-   Eta => "\x{0397}",
-   Euml => "\x{00CB}",
-   Gamma => "\x{0393}",
-   Iacute => "\x{00CD}",
-   Icirc => "\x{00CE}",
-   Igrave => "\x{00CC}",
-   Iota => "\x{0399}",
-   Iuml => "\x{00CF}",
-   Kappa => "\x{039A}",
-   Lambda => "\x{039B}",
-   Mu => "\x{039C}",
-   Ntilde => "\x{00D1}",
-   Nu => "\x{039D}",
-   OElig => "\x{0152}",
-   Oacute => "\x{00D3}",
-   Ocirc => "\x{00D4}",
-   Ograve => "\x{00D2}",
-   Omega => "\x{03A9}",
-   Omicron => "\x{039F}",
-   Oslash => "\x{00D8}",
-   Otilde => "\x{00D5}",
-   Ouml => "\x{00D6}",
-   Phi => "\x{03A6}",
-   Pi => "\x{03A0}",
-   Prime => "\x{2033}",
-   Psi => "\x{03A8}",
-   Rho => "\x{03A1}",
-   Scaron => "\x{0160}",
-   Sigma => "\x{03A3}",
-   THORN => "\x{00DE}",
-   Tau => "\x{03A4}",
-   Theta => "\x{0398}",
-   Uacute => "\x{00DA}",
-   Ucirc => "\x{00DB}",
-   Ugrave => "\x{00D9}",
-   Upsilon => "\x{03A5}",
-   Uuml => "\x{00DC}",
-   Xi => "\x{039E}",
-   Yacute => "\x{00DD}",
-   Yuml => "\x{0178}",
-   Zeta => "\x{0396}",
-   aacute => "\x{00E1}",
-   acirc => "\x{00E2}",
-   acute => "\x{00B4}",
-   aelig => "\x{00E6}",
-   agrave => "\x{00E0}",
-   alefsym => "\x{2135}",
-   alpha => "\x{03B1}",
-   amp => "\x{0026}",
-   AMP => "\x{0026}",
-   and => "\x{2227}",
-   ang => "\x{2220}",
-   apos => "\x{0027}",
-   aring => "\x{00E5}",
-   asymp => "\x{2248}",
-   atilde => "\x{00E3}",
-   auml => "\x{00E4}",
-   bdquo => "\x{201E}",
-   beta => "\x{03B2}",
-   brvbar => "\x{00A6}",
-   bull => "\x{2022}",
-   cap => "\x{2229}",
-   ccedil => "\x{00E7}",
-   cedil => "\x{00B8}",
-   cent => "\x{00A2}",
-   chi => "\x{03C7}",
-   circ => "\x{02C6}",
-   clubs => "\x{2663}",
-   cong => "\x{2245}",
-   copy => "\x{00A9}",
-   COPY => "\x{00A9}",
-   crarr => "\x{21B5}",
-   cup => "\x{222A}",
-   curren => "\x{00A4}",
-   dArr => "\x{21D3}",
-   dagger => "\x{2020}",
-   darr => "\x{2193}",
-   deg => "\x{00B0}",
-   delta => "\x{03B4}",
-   diams => "\x{2666}",
-   divide => "\x{00F7}",
-   eacute => "\x{00E9}",
-   ecirc => "\x{00EA}",
-   egrave => "\x{00E8}",
-   empty => "\x{2205}",
-   emsp => "\x{2003}",
-   ensp => "\x{2002}",
-   epsilon => "\x{03B5}",
-   equiv => "\x{2261}",
-   eta => "\x{03B7}",
-   eth => "\x{00F0}",
-   euml => "\x{00EB}",
-   euro => "\x{20AC}",
-   exist => "\x{2203}",
-   fnof => "\x{0192}",
-   forall => "\x{2200}",
-   frac12 => "\x{00BD}",
-   frac14 => "\x{00BC}",
-   frac34 => "\x{00BE}",
-   frasl => "\x{2044}",
-   gamma => "\x{03B3}",
-   ge => "\x{2265}",
-   gt => "\x{003E}",
-   GT => "\x{003E}",
-   hArr => "\x{21D4}",
-   harr => "\x{2194}",
-   hearts => "\x{2665}",
-   hellip => "\x{2026}",
-   iacute => "\x{00ED}",
-   icirc => "\x{00EE}",
-   iexcl => "\x{00A1}",
-   igrave => "\x{00EC}",
-   image => "\x{2111}",
-   infin => "\x{221E}",
-   int => "\x{222B}",
-   iota => "\x{03B9}",
-   iquest => "\x{00BF}",
-   isin => "\x{2208}",
-   iuml => "\x{00EF}",
-   kappa => "\x{03BA}",
-   lArr => "\x{21D0}",
-   lambda => "\x{03BB}",
-   lang => "\x{2329}",
-   laquo => "\x{00AB}",
-   larr => "\x{2190}",
-   lceil => "\x{2308}",
-   ldquo => "\x{201C}",
-   le => "\x{2264}",
-   lfloor => "\x{230A}",
-   lowast => "\x{2217}",
-   loz => "\x{25CA}",
-   lrm => "\x{200E}",
-   lsaquo => "\x{2039}",
-   lsquo => "\x{2018}",
-   lt => "\x{003C}",
-   LT => "\x{003C}",
-   macr => "\x{00AF}",
-   mdash => "\x{2014}",
-   micro => "\x{00B5}",
-   middot => "\x{00B7}",
-   minus => "\x{2212}",
-   mu => "\x{03BC}",
-   nabla => "\x{2207}",
-   nbsp => "\x{00A0}",
-   ndash => "\x{2013}",
-   ne => "\x{2260}",
-   ni => "\x{220B}",
-   not => "\x{00AC}",
-   notin => "\x{2209}",
-   nsub => "\x{2284}",
-   ntilde => "\x{00F1}",
-   nu => "\x{03BD}",
-   oacute => "\x{00F3}",
-   ocirc => "\x{00F4}",
-   oelig => "\x{0153}",
-   ograve => "\x{00F2}",
-   oline => "\x{203E}",
-   omega => "\x{03C9}",
-   omicron => "\x{03BF}",
-   oplus => "\x{2295}",
-   or => "\x{2228}",
-   ordf => "\x{00AA}",
-   ordm => "\x{00BA}",
-   oslash => "\x{00F8}",
-   otilde => "\x{00F5}",
-   otimes => "\x{2297}",
-   ouml => "\x{00F6}",
-   para => "\x{00B6}",
-   part => "\x{2202}",
-   permil => "\x{2030}",
-   perp => "\x{22A5}",
-   phi => "\x{03C6}",
-   pi => "\x{03C0}",
-   piv => "\x{03D6}",
-   plusmn => "\x{00B1}",
-   pound => "\x{00A3}",
-   prime => "\x{2032}",
-   prod => "\x{220F}",
-   prop => "\x{221D}",
-   psi => "\x{03C8}",
-   quot => "\x{0022}",
-   QUOT => "\x{0022}",
-   rArr => "\x{21D2}",
-   radic => "\x{221A}",
-   rang => "\x{232A}",
-   raquo => "\x{00BB}",
-   rarr => "\x{2192}",
-   rceil => "\x{2309}",
-   rdquo => "\x{201D}",
-   real => "\x{211C}",
-   reg => "\x{00AE}",
-   REG => "\x{00AE}",
-   rfloor => "\x{230B}",
-   rho => "\x{03C1}",
-   rlm => "\x{200F}",
-   rsaquo => "\x{203A}",
-   rsquo => "\x{2019}",
-   sbquo => "\x{201A}",
-   scaron => "\x{0161}",
-   sdot => "\x{22C5}",
-   sect => "\x{00A7}",
-   shy => "\x{00AD}",
-   sigma => "\x{03C3}",
-   sigmaf => "\x{03C2}",
-   sim => "\x{223C}",
-   spades => "\x{2660}",
-   sub => "\x{2282}",
-   sube => "\x{2286}",
-   sum => "\x{2211}",
-   sup => "\x{2283}",
-   sup1 => "\x{00B9}",
-   sup2 => "\x{00B2}",
-   sup3 => "\x{00B3}",
-   supe => "\x{2287}",
-   szlig => "\x{00DF}",
-   tau => "\x{03C4}",
-   there4 => "\x{2234}",
-   theta => "\x{03B8}",
-   thetasym => "\x{03D1}",
-   thinsp => "\x{2009}",
-   thorn => "\x{00FE}",
-   tilde => "\x{02DC}",
-   times => "\x{00D7}",
-   trade => "\x{2122}",
-   uArr => "\x{21D1}",
-   uacute => "\x{00FA}",
-   uarr => "\x{2191}",
-   ucirc => "\x{00FB}",
-   ugrave => "\x{00F9}",
-   uml => "\x{00A8}",
-   upsih => "\x{03D2}",
-   upsilon => "\x{03C5}",
-   uuml => "\x{00FC}",
-   weierp => "\x{2118}",
-   xi => "\x{03BE}",
-   yacute => "\x{00FD}",
-   yen => "\x{00A5}",
-   yuml => "\x{00FF}",
-   zeta => "\x{03B6}",
-   zwj => "\x{200D}",
-   zwnj => "\x{200C}",
- }; # $entity_char
  my $c1_entity_char = {
 x80 => 0x20AC,
 x81 => 0xFFFD,
-Line 403 
 sub new ($) {
+Line 150 
 sub new ($) {
    return $self;
  } # new
+ sub CM_ENTITY () { 0b001 } # & markup in data
+ sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
+ sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
+ sub PLAINTEXT_CONTENT_MODEL () { 0 }
+ sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
+ sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
+ sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
  ## Implementations MUST act as if state machine in the spec
  sub _initialize_tokenizer ($) {
    my $self = shift;
    $self->{state} = 'data'; # MUST
-   $self->{content_model_flag} = 'PCDATA'; # be
+   $self->{content_model} = PCDATA_CONTENT_MODEL; # be
    undef $self->{current_token}; # start tag, end tag, comment, or DOCTYPE
    undef $self->{current_attribute};
    undef $self->{last_emitted_start_tag_name};
-Line 417 
 sub _initialize_tokenizer ($) {
+Line 173 
 sub _initialize_tokenizer ($) {
    # $self->{next_input_character}
    !!!next-input-character;
    $self->{token} = [];
+   # $self->{escape}
  } # _initialize_tokenizer
  ## A token has:
  ##   ->{type} eq 'DOCTYPE', 'start tag', 'end tag', 'comment',
  ##       'character', or 'end-of-file'
- ##   ->{name} (DOCTYPE, start tag (tagname), end tag (tagname))
+ ##   ->{name} (DOCTYPE, start tag (tag name), end tag (tag name))
-     ## ISSUE: the spec need s/tagname/tag name/
+ ##   ->{public_identifier} (DOCTYPE)
- ##   ->{error} == 1 or 0 (DOCTYPE)
+ ##   ->{system_identifier} (DOCTYPE)
+ ##   ->{correct} == 1 or 0 (DOCTYPE)
  ##   ->{attributes} isa HASH (start tag, end tag)
  ##   ->{data} (comment, character)
- ## Macros
- ##   Macros MUST be preceded by three EXCLAMATION MARKs.
- ##   emit ($token)
- ##     Emits the specified token.
  ## Emitted token MUST immediately be handled by the tree construction state.
  ## Before each step, UA MAY check to see if either one of the scripts in
-Line 450 
 sub _get_next_token ($) {
+Line 203 
 sub _get_next_token ($) {
    A: {
      if ($self->{state} eq 'data') {
        if ($self->{next_input_character} == 0x0026) { # &
-         if ($self->{content_model_flag} eq 'PCDATA' or
+         if ($self->{content_model} & CM_ENTITY) { # PCDATA | RCDATA
-             $self->{content_model_flag} eq 'RCDATA') {
            $self->{state} = 'entity data';
            !!!next-input-character;
            redo A;
-Line 459 
 sub _get_next_token ($) {
+Line 211 
 sub _get_next_token ($) {
            #
          }
        } elsif ($self->{next_input_character} == 0x002D) { # -
-         if ($self->{content_model_flag} eq 'RCDATA' or
+         if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
-             $self->{content_model_flag} eq 'CDATA') {
            unless ($self->{escape}) {
              if ($self->{prev_input_character}->[0] == 0x002D and # -
                  $self->{prev_input_character}->[1] == 0x0021 and # !
-Line 472 
 sub _get_next_token ($) {
+Line 223 
 sub _get_next_token ($) {
          #
        } elsif ($self->{next_input_character} == 0x003C) { # <
-         if ($self->{content_model_flag} eq 'PCDATA' or
+         if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
-             (($self->{content_model_flag} eq 'CDATA' or
+             (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
-               $self->{content_model_flag} eq 'RCDATA') and
               not $self->{escape})) {
            $self->{state} = 'tag open';
            !!!next-input-character;
-Line 484 
 sub _get_next_token ($) {
+Line 234 
 sub _get_next_token ($) {
          }
        } elsif ($self->{next_input_character} == 0x003E) { # >
          if ($self->{escape} and
-             ($self->{content_model_flag} eq 'RCDATA' or
+             ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
-              $self->{content_model_flag} eq 'CDATA')) {
            if ($self->{prev_input_character}->[0] == 0x002D and # -
                $self->{prev_input_character}->[1] == 0x002D) { # -
              delete $self->{escape};
-Line 509 
 sub _get_next_token ($) {
+Line 258 
 sub _get_next_token ($) {
      } elsif ($self->{state} eq 'entity data') {
        ## (cannot happen in CDATA state)
-       my $token = $self->_tokenize_attempt_to_consume_an_entity;
+       my $token = $self->_tokenize_attempt_to_consume_an_entity (0);
        $self->{state} = 'data';
        # next-input-character is already done
-Line 522 
 sub _get_next_token ($) {
+Line 271 
 sub _get_next_token ($) {
        redo A;
      } elsif ($self->{state} eq 'tag open') {
-       if ($self->{content_model_flag} eq 'RCDATA' or
+       if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
-           $self->{content_model_flag} eq 'CDATA') {
          if ($self->{next_input_character} == 0x002F) { # /
            !!!next-input-character;
            $self->{state} = 'close tag open';
-Line 536 
 sub _get_next_token ($) {
+Line 284 
 sub _get_next_token ($) {
            redo A;
          }
-       } elsif ($self->{content_model_flag} eq 'PCDATA') {
+       } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
          if ($self->{next_input_character} == 0x0021) { # !
            $self->{state} = 'markup declaration open';
            !!!next-input-character;
-Line 583 
 sub _get_next_token ($) {
+Line 331 
 sub _get_next_token ($) {
            redo A;
          }
        } else {
-         die "$0: $self->{content_model_flag}: Unknown content model flag";
+         die "$0: $self->{content_model} in tag open";
        }
      } elsif ($self->{state} eq 'close tag open') {
-       if ($self->{content_model_flag} eq 'RCDATA' or
+       if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
-           $self->{content_model_flag} eq 'CDATA') {
+         if (defined $self->{last_emitted_start_tag_name}) {
-         my @next_char;
+           ## NOTE: <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>
-         TAGNAME: for (my $i = 0; $i < length $self->{last_emitted_start_tag_name}; $i++) {
+           my @next_char;
+           TAGNAME: for (my $i = 0; $i < length $self->{last_emitted_start_tag_name}; $i++) {
+             push @next_char, $self->{next_input_character};
+             my $c = ord substr ($self->{last_emitted_start_tag_name}, $i, 1);
+             my $C = 0x0061 <= $c && $c <= 0x007A ? $c - 0x0020 : $c;
+             if ($self->{next_input_character} == $c or $self->{next_input_character} == $C) {
+               !!!next-input-character;
+               next TAGNAME;
+             } else {
+               $self->{next_input_character} = shift @next_char; # reconsume
+               !!!back-next-input-character (@next_char);
+               $self->{state} = 'data';
+               !!!emit ({type => 'character', data => '</'});
+               redo A;
+             }
+           }
            push @next_char, $self->{next_input_character};
-           my $c = ord substr ($self->{last_emitted_start_tag_name}, $i, 1);
-           my $C = 0x0061 <= $c && $c <= 0x007A ? $c - 0x0020 : $c;
+           unless ($self->{next_input_character} == 0x0009 or # HT
-           if ($self->{next_input_character} == $c or $self->{next_input_character} == $C) {
+                   $self->{next_input_character} == 0x000A or # LF
-             !!!next-input-character;
+                   $self->{next_input_character} == 0x000B or # VT
-             next TAGNAME;
+                   $self->{next_input_character} == 0x000C or # FF
-           } else {
+                   $self->{next_input_character} == 0x0020 or # SP
-             !!!parse-error (type => 'unmatched end tag');
+                   $self->{next_input_character} == 0x003E or # >
+                   $self->{next_input_character} == 0x002F or # /
+                   $self->{next_input_character} == -1) {
              $self->{next_input_character} = shift @next_char; # reconsume
              !!!back-next-input-character (@next_char);
              $self->{state} = 'data';
              !!!emit ({type => 'character', data => '</'});
              redo A;
+           } else {
+             $self->{next_input_character} = shift @next_char;
+             !!!back-next-input-character (@next_char);
+             # and consume...
            }
-         }
+         } else {
-         push @next_char, $self->{next_input_character};
+           ## No start tag token has ever been emitted
+           # next-input-character is already done
-         unless ($self->{next_input_character} == 0x0009 or # HT
-                 $self->{next_input_character} == 0x000A or # LF
-                 $self->{next_input_character} == 0x000B or # VT
-                 $self->{next_input_character} == 0x000C or # FF
-                 $self->{next_input_character} == 0x0020 or # SP
-                 $self->{next_input_character} == 0x003E or # >
-                 $self->{next_input_character} == 0x002F or # /
-                 $self->{next_input_character} == 0x003C or # <
-                 $self->{next_input_character} == -1) {
-           !!!parse-error (type => 'unmatched end tag');
-           $self->{next_input_character} = shift @next_char; # reconsume
-           !!!back-next-input-character (@next_char);
            $self->{state} = 'data';
            !!!emit ({type => 'character', data => '</'});
            redo A;
-         } else {
-           $self->{next_input_character} = shift @next_char;
-           !!!back-next-input-character (@next_char);
-           # and consume...
          }
        }
-Line 677 
 sub _get_next_token ($) {
+Line 428 
 sub _get_next_token ($) {
          redo A;
        } elsif ($self->{next_input_character} == 0x003E) { # >
          if ($self->{current_token}->{type} eq 'start tag') {
+           $self->{current_token}->{first_start_tag}
+               = not defined $self->{last_emitted_start_tag_name};
            $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
          } elsif ($self->{current_token}->{type} eq 'end tag') {
-           $self->{content_model_flag} = 'PCDATA'; # MUST
+           $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
            if ($self->{current_token}->{attributes}) {
              !!!parse-error (type => 'end tag attribute');
            }
-Line 690 
 sub _get_next_token ($) {
+Line 443 
 sub _get_next_token ($) {
          !!!next-input-character;
          !!!emit ($self->{current_token}); # start tag or end tag
-         undef $self->{current_token};
          redo A;
        } elsif (0x0041 <= $self->{next_input_character} and
-Line 700 
 sub _get_next_token ($) {
+Line 452 
 sub _get_next_token ($) {
          ## Stay in this state
          !!!next-input-character;
          redo A;
-       } elsif ($self->{next_input_character} == 0x003C or # <
+       } elsif ($self->{next_input_character} == -1) {
-                $self->{next_input_character} == -1) {
          !!!parse-error (type => 'unclosed tag');
          if ($self->{current_token}->{type} eq 'start tag') {
+           $self->{current_token}->{first_start_tag}
+               = not defined $self->{last_emitted_start_tag_name};
            $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
          } elsif ($self->{current_token}->{type} eq 'end tag') {
-           $self->{content_model_flag} = 'PCDATA'; # MUST
+           $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
            if ($self->{current_token}->{attributes}) {
              !!!parse-error (type => 'end tag attribute');
            }
-Line 717 
 sub _get_next_token ($) {
+Line 470 
 sub _get_next_token ($) {
          # reconsume
          !!!emit ($self->{current_token}); # start tag or end tag
-         undef $self->{current_token};
          redo A;
        } elsif ($self->{next_input_character} == 0x002F) { # /
-Line 751 
 sub _get_next_token ($) {
+Line 503 
 sub _get_next_token ($) {
          redo A;
        } elsif ($self->{next_input_character} == 0x003E) { # >
          if ($self->{current_token}->{type} eq 'start tag') {
+           $self->{current_token}->{first_start_tag}
+               = not defined $self->{last_emitted_start_tag_name};
            $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
          } elsif ($self->{current_token}->{type} eq 'end tag') {
-           $self->{content_model_flag} = 'PCDATA'; # MUST
+           $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
            if ($self->{current_token}->{attributes}) {
              !!!parse-error (type => 'end tag attribute');
            }
-Line 764 
 sub _get_next_token ($) {
+Line 518 
 sub _get_next_token ($) {
          !!!next-input-character;
          !!!emit ($self->{current_token}); # start tag or end tag
-         undef $self->{current_token};
          redo A;
        } elsif (0x0041 <= $self->{next_input_character} and
-Line 787 
 sub _get_next_token ($) {
+Line 540 
 sub _get_next_token ($) {
          ## Stay in the state
          # next-input-character is already done
          redo A;
-       } elsif ($self->{next_input_character} == 0x003C or # <
+       } elsif ($self->{next_input_character} == -1) {
-                $self->{next_input_character} == -1) {
          !!!parse-error (type => 'unclosed tag');
          if ($self->{current_token}->{type} eq 'start tag') {
+           $self->{current_token}->{first_start_tag}
+               = not defined $self->{last_emitted_start_tag_name};
            $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
          } elsif ($self->{current_token}->{type} eq 'end tag') {
-           $self->{content_model_flag} = 'PCDATA'; # MUST
+           $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
            if ($self->{current_token}->{attributes}) {
              !!!parse-error (type => 'end tag attribute');
            }
-Line 804 
 sub _get_next_token ($) {
+Line 558 
 sub _get_next_token ($) {
          # reconsume
          !!!emit ($self->{current_token}); # start tag or end tag
-         undef $self->{current_token};
          redo A;
        } else {
-Line 818 
 sub _get_next_token ($) {
+Line 571 
 sub _get_next_token ($) {
        my $before_leave = sub {
          if (exists $self->{current_token}->{attributes} # start tag or end tag
              ->{$self->{current_attribute}->{name}}) { # MUST
-           !!!parse-error (type => 'dupulicate attribute');
+           !!!parse-error (type => 'duplicate attribute:'.$self->{current_attribute}->{name});
            ## Discard $self->{current_attribute} # MUST
          } else {
            $self->{current_token}->{attributes}->{$self->{current_attribute}->{name}}
-Line 843 
 sub _get_next_token ($) {
+Line 596 
 sub _get_next_token ($) {
        } elsif ($self->{next_input_character} == 0x003E) { # >
          $before_leave->();
          if ($self->{current_token}->{type} eq 'start tag') {
+           $self->{current_token}->{first_start_tag}
+               = not defined $self->{last_emitted_start_tag_name};
            $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
          } elsif ($self->{current_token}->{type} eq 'end tag') {
-           $self->{content_model_flag} = 'PCDATA'; # MUST
+           $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
            if ($self->{current_token}->{attributes}) {
              !!!parse-error (type => 'end tag attribute');
            }
-Line 856 
 sub _get_next_token ($) {
+Line 611 
 sub _get_next_token ($) {
          !!!next-input-character;
          !!!emit ($self->{current_token}); # start tag or end tag
-         undef $self->{current_token};
          redo A;
        } elsif (0x0041 <= $self->{next_input_character} and
-Line 879 
 sub _get_next_token ($) {
+Line 633 
 sub _get_next_token ($) {
          $self->{state} = 'before attribute name';
          # next-input-character is already done
          redo A;
-       } elsif ($self->{next_input_character} == 0x003C or # <
+       } elsif ($self->{next_input_character} == -1) {
-                $self->{next_input_character} == -1) {
          !!!parse-error (type => 'unclosed tag');
          $before_leave->();
          if ($self->{current_token}->{type} eq 'start tag') {
+           $self->{current_token}->{first_start_tag}
+               = not defined $self->{last_emitted_start_tag_name};
            $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
          } elsif ($self->{current_token}->{type} eq 'end tag') {
-           $self->{content_model_flag} = 'PCDATA'; # MUST
+           $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
            if ($self->{current_token}->{attributes}) {
              !!!parse-error (type => 'end tag attribute');
            }
-Line 897 
 sub _get_next_token ($) {
+Line 652 
 sub _get_next_token ($) {
          # reconsume
          !!!emit ($self->{current_token}); # start tag or end tag
-         undef $self->{current_token};
          redo A;
        } else {
-Line 921 
 sub _get_next_token ($) {
+Line 675 
 sub _get_next_token ($) {
          redo A;
        } elsif ($self->{next_input_character} == 0x003E) { # >
          if ($self->{current_token}->{type} eq 'start tag') {
+           $self->{current_token}->{first_start_tag}
+               = not defined $self->{last_emitted_start_tag_name};
            $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
          } elsif ($self->{current_token}->{type} eq 'end tag') {
-           $self->{content_model_flag} = 'PCDATA'; # MUST
+           $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
            if ($self->{current_token}->{attributes}) {
              !!!parse-error (type => 'end tag attribute');
            }
-Line 934 
 sub _get_next_token ($) {
+Line 690 
 sub _get_next_token ($) {
          !!!next-input-character;
          !!!emit ($self->{current_token}); # start tag or end tag
-         undef $self->{current_token};
          redo A;
        } elsif (0x0041 <= $self->{next_input_character} and
-Line 953 
 sub _get_next_token ($) {
+Line 708 
 sub _get_next_token ($) {
            #
          } else {
            !!!parse-error (type => 'nestc');
+           ## TODO: Different error type for <aa / bb> than <aa/>
          }
          $self->{state} = 'before attribute name';
          # next-input-character is already done
          redo A;
-       } elsif ($self->{next_input_character} == 0x003C or # <
+       } elsif ($self->{next_input_character} == -1) {
-                $self->{next_input_character} == -1) {
          !!!parse-error (type => 'unclosed tag');
          if ($self->{current_token}->{type} eq 'start tag') {
+           $self->{current_token}->{first_start_tag}
+               = not defined $self->{last_emitted_start_tag_name};
            $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
          } elsif ($self->{current_token}->{type} eq 'end tag') {
-           $self->{content_model_flag} = 'PCDATA'; # MUST
+           $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
            if ($self->{current_token}->{attributes}) {
              !!!parse-error (type => 'end tag attribute');
            }
-Line 974 
 sub _get_next_token ($) {
+Line 731 
 sub _get_next_token ($) {
          # reconsume
          !!!emit ($self->{current_token}); # start tag or end tag
-         undef $self->{current_token};
          redo A;
        } else {
-Line 1007 
 sub _get_next_token ($) {
+Line 763 
 sub _get_next_token ($) {
          redo A;
        } elsif ($self->{next_input_character} == 0x003E) { # >
          if ($self->{current_token}->{type} eq 'start tag') {
+           $self->{current_token}->{first_start_tag}
+               = not defined $self->{last_emitted_start_tag_name};
            $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
          } elsif ($self->{current_token}->{type} eq 'end tag') {
-           $self->{content_model_flag} = 'PCDATA'; # MUST
+           $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
            if ($self->{current_token}->{attributes}) {
              !!!parse-error (type => 'end tag attribute');
            }
-Line 1020 
 sub _get_next_token ($) {
+Line 778 
 sub _get_next_token ($) {
          !!!next-input-character;
          !!!emit ($self->{current_token}); # start tag or end tag
-         undef $self->{current_token};
          redo A;
-       } elsif ($self->{next_input_character} == 0x003C or # <
+       } elsif ($self->{next_input_character} == -1) {
-                $self->{next_input_character} == -1) {
          !!!parse-error (type => 'unclosed tag');
          if ($self->{current_token}->{type} eq 'start tag') {
+           $self->{current_token}->{first_start_tag}
+               = not defined $self->{last_emitted_start_tag_name};
            $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
          } elsif ($self->{current_token}->{type} eq 'end tag') {
-           $self->{content_model_flag} = 'PCDATA'; # MUST
+           $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
            if ($self->{current_token}->{attributes}) {
              !!!parse-error (type => 'end tag attribute');
            }
-Line 1040 
 sub _get_next_token ($) {
+Line 798 
 sub _get_next_token ($) {
          ## reconsume
          !!!emit ($self->{current_token}); # start tag or end tag
-         undef $self->{current_token};
          redo A;
        } else {
-Line 1062 
 sub _get_next_token ($) {
+Line 819 
 sub _get_next_token ($) {
        } elsif ($self->{next_input_character} == -1) {
          !!!parse-error (type => 'unclosed attribute value');
          if ($self->{current_token}->{type} eq 'start tag') {
+           $self->{current_token}->{first_start_tag}
+               = not defined $self->{last_emitted_start_tag_name};
            $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
          } elsif ($self->{current_token}->{type} eq 'end tag') {
-           $self->{content_model_flag} = 'PCDATA'; # MUST
+           $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
            if ($self->{current_token}->{attributes}) {
              !!!parse-error (type => 'end tag attribute');
            }
-Line 1075 
 sub _get_next_token ($) {
+Line 834 
 sub _get_next_token ($) {
          ## reconsume
          !!!emit ($self->{current_token}); # start tag or end tag
-         undef $self->{current_token};
          redo A;
        } else {
-Line 1097 
 sub _get_next_token ($) {
+Line 855 
 sub _get_next_token ($) {
        } elsif ($self->{next_input_character} == -1) {
          !!!parse-error (type => 'unclosed attribute value');
          if ($self->{current_token}->{type} eq 'start tag') {
+           $self->{current_token}->{first_start_tag}
+               = not defined $self->{last_emitted_start_tag_name};
            $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
          } elsif ($self->{current_token}->{type} eq 'end tag') {
-           $self->{content_model_flag} = 'PCDATA'; # MUST
+           $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
            if ($self->{current_token}->{attributes}) {
              !!!parse-error (type => 'end tag attribute');
            }
-Line 1110 
 sub _get_next_token ($) {
+Line 870 
 sub _get_next_token ($) {
          ## reconsume
          !!!emit ($self->{current_token}); # start tag or end tag
-         undef $self->{current_token};
          redo A;
        } else {
-Line 1135 
 sub _get_next_token ($) {
+Line 894 
 sub _get_next_token ($) {
          redo A;
        } elsif ($self->{next_input_character} == 0x003E) { # >
          if ($self->{current_token}->{type} eq 'start tag') {
+           $self->{current_token}->{first_start_tag}
+               = not defined $self->{last_emitted_start_tag_name};
            $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
          } elsif ($self->{current_token}->{type} eq 'end tag') {
-           $self->{content_model_flag} = 'PCDATA'; # MUST
+           $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
            if ($self->{current_token}->{attributes}) {
              !!!parse-error (type => 'end tag attribute');
            }
-Line 1148 
 sub _get_next_token ($) {
+Line 909 
 sub _get_next_token ($) {
          !!!next-input-character;
          !!!emit ($self->{current_token}); # start tag or end tag
-         undef $self->{current_token};
          redo A;
-       } elsif ($self->{next_input_character} == 0x003C or # <
+       } elsif ($self->{next_input_character} == -1) {
-                $self->{next_input_character} == -1) {
          !!!parse-error (type => 'unclosed tag');
          if ($self->{current_token}->{type} eq 'start tag') {
+           $self->{current_token}->{first_start_tag}
+               = not defined $self->{last_emitted_start_tag_name};
            $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
          } elsif ($self->{current_token}->{type} eq 'end tag') {
-           $self->{content_model_flag} = 'PCDATA'; # MUST
+           $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
            if ($self->{current_token}->{attributes}) {
              !!!parse-error (type => 'end tag attribute');
            }
-Line 1168 
 sub _get_next_token ($) {
+Line 929 
 sub _get_next_token ($) {
          ## reconsume
          !!!emit ($self->{current_token}); # start tag or end tag
-         undef $self->{current_token};
          redo A;
        } else {
-Line 1178 
 sub _get_next_token ($) {
+Line 938 
 sub _get_next_token ($) {
          redo A;
        }
      } elsif ($self->{state} eq 'entity in attribute value') {
-       my $token = $self->_tokenize_attempt_to_consume_an_entity;
+       my $token = $self->_tokenize_attempt_to_consume_an_entity (1);
        unless (defined $token) {
          $self->{current_attribute}->{value} .= '&';
-Line 1227 
 sub _get_next_token ($) {
+Line 987 
 sub _get_next_token ($) {
          push @next_char, $self->{next_input_character};
          if ($self->{next_input_character} == 0x002D) { # -
            $self->{current_token} = {type => 'comment', data => ''};
-           $self->{state} = 'comment';
+           $self->{state} = 'comment start';
            !!!next-input-character;
            redo A;
          }
-Line 1269 
 sub _get_next_token ($) {
+Line 1029 
 sub _get_next_token ($) {
          }
        }
-       !!!parse-error (type => 'bogus comment open');
+       !!!parse-error (type => 'bogus comment');
        $self->{next_input_character} = shift @next_char;
        !!!back-next-input-character (@next_char);
        $self->{state} = 'bogus comment';
-Line 1277 
 sub _get_next_token ($) {
+Line 1037 
 sub _get_next_token ($) {
        ## ISSUE: typos in spec: chacacters, is is a parse error
        ## ISSUE: spec is somewhat unclear on "is the first character that will be in the comment"; what is "that will be in the comment" is what the algorithm defines, isn't it?
+     } elsif ($self->{state} eq 'comment start') {
+       if ($self->{next_input_character} == 0x002D) { # -
+         $self->{state} = 'comment start dash';
+         !!!next-input-character;
+         redo A;
+       } elsif ($self->{next_input_character} == 0x003E) { # >
+         !!!parse-error (type => 'bogus comment');
+         $self->{state} = 'data';
+         !!!next-input-character;
+         !!!emit ($self->{current_token}); # comment
+         redo A;
+       } elsif ($self->{next_input_character} == -1) {
+         !!!parse-error (type => 'unclosed comment');
+         $self->{state} = 'data';
+         ## reconsume
+         !!!emit ($self->{current_token}); # comment
+         redo A;
+       } else {
+         $self->{current_token}->{data} # comment
+             .= chr ($self->{next_input_character});
+         $self->{state} = 'comment';
+         !!!next-input-character;
+         redo A;
+       }
+     } elsif ($self->{state} eq 'comment start dash') {
+       if ($self->{next_input_character} == 0x002D) { # -
+         $self->{state} = 'comment end';
+         !!!next-input-character;
+         redo A;
+       } elsif ($self->{next_input_character} == 0x003E) { # >
+         !!!parse-error (type => 'bogus comment');
+         $self->{state} = 'data';
+         !!!next-input-character;
+         !!!emit ($self->{current_token}); # comment
+         redo A;
+       } elsif ($self->{next_input_character} == -1) {
+         !!!parse-error (type => 'unclosed comment');
+         $self->{state} = 'data';
+         ## reconsume
+         !!!emit ($self->{current_token}); # comment
+         redo A;
+       } else {
+         $self->{current_token}->{data} # comment
+             .= '-' . chr ($self->{next_input_character});
+         $self->{state} = 'comment';
+         !!!next-input-character;
+         redo A;
+       }
      } elsif ($self->{state} eq 'comment') {
        if ($self->{next_input_character} == 0x002D) { # -
-         $self->{state} = 'comment dash';
+         $self->{state} = 'comment end dash';
          !!!next-input-character;
          redo A;
        } elsif ($self->{next_input_character} == -1) {
-Line 1288 
 sub _get_next_token ($) {
+Line 1104 
 sub _get_next_token ($) {
          ## reconsume
          !!!emit ($self->{current_token}); # comment
-         undef $self->{current_token};
          redo A;
        } else {
-Line 1297 
 sub _get_next_token ($) {
+Line 1112 
 sub _get_next_token ($) {
          !!!next-input-character;
          redo A;
        }
-     } elsif ($self->{state} eq 'comment dash') {
+     } elsif ($self->{state} eq 'comment end dash') {
        if ($self->{next_input_character} == 0x002D) { # -
          $self->{state} = 'comment end';
          !!!next-input-character;
-Line 1308 
 sub _get_next_token ($) {
+Line 1123 
 sub _get_next_token ($) {
          ## reconsume
          !!!emit ($self->{current_token}); # comment
-         undef $self->{current_token};
          redo A;
        } else {
-Line 1323 
 sub _get_next_token ($) {
+Line 1137 
 sub _get_next_token ($) {
          !!!next-input-character;
          !!!emit ($self->{current_token}); # comment
-         undef $self->{current_token};
          redo A;
        } elsif ($self->{next_input_character} == 0x002D) { # -
-Line 1338 
 sub _get_next_token ($) {
+Line 1151 
 sub _get_next_token ($) {
          ## reconsume
          !!!emit ($self->{current_token}); # comment
-         undef $self->{current_token};
          redo A;
        } else {
-Line 1372 
 sub _get_next_token ($) {
+Line 1184 
 sub _get_next_token ($) {
          ## Stay in the state
          !!!next-input-character;
          redo A;
-       } elsif (0x0061 <= $self->{next_input_character} and
-                $self->{next_input_character} <= 0x007A) { # a..z
- ## ISSUE: "Set the token's name name to the" in the spec
-         $self->{current_token} = {type => 'DOCTYPE',
-                           name => chr ($self->{next_input_character} - 0x0020),
-                           error => 1};
-         $self->{state} = 'DOCTYPE name';
-         !!!next-input-character;
-         redo A;
        } elsif ($self->{next_input_character} == 0x003E) { # >
          !!!parse-error (type => 'no DOCTYPE name');
          $self->{state} = 'data';
          !!!next-input-character;
-         !!!emit ({type => 'DOCTYPE', name => '', error => 1});
+         !!!emit ({type => 'DOCTYPE'}); # incorrect
          redo A;
        } elsif ($self->{next_input_character} == -1) {
-Line 1394 
 sub _get_next_token ($) {
+Line 1197 
 sub _get_next_token ($) {
          $self->{state} = 'data';
          ## reconsume
-         !!!emit ({type => 'DOCTYPE', name => '', error => 1});
+         !!!emit ({type => 'DOCTYPE'}); # incorrect
          redo A;
        } else {
-         $self->{current_token} = {type => 'DOCTYPE',
+         $self->{current_token}
-                           name => chr ($self->{next_input_character}),
+             = {type => 'DOCTYPE',
-                           error => 1};
+                name => chr ($self->{next_input_character}),
+                correct => 1};
  ## ISSUE: "Set the token's name name to the" in the spec
          $self->{state} = 'DOCTYPE name';
          !!!next-input-character;
          redo A;
        }
      } elsif ($self->{state} eq 'DOCTYPE name') {
+ ## ISSUE: Redundant "First," in the spec.
        if ($self->{next_input_character} == 0x0009 or # HT
            $self->{next_input_character} == 0x000A or # LF
            $self->{next_input_character} == 0x000B or # VT
            $self->{next_input_character} == 0x000C or # FF
            $self->{next_input_character} == 0x0020) { # SP
-         $self->{current_token}->{error} = ($self->{current_token}->{name} ne 'HTML'); # DOCTYPE
          $self->{state} = 'after DOCTYPE name';
          !!!next-input-character;
          redo A;
        } elsif ($self->{next_input_character} == 0x003E) { # >
-         $self->{current_token}->{error} = ($self->{current_token}->{name} ne 'HTML'); # DOCTYPE
          $self->{state} = 'data';
          !!!next-input-character;
          !!!emit ($self->{current_token}); # DOCTYPE
-         undef $self->{current_token};
          redo A;
-       } elsif (0x0061 <= $self->{next_input_character} and
-                $self->{next_input_character} <= 0x007A) { # a..z
-         $self->{current_token}->{name} .= chr ($self->{next_input_character} - 0x0020); # DOCTYPE
-         #$self->{current_token}->{error} = ($self->{current_token}->{name} ne 'HTML');
-         ## Stay in the state
-         !!!next-input-character;
-         redo A;
        } elsif ($self->{next_input_character} == -1) {
          !!!parse-error (type => 'unclosed DOCTYPE');
-         $self->{current_token}->{error} = ($self->{current_token}->{name} ne 'HTML'); # DOCTYPE
          $self->{state} = 'data';
          ## reconsume
-         !!!emit ($self->{current_token});
+         delete $self->{current_token}->{correct};
-         undef $self->{current_token};
+         !!!emit ($self->{current_token}); # DOCTYPE
          redo A;
        } else {
          $self->{current_token}->{name}
            .= chr ($self->{next_input_character}); # DOCTYPE
-         #$self->{current_token}->{error} = ($self->{current_token}->{name} ne 'HTML');
          ## Stay in the state
          !!!next-input-character;
          redo A;
-Line 1464 
 sub _get_next_token ($) {
+Line 1257 
 sub _get_next_token ($) {
          !!!next-input-character;
          !!!emit ($self->{current_token}); # DOCTYPE
-         undef $self->{current_token};
          redo A;
        } elsif ($self->{next_input_character} == -1) {
-Line 1472 
 sub _get_next_token ($) {
+Line 1264 
 sub _get_next_token ($) {
          $self->{state} = 'data';
          ## reconsume
+         delete $self->{current_token}->{correct};
+         !!!emit ($self->{current_token}); # DOCTYPE
+         redo A;
+       } elsif ($self->{next_input_character} == 0x0050 or # P
+                $self->{next_input_character} == 0x0070) { # p
+         !!!next-input-character;
+         if ($self->{next_input_character} == 0x0055 or # U
+             $self->{next_input_character} == 0x0075) { # u
+           !!!next-input-character;
+           if ($self->{next_input_character} == 0x0042 or # B
+               $self->{next_input_character} == 0x0062) { # b
+             !!!next-input-character;
+             if ($self->{next_input_character} == 0x004C or # L
+                 $self->{next_input_character} == 0x006C) { # l
+               !!!next-input-character;
+               if ($self->{next_input_character} == 0x0049 or # I
+                   $self->{next_input_character} == 0x0069) { # i
+                 !!!next-input-character;
+                 if ($self->{next_input_character} == 0x0043 or # C
+                     $self->{next_input_character} == 0x0063) { # c
+                   $self->{state} = 'before DOCTYPE public identifier';
+                   !!!next-input-character;
+                   redo A;
+                 }
+               }
+             }
+           }
+         }
+         #
+       } elsif ($self->{next_input_character} == 0x0053 or # S
+                $self->{next_input_character} == 0x0073) { # s
+         !!!next-input-character;
+         if ($self->{next_input_character} == 0x0059 or # Y
+             $self->{next_input_character} == 0x0079) { # y
+           !!!next-input-character;
+           if ($self->{next_input_character} == 0x0053 or # S
+               $self->{next_input_character} == 0x0073) { # s
+             !!!next-input-character;
+             if ($self->{next_input_character} == 0x0054 or # T
+                 $self->{next_input_character} == 0x0074) { # t
+               !!!next-input-character;
+               if ($self->{next_input_character} == 0x0045 or # E
+                   $self->{next_input_character} == 0x0065) { # e
+                 !!!next-input-character;
+                 if ($self->{next_input_character} == 0x004D or # M
+                     $self->{next_input_character} == 0x006D) { # m
+                   $self->{state} = 'before DOCTYPE system identifier';
+                   !!!next-input-character;
+                   redo A;
+                 }
+               }
+             }
+           }
+         }
+         #
+       } else {
+         !!!next-input-character;
+         #
+       }
+       !!!parse-error (type => 'string after DOCTYPE name');
+       $self->{state} = 'bogus DOCTYPE';
+       # next-input-character is already done
+       redo A;
+     } elsif ($self->{state} eq 'before DOCTYPE public identifier') {
+       if ({
+x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
+             #0x000D => 1, # HT, LF, VT, FF, SP, CR
+           }->{$self->{next_input_character}}) {
+         ## Stay in the state
+         !!!next-input-character;
+         redo A;
+       } elsif ($self->{next_input_character} eq 0x0022) { # "
+         $self->{current_token}->{public_identifier} = ''; # DOCTYPE
+         $self->{state} = 'DOCTYPE public identifier (double-quoted)';
+         !!!next-input-character;
+         redo A;
+       } elsif ($self->{next_input_character} eq 0x0027) { # '
+         $self->{current_token}->{public_identifier} = ''; # DOCTYPE
+         $self->{state} = 'DOCTYPE public identifier (single-quoted)';
+         !!!next-input-character;
+         redo A;
+       } elsif ($self->{next_input_character} eq 0x003E) { # >
+         !!!parse-error (type => 'no PUBLIC literal');
+         $self->{state} = 'data';
+         !!!next-input-character;
+         delete $self->{current_token}->{correct};
+         !!!emit ($self->{current_token}); # DOCTYPE
+         redo A;
+       } elsif ($self->{next_input_character} == -1) {
+         !!!parse-error (type => 'unclosed DOCTYPE');
+         $self->{state} = 'data';
+         ## reconsume
+         delete $self->{current_token}->{correct};
+         !!!emit ($self->{current_token}); # DOCTYPE
+         redo A;
+       } else {
+         !!!parse-error (type => 'string after PUBLIC');
+         $self->{state} = 'bogus DOCTYPE';
+         !!!next-input-character;
+         redo A;
+       }
+     } elsif ($self->{state} eq 'DOCTYPE public identifier (double-quoted)') {
+       if ($self->{next_input_character} == 0x0022) { # "
+         $self->{state} = 'after DOCTYPE public identifier';
+         !!!next-input-character;
+         redo A;
+       } elsif ($self->{next_input_character} == -1) {
+         !!!parse-error (type => 'unclosed PUBLIC literal');
+         $self->{state} = 'data';
+         ## reconsume
+         delete $self->{current_token}->{correct};
+         !!!emit ($self->{current_token}); # DOCTYPE
+         redo A;
+       } else {
+         $self->{current_token}->{public_identifier} # DOCTYPE
+             .= chr $self->{next_input_character};
+         ## Stay in the state
+         !!!next-input-character;
+         redo A;
+       }
+     } elsif ($self->{state} eq 'DOCTYPE public identifier (single-quoted)') {
+       if ($self->{next_input_character} == 0x0027) { # '
+         $self->{state} = 'after DOCTYPE public identifier';
+         !!!next-input-character;
+         redo A;
+       } elsif ($self->{next_input_character} == -1) {
+         !!!parse-error (type => 'unclosed PUBLIC literal');
+         $self->{state} = 'data';
+         ## reconsume
+         delete $self->{current_token}->{correct};
+         !!!emit ($self->{current_token}); # DOCTYPE
+         redo A;
+       } else {
+         $self->{current_token}->{public_identifier} # DOCTYPE
+             .= chr $self->{next_input_character};
+         ## Stay in the state
+         !!!next-input-character;
+         redo A;
+       }
+     } elsif ($self->{state} eq 'after DOCTYPE public identifier') {
+       if ({
+x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
+             #0x000D => 1, # HT, LF, VT, FF, SP, CR
+           }->{$self->{next_input_character}}) {
+         ## Stay in the state
+         !!!next-input-character;
+         redo A;
+       } elsif ($self->{next_input_character} == 0x0022) { # "
+         $self->{current_token}->{system_identifier} = ''; # DOCTYPE
+         $self->{state} = 'DOCTYPE system identifier (double-quoted)';
+         !!!next-input-character;
+         redo A;
+       } elsif ($self->{next_input_character} == 0x0027) { # '
+         $self->{current_token}->{system_identifier} = ''; # DOCTYPE
+         $self->{state} = 'DOCTYPE system identifier (single-quoted)';
+         !!!next-input-character;
+         redo A;
+       } elsif ($self->{next_input_character} == 0x003E) { # >
+         $self->{state} = 'data';
+         !!!next-input-character;
+         !!!emit ($self->{current_token}); # DOCTYPE
+         redo A;
+       } elsif ($self->{next_input_character} == -1) {
+         !!!parse-error (type => 'unclosed DOCTYPE');
+         $self->{state} = 'data';
+         ## reconsume
+         delete $self->{current_token}->{correct};
+         !!!emit ($self->{current_token}); # DOCTYPE
+         redo A;
+       } else {
+         !!!parse-error (type => 'string after PUBLIC literal');
+         $self->{state} = 'bogus DOCTYPE';
+         !!!next-input-character;
+         redo A;
+       }
+     } elsif ($self->{state} eq 'before DOCTYPE system identifier') {
+       if ({
+x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
+             #0x000D => 1, # HT, LF, VT, FF, SP, CR
+           }->{$self->{next_input_character}}) {
+         ## Stay in the state
+         !!!next-input-character;
+         redo A;
+       } elsif ($self->{next_input_character} == 0x0022) { # "
+         $self->{current_token}->{system_identifier} = ''; # DOCTYPE
+         $self->{state} = 'DOCTYPE system identifier (double-quoted)';
+         !!!next-input-character;
+         redo A;
+       } elsif ($self->{next_input_character} == 0x0027) { # '
+         $self->{current_token}->{system_identifier} = ''; # DOCTYPE
+         $self->{state} = 'DOCTYPE system identifier (single-quoted)';
+         !!!next-input-character;
+         redo A;
+       } elsif ($self->{next_input_character} == 0x003E) { # >
+         !!!parse-error (type => 'no SYSTEM literal');
+         $self->{state} = 'data';
+         !!!next-input-character;
+         delete $self->{current_token}->{correct};
+         !!!emit ($self->{current_token}); # DOCTYPE
+         redo A;
+       } elsif ($self->{next_input_character} == -1) {
+         !!!parse-error (type => 'unclosed DOCTYPE');
+         $self->{state} = 'data';
+         ## reconsume
+         delete $self->{current_token}->{correct};
+         !!!emit ($self->{current_token}); # DOCTYPE
+         redo A;
+       } else {
+         !!!parse-error (type => 'string after SYSTEM');
+         $self->{state} = 'bogus DOCTYPE';
+         !!!next-input-character;
+         redo A;
+       }
+     } elsif ($self->{state} eq 'DOCTYPE system identifier (double-quoted)') {
+       if ($self->{next_input_character} == 0x0022) { # "
+         $self->{state} = 'after DOCTYPE system identifier';
+         !!!next-input-character;
+         redo A;
+       } elsif ($self->{next_input_character} == -1) {
+         !!!parse-error (type => 'unclosed SYSTEM literal');
+         $self->{state} = 'data';
+         ## reconsume
+         delete $self->{current_token}->{correct};
+         !!!emit ($self->{current_token}); # DOCTYPE
+         redo A;
+       } else {
+         $self->{current_token}->{system_identifier} # DOCTYPE
+             .= chr $self->{next_input_character};
+         ## Stay in the state
+         !!!next-input-character;
+         redo A;
+       }
+     } elsif ($self->{state} eq 'DOCTYPE system identifier (single-quoted)') {
+       if ($self->{next_input_character} == 0x0027) { # '
+         $self->{state} = 'after DOCTYPE system identifier';
+         !!!next-input-character;
+         redo A;
+       } elsif ($self->{next_input_character} == -1) {
+         !!!parse-error (type => 'unclosed SYSTEM literal');
+         $self->{state} = 'data';
+         ## reconsume
+         delete $self->{current_token}->{correct};
+         !!!emit ($self->{current_token}); # DOCTYPE
+         redo A;
+       } else {
+         $self->{current_token}->{system_identifier} # DOCTYPE
+             .= chr $self->{next_input_character};
+         ## Stay in the state
+         !!!next-input-character;
+         redo A;
+       }
+     } elsif ($self->{state} eq 'after DOCTYPE system identifier') {
+       if ({
+x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
+             #0x000D => 1, # HT, LF, VT, FF, SP, CR
+           }->{$self->{next_input_character}}) {
+         ## Stay in the state
+         !!!next-input-character;
+         redo A;
+       } elsif ($self->{next_input_character} == 0x003E) { # >
+         $self->{state} = 'data';
+         !!!next-input-character;
+         !!!emit ($self->{current_token}); # DOCTYPE
+         redo A;
+       } elsif ($self->{next_input_character} == -1) {
+         !!!parse-error (type => 'unclosed DOCTYPE');
+         $self->{state} = 'data';
+         ## reconsume
+         delete $self->{current_token}->{correct};
          !!!emit ($self->{current_token}); # DOCTYPE
-         undef $self->{current_token};
          redo A;
        } else {
-         !!!parse-error (type => 'string after DOCTYPE name');
+         !!!parse-error (type => 'string after SYSTEM literal');
-         $self->{current_token}->{error} = 1; # DOCTYPE
          $self->{state} = 'bogus DOCTYPE';
          !!!next-input-character;
          redo A;
-Line 1488 
 sub _get_next_token ($) {
+Line 1583 
 sub _get_next_token ($) {
          $self->{state} = 'data';
          !!!next-input-character;
+         delete $self->{current_token}->{correct};
          !!!emit ($self->{current_token}); # DOCTYPE
-         undef $self->{current_token};
          redo A;
        } elsif ($self->{next_input_character} == -1) {
-Line 1497 
 sub _get_next_token ($) {
+Line 1592 
 sub _get_next_token ($) {
          $self->{state} = 'data';
          ## reconsume
+         delete $self->{current_token}->{correct};
          !!!emit ($self->{current_token}); # DOCTYPE
-         undef $self->{current_token};
          redo A;
        } else {
-Line 1514 
 sub _get_next_token ($) {
+Line 1609 
 sub _get_next_token ($) {
    die "$0: _get_next_token: unexpected case";
  } # _get_next_token
- sub _tokenize_attempt_to_consume_an_entity ($) {
+ sub _tokenize_attempt_to_consume_an_entity ($$) {
-   my $self = shift;
+   my ($self, $in_attr) = @_;
-   if ($self->{next_input_character} == 0x0023) { # #
+   if ({
+x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, # HT, LF, VT, FF,
+x0020 => 1, 0x003C => 1, 0x0026 => 1, -1 => 1, # SP, <, & # 0x000D # CR
+       }->{$self->{next_input_character}}) {
+     ## Don't consume
+     ## No error
+     return undef;
+   } elsif ($self->{next_input_character} == 0x0023) { # #
      !!!next-input-character;
      if ($self->{next_input_character} == 0x0078 or # x
          $self->{next_input_character} == 0x0058) { # X
-       my $num;
+       my $code;
        X: {
          my $x_char = $self->{next_input_character};
          !!!next-input-character;
          if (0x0030 <= $self->{next_input_character} and
              $self->{next_input_character} <= 0x0039) { # 0..9
-           $num ||= 0;
+           $code ||= 0;
-           $num *= 0x10;
+           $code *= 0x10;
-           $num += $self->{next_input_character} - 0x0030;
+           $code += $self->{next_input_character} - 0x0030;
            redo X;
          } elsif (0x0061 <= $self->{next_input_character} and
                   $self->{next_input_character} <= 0x0066) { # a..f
-           ## ISSUE: the spec says U+0078, which is apparently incorrect
+           $code ||= 0;
-           $num ||= 0;
+           $code *= 0x10;
-           $num *= 0x10;
+           $code += $self->{next_input_character} - 0x0060 + 9;
-           $num += $self->{next_input_character} - 0x0060 + 9;
            redo X;
          } elsif (0x0041 <= $self->{next_input_character} and
                   $self->{next_input_character} <= 0x0046) { # A..F
-           ## ISSUE: the spec says U+0058, which is apparently incorrect
+           $code ||= 0;
-           $num ||= 0;
+           $code *= 0x10;
-           $num *= 0x10;
+           $code += $self->{next_input_character} - 0x0040 + 9;
-           $num += $self->{next_input_character} - 0x0040 + 9;
            redo X;
-         } elsif (not defined $num) { # no hexadecimal digit
+         } elsif (not defined $code) { # no hexadecimal digit
            !!!parse-error (type => 'bare hcro');
+           !!!back-next-input-character ($x_char, $self->{next_input_character});
            $self->{next_input_character} = 0x0023; # #
-           !!!back-next-input-character ($x_char);
            return undef;
          } elsif ($self->{next_input_character} == 0x003B) { # ;
            !!!next-input-character;
-Line 1556 
 sub _tokenize_attempt_to_consume_an_enti
+Line 1656 
 sub _tokenize_attempt_to_consume_an_enti
            !!!parse-error (type => 'no refc');
          }
-         ## TODO: check the definition for |a valid Unicode character|.
+         if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) {
-         ## <http://lists.whatwg.org/pipermail/whatwg-whatwg.org/2006-December/thread.html#8189>
+           !!!parse-error (type => sprintf 'invalid character reference:U+%04X', $code);
-         if ($num > 1114111 or $num == 0) {
+           $code = 0xFFFD;
-           $num = 0xFFFD; # REPLACEMENT CHARACTER
+         } elsif ($code > 0x10FFFF) {
-           ## ISSUE: Why this is not an error?
+           !!!parse-error (type => sprintf 'invalid character reference:U-%08X', $code);
-         } elsif (0x80 <= $num and $num <= 0x9F) {
+           $code = 0xFFFD;
-           !!!parse-error (type => sprintf 'c1 entity:U+%04X', $num);
+         } elsif ($code == 0x000D) {
-           $num = $c1_entity_char->{$num};
+           !!!parse-error (type => 'CR character reference');
+           $code = 0x000A;
+         } elsif (0x80 <= $code and $code <= 0x9F) {
+           !!!parse-error (type => sprintf 'C1 character reference:U+%04X', $code);
+           $code = $c1_entity_char->{$code};
          }
-         return {type => 'character', data => chr $num};
+         return {type => 'character', data => chr $code};
        } # X
      } elsif (0x0030 <= $self->{next_input_character} and
               $self->{next_input_character} <= 0x0039) { # 0..9
-Line 1587 
 sub _tokenize_attempt_to_consume_an_enti
+Line 1691 
 sub _tokenize_attempt_to_consume_an_enti
          !!!parse-error (type => 'no refc');
        }
-       ## TODO: check the definition for |a valid Unicode character|.
+       if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) {
-       if ($code > 1114111 or $code == 0) {
+         !!!parse-error (type => sprintf 'invalid character reference:U+%04X', $code);
-         $code = 0xFFFD; # REPLACEMENT CHARACTER
+         $code = 0xFFFD;
-         ## ISSUE: Why this is not an error?
+       } elsif ($code > 0x10FFFF) {
+         !!!parse-error (type => sprintf 'invalid character reference:U-%08X', $code);
+         $code = 0xFFFD;
+       } elsif ($code == 0x000D) {
+         !!!parse-error (type => 'CR character reference');
+         $code = 0x000A;
        } elsif (0x80 <= $code and $code <= 0x9F) {
-         !!!parse-error (type => sprintf 'c1 entity:U+%04X', $code);
+         !!!parse-error (type => sprintf 'C1 character reference:U+%04X', $code);
          $code = $c1_entity_char->{$code};
        }
-Line 1611 
 sub _tokenize_attempt_to_consume_an_enti
+Line 1720 
 sub _tokenize_attempt_to_consume_an_enti
      !!!next-input-character;
      my $value = $entity_name;
-     my $match;
+     my $match = 0;
+     require Whatpm::_NamedEntityList;
+     our $EntityChar;
      while (length $entity_name < 10 and
             ## NOTE: Some number greater than the maximum length of entity name
-            ((0x0041 <= $self->{next_input_character} and
+            ((0x0041 <= $self->{next_input_character} and # a
-              $self->{next_input_character} <= 0x005A) or
+              $self->{next_input_character} <= 0x005A) or # x
-             (0x0061 <= $self->{next_input_character} and
+             (0x0061 <= $self->{next_input_character} and # a
-              $self->{next_input_character} <= 0x007A) or
+              $self->{next_input_character} <= 0x007A) or # z
-             (0x0030 <= $self->{next_input_character} and
+             (0x0030 <= $self->{next_input_character} and # 0
-              $self->{next_input_character} <= 0x0039))) {
+              $self->{next_input_character} <= 0x0039) or # 9
+             $self->{next_input_character} == 0x003B)) { # ;
        $entity_name .= chr $self->{next_input_character};
-       if (defined $entity_char->{$entity_name}) {
+       if (defined $EntityChar->{$entity_name}) {
-         $value = $entity_char->{$entity_name};
+         if ($self->{next_input_character} == 0x003B) { # ;
-         $match = 1;
+           $value = $EntityChar->{$entity_name};
+           $match = 1;
+           !!!next-input-character;
+           last;
+         } else {
+           $value = $EntityChar->{$entity_name};
+           $match = -1;
+           !!!next-input-character;
+         }
        } else {
          $value .= chr $self->{next_input_character};
+         $match *= 2;
+         !!!next-input-character;
        }
-       !!!next-input-character;
      }
-     if ($match) {
+     if ($match > 0) {
-       if ($self->{next_input_character} == 0x003B) { # ;
+       return {type => 'character', data => $value};
-         !!!next-input-character;
+     } elsif ($match < 0) {
+       !!!parse-error (type => 'no refc');
+       if ($in_attr and $match < -1) {
+         return {type => 'character', data => '&'.$entity_name};
        } else {
-         !!!parse-error (type => 'refc');
+         return {type => 'character', data => $value};
        }
-       return {type => 'character', data => $value};
      } else {
        !!!parse-error (type => 'bare ero');
        ## NOTE: No characters are consumed in the spec.
-       !!!back-token ({type => 'character', data => $value});
+       return {type => 'character', data => '&'.$value};
-       return undef;
      }
    } else {
      ## no characters are consumed
-Line 1658 
 sub _initialize_tree_constructor ($) {
+Line 1779 
 sub _initialize_tree_constructor ($) {
    $self->{document}->strict_error_checking (0);
    ## TODO: Turn mutation events off # MUST
    ## TODO: Turn loose Document option (manakai extension) on
-   ## TODO: Mark the Document as an HTML document # MUST
+   $self->{document}->manakai_is_html (1); # MUST
  } # _initialize_tree_constructor
  sub _terminate_tree_constructor ($) {
-Line 1698 
 sub _construct_tree ($) {
+Line 1819 
 sub _construct_tree ($) {
  sub _tree_construction_initial ($) {
    my $self = shift;
-   B: {
+   INITIAL: {
-       if ($token->{type} eq 'DOCTYPE') {
+     if ($token->{type} eq 'DOCTYPE') {
-         if ($token->{error}) {
+       ## NOTE: Conformance checkers MAY, instead of reporting "not HTML5"
-           ## ISSUE: Spec currently left this case undefined.
+       ## error, switch to a conformance checking mode for another
-           !!!parse-error (type => 'bogus DOCTYPE');
+       ## language.
-         }
+       my $doctype_name = $token->{name};
-         my $doctype = $self->{document}->create_document_type_definition
+       $doctype_name = '' unless defined $doctype_name;
-           ($token->{name});
+       $doctype_name =~ tr/a-z/A-Z/;
-         $self->{document}->append_child ($doctype);
+       if (not defined $token->{name} or # <!DOCTYPE>
-         #$phase = 'root element';
+           defined $token->{public_identifier} or
-         !!!next-token;
+           defined $token->{system_identifier}) {
-         #redo B;
+         !!!parse-error (type => 'not HTML5');
-         return;
+       } elsif ($doctype_name ne 'HTML') {
-       } elsif ({
+         ## ISSUE: ASCII case-insensitive? (in fact it does not matter)
-                 comment => 1,
+         !!!parse-error (type => 'not HTML5');
-                 'start tag' => 1,
+       }
-                 'end tag' => 1,
-                 'end-of-file' => 1,
+       my $doctype = $self->{document}->create_document_type_definition
-                }->{$token->{type}}) {
+         ($token->{name}); ## ISSUE: If name is missing (e.g. <!DOCTYPE>)?
-         ## ISSUE: Spec currently left this case undefined.
+       $doctype->public_id ($token->{public_identifier})
-         !!!parse-error (type => 'missing DOCTYPE');
+           if defined $token->{public_identifier};
-         #$phase = 'root element';
+       $doctype->system_id ($token->{system_identifier})
-         ## reprocess
+           if defined $token->{system_identifier};
-         #redo B;
+       ## NOTE: Other DocumentType attributes are null or empty lists.
-         return;
+       ## ISSUE: internalSubset = null??
-       } elsif ($token->{type} eq 'character') {
+       $self->{document}->append_child ($doctype);
-         if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
-           $self->{document}->manakai_append_text ($1);
+       if (not $token->{correct} or $doctype_name ne 'HTML') {
-           ## ISSUE: DOM3 Core does not allow Document > Text
+         $self->{document}->manakai_compat_mode ('quirks');
-           unless (length $token->{data}) {
+       } elsif (defined $token->{public_identifier}) {
-             ## Stay in the phase
+         my $pubid = $token->{public_identifier};
-             !!!next-token;
+         $pubid =~ tr/a-z/A-z/;
-             redo B;
+         if ({
+           "+//SILMARIL//DTD HTML PRO V0R11 19970101//EN" => 1,
+           "-//ADVASOFT LTD//DTD HTML 3.0 ASWEDIT + EXTENSIONS//EN" => 1,
+           "-//AS//DTD HTML 3.0 ASWEDIT + EXTENSIONS//EN" => 1,
+           "-//IETF//DTD HTML 2.0 LEVEL 1//EN" => 1,
+           "-//IETF//DTD HTML 2.0 LEVEL 2//EN" => 1,
+           "-//IETF//DTD HTML 2.0 STRICT LEVEL 1//EN" => 1,
+           "-//IETF//DTD HTML 2.0 STRICT LEVEL 2//EN" => 1,
+           "-//IETF//DTD HTML 2.0 STRICT//EN" => 1,
+           "-//IETF//DTD HTML 2.0//EN" => 1,
+           "-//IETF//DTD HTML 2.1E//EN" => 1,
+           "-//IETF//DTD HTML 3.0//EN" => 1,
+           "-//IETF//DTD HTML 3.0//EN//" => 1,
+           "-//IETF//DTD HTML 3.2 FINAL//EN" => 1,
+           "-//IETF//DTD HTML 3.2//EN" => 1,
+           "-//IETF//DTD HTML 3//EN" => 1,
+           "-//IETF//DTD HTML LEVEL 0//EN" => 1,
+           "-//IETF//DTD HTML LEVEL 0//EN//2.0" => 1,
+           "-//IETF//DTD HTML LEVEL 1//EN" => 1,
+           "-//IETF//DTD HTML LEVEL 1//EN//2.0" => 1,
+           "-//IETF//DTD HTML LEVEL 2//EN" => 1,
+           "-//IETF//DTD HTML LEVEL 2//EN//2.0" => 1,
+           "-//IETF//DTD HTML LEVEL 3//EN" => 1,
+           "-//IETF//DTD HTML LEVEL 3//EN//3.0" => 1,
+           "-//IETF//DTD HTML STRICT LEVEL 0//EN" => 1,
+           "-//IETF//DTD HTML STRICT LEVEL 0//EN//2.0" => 1,
+           "-//IETF//DTD HTML STRICT LEVEL 1//EN" => 1,
+           "-//IETF//DTD HTML STRICT LEVEL 1//EN//2.0" => 1,
+           "-//IETF//DTD HTML STRICT LEVEL 2//EN" => 1,
+           "-//IETF//DTD HTML STRICT LEVEL 2//EN//2.0" => 1,
+           "-//IETF//DTD HTML STRICT LEVEL 3//EN" => 1,
+           "-//IETF//DTD HTML STRICT LEVEL 3//EN//3.0" => 1,
+           "-//IETF//DTD HTML STRICT//EN" => 1,
+           "-//IETF//DTD HTML STRICT//EN//2.0" => 1,
+           "-//IETF//DTD HTML STRICT//EN//3.0" => 1,
+           "-//IETF//DTD HTML//EN" => 1,
+           "-//IETF//DTD HTML//EN//2.0" => 1,
+           "-//IETF//DTD HTML//EN//3.0" => 1,
+           "-//METRIUS//DTD METRIUS PRESENTATIONAL//EN" => 1,
+           "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML STRICT//EN" => 1,
+           "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML//EN" => 1,
+           "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 TABLES//EN" => 1,
+           "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML STRICT//EN" => 1,
+           "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML//EN" => 1,
+           "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 TABLES//EN" => 1,
+           "-//NETSCAPE COMM. CORP.//DTD HTML//EN" => 1,
+           "-//NETSCAPE COMM. CORP.//DTD STRICT HTML//EN" => 1,
+           "-//O'REILLY AND ASSOCIATES//DTD HTML 2.0//EN" => 1,
+           "-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED 1.0//EN" => 1,
+           "-//SPYGLASS//DTD HTML 2.0 EXTENDED//EN" => 1,
+           "-//SQ//DTD HTML 2.0 HOTMETAL + EXTENSIONS//EN" => 1,
+           "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA HTML//EN" => 1,
+           "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA STRICT HTML//EN" => 1,
+           "-//W3C//DTD HTML 3 1995-03-24//EN" => 1,
+           "-//W3C//DTD HTML 3.2 DRAFT//EN" => 1,
+           "-//W3C//DTD HTML 3.2 FINAL//EN" => 1,
+           "-//W3C//DTD HTML 3.2//EN" => 1,
+           "-//W3C//DTD HTML 3.2S DRAFT//EN" => 1,
+           "-//W3C//DTD HTML 4.0 FRAMESET//EN" => 1,
+           "-//W3C//DTD HTML 4.0 TRANSITIONAL//EN" => 1,
+           "-//W3C//DTD HTML EXPERIMETNAL 19960712//EN" => 1,
+           "-//W3C//DTD HTML EXPERIMENTAL 970421//EN" => 1,
+           "-//W3C//DTD W3 HTML//EN" => 1,
+           "-//W3O//DTD W3 HTML 3.0//EN" => 1,
+           "-//W3O//DTD W3 HTML 3.0//EN//" => 1,
+           "-//W3O//DTD W3 HTML STRICT 3.0//EN//" => 1,
+           "-//WEBTECHS//DTD MOZILLA HTML 2.0//EN" => 1,
+           "-//WEBTECHS//DTD MOZILLA HTML//EN" => 1,
+           "-/W3C/DTD HTML 4.0 TRANSITIONAL/EN" => 1,
+           "HTML" => 1,
+         }->{$pubid}) {
+           $self->{document}->manakai_compat_mode ('quirks');
+         } elsif ($pubid eq "-//W3C//DTD HTML 4.01 FRAMESET//EN" or
+                  $pubid eq "-//W3C//DTD HTML 4.01 TRANSITIONAL//EN") {
+           if (defined $token->{system_identifier}) {
+             $self->{document}->manakai_compat_mode ('quirks');
+           } else {
+             $self->{document}->manakai_compat_mode ('limited quirks');
            }
+         } elsif ($pubid eq "-//W3C//DTD XHTML 1.0 Frameset//EN" or
+                  $pubid eq "-//W3C//DTD XHTML 1.0 Transitional//EN") {
+           $self->{document}->manakai_compat_mode ('limited quirks');
+         }
+       }
+       if (defined $token->{system_identifier}) {
+         my $sysid = $token->{system_identifier};
+         $sysid =~ tr/A-Z/a-z/;
+         if ($sysid eq "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd") {
+           $self->{document}->manakai_compat_mode ('quirks');
          }
-         ## ISSUE: Spec currently left this case undefined.
-         !!!parse-error (type => 'missing DOCTYPE');
-         #$phase = 'root element';
-         ## reprocess
-         #redo B;
-         return;
-       } else {
-         die "$0: $token->{type}: Unknown token";
        }
-     } # B
+       ## Go to the root element phase.
+       !!!next-token;
+       return;
+     } elsif ({
+               'start tag' => 1,
+               'end tag' => 1,
+               'end-of-file' => 1,
+              }->{$token->{type}}) {
+       !!!parse-error (type => 'no DOCTYPE');
+       $self->{document}->manakai_compat_mode ('quirks');
+       ## Go to the root element phase
+       ## reprocess
+       return;
+     } elsif ($token->{type} eq 'character') {
+       if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { # \x0D
+         ## Ignore the token
+         unless (length $token->{data}) {
+           ## Stay in the phase
+           !!!next-token;
+           redo INITIAL;
+         }
+       }
+       !!!parse-error (type => 'no DOCTYPE');
+       $self->{document}->manakai_compat_mode ('quirks');
+       ## Go to the root element phase
+       ## reprocess
+       return;
+     } elsif ($token->{type} eq 'comment') {
+       my $comment = $self->{document}->create_comment ($token->{data});
+       $self->{document}->append_child ($comment);
+       ## Stay in the phase.
+       !!!next-token;
+       redo INITIAL;
+     } else {
+       die "$0: $token->{type}: Unknown token";
+     }
+   } # INITIAL
  } # _tree_construction_initial
  sub _tree_construction_root_element ($) {
-Line 1762 
 sub _tree_construction_root_element ($)
+Line 2002 
 sub _tree_construction_root_element ($)
          !!!next-token;
          redo B;
        } elsif ($token->{type} eq 'character') {
-         if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
+         if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { # \x0D
-           $self->{document}->manakai_append_text ($1);
+           ## Ignore the token.
-           ## ISSUE: DOM3 Core does not allow Document > Text
            unless (length $token->{data}) {
              ## Stay in the phase
              !!!next-token;
-Line 1785 
 sub _tree_construction_root_element ($)
+Line 2025 
 sub _tree_construction_root_element ($)
        my $root_element; !!!create-element ($root_element, 'html');
        $self->{document}->append_child ($root_element);
        push @{$self->{open_elements}}, [$root_element, 'html'];
-       #$phase = 'main';
        ## reprocess
        #redo B;
-       return;
+       return; ## Go to the main phase.
    } # B
  } # _tree_construction_root_element
-Line 1804 
 sub _reset_insertion_mode ($) {
+Line 2043 
 sub _reset_insertion_mode ($) {
      ## Step 3
      S3: {
-       $last = 1 if $self->{open_elements}->[0]->[0] eq $node->[0];
+       ## ISSUE: Oops! "If node is the first node in the stack of open
-       if (defined $self->{inner_html_node}) {
+       ## elements, then set last to true. If the context element of the
-         if ($self->{inner_html_node}->[1] eq 'td' or
+       ## HTML fragment parsing algorithm is neither a td element nor a
-             $self->{inner_html_node}->[1] eq 'th') {
+       ## th element, then set node to the context element. (fragment case)":
-           #
+       ## The second "if" is in the scope of the first "if"!?
-         } else {
+       if ($self->{open_elements}->[0]->[0] eq $node->[0]) {
-           $node = $self->{inner_html_node};
+         $last = 1;
+         if (defined $self->{inner_html_node}) {
+           if ($self->{inner_html_node}->[1] eq 'td' or
+               $self->{inner_html_node}->[1] eq 'th') {
+             #
+           } else {
+             $node = $self->{inner_html_node};
+           }
          }
        }
-Line 1857 
 sub _reset_insertion_mode ($) {
+Line 2103 
 sub _reset_insertion_mode ($) {
  sub _tree_construction_main ($) {
    my $self = shift;
-   my $phase = 'main';
+   my $previous_insertion_mode;
    my $active_formatting_elements = [];
-Line 1941 
 sub _tree_construction_main ($) {
+Line 2187 
 sub _tree_construction_main ($) {
      }
    }; # $clear_up_to_marker
-   my $style_start_tag = sub {
+   my $parse_rcdata = sub ($$) {
-     my $style_el; !!!create-element ($style_el, 'style', $token->{attributes});
+     my ($content_model_flag, $insert) = @_;
-     ## $self->{insertion_mode} eq 'in head' and ... (always true)
-     (($self->{insertion_mode} eq 'in head' and defined $self->{head_element})
+     ## Step 1
-      ? $self->{head_element} : $self->{open_elements}->[-1]->[0])
+     my $start_tag_name = $token->{tag_name};
-       ->append_child ($style_el);
+     my $el;
-     $self->{content_model_flag} = 'CDATA';
+     !!!create-element ($el, $start_tag_name, $token->{attributes});
+     ## Step 2
+     $insert->($el); # /context node/->append_child ($el)
+     ## Step 3
+     $self->{content_model} = $content_model_flag; # CDATA or RCDATA
      delete $self->{escape}; # MUST
+     ## Step 4
      my $text = '';
      !!!next-token;
-     while ($token->{type} eq 'character') {
+     while ($token->{type} eq 'character') { # or until stop tokenizing
        $text .= $token->{data};
        !!!next-token;
-     } # stop if non-character token or tokenizer stops tokenising
+     }
+     ## Step 5
      if (length $text) {
-       $style_el->manakai_append_text ($text);
+       my $text = $self->{document}->create_text_node ($text);
+       $el->append_child ($text);
      }
-     $self->{content_model_flag} = 'PCDATA';
+     ## Step 6
+     $self->{content_model} = PCDATA_CONTENT_MODEL;
-     if ($token->{type} eq 'end tag' and $token->{tag_name} eq 'style') {
+     ## Step 7
+     if ($token->{type} eq 'end tag' and $token->{tag_name} eq $start_tag_name) {
        ## Ignore the token
-     } else {
+     } elsif ($content_model_flag == CDATA_CONTENT_MODEL) {
        !!!parse-error (type => 'in CDATA:#'.$token->{type});
-       ## ISSUE: And ignore?
+     } elsif ($content_model_flag == RCDATA_CONTENT_MODEL) {
+       !!!parse-error (type => 'in RCDATA:#'.$token->{type});
+     } else {
+       die "$0: $content_model_flag in parse_rcdata";
      }
      !!!next-token;
-   }; # $style_start_tag
+   }; # $parse_rcdata
-   my $script_start_tag = sub {
+   my $script_start_tag = sub ($) {
+     my $insert = $_[0];
      my $script_el;
      !!!create-element ($script_el, 'script', $token->{attributes});
      ## TODO: mark as "parser-inserted"
-     $self->{content_model_flag} = 'CDATA';
+     $self->{content_model} = CDATA_CONTENT_MODEL;
      delete $self->{escape}; # MUST
      my $text = '';
-Line 1989 
 sub _tree_construction_main ($) {
+Line 2251 
 sub _tree_construction_main ($) {
        $script_el->manakai_append_text ($text);
      }
-     $self->{content_model_flag} = 'PCDATA';
+     $self->{content_model} = PCDATA_CONTENT_MODEL;
      if ($token->{type} eq 'end tag' and
          $token->{tag_name} eq 'script') {
-Line 2005 
 sub _tree_construction_main ($) {
+Line 2267 
 sub _tree_construction_main ($) {
      } else {
        ## TODO: $old_insertion_point = current insertion point
        ## TODO: insertion point = just before the next input character
-       (($self->{insertion_mode} eq 'in head' and defined $self->{head_element})
+       $insert->($script_el);
-        ? $self->{head_element} : $self->{open_elements}->[-1]->[0])->append_child ($script_el);
        ## TODO: insertion point = $old_insertion_point (might be "undefined")
-Line 2201 
 sub _tree_construction_main ($) {
+Line 2462 
 sub _tree_construction_main ($) {
    }; # $formatting_end_tag
    my $insert_to_current = sub {
-     $self->{open_elements}->[-1]->[0]->append_child (shift);
+     $self->{open_elements}->[-1]->[0]->append_child ($_[0]);
    }; # $insert_to_current
    my $insert_to_foster = sub {
-Line 2239 
 sub _tree_construction_main ($) {
+Line 2500 
 sub _tree_construction_main ($) {
      my $insert = shift;
      if ($token->{type} eq 'start tag') {
        if ($token->{tag_name} eq 'script') {
-         $script_start_tag->();
+         ## NOTE: This is an "as if in head" code clone
+         $script_start_tag->($insert);
          return;
        } elsif ($token->{tag_name} eq 'style') {
-         $style_start_tag->();
+         ## NOTE: This is an "as if in head" code clone
+         $parse_rcdata->(CDATA_CONTENT_MODEL, $insert);
          return;
        } elsif ({
-                 base => 1, link => 1, meta => 1,
+                 base => 1, link => 1,
                 }->{$token->{tag_name}}) {
-         !!!parse-error (type => 'in body:'.$token->{tag_name});
+         ## NOTE: This is an "as if in head" code clone, only "-t" differs
-         ## NOTE: This is an "as if in head" code clone
+         !!!insert-element-t ($token->{tag_name}, $token->{attributes});
-         my $el;
+         pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
-         !!!create-element ($el, $token->{tag_name}, $token->{attributes});
+         !!!next-token;
-         if (defined $self->{head_element}) {
+         return;
-           $self->{head_element}->append_child ($el);
+       } elsif ($token->{tag_name} eq 'meta') {
-         } else {
+         ## NOTE: This is an "as if in head" code clone, only "-t" differs
-           $insert->($el);
+         !!!insert-element-t ($token->{tag_name}, $token->{attributes});
+         pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
+         unless ($self->{confident}) {
+           my $charset;
+           if ($token->{attributes}->{charset}) { ## TODO: And if supported
+             $charset = $token->{attributes}->{charset}->{value};
+           }
+           if ($token->{attributes}->{'http-equiv'}) {
+             ## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition.
+             if ($token->{attributes}->{'http-equiv'}->{value}
+                 =~ /\A[^;]*;[\x09-\x0D\x20]*charset[\x09-\x0D\x20]*=
+                     [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|
+                     ([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) {
+               $charset = defined $1 ? $1 : defined $2 ? $2 : $3;
+             } ## TODO: And if supported
+           }
+           ## TODO: Change the encoding
          }
          !!!next-token;
          return;
        } elsif ($token->{tag_name} eq 'title') {
          !!!parse-error (type => 'in body:title');
-         ## NOTE: There is an "as if in head" code clone
+         ## NOTE: This is an "as if in head" code clone
-         my $title_el;
+         $parse_rcdata->(RCDATA_CONTENT_MODEL, sub {
-         !!!create-element ($title_el, 'title', $token->{attributes});
+           if (defined $self->{head_element}) {
-         (defined $self->{head_element} ? $self->{head_element} : $self->{open_elements}->[-1]->[0])
+             $self->{head_element}->append_child ($_[0]);
-           ->append_child ($title_el);
+           } else {
-         $self->{content_model_flag} = 'RCDATA';
+             $insert->($_[0]);
-         delete $self->{escape}; # MUST
+           }
+         });
-         my $text = '';
-         !!!next-token;
-         while ($token->{type} eq 'character') {
-           $text .= $token->{data};
-           !!!next-token;
-         }
-         if (length $text) {
-           $title_el->manakai_append_text ($text);
-         }
-         $self->{content_model_flag} = 'PCDATA';
-         if ($token->{type} eq 'end tag' and
-             $token->{tag_name} eq 'title') {
-           ## Ignore the token
-         } else {
-           !!!parse-error (type => 'in RCDATA:#'.$token->{type});
-           ## ISSUE: And ignore?
-         }
-         !!!next-token;
          return;
        } elsif ($token->{tag_name} eq 'body') {
          !!!parse-error (type => 'in body:body');
-Line 2391 
 sub _tree_construction_main ($) {
+Line 2651 
 sub _tree_construction_main ($) {
              if ($i != -1) {
                !!!parse-error (type => 'end tag missing:'.
                                $self->{open_elements}->[-1]->[1]);
-               ## TODO: test
              }
              splice @{$self->{open_elements}}, $i;
              last LI;
-Line 2439 
 sub _tree_construction_main ($) {
+Line 2698 
 sub _tree_construction_main ($) {
              if ($i != -1) {
                !!!parse-error (type => 'end tag missing:'.
                                $self->{open_elements}->[-1]->[1]);
-               ## TODO: test
              }
              splice @{$self->{open_elements}}, $i;
              last LI;
-Line 2480 
 sub _tree_construction_main ($) {
+Line 2738 
 sub _tree_construction_main ($) {
          !!!insert-element-t ($token->{tag_name}, $token->{attributes});
-         $self->{content_model_flag} = 'PLAINTEXT';
+         $self->{content_model} = PLAINTEXT_CONTENT_MODEL;
          !!!next-token;
          return;
-Line 2502 
 sub _tree_construction_main ($) {
+Line 2760 
 sub _tree_construction_main ($) {
            }
          } # INSCOPE
+         ## NOTE: See <http://html5.org/tools/web-apps-tracker?from=925&to=926>
          ## has an element in scope
-         my $i;
+         #my $i;
-         INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
+         #INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
-           my $node = $self->{open_elements}->[$_];
+         #  my $node = $self->{open_elements}->[$_];
-           if ({
+         #  if ({
-                h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
+         #       h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
-               }->{$node->[1]}) {
+         #      }->{$node->[1]}) {
-             $i = $_;
+         #    $i = $_;
-             last INSCOPE;
+         #    last INSCOPE;
-           } elsif ({
+         #  } elsif ({
-                     table => 1, caption => 1, td => 1, th => 1,
+         #            table => 1, caption => 1, td => 1, th => 1,
-                     button => 1, marquee => 1, object => 1, html => 1,
+         #            button => 1, marquee => 1, object => 1, html => 1,
-                    }->{$node->[1]}) {
+         #           }->{$node->[1]}) {
-             last INSCOPE;
+         #    last INSCOPE;
-           }
+         #  }
-         } # INSCOPE
+         #} # INSCOPE
+         #
-         if (defined $i) {
+         #if (defined $i) {
-           !!!parse-error (type => 'in hn:hn');
+         #  !!! parse-error (type => 'in hn:hn');
-           splice @{$self->{open_elements}}, $i;
+         #  splice @{$self->{open_elements}}, $i;
-         }
+         #}
          !!!insert-element-t ($token->{tag_name}, $token->{attributes});
-Line 2565 
 sub _tree_construction_main ($) {
+Line 2824 
 sub _tree_construction_main ($) {
          return;
        } elsif ({
                  b => 1, big => 1, em => 1, font => 1, i => 1,
-                 nobr => 1, s => 1, small => 1, strile => 1,
+                 s => 1, small => 1, strile => 1,
                  strong => 1, tt => 1, u => 1,
                 }->{$token->{tag_name}}) {
          $reconstruct_active_formatting_elements->($insert_to_current);
-Line 2575 
 sub _tree_construction_main ($) {
+Line 2834 
 sub _tree_construction_main ($) {
          !!!next-token;
          return;
+       } elsif ($token->{tag_name} eq 'nobr') {
+         $reconstruct_active_formatting_elements->($insert_to_current);
+         ## has a |nobr| element in scope
+         INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
+           my $node = $self->{open_elements}->[$_];
+           if ($node->[1] eq 'nobr') {
+             !!!parse-error (type => 'not closed:nobr');
+             !!!back-token;
+             $token = {type => 'end tag', tag_name => 'nobr'};
+             return;
+           } elsif ({
+                     table => 1, caption => 1, td => 1, th => 1,
+                     button => 1, marquee => 1, object => 1, html => 1,
+                    }->{$node->[1]}) {
+             last INSCOPE;
+           }
+         } # INSCOPE
+         !!!insert-element-t ($token->{tag_name}, $token->{attributes});
+         push @$active_formatting_elements, $self->{open_elements}->[-1];
+         !!!next-token;
+         return;
        } elsif ($token->{tag_name} eq 'button') {
          ## has a button element in scope
          INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
-Line 2610 
 sub _tree_construction_main ($) {
+Line 2893 
 sub _tree_construction_main ($) {
          return;
        } elsif ($token->{tag_name} eq 'xmp') {
          $reconstruct_active_formatting_elements->($insert_to_current);
+         $parse_rcdata->(CDATA_CONTENT_MODEL, $insert);
-         !!!insert-element-t ($token->{tag_name}, $token->{attributes});
-         $self->{content_model_flag} = 'CDATA';
-         delete $self->{escape}; # MUST
-         !!!next-token;
          return;
        } elsif ($token->{tag_name} eq 'table') {
          ## has a p element in scope
-Line 2648 
 sub _tree_construction_main ($) {
+Line 2925 
 sub _tree_construction_main ($) {
            !!!parse-error (type => 'image');
            $token->{tag_name} = 'img';
          }
+         ## NOTE: There is an "as if <br>" code clone.
          $reconstruct_active_formatting_elements->($insert_to_current);
          !!!insert-element-t ($token->{tag_name}, $token->{attributes});
-Line 2694 
 sub _tree_construction_main ($) {
+Line 2972 
 sub _tree_construction_main ($) {
            return;
          } else {
            my $at = $token->{attributes};
+           my $form_attrs;
+           $form_attrs->{action} = $at->{action} if $at->{action};
+           my $prompt_attr = $at->{prompt};
            $at->{name} = {name => 'name', value => 'isindex'};
+           delete $at->{action};
+           delete $at->{prompt};
            my @tokens = (
-                         {type => 'start tag', tag_name => 'form'},
+                         {type => 'start tag', tag_name => 'form',
+                          attributes => $form_attrs},
                          {type => 'start tag', tag_name => 'hr'},
                          {type => 'start tag', tag_name => 'p'},
                          {type => 'start tag', tag_name => 'label'},
-                         {type => 'character',
+                        );
-                          data => 'This is a searchable index. Insert your search keywords here: '}, # SHOULD
+           if ($prompt_attr) {
-                         ## TODO: make this configurable
+             push @tokens, {type => 'character', data => $prompt_attr->{value}};
+           } else {
+             push @tokens, {type => 'character',
+                            data => 'This is a searchable index. Insert your search keywords here: '}; # SHOULD
+             ## TODO: make this configurable
+           }
+           push @tokens,
                          {type => 'start tag', tag_name => 'input', attributes => $at},
                          #{type => 'character', data => ''}, # SHOULD
                          {type => 'end tag', tag_name => 'label'},
                          {type => 'end tag', tag_name => 'p'},
                          {type => 'start tag', tag_name => 'hr'},
-                         {type => 'end tag', tag_name => 'form'},
+                         {type => 'end tag', tag_name => 'form'};
-                        );
            $token = shift @tokens;
            !!!back-token (@tokens);
            return;
          }
-       } elsif ({
+       } elsif ($token->{tag_name} eq 'textarea') {
-                 textarea => 1,
-                 iframe => 1,
-                 noembed => 1,
-                 noframes => 1,
-                 noscript => 0, ## TODO: 1 if scripting is enabled
-                }->{$token->{tag_name}}) {
          my $tag_name = $token->{tag_name};
          my $el;
          !!!create-element ($el, $token->{tag_name}, $token->{attributes});
-         if ($token->{tag_name} eq 'textarea') {
+         ## TODO: $self->{form_element} if defined
-           ## TODO: $self->{form_element} if defined
+         $self->{content_model} = RCDATA_CONTENT_MODEL;
-           $self->{content_model_flag} = 'RCDATA';
-         } else {
-           $self->{content_model_flag} = 'CDATA';
-         }
          delete $self->{escape}; # MUST
          $insert->($el);
          my $text = '';
-         if ($token->{tag_name} eq 'textarea') {
+         !!!next-token;
-           !!!next-token;
+         if ($token->{type} eq 'character') {
-           if ($token->{type} eq 'character') {
+           $token->{data} =~ s/^\x0A//;
-             $token->{data} =~ s/^\x0A//;
+           unless (length $token->{data}) {
-             unless (length $token->{data}) {
+             !!!next-token;
-               !!!next-token;
-             }
            }
-         } else {
-           !!!next-token;
          }
          while ($token->{type} eq 'character') {
            $text .= $token->{data};
-Line 2755 
 sub _tree_construction_main ($) {
+Line 3030 
 sub _tree_construction_main ($) {
            $el->manakai_append_text ($text);
          }
-         $self->{content_model_flag} = 'PCDATA';
+         $self->{content_model} = PCDATA_CONTENT_MODEL;
          if ($token->{type} eq 'end tag' and
              $token->{tag_name} eq $tag_name) {
            ## Ignore the token
          } else {
-           if ($token->{tag_name} eq 'textarea') {
+           !!!parse-error (type => 'in RCDATA:#'.$token->{type});
-             !!!parse-error (type => 'in RCDATA:#'.$token->{type});
-           } else {
-             !!!parse-error (type => 'in CDATA:#'.$token->{type});
-           }
-           ## ISSUE: And ignore?
          }
          !!!next-token;
          return;
+       } elsif ({
+                 iframe => 1,
+                 noembed => 1,
+                 noframes => 1,
+                 noscript => 0, ## TODO: 1 if scripting is enabled
+                }->{$token->{tag_name}}) {
+         $parse_rcdata->(CDATA_CONTENT_MODEL, $insert);
+         return;
        } elsif ($token->{tag_name} eq 'select') {
          $reconstruct_active_formatting_elements->($insert_to_current);
-Line 2800 
 sub _tree_construction_main ($) {
+Line 3078 
 sub _tree_construction_main ($) {
        }
      } elsif ($token->{type} eq 'end tag') {
        if ($token->{tag_name} eq 'body') {
-         if (@{$self->{open_elements}} > 1 and $self->{open_elements}->[1]->[1] eq 'body') {
+         if (@{$self->{open_elements}} > 1 and
-           ## ISSUE: There is an issue in the spec.
+             $self->{open_elements}->[1]->[1] eq 'body') {
-           if ($self->{open_elements}->[-1]->[1] ne 'body') {
+           for (@{$self->{open_elements}}) {
-             !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
+             unless ({
+                        dd => 1, dt => 1, li => 1, p => 1, td => 1,
+                        th => 1, tr => 1, body => 1, html => 1,
+                      tbody => 1, tfoot => 1, thead => 1,
+                     }->{$_->[1]}) {
+               !!!parse-error (type => 'not closed:'.$_->[1]);
+             }
            }
            $self->{insertion_mode} = 'after body';
            !!!next-token;
            return;
-Line 2849 
 sub _tree_construction_main ($) {
+Line 3134 
 sub _tree_construction_main ($) {
                   li => ($token->{tag_name} ne 'li'),
                   p => ($token->{tag_name} ne 'p'),
                   td => 1, th => 1, tr => 1,
+                  tbody => 1, tfoot=> 1, thead => 1,
                  }->{$self->{open_elements}->[-1]->[1]}) {
                !!!back-token;
                $token = {type => 'end tag',
-Line 2866 
 sub _tree_construction_main ($) {
+Line 3152 
 sub _tree_construction_main ($) {
          } # INSCOPE
          if ($self->{open_elements}->[-1]->[1] ne $token->{tag_name}) {
-           !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
+           if (defined $i) {
+             !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
+           } else {
+             !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
+           }
          }
-         splice @{$self->{open_elements}}, $i if defined $i;
+         if (defined $i) {
+           splice @{$self->{open_elements}}, $i;
+         } elsif ($token->{tag_name} eq 'p') {
+           ## As if <p>, then reprocess the current token
+           my $el;
+           !!!create-element ($el, 'p');
+           $insert->($el);
+         }
          $clear_up_to_marker->()
            if {
              button => 1, marquee => 1, object => 1,
-Line 2885 
 sub _tree_construction_main ($) {
+Line 3182 
 sub _tree_construction_main ($) {
              if ({
                   dd => 1, dt => 1, li => 1, p => 1,
                   td => 1, th => 1, tr => 1,
+                  tbody => 1, tfoot=> 1, thead => 1,
                  }->{$self->{open_elements}->[-1]->[1]}) {
                !!!back-token;
                $token = {type => 'end tag',
-Line 2923 
 sub _tree_construction_main ($) {
+Line 3221 
 sub _tree_construction_main ($) {
              if ({
                   dd => 1, dt => 1, li => 1, p => 1,
                   td => 1, th => 1, tr => 1,
+                  tbody => 1, tfoot=> 1, thead => 1,
                  }->{$self->{open_elements}->[-1]->[1]}) {
                !!!back-token;
                $token = {type => 'end tag',
-Line 2953 
 sub _tree_construction_main ($) {
+Line 3252 
 sub _tree_construction_main ($) {
                  strong => 1, tt => 1, u => 1,
                 }->{$token->{tag_name}}) {
          $formatting_end_tag->($token->{tag_name});
- ## TODO: <http://html5.org/tools/web-apps-tracker?from=883&to=884>
+         return;
+       } elsif ($token->{tag_name} eq 'br') {
+         !!!parse-error (type => 'unmatched end tag:br');
+         ## As if <br>
+         $reconstruct_active_formatting_elements->($insert_to_current);
+         my $el;
+         !!!create-element ($el, 'br');
+         $insert->($el);
+         ## Ignore the token.
+         !!!next-token;
          return;
        } elsif ({
                  caption => 1, col => 1, colgroup => 1, frame => 1,
                  frameset => 1, head => 1, option => 1, optgroup => 1,
                  tbody => 1, td => 1, tfoot => 1, th => 1,
                  thead => 1, tr => 1,
-                 area => 1, basefont => 1, bgsound => 1, br => 1,
+                 area => 1, basefont => 1, bgsound => 1,
                  embed => 1, hr => 1, iframe => 1, image => 1,
                  img => 1, input => 1, isindex => 1, noembed => 1,
                  noframes => 1, param => 1, select => 1, spacer => 1,
-Line 2987 
 sub _tree_construction_main ($) {
+Line 3298 
 sub _tree_construction_main ($) {
              if ({
                   dd => 1, dt => 1, li => 1, p => 1,
                   td => 1, th => 1, tr => 1,
+                  tbody => 1, tfoot=> 1, thead => 1,
                  }->{$self->{open_elements}->[-1]->[1]}) {
                !!!back-token;
                $token = {type => 'end tag',
-Line 3010 
 sub _tree_construction_main ($) {
+Line 3322 
 sub _tree_construction_main ($) {
                  #not $phrasing_category->{$node->[1]} and
                  ($special_category->{$node->[1]} or
                   $scoping_category->{$node->[1]})) {
-               !!!parse-error (type => 'not closed:'.$node->[1]);
+               !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
                ## Ignore the token
                !!!next-token;
                last S2;
-Line 3030 
 sub _tree_construction_main ($) {
+Line 3342 
 sub _tree_construction_main ($) {
    }; # $in_body
    B: {
-     if ($phase eq 'main') {
+     if ($token->{type} eq 'DOCTYPE') {
-       if ($token->{type} eq 'DOCTYPE') {
+       !!!parse-error (type => 'DOCTYPE in the middle');
-         !!!parse-error (type => 'in html:#DOCTYPE');
+       ## Ignore the token
-         ## Ignore the token
+       ## Stay in the phase
-         ## Stay in the phase
+       !!!next-token;
-         !!!next-token;
+       redo B;
-         redo B;
+     } elsif ($token->{type} eq 'end-of-file') {
-       } elsif ($token->{type} eq 'start tag' and
+       if ($token->{insertion_mode} ne 'trailing end') {
-                $token->{tag_name} eq 'html') {
-         ## TODO: unless it is the first start tag token, parse-error
-         my $top_el = $self->{open_elements}->[0]->[0];
-         for my $attr_name (keys %{$token->{attributes}}) {
-           unless ($top_el->has_attribute_ns (undef, $attr_name)) {
-             $top_el->set_attribute_ns
-               (undef, [undef, $attr_name],
-                $token->{attributes}->{$attr_name}->{value});
-           }
-         }
-         !!!next-token;
-         redo B;
-       } elsif ($token->{type} eq 'end-of-file') {
          ## Generate implied end tags
          if ({
               dd => 1, dt => 1, li => 1, p => 1, td => 1, th => 1, tr => 1,
+              tbody => 1, tfoot=> 1, thead => 1,
              }->{$self->{open_elements}->[-1]->[1]}) {
            !!!back-token;
            $token = {type => 'end tag', tag_name => $self->{open_elements}->[-1]->[1]};
-Line 3069 
 sub _tree_construction_main ($) {
+Line 3369 
 sub _tree_construction_main ($) {
            !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
          }
-         ## Stop parsing
-         last B;
          ## ISSUE: There is an issue in the spec.
+       }
+       ## Stop parsing
+       last B;
+     } elsif ($token->{type} eq 'start tag' and
+              $token->{tag_name} eq 'html') {
+       if ($self->{insertion_mode} eq 'trailing end') {
+         ## Turn into the main phase
+         !!!parse-error (type => 'after html:html');
+         $self->{insertion_mode} = $previous_insertion_mode;
+       }
+ ## ISSUE: "aa<html>" is not a parse error.
+ ## ISSUE: "<html>" in fragment is not a parse error.
+       unless ($token->{first_start_tag}) {
+         !!!parse-error (type => 'not first start tag');
+       }
+       my $top_el = $self->{open_elements}->[0]->[0];
+       for my $attr_name (keys %{$token->{attributes}}) {
+         unless ($top_el->has_attribute_ns (undef, $attr_name)) {
+           $top_el->set_attribute_ns
+             (undef, [undef, $attr_name],
+              $token->{attributes}->{$attr_name}->{value});
+         }
+       }
+       !!!next-token;
+       redo B;
+     } elsif ($token->{type} eq 'comment') {
+       my $comment = $self->{document}->create_comment ($token->{data});
+       if ($self->{insertion_mode} eq 'trailing end') {
+         $self->{document}->append_child ($comment);
+       } elsif ($self->{insertion_mode} eq 'after body') {
+         $self->{open_elements}->[0]->[0]->append_child ($comment);
        } else {
-         if ($self->{insertion_mode} eq 'before head') {
+         $self->{open_elements}->[-1]->[0]->append_child ($comment);
+       }
+       !!!next-token;
+       redo B;
+     } elsif ($self->{insertion_mode} eq 'before head') {
            if ($token->{type} eq 'character') {
              if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
                $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
-Line 3090 
 sub _tree_construction_main ($) {
+Line 3424 
 sub _tree_construction_main ($) {
              $self->{insertion_mode} = 'in head';
              ## reprocess
              redo B;
-           } elsif ($token->{type} eq 'comment') {
-             my $comment = $self->{document}->create_comment ($token->{data});
-             $self->{open_elements}->[-1]->[0]->append_child ($comment);
-             !!!next-token;
-             redo B;
            } elsif ($token->{type} eq 'start tag') {
              my $attr = $token->{tag_name} eq 'head' ? $token->{attributes} : {};
              !!!create-element ($self->{head_element}, 'head', $attr);
-Line 3113 
 sub _tree_construction_main ($) {
+Line 3442 
 sub _tree_construction_main ($) {
              }
              redo B;
            } elsif ($token->{type} eq 'end tag') {
-             if ($token->{tag_name} eq 'html') {
+             if ({
+                  head => 1, body => 1, html => 1,
+                  p => 1, br => 1,
+                 }->{$token->{tag_name}}) {
                ## As if <head>
                !!!create-element ($self->{head_element}, 'head');
                $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
-Line 3123 
 sub _tree_construction_main ($) {
+Line 3455 
 sub _tree_construction_main ($) {
                redo B;
              } else {
                !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
-               ## Ignore the token
+               ## Ignore the token ## ISSUE: An issue in the spec.
                !!!next-token;
                redo B;
              }
            } else {
              die "$0: $token->{type}: Unknown type";
            }
-         } elsif ($self->{insertion_mode} eq 'in head') {
+         } elsif ($self->{insertion_mode} eq 'in head' or
+                  $self->{insertion_mode} eq 'in head noscript' or
+                  $self->{insertion_mode} eq 'after head') {
            if ($token->{type} eq 'character') {
              if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
                $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
-Line 3141 
 sub _tree_construction_main ($) {
+Line 3475 
 sub _tree_construction_main ($) {
              }
              #
-           } elsif ($token->{type} eq 'comment') {
-             my $comment = $self->{document}->create_comment ($token->{data});
-             $self->{open_elements}->[-1]->[0]->append_child ($comment);
-             !!!next-token;
-             redo B;
            } elsif ($token->{type} eq 'start tag') {
-             if ($token->{tag_name} eq 'title') {
+             if ({base => ($self->{insertion_mode} eq 'in head' or
-               ## NOTE: There is an "as if in head" code clone
+                           $self->{insertion_mode} eq 'after head'),
-               my $title_el;
+                  link => 1}->{$token->{tag_name}}) {
-               !!!create-element ($title_el, 'title', $token->{attributes});
+               ## NOTE: There is a "as if in head" code clone.
-               (defined $self->{head_element} ? $self->{head_element} : $self->{open_elements}->[-1]->[0])
+               if ($self->{insertion_mode} eq 'after head') {
-                 ->append_child ($title_el);
+                 !!!parse-error (type => 'after head:'.$token->{tag_name});
-               $self->{content_model_flag} = 'RCDATA';
+                 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
-               delete $self->{escape}; # MUST
+               }
+               !!!insert-element ($token->{tag_name}, $token->{attributes});
-               my $text = '';
+               pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
+               pop @{$self->{open_elements}}
+                   if $self->{insertion_mode} eq 'after head';
                !!!next-token;
-               while ($token->{type} eq 'character') {
+               redo B;
-                 $text .= $token->{data};
+             } elsif ($token->{tag_name} eq 'meta') {
-                 !!!next-token;
+               ## NOTE: There is a "as if in head" code clone.
+               if ($self->{insertion_mode} eq 'after head') {
+                 !!!parse-error (type => 'after head:'.$token->{tag_name});
+                 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
                }
-               if (length $text) {
+               !!!insert-element ($token->{tag_name}, $token->{attributes});
-                 $title_el->manakai_append_text ($text);
+               pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
+               unless ($self->{confident}) {
+                 my $charset;
+                 if ($token->{attributes}->{charset}) { ## TODO: And if supported
+                   $charset = $token->{attributes}->{charset}->{value};
+                 }
+                 if ($token->{attributes}->{'http-equiv'}) {
+                   ## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition.
+                   if ($token->{attributes}->{'http-equiv'}->{value}
+                       =~ /\A[^;]*;[\x09-\x0D\x20]*charset[\x09-\x0D\x20]*=
+                           [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|
+                           ([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) {
+                     $charset = defined $1 ? $1 : defined $2 ? $2 : $3;
+                   } ## TODO: And if supported
+                 }
+                 ## TODO: Change the encoding
                }
-               $self->{content_model_flag} = 'PCDATA';
+               ## TODO: Extracting |charset| from |meta|.
+               pop @{$self->{open_elements}}
-               if ($token->{type} eq 'end tag' and
+                   if $self->{insertion_mode} eq 'after head';
-                   $token->{tag_name} eq 'title') {
+               !!!next-token;
+               redo B;
+             } elsif ($token->{tag_name} eq 'title' and
+                      $self->{insertion_mode} eq 'in head') {
+               ## NOTE: There is a "as if in head" code clone.
+               if ($self->{insertion_mode} eq 'after head') {
+                 !!!parse-error (type => 'after head:'.$token->{tag_name});
+                 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
+               }
+               my $parent = defined $self->{head_element} ? $self->{head_element}
+                   : $self->{open_elements}->[-1]->[0];
+               $parse_rcdata->(RCDATA_CONTENT_MODEL,
+                               sub { $parent->append_child ($_[0]) });
+               pop @{$self->{open_elements}}
+                   if $self->{insertion_mode} eq 'after head';
+               redo B;
+             } elsif ($token->{tag_name} eq 'style') {
+               ## NOTE: Or (scripting is enabled and tag_name eq 'noscript' and
+               ## insertion mode 'in head')
+               ## NOTE: There is a "as if in head" code clone.
+               if ($self->{insertion_mode} eq 'after head') {
+                 !!!parse-error (type => 'after head:'.$token->{tag_name});
+                 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
+               }
+               $parse_rcdata->(CDATA_CONTENT_MODEL, $insert_to_current);
+               pop @{$self->{open_elements}}
+                   if $self->{insertion_mode} eq 'after head';
+               redo B;
+             } elsif ($token->{tag_name} eq 'noscript') {
+               if ($self->{insertion_mode} eq 'in head') {
+                 ## NOTE: and scripting is disalbed
+                 !!!insert-element ($token->{tag_name}, $token->{attributes});
+                 $self->{insertion_mode} = 'in head noscript';
+                 !!!next-token;
+                 redo B;
+               } elsif ($self->{insertion_mode} eq 'in head noscript') {
+                 !!!parse-error (type => 'in noscript:noscript');
                  ## Ignore the token
+                 !!!next-token;
+                 redo B;
                } else {
-                 !!!parse-error (type => 'in RCDATA:#'.$token->{type});
+                 #
-                 ## ISSUE: And ignore?
                }
+             } elsif ($token->{tag_name} eq 'head' and
+                      $self->{insertion_mode} ne 'after head') {
+               !!!parse-error (type => 'in head:head'); # or in head noscript
+               ## Ignore the token
                !!!next-token;
                redo B;
-             } elsif ($token->{tag_name} eq 'style') {
+             } elsif ($self->{insertion_mode} ne 'in head noscript' and
-               $style_start_tag->();
+                      $token->{tag_name} eq 'script') {
-               redo B;
+               if ($self->{insertion_mode} eq 'after head') {
-             } elsif ($token->{tag_name} eq 'script') {
+                 !!!parse-error (type => 'after head:'.$token->{tag_name});
-               $script_start_tag->();
+                 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
+               }
+               ## NOTE: There is a "as if in head" code clone.
+               $script_start_tag->($insert_to_current);
+               pop @{$self->{open_elements}}
+                   if $self->{insertion_mode} eq 'after head';
                redo B;
-             } elsif ({base => 1, link => 1, meta => 1}->{$token->{tag_name}}) {
+             } elsif ($self->{insertion_mode} eq 'after head' and
-               ## NOTE: There are "as if in head" code clones
+                      $token->{tag_name} eq 'body') {
-               my $el;
+               !!!insert-element ('body', $token->{attributes});
-               !!!create-element ($el, $token->{tag_name}, $token->{attributes});
+               $self->{insertion_mode} = 'in body';
-               (defined $self->{head_element} ? $self->{head_element} : $self->{open_elements}->[-1]->[0])
-                 ->append_child ($el);
                !!!next-token;
                redo B;
-             } elsif ($token->{tag_name} eq 'head') {
+             } elsif ($self->{insertion_mode} eq 'after head' and
-               !!!parse-error (type => 'in head:head');
+                      $token->{tag_name} eq 'frameset') {
-               ## Ignore the token
+               !!!insert-element ('frameset', $token->{attributes});
+               $self->{insertion_mode} = 'in frameset';
                !!!next-token;
                redo B;
              } else {
                #
              }
            } elsif ($token->{type} eq 'end tag') {
-             if ($token->{tag_name} eq 'head') {
+             if ($self->{insertion_mode} eq 'in head' and
-               if ($self->{open_elements}->[-1]->[1] eq 'head') {
+                 $token->{tag_name} eq 'head') {
-                 pop @{$self->{open_elements}};
+               pop @{$self->{open_elements}};
-               } else {
-                 !!!parse-error (type => 'unmatched end tag:head');
-               }
                $self->{insertion_mode} = 'after head';
                !!!next-token;
                redo B;
-             } elsif ($token->{tag_name} eq 'html') {
+             } elsif ($self->{insertion_mode} eq 'in head noscript' and
+                 $token->{tag_name} eq 'noscript') {
+               pop @{$self->{open_elements}};
+               $self->{insertion_mode} = 'in head';
+               !!!next-token;
+               redo B;
+             } elsif ($self->{insertion_mode} eq 'in head' and
+                      {
+                       body => 1, html => 1,
+                       p => 1, br => 1,
+                      }->{$token->{tag_name}}) {
                #
-             } else {
+             } elsif ($self->{insertion_mode} eq 'in head noscript' and
+                      {
+                       p => 1, br => 1,
+                      }->{$token->{tag_name}}) {
+               #
+             } elsif ($self->{insertion_mode} ne 'after head') {
                !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
                ## Ignore the token
                !!!next-token;
                redo B;
+             } else {
+               #
              }
            } else {
              #
            }
-           if ($self->{open_elements}->[-1]->[1] eq 'head') {
+           ## As if </head> or </noscript> or <body>
-             ## As if </head>
+           if ($self->{insertion_mode} eq 'in head') {
+             pop @{$self->{open_elements}};
+             $self->{insertion_mode} = 'after head';
+           } elsif ($self->{insertion_mode} eq 'in head noscript') {
              pop @{$self->{open_elements}};
+             !!!parse-error (type => 'in noscript:'.(defined $token->{tag_name} ? ($token->{type} eq 'end tag' ? '/' : '') . $token->{tag_name} : '#' . $token->{type}));
+             $self->{insertion_mode} = 'in head';
+           } else { # 'after head'
+             !!!insert-element ('body');
+             $self->{insertion_mode} = 'in body';
            }
-           $self->{insertion_mode} = 'after head';
            ## reprocess
            redo B;
            ## ISSUE: An issue in the spec.
-         } elsif ($self->{insertion_mode} eq 'after head') {
+         } elsif ($self->{insertion_mode} eq 'in body' or
+                  $self->{insertion_mode} eq 'in caption') {
            if ($token->{type} eq 'character') {
-             if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
+             ## NOTE: There is a code clone of "character in body".
-               $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
+             $reconstruct_active_formatting_elements->($insert_to_current);
-               unless (length $token->{data}) {
-                 !!!next-token;
-                 redo B;
-               }
-             }
-             #
+             $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
-           } elsif ($token->{type} eq 'comment') {
-             my $comment = $self->{document}->create_comment ($token->{data});
-             $self->{open_elements}->[-1]->[0]->append_child ($comment);
              !!!next-token;
              redo B;
            } elsif ($token->{type} eq 'start tag') {
-             if ($token->{tag_name} eq 'body') {
+             if ({
-               !!!insert-element ('body', $token->{attributes});
+                  caption => 1, col => 1, colgroup => 1, tbody => 1,
-               $self->{insertion_mode} = 'in body';
+                  td => 1, tfoot => 1, th => 1, thead => 1, tr => 1,
-               !!!next-token;
+                 }->{$token->{tag_name}} and
+                 $self->{insertion_mode} eq 'in caption') {
+               !!!parse-error (type => 'not closed:caption');
+               ## As if </caption>
+               ## have a table element in table scope
+               my $i;
+               INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
+                 my $node = $self->{open_elements}->[$_];
+                 if ($node->[1] eq 'caption') {
+                   $i = $_;
+                   last INSCOPE;
+                 } elsif ({
+                           table => 1, html => 1,
+                          }->{$node->[1]}) {
+                   last INSCOPE;
+                 }
+               } # INSCOPE
+               unless (defined $i) {
+                 !!!parse-error (type => 'unmatched end tag:caption');
+                 ## Ignore the token
+                 !!!next-token;
+                 redo B;
+               }
+               ## generate implied end tags
+               if ({
+                    dd => 1, dt => 1, li => 1, p => 1,
+                    td => 1, th => 1, tr => 1,
+                    tbody => 1, tfoot=> 1, thead => 1,
+                   }->{$self->{open_elements}->[-1]->[1]}) {
+                 !!!back-token; # <?>
+                 $token = {type => 'end tag', tag_name => 'caption'};
+                 !!!back-token;
+                 $token = {type => 'end tag',
+                           tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
+                 redo B;
+               }
+               if ($self->{open_elements}->[-1]->[1] ne 'caption') {
+                 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
+               }
+               splice @{$self->{open_elements}}, $i;
+               $clear_up_to_marker->();
+               $self->{insertion_mode} = 'in table';
+               ## reprocess
                redo B;
-             } elsif ($token->{tag_name} eq 'frameset') {
+             } else {
-               !!!insert-element ('frameset', $token->{attributes});
+               #
-               $self->{insertion_mode} = 'in frameset';
+             }
+           } elsif ($token->{type} eq 'end tag') {
+             if ($token->{tag_name} eq 'caption' and
+                 $self->{insertion_mode} eq 'in caption') {
+               ## have a table element in table scope
+               my $i;
+               INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
+                 my $node = $self->{open_elements}->[$_];
+                 if ($node->[1] eq $token->{tag_name}) {
+                   $i = $_;
+                   last INSCOPE;
+                 } elsif ({
+                           table => 1, html => 1,
+                          }->{$node->[1]}) {
+                   last INSCOPE;
+                 }
+               } # INSCOPE
+               unless (defined $i) {
+                 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
+                 ## Ignore the token
+                 !!!next-token;
+                 redo B;
+               }
+               ## generate implied end tags
+               if ({
+                    dd => 1, dt => 1, li => 1, p => 1,
+                    td => 1, th => 1, tr => 1,
+                    tbody => 1, tfoot=> 1, thead => 1,
+                   }->{$self->{open_elements}->[-1]->[1]}) {
+                 !!!back-token;
+                 $token = {type => 'end tag',
+                           tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
+                 redo B;
+               }
+               if ($self->{open_elements}->[-1]->[1] ne 'caption') {
+                 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
+               }
+               splice @{$self->{open_elements}}, $i;
+               $clear_up_to_marker->();
+               $self->{insertion_mode} = 'in table';
                !!!next-token;
                redo B;
-             } elsif ({
+             } elsif ($token->{tag_name} eq 'table' and
-                       base => 1, link => 1, meta => 1,
+                      $self->{insertion_mode} eq 'in caption') {
-                       script => 1, style => 1, title => 1,
+               !!!parse-error (type => 'not closed:caption');
-                      }->{$token->{tag_name}}) {
-               !!!parse-error (type => 'after head:'.$token->{tag_name});
+               ## As if </caption>
-               $self->{insertion_mode} = 'in head';
+               ## have a table element in table scope
+               my $i;
+               INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
+                 my $node = $self->{open_elements}->[$_];
+                 if ($node->[1] eq 'caption') {
+                   $i = $_;
+                   last INSCOPE;
+                 } elsif ({
+                           table => 1, html => 1,
+                          }->{$node->[1]}) {
+                   last INSCOPE;
+                 }
+               } # INSCOPE
+               unless (defined $i) {
+                 !!!parse-error (type => 'unmatched end tag:caption');
+                 ## Ignore the token
+                 !!!next-token;
+                 redo B;
+               }
+               ## generate implied end tags
+               if ({
+                    dd => 1, dt => 1, li => 1, p => 1,
+                    td => 1, th => 1, tr => 1,
+                    tbody => 1, tfoot=> 1, thead => 1,
+                   }->{$self->{open_elements}->[-1]->[1]}) {
+                 !!!back-token; # </table>
+                 $token = {type => 'end tag', tag_name => 'caption'};
+                 !!!back-token;
+                 $token = {type => 'end tag',
+                           tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
+                 redo B;
+               }
+               if ($self->{open_elements}->[-1]->[1] ne 'caption') {
+                 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
+               }
+               splice @{$self->{open_elements}}, $i;
+               $clear_up_to_marker->();
+               $self->{insertion_mode} = 'in table';
                ## reprocess
                redo B;
+             } elsif ({
+                       body => 1, col => 1, colgroup => 1,
+                       html => 1, tbody => 1, td => 1, tfoot => 1,
+                       th => 1, thead => 1, tr => 1,
+                      }->{$token->{tag_name}} and
+                      $self->{insertion_mode} eq 'in caption') {
+               !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
+               ## Ignore the token
+               !!!next-token;
+               redo B;
              } else {
                #
              }
            } else {
              #
            }
-           ## As if <body>
-           !!!insert-element ('body');
-           $self->{insertion_mode} = 'in body';
-           ## reprocess
-           redo B;
-         } elsif ($self->{insertion_mode} eq 'in body') {
-           if ($token->{type} eq 'character') {
-             ## NOTE: There is a code clone of "character in body".
-             $reconstruct_active_formatting_elements->($insert_to_current);
-             $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
-             !!!next-token;
+           $in_body->($insert_to_current);
-             redo B;
+           redo B;
-           } elsif ($token->{type} eq 'comment') {
-             ## NOTE: There is a code clone of "comment in body".
-             my $comment = $self->{document}->create_comment ($token->{data});
-             $self->{open_elements}->[-1]->[0]->append_child ($comment);
-             !!!next-token;
-             redo B;
-           } else {
-             $in_body->($insert_to_current);
-             redo B;
-           }
          } elsif ($self->{insertion_mode} eq 'in table') {
            if ($token->{type} eq 'character') {
              ## NOTE: There are "character in table" code clones.
-Line 3356 
 sub _tree_construction_main ($) {
+Line 3890 
 sub _tree_construction_main ($) {
              !!!next-token;
              redo B;
-           } elsif ($token->{type} eq 'comment') {
-             my $comment = $self->{document}->create_comment ($token->{data});
-             $self->{open_elements}->[-1]->[0]->append_child ($comment);
-             !!!next-token;
-             redo B;
            } elsif ($token->{type} eq 'start tag') {
              if ({
                   caption => 1,
-Line 3432 
 sub _tree_construction_main ($) {
+Line 3961 
 sub _tree_construction_main ($) {
                if ({
                     dd => 1, dt => 1, li => 1, p => 1,
                     td => 1, th => 1, tr => 1,
+                    tbody => 1, tfoot=> 1, thead => 1,
                    }->{$self->{open_elements}->[-1]->[1]}) {
                  !!!back-token; # <table>
                  $token = {type => 'end tag', tag_name => 'table'};
-Line 3480 
 sub _tree_construction_main ($) {
+Line 4010 
 sub _tree_construction_main ($) {
                if ({
                     dd => 1, dt => 1, li => 1, p => 1,
                     td => 1, th => 1, tr => 1,
+                    tbody => 1, tfoot=> 1, thead => 1,
                    }->{$self->{open_elements}->[-1]->[1]}) {
                  !!!back-token;
                  $token = {type => 'end tag',
-Line 3516 
 sub _tree_construction_main ($) {
+Line 4047 
 sub _tree_construction_main ($) {
            !!!parse-error (type => 'in table:'.$token->{tag_name});
            $in_body->($insert_to_foster);
            redo B;
-         } elsif ($self->{insertion_mode} eq 'in caption') {
-           if ($token->{type} eq 'character') {
-             ## NOTE: This is a code clone of "character in body".
-             $reconstruct_active_formatting_elements->($insert_to_current);
-             $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
-             !!!next-token;
-             redo B;
-           } elsif ($token->{type} eq 'comment') {
-             ## NOTE: This is a code clone of "comment in body".
-             my $comment = $self->{document}->create_comment ($token->{data});
-             $self->{open_elements}->[-1]->[0]->append_child ($comment);
-             !!!next-token;
-             redo B;
-           } elsif ($token->{type} eq 'start tag') {
-             if ({
-                  caption => 1, col => 1, colgroup => 1, tbody => 1,
-                  td => 1, tfoot => 1, th => 1, thead => 1, tr => 1,
-                 }->{$token->{tag_name}}) {
-               !!!parse-error (type => 'not closed:caption');
-               ## As if </caption>
-               ## have a table element in table scope
-               my $i;
-               INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
-                 my $node = $self->{open_elements}->[$_];
-                 if ($node->[1] eq 'caption') {
-                   $i = $_;
-                   last INSCOPE;
-                 } elsif ({
-                           table => 1, html => 1,
-                          }->{$node->[1]}) {
-                   last INSCOPE;
-                 }
-               } # INSCOPE
-               unless (defined $i) {
-                 !!!parse-error (type => 'unmatched end tag:caption');
-                 ## Ignore the token
-                 !!!next-token;
-                 redo B;
-               }
-               ## generate implied end tags
-               if ({
-                    dd => 1, dt => 1, li => 1, p => 1,
-                    td => 1, th => 1, tr => 1,
-                   }->{$self->{open_elements}->[-1]->[1]}) {
-                 !!!back-token; # <?>
-                 $token = {type => 'end tag', tag_name => 'caption'};
-                 !!!back-token;
-                 $token = {type => 'end tag',
-                           tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
-                 redo B;
-               }
-               if ($self->{open_elements}->[-1]->[1] ne 'caption') {
-                 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
-               }
-               splice @{$self->{open_elements}}, $i;
-               $clear_up_to_marker->();
-               $self->{insertion_mode} = 'in table';
-               ## reprocess
-               redo B;
-             } else {
-               #
-             }
-           } elsif ($token->{type} eq 'end tag') {
-             if ($token->{tag_name} eq 'caption') {
-               ## have a table element in table scope
-               my $i;
-               INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
-                 my $node = $self->{open_elements}->[$_];
-                 if ($node->[1] eq $token->{tag_name}) {
-                   $i = $_;
-                   last INSCOPE;
-                 } elsif ({
-                           table => 1, html => 1,
-                          }->{$node->[1]}) {
-                   last INSCOPE;
-                 }
-               } # INSCOPE
-               unless (defined $i) {
-                 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
-                 ## Ignore the token
-                 !!!next-token;
-                 redo B;
-               }
-               ## generate implied end tags
-               if ({
-                    dd => 1, dt => 1, li => 1, p => 1,
-                    td => 1, th => 1, tr => 1,
-                   }->{$self->{open_elements}->[-1]->[1]}) {
-                 !!!back-token;
-                 $token = {type => 'end tag',
-                           tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
-                 redo B;
-               }
-               if ($self->{open_elements}->[-1]->[1] ne 'caption') {
-                 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
-               }
-               splice @{$self->{open_elements}}, $i;
-               $clear_up_to_marker->();
-               $self->{insertion_mode} = 'in table';
-               !!!next-token;
-               redo B;
-             } elsif ($token->{tag_name} eq 'table') {
-               !!!parse-error (type => 'not closed:caption');
-               ## As if </caption>
-               ## have a table element in table scope
-               my $i;
-               INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
-                 my $node = $self->{open_elements}->[$_];
-                 if ($node->[1] eq 'caption') {
-                   $i = $_;
-                   last INSCOPE;
-                 } elsif ({
-                           table => 1, html => 1,
-                          }->{$node->[1]}) {
-                   last INSCOPE;
-                 }
-               } # INSCOPE
-               unless (defined $i) {
-                 !!!parse-error (type => 'unmatched end tag:caption');
-                 ## Ignore the token
-                 !!!next-token;
-                 redo B;
-               }
-               ## generate implied end tags
-               if ({
-                    dd => 1, dt => 1, li => 1, p => 1,
-                    td => 1, th => 1, tr => 1,
-                   }->{$self->{open_elements}->[-1]->[1]}) {
-                 !!!back-token; # </table>
-                 $token = {type => 'end tag', tag_name => 'caption'};
-                 !!!back-token;
-                 $token = {type => 'end tag',
-                           tag_name => $self->{open_elements}->[-1]->[1]}; # MUST
-                 redo B;
-               }
-               if ($self->{open_elements}->[-1]->[1] ne 'caption') {
-                 !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
-               }
-               splice @{$self->{open_elements}}, $i;
-               $clear_up_to_marker->();
-               $self->{insertion_mode} = 'in table';
-               ## reprocess
-               redo B;
-             } elsif ({
-                       body => 1, col => 1, colgroup => 1,
-                       html => 1, tbody => 1, td => 1, tfoot => 1,
-                       th => 1, thead => 1, tr => 1,
-                      }->{$token->{tag_name}}) {
-               !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
-               ## Ignore the token
-               redo B;
-             } else {
-               #
-             }
-           } else {
-             #
-           }
-           $in_body->($insert_to_current);
-           redo B;
          } elsif ($self->{insertion_mode} eq 'in column group') {
            if ($token->{type} eq 'character') {
              if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
-Line 3709 
 sub _tree_construction_main ($) {
+Line 4058 
 sub _tree_construction_main ($) {
              }
              #
-           } elsif ($token->{type} eq 'comment') {
-             my $comment = $self->{document}->create_comment ($token->{data});
-             $self->{open_elements}->[-1]->[0]->append_child ($comment);
-             !!!next-token;
-             redo B;
            } elsif ($token->{type} eq 'start tag') {
              if ($token->{tag_name} eq 'col') {
                !!!insert-element ($token->{tag_name}, $token->{attributes});
-Line 3819 
 sub _tree_construction_main ($) {
+Line 4163 
 sub _tree_construction_main ($) {
              !!!next-token;
              redo B;
-           } elsif ($token->{type} eq 'comment') {
-             ## Copied from 'in table'
-             my $comment = $self->{document}->create_comment ($token->{data});
-             $self->{open_elements}->[-1]->[0]->append_child ($comment);
-             !!!next-token;
-             redo B;
            } elsif ($token->{type} eq 'start tag') {
              if ({
                   tr => 1,
-Line 3925 
 sub _tree_construction_main ($) {
+Line 4263 
 sub _tree_construction_main ($) {
                if ({
                     dd => 1, dt => 1, li => 1, p => 1,
                     td => 1, th => 1, tr => 1,
+                    tbody => 1, tfoot=> 1, thead => 1,
                    }->{$self->{open_elements}->[-1]->[1]}) {
                  !!!back-token; # <table>
                  $token = {type => 'end tag', tag_name => 'table'};
-Line 4103 
 sub _tree_construction_main ($) {
+Line 4442 
 sub _tree_construction_main ($) {
              !!!next-token;
              redo B;
-           } elsif ($token->{type} eq 'comment') {
-             ## Copied from 'in table'
-             my $comment = $self->{document}->create_comment ($token->{data});
-             $self->{open_elements}->[-1]->[0]->append_child ($comment);
-             !!!next-token;
-             redo B;
            } elsif ($token->{type} eq 'start tag') {
              if ($token->{tag_name} eq 'th' or
                  $token->{tag_name} eq 'td') {
-Line 4193 
 sub _tree_construction_main ($) {
+Line 4526 
 sub _tree_construction_main ($) {
                if ({
                     dd => 1, dt => 1, li => 1, p => 1,
                     td => 1, th => 1, tr => 1,
+                    tbody => 1, tfoot=> 1, thead => 1,
                    }->{$self->{open_elements}->[-1]->[1]}) {
                  !!!back-token; # <table>
                  $token = {type => 'end tag', tag_name => 'table'};
-Line 4367 
 sub _tree_construction_main ($) {
+Line 4701 
 sub _tree_construction_main ($) {
              !!!next-token;
              redo B;
-           } elsif ($token->{type} eq 'comment') {
-             ## NOTE: This is a code clone of "comment in body".
-             my $comment = $self->{document}->create_comment ($token->{data});
-             $self->{open_elements}->[-1]->[0]->append_child ($comment);
-             !!!next-token;
-             redo B;
            } elsif ($token->{type} eq 'start tag') {
              if ({
                   caption => 1, col => 1, colgroup => 1,
-Line 4434 
 sub _tree_construction_main ($) {
+Line 4762 
 sub _tree_construction_main ($) {
                     td => ($token->{tag_name} eq 'th'),
                     th => ($token->{tag_name} eq 'td'),
                     tr => 1,
+                    tbody => 1, tfoot=> 1, thead => 1,
                    }->{$self->{open_elements}->[-1]->[1]}) {
                  !!!back-token;
                  $token = {type => 'end tag',
-Line 4508 
 sub _tree_construction_main ($) {
+Line 4837 
 sub _tree_construction_main ($) {
              $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
              !!!next-token;
              redo B;
-           } elsif ($token->{type} eq 'comment') {
-             my $comment = $self->{document}->create_comment ($token->{data});
-             $self->{open_elements}->[-1]->[0]->append_child ($comment);
-             !!!next-token;
-             redo B;
            } elsif ($token->{type} eq 'start tag') {
              if ($token->{tag_name} eq 'option') {
                if ($self->{open_elements}->[-1]->[1] eq 'option') {
-Line 4685 
 sub _tree_construction_main ($) {
+Line 5009 
 sub _tree_construction_main ($) {
          } elsif ($self->{insertion_mode} eq 'after body') {
            if ($token->{type} eq 'character') {
              if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
+               my $data = $1;
                ## As if in body
                $reconstruct_active_formatting_elements->($insert_to_current);
-               $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
+               $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
                unless (length $token->{data}) {
                  !!!next-token;
-Line 4697 
 sub _tree_construction_main ($) {
+Line 5022 
 sub _tree_construction_main ($) {
              }
              #
-             !!!parse-error (type => 'after body:#'.$token->{type});
+             !!!parse-error (type => 'after body:#character');
-           } elsif ($token->{type} eq 'comment') {
-             my $comment = $self->{document}->create_comment ($token->{data});
-             $self->{open_elements}->[0]->[0]->append_child ($comment);
-             !!!next-token;
-             redo B;
            } elsif ($token->{type} eq 'start tag') {
              !!!parse-error (type => 'after body:'.$token->{tag_name});
              #
-Line 4714 
 sub _tree_construction_main ($) {
+Line 5034 
 sub _tree_construction_main ($) {
                  !!!next-token;
                  redo B;
                } else {
-                 $phase = 'trailing end';
+                 $previous_insertion_mode = $self->{insertion_mode};
+                 $self->{insertion_mode} = 'trailing end';
                  !!!next-token;
                  redo B;
                }
-Line 4722 
 sub _tree_construction_main ($) {
+Line 5043 
 sub _tree_construction_main ($) {
                !!!parse-error (type => 'after body:/'.$token->{tag_name});
              }
            } else {
-             !!!parse-error (type => 'after body:#'.$token->{type});
+             die "$0: $token->{type}: Unknown token type";
            }
            $self->{insertion_mode} = 'in body';
            ## reprocess
            redo B;
-         } elsif ($self->{insertion_mode} eq 'in frameset') {
+     } elsif ($self->{insertion_mode} eq 'in frameset') {
-           if ($token->{type} eq 'character') {
+       if ($token->{type} eq 'character') {
-             if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
+         if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
-               $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
+           $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
-               unless (length $token->{data}) {
-                 !!!next-token;
-                 redo B;
-               }
-             }
-             #
+           unless (length $token->{data}) {
-           } elsif ($token->{type} eq 'comment') {
-             my $comment = $self->{document}->create_comment ($token->{data});
-             $self->{open_elements}->[-1]->[0]->append_child ($comment);
              !!!next-token;
              redo B;
-           } elsif ($token->{type} eq 'start tag') {
-             if ($token->{tag_name} eq 'frameset') {
-               !!!insert-element ($token->{tag_name}, $token->{attributes});
-               !!!next-token;
-               redo B;
-             } elsif ($token->{tag_name} eq 'frame') {
-               !!!insert-element ($token->{tag_name}, $token->{attributes});
-               pop @{$self->{open_elements}};
-               !!!next-token;
-               redo B;
-             } elsif ($token->{tag_name} eq 'noframes') {
-               $in_body->($insert_to_current);
-               redo B;
-             } else {
-               #
-             }
-           } elsif ($token->{type} eq 'end tag') {
-             if ($token->{tag_name} eq 'frameset') {
-               if ($self->{open_elements}->[-1]->[1] eq 'html' and
-                   @{$self->{open_elements}} == 1) {
-                 !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
-                 ## Ignore the token
-                 !!!next-token;
-               } else {
-                 pop @{$self->{open_elements}};
-                 !!!next-token;
-               }
-               ## if not inner_html and
-               if ($self->{open_elements}->[-1]->[1] ne 'frameset') {
-                 $self->{insertion_mode} = 'after frameset';
-               }
-               redo B;
-             } else {
-               #
-             }
-           } else {
-             #
            }
+         }
-           if (defined $token->{tag_name}) {
-             !!!parse-error (type => 'in frameset:'.$token->{tag_name});
+         !!!parse-error (type => 'in frameset:#character');
+         ## Ignore the token
+         !!!next-token;
+         redo B;
+       } elsif ($token->{type} eq 'start tag') {
+         if ($token->{tag_name} eq 'frameset') {
+           !!!insert-element ($token->{tag_name}, $token->{attributes});
+           !!!next-token;
+           redo B;
+         } elsif ($token->{tag_name} eq 'frame') {
+           !!!insert-element ($token->{tag_name}, $token->{attributes});
+           pop @{$self->{open_elements}};
+           !!!next-token;
+           redo B;
+         } elsif ($token->{tag_name} eq 'noframes') {
+           $in_body->($insert_to_current);
+           redo B;
+         } else {
+           !!!parse-error (type => 'in frameset:'.$token->{tag_name});
+           ## Ignore the token
+           !!!next-token;
+           redo B;
+         }
+       } elsif ($token->{type} eq 'end tag') {
+         if ($token->{tag_name} eq 'frameset') {
+           if ($self->{open_elements}->[-1]->[1] eq 'html' and
+               @{$self->{open_elements}} == 1) {
+             !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
+             ## Ignore the token
+             !!!next-token;
            } else {
-             !!!parse-error (type => 'in frameset:#'.$token->{type});
+             pop @{$self->{open_elements}};
+             !!!next-token;
+           }
+           if (not defined $self->{inner_html_node} and
+               $self->{open_elements}->[-1]->[1] ne 'frameset') {
+             $self->{insertion_mode} = 'after frameset';
            }
+           redo B;
+         } else {
+           !!!parse-error (type => 'in frameset:/'.$token->{tag_name});
            ## Ignore the token
            !!!next-token;
            redo B;
-         } elsif ($self->{insertion_mode} eq 'after frameset') {
+         }
-           if ($token->{type} eq 'character') {
+       } else {
+         die "$0: $token->{type}: Unknown token type";
+       }
+     } elsif ($self->{insertion_mode} eq 'after frameset') {
+       if ($token->{type} eq 'character') {
              if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
-               $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
+               $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
                unless (length $token->{data}) {
                  !!!next-token;
-Line 4804 
 sub _tree_construction_main ($) {
+Line 5120 
 sub _tree_construction_main ($) {
                }
              }
-             #
+             if ($token->{data} =~ s/^[^\x09\x0A\x0B\x0C\x20]+//) {
-           } elsif ($token->{type} eq 'comment') {
+               !!!parse-error (type => 'after frameset:#character');
-             my $comment = $self->{document}->create_comment ($token->{data});
-             $self->{open_elements}->[-1]->[0]->append_child ($comment);
+               ## Ignore the token.
-             !!!next-token;
+               if (length $token->{data}) {
-             redo B;
+                 ## reprocess the rest of characters
-           } elsif ($token->{type} eq 'start tag') {
+               } else {
-             if ($token->{tag_name} eq 'noframes') {
+                 !!!next-token;
-               $in_body->($insert_to_current);
+               }
-               redo B;
-             } else {
-               #
-             }
-           } elsif ($token->{type} eq 'end tag') {
-             if ($token->{tag_name} eq 'html') {
-               $phase = 'trailing end';
-               !!!next-token;
                redo B;
-             } else {
-               #
              }
-           } else {
-             #
+         die qq[$0: Character "$token->{data}"];
-           }
+       } elsif ($token->{type} eq 'start tag') {
+         if ($token->{tag_name} eq 'noframes') {
-           if (defined $token->{tag_name}) {
+           $in_body->($insert_to_current);
-             !!!parse-error (type => 'after frameset:'.$token->{tag_name});
+           redo B;
-           } else {
+         } else {
-             !!!parse-error (type => 'after frameset:#'.$token->{type});
+           !!!parse-error (type => 'after frameset:'.$token->{tag_name});
-           }
            ## Ignore the token
            !!!next-token;
            redo B;
+         }
-           ## ISSUE: An issue in spec there
+       } elsif ($token->{type} eq 'end tag') {
+         if ($token->{tag_name} eq 'html') {
+           $previous_insertion_mode = $self->{insertion_mode};
+           $self->{insertion_mode} = 'trailing end';
+           !!!next-token;
+           redo B;
          } else {
-           die "$0: $self->{insertion_mode}: Unknown insertion mode";
+           !!!parse-error (type => 'after frameset:/'.$token->{tag_name});
+           ## Ignore the token
+           !!!next-token;
+           redo B;
          }
+       } else {
+         die "$0: $token->{type}: Unknown token type";
        }
-     } elsif ($phase eq 'trailing end') {
+       ## ISSUE: An issue in spec here
+     } elsif ($self->{insertion_mode} eq 'trailing end') {
        ## states in the main stage is preserved yet # MUST
-       if ($token->{type} eq 'DOCTYPE') {
+       if ($token->{type} eq 'character') {
-         !!!parse-error (type => 'after html:#DOCTYPE');
-         ## Ignore the token
-         !!!next-token;
-         redo B;
-       } elsif ($token->{type} eq 'comment') {
-         my $comment = $self->{document}->create_comment ($token->{data});
-         $self->{document}->append_child ($comment);
-         !!!next-token;
-         redo B;
-       } elsif ($token->{type} eq 'character') {
          if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
            my $data = $1;
            ## As if in the main phase.
            ## NOTE: The insertion mode in the main phase
            ## just before the phase has been changed to the trailing
            ## end phase is either "after body" or "after frameset".
-           $reconstruct_active_formatting_elements->($insert_to_current)
+           $reconstruct_active_formatting_elements->($insert_to_current);
-             if $phase eq 'main';
            $self->{open_elements}->[-1]->[0]->manakai_append_text ($data);
-Line 4875 
 sub _tree_construction_main ($) {
+Line 5181 
 sub _tree_construction_main ($) {
          }
          !!!parse-error (type => 'after html:#character');
-         $phase = 'main';
+         $self->{insertion_mode} = $previous_insertion_mode;
          ## reprocess
          redo B;
-       } elsif ($token->{type} eq 'start tag' or
+       } elsif ($token->{type} eq 'start tag') {
-                $token->{type} eq 'end tag') {
          !!!parse-error (type => 'after html:'.$token->{tag_name});
-         $phase = 'main';
+         $self->{insertion_mode} = $previous_insertion_mode;
+         ## reprocess
+         redo B;
+       } elsif ($token->{type} eq 'end tag') {
+         !!!parse-error (type => 'after html:/'.$token->{tag_name});
+         $self->{insertion_mode} = $previous_insertion_mode;
          ## reprocess
          redo B;
-       } elsif ($token->{type} eq 'end-of-file') {
-         ## Stop parsing
-         last B;
        } else {
          die "$0: $token->{type}: Unknown token";
        }
+     } else {
+       die "$0: $self->{insertion_mode}: Unknown insertion mode";
      }
    } # B
-Line 4928 
 sub set_inner_html ($$$) {
+Line 5237 
 sub set_inner_html ($$$) {
      ## Step 1 # MUST
      my $this_doc = $node->owner_document;
      my $doc = $this_doc->implementation->create_document;
-     ## TODO: Mark as HTML document
+     $doc->manakai_is_html (1);
      my $p = $class->new;
      $p->{document} = $doc;
-Line 4977 
 sub set_inner_html ($$$) {
+Line 5286 
 sub set_inner_html ($$$) {
      ## Step 2
      my $node_ln = $node->local_name;
-     $p->{content_model_flag} = {
+     $p->{content_model} = {
-       title => 'RCDATA',
+       title => RCDATA_CONTENT_MODEL,
-       textarea => 'RCDATA',
+       textarea => RCDATA_CONTENT_MODEL,
-       style => 'CDATA',
+       style => CDATA_CONTENT_MODEL,
-       script => 'CDATA',
+       script => CDATA_CONTENT_MODEL,
-       xmp => 'CDATA',
+       xmp => CDATA_CONTENT_MODEL,
-       iframe => 'CDATA',
+       iframe => CDATA_CONTENT_MODEL,
-       noembed => 'CDATA',
+       noembed => CDATA_CONTENT_MODEL,
-       noframes => 'CDATA',
+       noframes => CDATA_CONTENT_MODEL,
-       noscript => 'CDATA',
+       noscript => CDATA_CONTENT_MODEL,
-       plaintext => 'PLAINTEXT',
+       plaintext => PLAINTEXT_CONTENT_MODEL,
-     }->{$node_ln} || 'PCDATA';
+     }->{$node_ln};
-        ## ISSUE: What is "the name of the element"? local name?
+     $p->{content_model} = PCDATA_CONTENT_MODEL
+         unless defined $p->{content_model};
+         ## ISSUE: What is "the name of the element"? local name?
      $p->{inner_html_node} = [$node, $node_ln];
-Line 5089 
 sub get_inner_html ($$$) {
+Line 5400 
 sub get_inner_html ($$$) {
      my $nt = $child->node_type;
      if ($nt == 1) { # Element
-       my $tag_name = lc $child->tag_name; ## ISSUE: Definition of "lowercase"
+       my $tag_name = $child->tag_name; ## TODO: manakai_tag_name
        $s .= '<' . $tag_name;
+       ## NOTE: Non-HTML case:
-       ## ISSUE: Non-html elements
+       ## <http://permalink.gmane.org/gmane.org.w3c.whatwg.discuss/11191>
        my @attrs = @{$child->attributes}; # sort order MUST be stable
        for my $attr (@attrs) { # order is implementation dependent
-         my $attr_name = lc $attr->name; ## ISSUE: Definition of "lowercase"
+         my $attr_name = $attr->name; ## TODO: manakai_name
          $s .= ' ' . $attr_name . '="';
          my $attr_value = $attr->value;
          ## escape
-Line 5115 
 sub get_inner_html ($$$) {
+Line 5426 
 sub get_inner_html ($$$) {
          spacer => 1, wbr => 1,
        }->{$tag_name};
+       $s .= "\x0A" if $tag_name eq 'pre' or $tag_name eq 'textarea';
        if (not $in_cdata and {
          style => 1, script => 1, xmp => 1, iframe => 1,
          noembed => 1, noframes => 1, noscript => 1,
+         plaintext => 1,
        }->{$tag_name}) {
          unshift @node, 'cdata-out';
          $in_cdata = 1;

 Legend:



Removed from v.1.15
 


changed lines


 
Added in v.1.42
 Legend:



Removed from v.1.15
 


changed lines


 
Added in v.1.42
-Removed from v.1.15
+Added in v.1.42

admin@suikawiki.org	ViewVC Help
Powered by ViewVC 1.1.24