/[suikacvs]/markup/html/whatpm/Whatpm/HTML.pm.src

Diff of /markup/html/whatpm/Whatpm/HTML.pm.src

Parent Directory | Revision Log | View Patch Patch

-revision 1.13 by wakaba,
Sat Jun 23 05:29:48 2007 UTC
+revision 1.25 by wakaba,
Sun Jun 24 05:12:11 2007 UTC
 Line 2 
 package Whatpm::HTML;
  use strict;
  our $VERSION=do{my @r=(q$Revision$=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
- ## This is an early version of an HTML parser.
+ ## ISSUE:
+ ## var doc = implementation.createDocument (null, null, null);
+ ## doc.write ('');
+ ## alert (doc.compatMode);
  my $permitted_slash_tag_name = {
    base => 1,
-Line 18 
 my $permitted_slash_tag_name = {
+Line 21 
 my $permitted_slash_tag_name = {
    input => 1,
  };
- my $entity_char = {
-   AElig => "\x{00C6}",
-   Aacute => "\x{00C1}",
-   Acirc => "\x{00C2}",
-   Agrave => "\x{00C0}",
-   Alpha => "\x{0391}",
-   Aring => "\x{00C5}",
-   Atilde => "\x{00C3}",
-   Auml => "\x{00C4}",
-   Beta => "\x{0392}",
-   Ccedil => "\x{00C7}",
-   Chi => "\x{03A7}",
-   Dagger => "\x{2021}",
-   Delta => "\x{0394}",
-   ETH => "\x{00D0}",
-   Eacute => "\x{00C9}",
-   Ecirc => "\x{00CA}",
-   Egrave => "\x{00C8}",
-   Epsilon => "\x{0395}",
-   Eta => "\x{0397}",
-   Euml => "\x{00CB}",
-   Gamma => "\x{0393}",
-   Iacute => "\x{00CD}",
-   Icirc => "\x{00CE}",
-   Igrave => "\x{00CC}",
-   Iota => "\x{0399}",
-   Iuml => "\x{00CF}",
-   Kappa => "\x{039A}",
-   Lambda => "\x{039B}",
-   Mu => "\x{039C}",
-   Ntilde => "\x{00D1}",
-   Nu => "\x{039D}",
-   OElig => "\x{0152}",
-   Oacute => "\x{00D3}",
-   Ocirc => "\x{00D4}",
-   Ograve => "\x{00D2}",
-   Omega => "\x{03A9}",
-   Omicron => "\x{039F}",
-   Oslash => "\x{00D8}",
-   Otilde => "\x{00D5}",
-   Ouml => "\x{00D6}",
-   Phi => "\x{03A6}",
-   Pi => "\x{03A0}",
-   Prime => "\x{2033}",
-   Psi => "\x{03A8}",
-   Rho => "\x{03A1}",
-   Scaron => "\x{0160}",
-   Sigma => "\x{03A3}",
-   THORN => "\x{00DE}",
-   Tau => "\x{03A4}",
-   Theta => "\x{0398}",
-   Uacute => "\x{00DA}",
-   Ucirc => "\x{00DB}",
-   Ugrave => "\x{00D9}",
-   Upsilon => "\x{03A5}",
-   Uuml => "\x{00DC}",
-   Xi => "\x{039E}",
-   Yacute => "\x{00DD}",
-   Yuml => "\x{0178}",
-   Zeta => "\x{0396}",
-   aacute => "\x{00E1}",
-   acirc => "\x{00E2}",
-   acute => "\x{00B4}",
-   aelig => "\x{00E6}",
-   agrave => "\x{00E0}",
-   alefsym => "\x{2135}",
-   alpha => "\x{03B1}",
-   amp => "\x{0026}",
-   AMP => "\x{0026}",
-   and => "\x{2227}",
-   ang => "\x{2220}",
-   apos => "\x{0027}",
-   aring => "\x{00E5}",
-   asymp => "\x{2248}",
-   atilde => "\x{00E3}",
-   auml => "\x{00E4}",
-   bdquo => "\x{201E}",
-   beta => "\x{03B2}",
-   brvbar => "\x{00A6}",
-   bull => "\x{2022}",
-   cap => "\x{2229}",
-   ccedil => "\x{00E7}",
-   cedil => "\x{00B8}",
-   cent => "\x{00A2}",
-   chi => "\x{03C7}",
-   circ => "\x{02C6}",
-   clubs => "\x{2663}",
-   cong => "\x{2245}",
-   copy => "\x{00A9}",
-   COPY => "\x{00A9}",
-   crarr => "\x{21B5}",
-   cup => "\x{222A}",
-   curren => "\x{00A4}",
-   dArr => "\x{21D3}",
-   dagger => "\x{2020}",
-   darr => "\x{2193}",
-   deg => "\x{00B0}",
-   delta => "\x{03B4}",
-   diams => "\x{2666}",
-   divide => "\x{00F7}",
-   eacute => "\x{00E9}",
-   ecirc => "\x{00EA}",
-   egrave => "\x{00E8}",
-   empty => "\x{2205}",
-   emsp => "\x{2003}",
-   ensp => "\x{2002}",
-   epsilon => "\x{03B5}",
-   equiv => "\x{2261}",
-   eta => "\x{03B7}",
-   eth => "\x{00F0}",
-   euml => "\x{00EB}",
-   euro => "\x{20AC}",
-   exist => "\x{2203}",
-   fnof => "\x{0192}",
-   forall => "\x{2200}",
-   frac12 => "\x{00BD}",
-   frac14 => "\x{00BC}",
-   frac34 => "\x{00BE}",
-   frasl => "\x{2044}",
-   gamma => "\x{03B3}",
-   ge => "\x{2265}",
-   gt => "\x{003E}",
-   GT => "\x{003E}",
-   hArr => "\x{21D4}",
-   harr => "\x{2194}",
-   hearts => "\x{2665}",
-   hellip => "\x{2026}",
-   iacute => "\x{00ED}",
-   icirc => "\x{00EE}",
-   iexcl => "\x{00A1}",
-   igrave => "\x{00EC}",
-   image => "\x{2111}",
-   infin => "\x{221E}",
-   int => "\x{222B}",
-   iota => "\x{03B9}",
-   iquest => "\x{00BF}",
-   isin => "\x{2208}",
-   iuml => "\x{00EF}",
-   kappa => "\x{03BA}",
-   lArr => "\x{21D0}",
-   lambda => "\x{03BB}",
-   lang => "\x{2329}",
-   laquo => "\x{00AB}",
-   larr => "\x{2190}",
-   lceil => "\x{2308}",
-   ldquo => "\x{201C}",
-   le => "\x{2264}",
-   lfloor => "\x{230A}",
-   lowast => "\x{2217}",
-   loz => "\x{25CA}",
-   lrm => "\x{200E}",
-   lsaquo => "\x{2039}",
-   lsquo => "\x{2018}",
-   lt => "\x{003C}",
-   LT => "\x{003C}",
-   macr => "\x{00AF}",
-   mdash => "\x{2014}",
-   micro => "\x{00B5}",
-   middot => "\x{00B7}",
-   minus => "\x{2212}",
-   mu => "\x{03BC}",
-   nabla => "\x{2207}",
-   nbsp => "\x{00A0}",
-   ndash => "\x{2013}",
-   ne => "\x{2260}",
-   ni => "\x{220B}",
-   not => "\x{00AC}",
-   notin => "\x{2209}",
-   nsub => "\x{2284}",
-   ntilde => "\x{00F1}",
-   nu => "\x{03BD}",
-   oacute => "\x{00F3}",
-   ocirc => "\x{00F4}",
-   oelig => "\x{0153}",
-   ograve => "\x{00F2}",
-   oline => "\x{203E}",
-   omega => "\x{03C9}",
-   omicron => "\x{03BF}",
-   oplus => "\x{2295}",
-   or => "\x{2228}",
-   ordf => "\x{00AA}",
-   ordm => "\x{00BA}",
-   oslash => "\x{00F8}",
-   otilde => "\x{00F5}",
-   otimes => "\x{2297}",
-   ouml => "\x{00F6}",
-   para => "\x{00B6}",
-   part => "\x{2202}",
-   permil => "\x{2030}",
-   perp => "\x{22A5}",
-   phi => "\x{03C6}",
-   pi => "\x{03C0}",
-   piv => "\x{03D6}",
-   plusmn => "\x{00B1}",
-   pound => "\x{00A3}",
-   prime => "\x{2032}",
-   prod => "\x{220F}",
-   prop => "\x{221D}",
-   psi => "\x{03C8}",
-   quot => "\x{0022}",
-   QUOT => "\x{0022}",
-   rArr => "\x{21D2}",
-   radic => "\x{221A}",
-   rang => "\x{232A}",
-   raquo => "\x{00BB}",
-   rarr => "\x{2192}",
-   rceil => "\x{2309}",
-   rdquo => "\x{201D}",
-   real => "\x{211C}",
-   reg => "\x{00AE}",
-   REG => "\x{00AE}",
-   rfloor => "\x{230B}",
-   rho => "\x{03C1}",
-   rlm => "\x{200F}",
-   rsaquo => "\x{203A}",
-   rsquo => "\x{2019}",
-   sbquo => "\x{201A}",
-   scaron => "\x{0161}",
-   sdot => "\x{22C5}",
-   sect => "\x{00A7}",
-   shy => "\x{00AD}",
-   sigma => "\x{03C3}",
-   sigmaf => "\x{03C2}",
-   sim => "\x{223C}",
-   spades => "\x{2660}",
-   sub => "\x{2282}",
-   sube => "\x{2286}",
-   sum => "\x{2211}",
-   sup => "\x{2283}",
-   sup1 => "\x{00B9}",
-   sup2 => "\x{00B2}",
-   sup3 => "\x{00B3}",
-   supe => "\x{2287}",
-   szlig => "\x{00DF}",
-   tau => "\x{03C4}",
-   there4 => "\x{2234}",
-   theta => "\x{03B8}",
-   thetasym => "\x{03D1}",
-   thinsp => "\x{2009}",
-   thorn => "\x{00FE}",
-   tilde => "\x{02DC}",
-   times => "\x{00D7}",
-   trade => "\x{2122}",
-   uArr => "\x{21D1}",
-   uacute => "\x{00FA}",
-   uarr => "\x{2191}",
-   ucirc => "\x{00FB}",
-   ugrave => "\x{00F9}",
-   uml => "\x{00A8}",
-   upsih => "\x{03D2}",
-   upsilon => "\x{03C5}",
-   uuml => "\x{00FC}",
-   weierp => "\x{2118}",
-   xi => "\x{03BE}",
-   yacute => "\x{00FD}",
-   yen => "\x{00A5}",
-   yuml => "\x{00FF}",
-   zeta => "\x{03B6}",
-   zwj => "\x{200D}",
-   zwnj => "\x{200C}",
- }; # $entity_char
  my $c1_entity_char = {
 x80 => 0x20AC,
 x81 => 0xFFFD,
-Line 361 
 sub parse_string ($$$;$) {
+Line 102 
 sub parse_string ($$$;$) {
        $line++;
        $column = 0;
      } elsif ($self->{next_input_character} == 0x000D) { # CR
-       if ($i >= length $$s) {
+       $i++ if substr ($$s, $i, 1) eq "\x0A";
-         #
-       } else {
-         my $next_char = ord substr $$s, $i++, 1;
-         if ($next_char == 0x000A) { # LF
-           #
-         } else {
-           push @{$self->{char}}, $next_char;
-         }
-       }
        $self->{next_input_character} = 0x000A; # LF # MUST
        $line++;
        $column = 0;
-Line 426 
 sub _initialize_tokenizer ($) {
+Line 158 
 sub _initialize_tokenizer ($) {
    # $self->{next_input_character}
    !!!next-input-character;
    $self->{token} = [];
+   # $self->{escape}
  } # _initialize_tokenizer
  ## A token has:
  ##   ->{type} eq 'DOCTYPE', 'start tag', 'end tag', 'comment',
  ##       'character', or 'end-of-file'
- ##   ->{name} (DOCTYPE, start tag (tagname), end tag (tagname))
+ ##   ->{name} (DOCTYPE, start tag (tag name), end tag (tag name))
-     ## ISSUE: the spec need s/tagname/tag name/
+ ##   ->{public_identifier} (DOCTYPE)
- ##   ->{error} == 1 or 0 (DOCTYPE)
+ ##   ->{system_identifier} (DOCTYPE)
+ ##   ->{correct} == 1 or 0 (DOCTYPE)
  ##   ->{attributes} isa HASH (start tag, end tag)
  ##   ->{data} (comment, character)
- ## Macros
- ##   Macros MUST be preceded by three EXCLAMATION MARKs.
- ##   emit ($token)
- ##     Emits the specified token.
  ## Emitted token MUST immediately be handled by the tree construction state.
  ## Before each step, UA MAY check to see if either one of the scripts in
-Line 597 
 sub _get_next_token ($) {
+Line 326 
 sub _get_next_token ($) {
      } elsif ($self->{state} eq 'close tag open') {
        if ($self->{content_model_flag} eq 'RCDATA' or
            $self->{content_model_flag} eq 'CDATA') {
-         my @next_char;
+         if (defined $self->{last_emitted_start_tag_name}) {
-         TAGNAME: for (my $i = 0; $i < length $self->{last_emitted_start_tag_name}; $i++) {
+           my @next_char;
+           TAGNAME: for (my $i = 0; $i < length $self->{last_emitted_start_tag_name}; $i++) {
+             push @next_char, $self->{next_input_character};
+             my $c = ord substr ($self->{last_emitted_start_tag_name}, $i, 1);
+             my $C = 0x0061 <= $c && $c <= 0x007A ? $c - 0x0020 : $c;
+             if ($self->{next_input_character} == $c or $self->{next_input_character} == $C) {
+               !!!next-input-character;
+               next TAGNAME;
+             } else {
+               $self->{next_input_character} = shift @next_char; # reconsume
+               !!!back-next-input-character (@next_char);
+               $self->{state} = 'data';
+               !!!emit ({type => 'character', data => '</'});
+               redo A;
+             }
+           }
            push @next_char, $self->{next_input_character};
-           my $c = ord substr ($self->{last_emitted_start_tag_name}, $i, 1);
-           my $C = 0x0061 <= $c && $c <= 0x007A ? $c - 0x0020 : $c;
+           unless ($self->{next_input_character} == 0x0009 or # HT
-           if ($self->{next_input_character} == $c or $self->{next_input_character} == $C) {
+                   $self->{next_input_character} == 0x000A or # LF
-             !!!next-input-character;
+                   $self->{next_input_character} == 0x000B or # VT
-             next TAGNAME;
+                   $self->{next_input_character} == 0x000C or # FF
-           } else {
+                   $self->{next_input_character} == 0x0020 or # SP
-             !!!parse-error (type => 'unmatched end tag');
+                   $self->{next_input_character} == 0x003E or # >
+                   $self->{next_input_character} == 0x002F or # /
+                   $self->{next_input_character} == -1) {
              $self->{next_input_character} = shift @next_char; # reconsume
              !!!back-next-input-character (@next_char);
              $self->{state} = 'data';
              !!!emit ({type => 'character', data => '</'});
              redo A;
+           } else {
+             $self->{next_input_character} = shift @next_char;
+             !!!back-next-input-character (@next_char);
+             # and consume...
            }
-         }
+         } else {
-         push @next_char, $self->{next_input_character};
+           ## No start tag token has ever been emitted
+           # next-input-character is already done
-         unless ($self->{next_input_character} == 0x0009 or # HT
-                 $self->{next_input_character} == 0x000A or # LF
-                 $self->{next_input_character} == 0x000B or # VT
-                 $self->{next_input_character} == 0x000C or # FF
-                 $self->{next_input_character} == 0x0020 or # SP
-                 $self->{next_input_character} == 0x003E or # >
-                 $self->{next_input_character} == 0x002F or # /
-                 $self->{next_input_character} == 0x003C or # <
-                 $self->{next_input_character} == -1) {
-           !!!parse-error (type => 'unmatched end tag');
-           $self->{next_input_character} = shift @next_char; # reconsume
-           !!!back-next-input-character (@next_char);
            $self->{state} = 'data';
            !!!emit ({type => 'character', data => '</'});
            redo A;
-         } else {
-           $self->{next_input_character} = shift @next_char;
-           !!!back-next-input-character (@next_char);
-           # and consume...
          }
        }
-Line 699 
 sub _get_next_token ($) {
+Line 431 
 sub _get_next_token ($) {
          !!!next-input-character;
          !!!emit ($self->{current_token}); # start tag or end tag
-         undef $self->{current_token};
          redo A;
        } elsif (0x0041 <= $self->{next_input_character} and
-Line 709 
 sub _get_next_token ($) {
+Line 440 
 sub _get_next_token ($) {
          ## Stay in this state
          !!!next-input-character;
          redo A;
-       } elsif ($self->{next_input_character} == 0x003C or # <
+       } elsif ($self->{next_input_character} == -1) {
-                $self->{next_input_character} == -1) {
          !!!parse-error (type => 'unclosed tag');
          if ($self->{current_token}->{type} eq 'start tag') {
            $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
-Line 726 
 sub _get_next_token ($) {
+Line 456 
 sub _get_next_token ($) {
          # reconsume
          !!!emit ($self->{current_token}); # start tag or end tag
-         undef $self->{current_token};
          redo A;
        } elsif ($self->{next_input_character} == 0x002F) { # /
-Line 773 
 sub _get_next_token ($) {
+Line 502 
 sub _get_next_token ($) {
          !!!next-input-character;
          !!!emit ($self->{current_token}); # start tag or end tag
-         undef $self->{current_token};
          redo A;
        } elsif (0x0041 <= $self->{next_input_character} and
-Line 796 
 sub _get_next_token ($) {
+Line 524 
 sub _get_next_token ($) {
          ## Stay in the state
          # next-input-character is already done
          redo A;
-       } elsif ($self->{next_input_character} == 0x003C or # <
+       } elsif ($self->{next_input_character} == -1) {
-                $self->{next_input_character} == -1) {
          !!!parse-error (type => 'unclosed tag');
          if ($self->{current_token}->{type} eq 'start tag') {
            $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
-Line 813 
 sub _get_next_token ($) {
+Line 540 
 sub _get_next_token ($) {
          # reconsume
          !!!emit ($self->{current_token}); # start tag or end tag
-         undef $self->{current_token};
          redo A;
        } else {
-Line 865 
 sub _get_next_token ($) {
+Line 591 
 sub _get_next_token ($) {
          !!!next-input-character;
          !!!emit ($self->{current_token}); # start tag or end tag
-         undef $self->{current_token};
          redo A;
        } elsif (0x0041 <= $self->{next_input_character} and
-Line 888 
 sub _get_next_token ($) {
+Line 613 
 sub _get_next_token ($) {
          $self->{state} = 'before attribute name';
          # next-input-character is already done
          redo A;
-       } elsif ($self->{next_input_character} == 0x003C or # <
+       } elsif ($self->{next_input_character} == -1) {
-                $self->{next_input_character} == -1) {
          !!!parse-error (type => 'unclosed tag');
          $before_leave->();
          if ($self->{current_token}->{type} eq 'start tag') {
-Line 906 
 sub _get_next_token ($) {
+Line 630 
 sub _get_next_token ($) {
          # reconsume
          !!!emit ($self->{current_token}); # start tag or end tag
-         undef $self->{current_token};
          redo A;
        } else {
-Line 943 
 sub _get_next_token ($) {
+Line 666 
 sub _get_next_token ($) {
          !!!next-input-character;
          !!!emit ($self->{current_token}); # start tag or end tag
-         undef $self->{current_token};
          redo A;
        } elsif (0x0041 <= $self->{next_input_character} and
-Line 966 
 sub _get_next_token ($) {
+Line 688 
 sub _get_next_token ($) {
          $self->{state} = 'before attribute name';
          # next-input-character is already done
          redo A;
-       } elsif ($self->{next_input_character} == 0x003C or # <
+       } elsif ($self->{next_input_character} == -1) {
-                $self->{next_input_character} == -1) {
          !!!parse-error (type => 'unclosed tag');
          if ($self->{current_token}->{type} eq 'start tag') {
            $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
-Line 983 
 sub _get_next_token ($) {
+Line 704 
 sub _get_next_token ($) {
          # reconsume
          !!!emit ($self->{current_token}); # start tag or end tag
-         undef $self->{current_token};
          redo A;
        } else {
-Line 1029 
 sub _get_next_token ($) {
+Line 749 
 sub _get_next_token ($) {
          !!!next-input-character;
          !!!emit ($self->{current_token}); # start tag or end tag
-         undef $self->{current_token};
          redo A;
-       } elsif ($self->{next_input_character} == 0x003C or # <
+       } elsif ($self->{next_input_character} == -1) {
-                $self->{next_input_character} == -1) {
          !!!parse-error (type => 'unclosed tag');
          if ($self->{current_token}->{type} eq 'start tag') {
            $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
-Line 1049 
 sub _get_next_token ($) {
+Line 767 
 sub _get_next_token ($) {
          ## reconsume
          !!!emit ($self->{current_token}); # start tag or end tag
-         undef $self->{current_token};
          redo A;
        } else {
-Line 1084 
 sub _get_next_token ($) {
+Line 801 
 sub _get_next_token ($) {
          ## reconsume
          !!!emit ($self->{current_token}); # start tag or end tag
-         undef $self->{current_token};
          redo A;
        } else {
-Line 1119 
 sub _get_next_token ($) {
+Line 835 
 sub _get_next_token ($) {
          ## reconsume
          !!!emit ($self->{current_token}); # start tag or end tag
-         undef $self->{current_token};
          redo A;
        } else {
-Line 1157 
 sub _get_next_token ($) {
+Line 872 
 sub _get_next_token ($) {
          !!!next-input-character;
          !!!emit ($self->{current_token}); # start tag or end tag
-         undef $self->{current_token};
          redo A;
-       } elsif ($self->{next_input_character} == 0x003C or # <
+       } elsif ($self->{next_input_character} == -1) {
-                $self->{next_input_character} == -1) {
          !!!parse-error (type => 'unclosed tag');
          if ($self->{current_token}->{type} eq 'start tag') {
            $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
-Line 1177 
 sub _get_next_token ($) {
+Line 890 
 sub _get_next_token ($) {
          ## reconsume
          !!!emit ($self->{current_token}); # start tag or end tag
-         undef $self->{current_token};
          redo A;
        } else {
-Line 1236 
 sub _get_next_token ($) {
+Line 948 
 sub _get_next_token ($) {
          push @next_char, $self->{next_input_character};
          if ($self->{next_input_character} == 0x002D) { # -
            $self->{current_token} = {type => 'comment', data => ''};
-           $self->{state} = 'comment';
+           $self->{state} = 'comment start';
            !!!next-input-character;
            redo A;
          }
-Line 1286 
 sub _get_next_token ($) {
+Line 998 
 sub _get_next_token ($) {
        ## ISSUE: typos in spec: chacacters, is is a parse error
        ## ISSUE: spec is somewhat unclear on "is the first character that will be in the comment"; what is "that will be in the comment" is what the algorithm defines, isn't it?
+     } elsif ($self->{state} eq 'comment start') {
+       if ($self->{next_input_character} == 0x002D) { # -
+         $self->{state} = 'comment start dash';
+         !!!next-input-character;
+         redo A;
+       } elsif ($self->{next_input_character} == 0x003E) { # >
+         !!!parse-error (type => 'bogus comment');
+         $self->{state} = 'data';
+         !!!next-input-character;
+         !!!emit ($self->{current_token}); # comment
+         redo A;
+       } elsif ($self->{next_input_character} == -1) {
+         !!!parse-error (type => 'unclosed comment');
+         $self->{state} = 'data';
+         ## reconsume
+         !!!emit ($self->{current_token}); # comment
+         redo A;
+       } else {
+         $self->{current_token}->{data} # comment
+             .= chr ($self->{next_input_character});
+         $self->{state} = 'comment';
+         !!!next-input-character;
+         redo A;
+       }
+     } elsif ($self->{state} eq 'comment start dash') {
+       if ($self->{next_input_character} == 0x002D) { # -
+         $self->{state} = 'comment end';
+         !!!next-input-character;
+         redo A;
+       } elsif ($self->{next_input_character} == 0x003E) { # >
+         !!!parse-error (type => 'bogus comment');
+         $self->{state} = 'data';
+         !!!next-input-character;
+         !!!emit ($self->{current_token}); # comment
+         redo A;
+       } elsif ($self->{next_input_character} == -1) {
+         !!!parse-error (type => 'unclosed comment');
+         $self->{state} = 'data';
+         ## reconsume
+         !!!emit ($self->{current_token}); # comment
+         redo A;
+       } else {
+         $self->{current_token}->{data} # comment
+             .= chr ($self->{next_input_character});
+         $self->{state} = 'comment';
+         !!!next-input-character;
+         redo A;
+       }
      } elsif ($self->{state} eq 'comment') {
        if ($self->{next_input_character} == 0x002D) { # -
-         $self->{state} = 'comment dash';
+         $self->{state} = 'comment end dash';
          !!!next-input-character;
          redo A;
        } elsif ($self->{next_input_character} == -1) {
-Line 1297 
 sub _get_next_token ($) {
+Line 1065 
 sub _get_next_token ($) {
          ## reconsume
          !!!emit ($self->{current_token}); # comment
-         undef $self->{current_token};
          redo A;
        } else {
-Line 1306 
 sub _get_next_token ($) {
+Line 1073 
 sub _get_next_token ($) {
          !!!next-input-character;
          redo A;
        }
-     } elsif ($self->{state} eq 'comment dash') {
+     } elsif ($self->{state} eq 'comment end dash') {
        if ($self->{next_input_character} == 0x002D) { # -
          $self->{state} = 'comment end';
          !!!next-input-character;
-Line 1317 
 sub _get_next_token ($) {
+Line 1084 
 sub _get_next_token ($) {
          ## reconsume
          !!!emit ($self->{current_token}); # comment
-         undef $self->{current_token};
          redo A;
        } else {
-Line 1332 
 sub _get_next_token ($) {
+Line 1098 
 sub _get_next_token ($) {
          !!!next-input-character;
          !!!emit ($self->{current_token}); # comment
-         undef $self->{current_token};
          redo A;
        } elsif ($self->{next_input_character} == 0x002D) { # -
-Line 1347 
 sub _get_next_token ($) {
+Line 1112 
 sub _get_next_token ($) {
          ## reconsume
          !!!emit ($self->{current_token}); # comment
-         undef $self->{current_token};
          redo A;
        } else {
-Line 1381 
 sub _get_next_token ($) {
+Line 1145 
 sub _get_next_token ($) {
          ## Stay in the state
          !!!next-input-character;
          redo A;
-       } elsif (0x0061 <= $self->{next_input_character} and
-                $self->{next_input_character} <= 0x007A) { # a..z
- ## ISSUE: "Set the token's name name to the" in the spec
-         $self->{current_token} = {type => 'DOCTYPE',
-                           name => chr ($self->{next_input_character} - 0x0020),
-                           error => 1};
-         $self->{state} = 'DOCTYPE name';
-         !!!next-input-character;
-         redo A;
        } elsif ($self->{next_input_character} == 0x003E) { # >
          !!!parse-error (type => 'no DOCTYPE name');
          $self->{state} = 'data';
          !!!next-input-character;
-         !!!emit ({type => 'DOCTYPE', name => '', error => 1});
+         !!!emit ({type => 'DOCTYPE'}); # incorrect
          redo A;
        } elsif ($self->{next_input_character} == -1) {
-Line 1403 
 sub _get_next_token ($) {
+Line 1158 
 sub _get_next_token ($) {
          $self->{state} = 'data';
          ## reconsume
-         !!!emit ({type => 'DOCTYPE', name => '', error => 1});
+         !!!emit ({type => 'DOCTYPE'}); # incorrect
          redo A;
        } else {
-         $self->{current_token} = {type => 'DOCTYPE',
+         $self->{current_token}
-                           name => chr ($self->{next_input_character}),
+             = {type => 'DOCTYPE',
-                           error => 1};
+                name => chr ($self->{next_input_character}),
+                correct => 1};
  ## ISSUE: "Set the token's name name to the" in the spec
          $self->{state} = 'DOCTYPE name';
          !!!next-input-character;
          redo A;
        }
      } elsif ($self->{state} eq 'DOCTYPE name') {
+ ## ISSUE: Redundant "First," in the spec.
        if ($self->{next_input_character} == 0x0009 or # HT
            $self->{next_input_character} == 0x000A or # LF
            $self->{next_input_character} == 0x000B or # VT
            $self->{next_input_character} == 0x000C or # FF
            $self->{next_input_character} == 0x0020) { # SP
-         $self->{current_token}->{error} = ($self->{current_token}->{name} ne 'HTML'); # DOCTYPE
          $self->{state} = 'after DOCTYPE name';
          !!!next-input-character;
          redo A;
        } elsif ($self->{next_input_character} == 0x003E) { # >
-         $self->{current_token}->{error} = ($self->{current_token}->{name} ne 'HTML'); # DOCTYPE
          $self->{state} = 'data';
          !!!next-input-character;
          !!!emit ($self->{current_token}); # DOCTYPE
-         undef $self->{current_token};
          redo A;
-       } elsif (0x0061 <= $self->{next_input_character} and
-                $self->{next_input_character} <= 0x007A) { # a..z
-         $self->{current_token}->{name} .= chr ($self->{next_input_character} - 0x0020); # DOCTYPE
-         #$self->{current_token}->{error} = ($self->{current_token}->{name} ne 'HTML');
-         ## Stay in the state
-         !!!next-input-character;
-         redo A;
        } elsif ($self->{next_input_character} == -1) {
          !!!parse-error (type => 'unclosed DOCTYPE');
-         $self->{current_token}->{error} = ($self->{current_token}->{name} ne 'HTML'); # DOCTYPE
          $self->{state} = 'data';
          ## reconsume
-         !!!emit ($self->{current_token});
+         delete $self->{current_token}->{correct};
-         undef $self->{current_token};
+         !!!emit ($self->{current_token}); # DOCTYPE
          redo A;
        } else {
          $self->{current_token}->{name}
            .= chr ($self->{next_input_character}); # DOCTYPE
-         #$self->{current_token}->{error} = ($self->{current_token}->{name} ne 'HTML');
          ## Stay in the state
          !!!next-input-character;
          redo A;
-Line 1473 
 sub _get_next_token ($) {
+Line 1218 
 sub _get_next_token ($) {
          !!!next-input-character;
          !!!emit ($self->{current_token}); # DOCTYPE
-         undef $self->{current_token};
          redo A;
        } elsif ($self->{next_input_character} == -1) {
-Line 1481 
 sub _get_next_token ($) {
+Line 1225 
 sub _get_next_token ($) {
          $self->{state} = 'data';
          ## reconsume
+         delete $self->{current_token}->{correct};
+         !!!emit ($self->{current_token}); # DOCTYPE
+         redo A;
+       } elsif ($self->{next_input_character} == 0x0050 or # P
+                $self->{next_input_character} == 0x0070) { # p
+         !!!next-input-character;
+         if ($self->{next_input_character} == 0x0055 or # U
+             $self->{next_input_character} == 0x0075) { # u
+           !!!next-input-character;
+           if ($self->{next_input_character} == 0x0042 or # B
+               $self->{next_input_character} == 0x0062) { # b
+             !!!next-input-character;
+             if ($self->{next_input_character} == 0x004C or # L
+                 $self->{next_input_character} == 0x006C) { # l
+               !!!next-input-character;
+               if ($self->{next_input_character} == 0x0049 or # I
+                   $self->{next_input_character} == 0x0069) { # i
+                 !!!next-input-character;
+                 if ($self->{next_input_character} == 0x0043 or # C
+                     $self->{next_input_character} == 0x0063) { # c
+                   $self->{state} = 'before DOCTYPE public identifier';
+                   !!!next-input-character;
+                   redo A;
+                 }
+               }
+             }
+           }
+         }
+         #
+       } elsif ($self->{next_input_character} == 0x0053 or # S
+                $self->{next_input_character} == 0x0073) { # s
+         !!!next-input-character;
+         if ($self->{next_input_character} == 0x0059 or # Y
+             $self->{next_input_character} == 0x0079) { # y
+           !!!next-input-character;
+           if ($self->{next_input_character} == 0x0053 or # S
+               $self->{next_input_character} == 0x0073) { # s
+             !!!next-input-character;
+             if ($self->{next_input_character} == 0x0054 or # T
+                 $self->{next_input_character} == 0x0074) { # t
+               !!!next-input-character;
+               if ($self->{next_input_character} == 0x0045 or # E
+                   $self->{next_input_character} == 0x0065) { # e
+                 !!!next-input-character;
+                 if ($self->{next_input_character} == 0x004D or # M
+                     $self->{next_input_character} == 0x006D) { # m
+                   $self->{state} = 'before DOCTYPE system identifier';
+                   !!!next-input-character;
+                   redo A;
+                 }
+               }
+             }
+           }
+         }
+         #
+       } else {
+         !!!next-input-character;
+         #
+       }
+       !!!parse-error (type => 'string after DOCTYPE name');
+       $self->{state} = 'bogus DOCTYPE';
+       # next-input-character is already done
+       redo A;
+     } elsif ($self->{state} eq 'before DOCTYPE public identifier') {
+       if ({
+x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
+             #0x000D => 1, # HT, LF, VT, FF, SP, CR
+           }->{$self->{next_input_character}}) {
+         ## Stay in the state
+         !!!next-input-character;
+         redo A;
+       } elsif ($self->{next_input_character} eq 0x0022) { # "
+         $self->{current_token}->{public_identifier} = ''; # DOCTYPE
+         $self->{state} = 'DOCTYPE public identifier (double-quoted)';
+         !!!next-input-character;
+         redo A;
+       } elsif ($self->{next_input_character} eq 0x0027) { # '
+         $self->{current_token}->{public_identifier} = ''; # DOCTYPE
+         $self->{state} = 'DOCTYPE public identifier (single-quoted)';
+         !!!next-input-character;
+         redo A;
+       } elsif ($self->{next_input_character} eq 0x003E) { # >
+         !!!parse-error (type => 'no PUBLIC literal');
+         $self->{state} = 'data';
+         !!!next-input-character;
+         delete $self->{current_token}->{correct};
+         !!!emit ($self->{current_token}); # DOCTYPE
+         redo A;
+       } elsif ($self->{next_input_character} == -1) {
+         !!!parse-error (type => 'unclosed DOCTYPE');
+         $self->{state} = 'data';
+         ## reconsume
+         delete $self->{current_token}->{correct};
+         !!!emit ($self->{current_token}); # DOCTYPE
+         redo A;
+       } else {
+         !!!parse-error (type => 'string after PUBLIC');
+         $self->{state} = 'bogus DOCTYPE';
+         !!!next-input-character;
+         redo A;
+       }
+     } elsif ($self->{state} eq 'DOCTYPE public identifier (double-quoted)') {
+       if ($self->{next_input_character} == 0x0022) { # "
+         $self->{state} = 'after DOCTYPE public identifier';
+         !!!next-input-character;
+         redo A;
+       } elsif ($self->{next_input_character} == -1) {
+         !!!parse-error (type => 'unclosed PUBLIC literal');
+         $self->{state} = 'data';
+         ## reconsume
+         delete $self->{current_token}->{correct};
+         !!!emit ($self->{current_token}); # DOCTYPE
+         redo A;
+       } else {
+         $self->{current_token}->{public_identifier} # DOCTYPE
+             .= chr $self->{next_input_character};
+         ## Stay in the state
+         !!!next-input-character;
+         redo A;
+       }
+     } elsif ($self->{state} eq 'DOCTYPE public identifier (single-quoted)') {
+       if ($self->{next_input_character} == 0x0027) { # '
+         $self->{state} = 'after DOCTYPE public identifier';
+         !!!next-input-character;
+         redo A;
+       } elsif ($self->{next_input_character} == -1) {
+         !!!parse-error (type => 'unclosed PUBLIC literal');
+         $self->{state} = 'data';
+         ## reconsume
+         delete $self->{current_token}->{correct};
+         !!!emit ($self->{current_token}); # DOCTYPE
+         redo A;
+       } else {
+         $self->{current_token}->{public_identifier} # DOCTYPE
+             .= chr $self->{next_input_character};
+         ## Stay in the state
+         !!!next-input-character;
+         redo A;
+       }
+     } elsif ($self->{state} eq 'after DOCTYPE public identifier') {
+       if ({
+x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
+             #0x000D => 1, # HT, LF, VT, FF, SP, CR
+           }->{$self->{next_input_character}}) {
+         ## Stay in the state
+         !!!next-input-character;
+         redo A;
+       } elsif ($self->{next_input_character} == 0x0022) { # "
+         $self->{current_token}->{system_identifier} = ''; # DOCTYPE
+         $self->{state} = 'DOCTYPE system identifier (double-quoted)';
+         !!!next-input-character;
+         redo A;
+       } elsif ($self->{next_input_character} == 0x0027) { # '
+         $self->{current_token}->{system_identifier} = ''; # DOCTYPE
+         $self->{state} = 'DOCTYPE system identifier (single-quoted)';
+         !!!next-input-character;
+         redo A;
+       } elsif ($self->{next_input_character} == 0x003E) { # >
+         $self->{state} = 'data';
+         !!!next-input-character;
+         !!!emit ($self->{current_token}); # DOCTYPE
+         redo A;
+       } elsif ($self->{next_input_character} == -1) {
+         !!!parse-error (type => 'unclosed DOCTYPE');
+         $self->{state} = 'data';
+         ## recomsume
+         delete $self->{current_token}->{correct};
+         !!!emit ($self->{current_token}); # DOCTYPE
+         redo A;
+       } else {
+         !!!parse-error (type => 'string after PUBLIC literal');
+         $self->{state} = 'bogus DOCTYPE';
+         !!!next-input-character;
+         redo A;
+       }
+     } elsif ($self->{state} eq 'before DOCTYPE system identifier') {
+       if ({
+x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
+             #0x000D => 1, # HT, LF, VT, FF, SP, CR
+           }->{$self->{next_input_character}}) {
+         ## Stay in the state
+         !!!next-input-character;
+         redo A;
+       } elsif ($self->{next_input_character} == 0x0022) { # "
+         $self->{current_token}->{system_identifier} = ''; # DOCTYPE
+         $self->{state} = 'DOCTYPE system identifier (double-quoted)';
+         !!!next-input-character;
+         redo A;
+       } elsif ($self->{next_input_character} == 0x0027) { # '
+         $self->{current_token}->{system_identifier} = ''; # DOCTYPE
+         $self->{state} = 'DOCTYPE system identifier (single-quoted)';
+         !!!next-input-character;
+         redo A;
+       } elsif ($self->{next_input_character} == 0x003E) { # >
+         !!!parse-error (type => 'no SYSTEM literal');
+         $self->{state} = 'data';
+         !!!next-input-character;
+         delete $self->{current_token}->{correct};
+         !!!emit ($self->{current_token}); # DOCTYPE
+         redo A;
+       } elsif ($self->{next_input_character} == -1) {
+         !!!parse-error (type => 'unclosed DOCTYPE');
+         $self->{state} = 'data';
+         ## recomsume
+         delete $self->{current_token}->{correct};
+         !!!emit ($self->{current_token}); # DOCTYPE
+         redo A;
+       } else {
+         !!!parse-error (type => 'string after PUBLIC literal');
+         $self->{state} = 'bogus DOCTYPE';
+         !!!next-input-character;
+         redo A;
+       }
+     } elsif ($self->{state} eq 'DOCTYPE system identifier (double-quoted)') {
+       if ($self->{next_input_character} == 0x0022) { # "
+         $self->{state} = 'after DOCTYPE system identifier';
+         !!!next-input-character;
+         redo A;
+       } elsif ($self->{next_input_character} == -1) {
+         !!!parse-error (type => 'unclosed SYSTEM literal');
+         $self->{state} = 'data';
+         ## reconsume
+         delete $self->{current_token}->{correct};
+         !!!emit ($self->{current_token}); # DOCTYPE
+         redo A;
+       } else {
+         $self->{current_token}->{system_identifier} # DOCTYPE
+             .= chr $self->{next_input_character};
+         ## Stay in the state
+         !!!next-input-character;
+         redo A;
+       }
+     } elsif ($self->{state} eq 'DOCTYPE system identifier (single-quoted)') {
+       if ($self->{next_input_character} == 0x0027) { # '
+         $self->{state} = 'after DOCTYPE system identifier';
+         !!!next-input-character;
+         redo A;
+       } elsif ($self->{next_input_character} == -1) {
+         !!!parse-error (type => 'unclosed SYSTEM literal');
+         $self->{state} = 'data';
+         ## reconsume
+         delete $self->{current_token}->{correct};
+         !!!emit ($self->{current_token}); # DOCTYPE
+         redo A;
+       } else {
+         $self->{current_token}->{system_identifier} # DOCTYPE
+             .= chr $self->{next_input_character};
+         ## Stay in the state
+         !!!next-input-character;
+         redo A;
+       }
+     } elsif ($self->{state} eq 'after DOCTYPE system identifier') {
+       if ({
+x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
+             #0x000D => 1, # HT, LF, VT, FF, SP, CR
+           }->{$self->{next_input_character}}) {
+         ## Stay in the state
+         !!!next-input-character;
+         redo A;
+       } elsif ($self->{next_input_character} == 0x003E) { # >
+         $self->{state} = 'data';
+         !!!next-input-character;
+         !!!emit ($self->{current_token}); # DOCTYPE
+         redo A;
+       } elsif ($self->{next_input_character} == -1) {
+         !!!parse-error (type => 'unclosed DOCTYPE');
+         $self->{state} = 'data';
+         ## recomsume
+         delete $self->{current_token}->{correct};
          !!!emit ($self->{current_token}); # DOCTYPE
-         undef $self->{current_token};
          redo A;
        } else {
-         !!!parse-error (type => 'string after DOCTYPE name');
+         !!!parse-error (type => 'string after SYSTEM literal');
-         $self->{current_token}->{error} = 1; # DOCTYPE
          $self->{state} = 'bogus DOCTYPE';
          !!!next-input-character;
          redo A;
-Line 1497 
 sub _get_next_token ($) {
+Line 1544 
 sub _get_next_token ($) {
          $self->{state} = 'data';
          !!!next-input-character;
+         delete $self->{current_token}->{correct};
          !!!emit ($self->{current_token}); # DOCTYPE
-         undef $self->{current_token};
          redo A;
        } elsif ($self->{next_input_character} == -1) {
-Line 1506 
 sub _get_next_token ($) {
+Line 1553 
 sub _get_next_token ($) {
          $self->{state} = 'data';
          ## reconsume
+         delete $self->{current_token}->{correct};
          !!!emit ($self->{current_token}); # DOCTYPE
-         undef $self->{current_token};
          redo A;
        } else {
-Line 1525 
 sub _get_next_token ($) {
+Line 1572 
 sub _get_next_token ($) {
  sub _tokenize_attempt_to_consume_an_entity ($) {
    my $self = shift;
-   if ($self->{next_input_character} == 0x0023) { # #
+   if ({
+x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, # HT, LF, VT, FF,
+x0020 => 1, 0x003C => 1, 0x0026 => 1, -1 => 1, # SP, <, & # 0x000D # CR
+       }->{$self->{next_input_character}}) {
+     ## Don't consume
+     ## No error
+     return undef;
+   } elsif ($self->{next_input_character} == 0x0023) { # #
      !!!next-input-character;
      if ($self->{next_input_character} == 0x0078 or # x
          $self->{next_input_character} == 0x0058) { # X
-Line 1621 
 sub _tokenize_attempt_to_consume_an_enti
+Line 1675 
 sub _tokenize_attempt_to_consume_an_enti
      my $value = $entity_name;
      my $match;
+     require Whatpm::_NamedEntityList;
+     our $EntityChar;
      while (length $entity_name < 10 and
             ## NOTE: Some number greater than the maximum length of entity name
-            ((0x0041 <= $self->{next_input_character} and
+            ((0x0041 <= $self->{next_input_character} and # a
-              $self->{next_input_character} <= 0x005A) or
+              $self->{next_input_character} <= 0x005A) or # x
-             (0x0061 <= $self->{next_input_character} and
+             (0x0061 <= $self->{next_input_character} and # a
-              $self->{next_input_character} <= 0x007A) or
+              $self->{next_input_character} <= 0x007A) or # z
-             (0x0030 <= $self->{next_input_character} and
+             (0x0030 <= $self->{next_input_character} and # 0
-              $self->{next_input_character} <= 0x0039))) {
+              $self->{next_input_character} <= 0x0039) or # 9
+             $self->{next_input_character} == 0x003B)) { # ;
        $entity_name .= chr $self->{next_input_character};
-       if (defined $entity_char->{$entity_name}) {
+       if (defined $EntityChar->{$entity_name}) {
-         $value = $entity_char->{$entity_name};
+         $value = $EntityChar->{$entity_name};
-         $match = 1;
+         if ($self->{next_input_character} == 0x003B) { # ;
+           $match = 1;
+           !!!next-input-character;
+           last;
+         } else {
+           $match = -1;
+         }
        } else {
          $value .= chr $self->{next_input_character};
        }
        !!!next-input-character;
      }
-     if ($match) {
+     if ($match > 0) {
-       if ($self->{next_input_character} == 0x003B) { # ;
+       return {type => 'character', data => $value};
-         !!!next-input-character;
+     } elsif ($match < 0) {
-       } else {
+       !!!parse-error (type => 'refc');
-         !!!parse-error (type => 'refc');
-       }
        return {type => 'character', data => $value};
      } else {
        !!!parse-error (type => 'bare ero');
-Line 1667 
 sub _initialize_tree_constructor ($) {
+Line 1727 
 sub _initialize_tree_constructor ($) {
    $self->{document}->strict_error_checking (0);
    ## TODO: Turn mutation events off # MUST
    ## TODO: Turn loose Document option (manakai extension) on
-   ## TODO: Mark the Document as an HTML document # MUST
+   $self->{document}->manakai_is_html (1); # MUST
  } # _initialize_tree_constructor
  sub _terminate_tree_constructor ($) {
-Line 1707 
 sub _construct_tree ($) {
+Line 1767 
 sub _construct_tree ($) {
  sub _tree_construction_initial ($) {
    my $self = shift;
-   B: {
+   INITIAL: {
-       if ($token->{type} eq 'DOCTYPE') {
+     if ($token->{type} eq 'DOCTYPE') {
-         if ($token->{error}) {
+       ## NOTE: Conformance checkers MAY, instead of reporting "not HTML5"
-           ## ISSUE: Spec currently left this case undefined.
+       ## error, switch to a conformance checking mode for another
-           !!!parse-error (type => 'bogus DOCTYPE');
+       ## language.
-         }
+       my $doctype_name = $token->{name};
-         my $doctype = $self->{document}->create_document_type_definition
+       $doctype_name = '' unless defined $doctype_name;
-           ($token->{name});
+       $doctype_name =~ tr/a-z/A-Z/;
-         $self->{document}->append_child ($doctype);
+       if (not defined $token->{name} or # <!DOCTYPE>
-         #$phase = 'root element';
+           defined $token->{public_identifier} or
-         !!!next-token;
+           defined $token->{system_identifier}) {
-         #redo B;
+         !!!parse-error (type => 'not HTML5');
-         return;
+       } elsif ($doctype_name ne 'HTML') {
-       } elsif ({
+         ## ISSUE: ASCII case-insensitive? (in fact it does not matter)
-                 comment => 1,
+         !!!parse-error (type => 'not HTML5');
-                 'start tag' => 1,
+       }
-                 'end tag' => 1,
-                 'end-of-file' => 1,
+       my $doctype = $self->{document}->create_document_type_definition
-                }->{$token->{type}}) {
+         ($token->{name}); ## ISSUE: If name is missing (e.g. <!DOCTYPE>)?
-         ## ISSUE: Spec currently left this case undefined.
+       $doctype->public_id ($token->{public_identifier})
-         !!!parse-error (type => 'missing DOCTYPE');
+           if defined $token->{public_identifier};
-         #$phase = 'root element';
+       $doctype->system_id ($token->{system_identifier})
-         ## reprocess
+           if defined $token->{system_identifier};
-         #redo B;
+       ## NOTE: Other DocumentType attributes are null or empty lists.
-         return;
+       ## ISSUE: internalSubset = null??
-       } elsif ($token->{type} eq 'character') {
+       $self->{document}->append_child ($doctype);
-         if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
-           $self->{document}->manakai_append_text ($1);
+       if (not $token->{correct} or $doctype_name ne 'HTML') {
-           ## ISSUE: DOM3 Core does not allow Document > Text
+         $self->{document}->manakai_compat_mode ('quirks');
-           unless (length $token->{data}) {
+       } elsif (defined $token->{public_identifier}) {
-             ## Stay in the phase
+         my $pubid = $token->{public_identifier};
-             !!!next-token;
+         $pubid =~ tr/a-z/A-z/;
-             redo B;
+         if ({
+           "+//SILMARIL//DTD HTML PRO V0R11 19970101//EN" => 1,
+           "-//ADVASOFT LTD//DTD HTML 3.0 ASWEDIT + EXTENSIONS//EN" => 1,
+           "-//AS//DTD HTML 3.0 ASWEDIT + EXTENSIONS//EN" => 1,
+           "-//IETF//DTD HTML 2.0 LEVEL 1//EN" => 1,
+           "-//IETF//DTD HTML 2.0 LEVEL 2//EN" => 1,
+           "-//IETF//DTD HTML 2.0 STRICT LEVEL 1//EN" => 1,
+           "-//IETF//DTD HTML 2.0 STRICT LEVEL 2//EN" => 1,
+           "-//IETF//DTD HTML 2.0 STRICT//EN" => 1,
+           "-//IETF//DTD HTML 2.0//EN" => 1,
+           "-//IETF//DTD HTML 2.1E//EN" => 1,
+           "-//IETF//DTD HTML 3.0//EN" => 1,
+           "-//IETF//DTD HTML 3.0//EN//" => 1,
+           "-//IETF//DTD HTML 3.2 FINAL//EN" => 1,
+           "-//IETF//DTD HTML 3.2//EN" => 1,
+           "-//IETF//DTD HTML 3//EN" => 1,
+           "-//IETF//DTD HTML LEVEL 0//EN" => 1,
+           "-//IETF//DTD HTML LEVEL 0//EN//2.0" => 1,
+           "-//IETF//DTD HTML LEVEL 1//EN" => 1,
+           "-//IETF//DTD HTML LEVEL 1//EN//2.0" => 1,
+           "-//IETF//DTD HTML LEVEL 2//EN" => 1,
+           "-//IETF//DTD HTML LEVEL 2//EN//2.0" => 1,
+           "-//IETF//DTD HTML LEVEL 3//EN" => 1,
+           "-//IETF//DTD HTML LEVEL 3//EN//3.0" => 1,
+           "-//IETF//DTD HTML STRICT LEVEL 0//EN" => 1,
+           "-//IETF//DTD HTML STRICT LEVEL 0//EN//2.0" => 1,
+           "-//IETF//DTD HTML STRICT LEVEL 1//EN" => 1,
+           "-//IETF//DTD HTML STRICT LEVEL 1//EN//2.0" => 1,
+           "-//IETF//DTD HTML STRICT LEVEL 2//EN" => 1,
+           "-//IETF//DTD HTML STRICT LEVEL 2//EN//2.0" => 1,
+           "-//IETF//DTD HTML STRICT LEVEL 3//EN" => 1,
+           "-//IETF//DTD HTML STRICT LEVEL 3//EN//3.0" => 1,
+           "-//IETF//DTD HTML STRICT//EN" => 1,
+           "-//IETF//DTD HTML STRICT//EN//2.0" => 1,
+           "-//IETF//DTD HTML STRICT//EN//3.0" => 1,
+           "-//IETF//DTD HTML//EN" => 1,
+           "-//IETF//DTD HTML//EN//2.0" => 1,
+           "-//IETF//DTD HTML//EN//3.0" => 1,
+           "-//METRIUS//DTD METRIUS PRESENTATIONAL//EN" => 1,
+           "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML STRICT//EN" => 1,
+           "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML//EN" => 1,
+           "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 TABLES//EN" => 1,
+           "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML STRICT//EN" => 1,
+           "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML//EN" => 1,
+           "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 TABLES//EN" => 1,
+           "-//NETSCAPE COMM. CORP.//DTD HTML//EN" => 1,
+           "-//NETSCAPE COMM. CORP.//DTD STRICT HTML//EN" => 1,
+           "-//O'REILLY AND ASSOCIATES//DTD HTML 2.0//EN" => 1,
+           "-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED 1.0//EN" => 1,
+           "-//SPYGLASS//DTD HTML 2.0 EXTENDED//EN" => 1,
+           "-//SQ//DTD HTML 2.0 HOTMETAL + EXTENSIONS//EN" => 1,
+           "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA HTML//EN" => 1,
+           "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA STRICT HTML//EN" => 1,
+           "-//W3C//DTD HTML 3 1995-03-24//EN" => 1,
+           "-//W3C//DTD HTML 3.2 DRAFT//EN" => 1,
+           "-//W3C//DTD HTML 3.2 FINAL//EN" => 1,
+           "-//W3C//DTD HTML 3.2//EN" => 1,
+           "-//W3C//DTD HTML 3.2S DRAFT//EN" => 1,
+           "-//W3C//DTD HTML 4.0 FRAMESET//EN" => 1,
+           "-//W3C//DTD HTML 4.0 TRANSITIONAL//EN" => 1,
+           "-//W3C//DTD HTML EXPERIMETNAL 19960712//EN" => 1,
+           "-//W3C//DTD HTML EXPERIMENTAL 970421//EN" => 1,
+           "-//W3C//DTD W3 HTML//EN" => 1,
+           "-//W3O//DTD W3 HTML 3.0//EN" => 1,
+           "-//W3O//DTD W3 HTML 3.0//EN//" => 1,
+           "-//W3O//DTD W3 HTML STRICT 3.0//EN//" => 1,
+           "-//WEBTECHS//DTD MOZILLA HTML 2.0//EN" => 1,
+           "-//WEBTECHS//DTD MOZILLA HTML//EN" => 1,
+           "-/W3C/DTD HTML 4.0 TRANSITIONAL/EN" => 1,
+           "HTML" => 1,
+         }->{$pubid}) {
+           $self->{document}->manakai_compat_mode ('quirks');
+         } elsif ($pubid eq "-//W3C//DTD HTML 4.01 FRAMESET//EN" or
+                  $pubid eq "-//W3C//DTD HTML 4.01 TRANSITIONAL//EN") {
+           if (defined $token->{system_identifier}) {
+             $self->{document}->manakai_compat_mode ('quirks');
+           } else {
+             $self->{document}->manakai_compat_mode ('limited quirks');
            }
+         } elsif ($pubid eq "-//W3C//DTD XHTML 1.0 Frameset//EN" or
+                  $pubid eq "-//W3C//DTD XHTML 1.0 Transitional//EN") {
+           $self->{document}->manakai_compat_mode ('limited quirks');
+         }
+       }
+       if (defined $token->{system_identifier}) {
+         my $sysid = $token->{system_identifier};
+         $sysid =~ tr/A-Z/a-z/;
+         if ($sysid eq "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd") {
+           $self->{document}->manakai_compat_mode ('quirks');
+         }
+       }
+       ## Go to the root element phase.
+       !!!next-token;
+       return;
+     } elsif ({
+               'start tag' => 1,
+               'end tag' => 1,
+               'end-of-file' => 1,
+              }->{$token->{type}}) {
+       !!!parse-error (type => 'no DOCTYPE');
+       $self->{document}->manakai_compat_mode ('quirks');
+       ## Go to the root element phase
+       ## reprocess
+       return;
+     } elsif ($token->{type} eq 'character') {
+       if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { # \x0D
+         ## Ignore the token
+         unless (length $token->{data}) {
+           ## Stay in the phase
+           !!!next-token;
+           redo INITIAL;
          }
-         ## ISSUE: Spec currently left this case undefined.
-         !!!parse-error (type => 'missing DOCTYPE');
-         #$phase = 'root element';
-         ## reprocess
-         #redo B;
-         return;
-       } else {
-         die "$0: $token->{type}: Unknown token";
        }
-     } # B
+       !!!parse-error (type => 'no DOCTYPE');
+       $self->{document}->manakai_compat_mode ('quirks');
+       ## Go to the root element phase
+       ## reprocess
+       return;
+     } elsif ($token->{type} eq 'comment') {
+       my $comment = $self->{document}->create_comment ($token->{data});
+       $self->{document}->append_child ($comment);
+       ## Stay in the phase.
+       !!!next-token;
+       redo INITIAL;
+     } else {
+       die "$0: $token->{type}: Unknown token";
+     }
+   } # INITIAL
  } # _tree_construction_initial
  sub _tree_construction_root_element ($) {
-Line 1950 
 sub _tree_construction_main ($) {
+Line 2128 
 sub _tree_construction_main ($) {
      }
    }; # $clear_up_to_marker
-   my $style_start_tag = sub {
+   my $parse_rcdata = sub ($$) {
-     my $style_el; !!!create-element ($style_el, 'style', $token->{attributes});
+     my ($content_model_flag, $insert) = @_;
-     ## $self->{insertion_mode} eq 'in head' and ... (always true)
-     (($self->{insertion_mode} eq 'in head' and defined $self->{head_element})
+     ## Step 1
-      ? $self->{head_element} : $self->{open_elements}->[-1]->[0])
+     my $start_tag_name = $token->{tag_name};
-       ->append_child ($style_el);
+     my $el;
-     $self->{content_model_flag} = 'CDATA';
+     !!!create-element ($el, $start_tag_name, $token->{attributes});
+     ## Step 2
+     $insert->($el); # /context node/->append_child ($el)
+     ## Step 3
+     $self->{content_model_flag} = $content_model_flag; # CDATA or RCDATA
      delete $self->{escape}; # MUST
+     ## Step 4
      my $text = '';
      !!!next-token;
-     while ($token->{type} eq 'character') {
+     while ($token->{type} eq 'character') { # or until stop tokenizing
        $text .= $token->{data};
        !!!next-token;
-     } # stop if non-character token or tokenizer stops tokenising
+     }
+     ## Step 5
      if (length $text) {
-       $style_el->manakai_append_text ($text);
+       my $text = $self->{document}->create_text_node ($text);
+       $el->append_child ($text);
      }
+     ## Step 6
      $self->{content_model_flag} = 'PCDATA';
-     if ($token->{type} eq 'end tag' and $token->{tag_name} eq 'style') {
+     ## Step 7
+     if ($token->{type} eq 'end tag' and $token->{tag_name} eq $start_tag_name) {
        ## Ignore the token
      } else {
-       !!!parse-error (type => 'in CDATA:#'.$token->{type});
+       !!!parse-error (type => 'in '.$content_model_flag.':#'.$token->{type});
-       ## ISSUE: And ignore?
      }
      !!!next-token;
-   }; # $style_start_tag
+   }; # $parse_rcdata
-   my $script_start_tag = sub {
+   my $script_start_tag = sub ($) {
+     my $insert = $_[0];
      my $script_el;
      !!!create-element ($script_el, 'script', $token->{attributes});
      ## TODO: mark as "parser-inserted"
-Line 2014 
 sub _tree_construction_main ($) {
+Line 2204 
 sub _tree_construction_main ($) {
      } else {
        ## TODO: $old_insertion_point = current insertion point
        ## TODO: insertion point = just before the next input character
-       (($self->{insertion_mode} eq 'in head' and defined $self->{head_element})
+       $insert->($script_el);
-        ? $self->{head_element} : $self->{open_elements}->[-1]->[0])->append_child ($script_el);
        ## TODO: insertion point = $old_insertion_point (might be "undefined")
-Line 2210 
 sub _tree_construction_main ($) {
+Line 2399 
 sub _tree_construction_main ($) {
    }; # $formatting_end_tag
    my $insert_to_current = sub {
-     $self->{open_elements}->[-1]->[0]->append_child (shift);
+     $self->{open_elements}->[-1]->[0]->append_child ($_[0]);
    }; # $insert_to_current
    my $insert_to_foster = sub {
-Line 2248 
 sub _tree_construction_main ($) {
+Line 2437 
 sub _tree_construction_main ($) {
      my $insert = shift;
      if ($token->{type} eq 'start tag') {
        if ($token->{tag_name} eq 'script') {
-         $script_start_tag->();
+         ## NOTE: This is an "as if in head" code clone
+         $script_start_tag->($insert);
          return;
        } elsif ($token->{tag_name} eq 'style') {
-         $style_start_tag->();
+         ## NOTE: This is an "as if in head" code clone
+         $parse_rcdata->('CDATA', $insert);
          return;
        } elsif ({
                  base => 1, link => 1, meta => 1,
                 }->{$token->{tag_name}}) {
-         !!!parse-error (type => 'in body:'.$token->{tag_name});
+         ## NOTE: This is an "as if in head" code clone, only "-t" differs
-         ## NOTE: This is an "as if in head" code clone
+         !!!insert-element-t ($token->{tag_name}, $token->{attributes});
-         my $el;
+         pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
-         !!!create-element ($el, $token->{tag_name}, $token->{attributes});
-         if (defined $self->{head_element}) {
-           $self->{head_element}->append_child ($el);
-         } else {
-           $insert->($el);
-         }
          !!!next-token;
          return;
        } elsif ($token->{tag_name} eq 'title') {
          !!!parse-error (type => 'in body:title');
-         ## NOTE: There is an "as if in head" code clone
+         ## NOTE: This is an "as if in head" code clone
-         my $title_el;
+         $parse_rcdata->('RCDATA', $insert);
-         !!!create-element ($title_el, 'title', $token->{attributes});
-         (defined $self->{head_element} ? $self->{head_element} : $self->{open_elements}->[-1]->[0])
-           ->append_child ($title_el);
-         $self->{content_model_flag} = 'RCDATA';
-         delete $self->{escape}; # MUST
-         my $text = '';
-         !!!next-token;
-         while ($token->{type} eq 'character') {
-           $text .= $token->{data};
-           !!!next-token;
-         }
-         if (length $text) {
-           $title_el->manakai_append_text ($text);
-         }
-         $self->{content_model_flag} = 'PCDATA';
-         if ($token->{type} eq 'end tag' and
-             $token->{tag_name} eq 'title') {
-           ## Ignore the token
-         } else {
-           !!!parse-error (type => 'in RCDATA:#'.$token->{type});
-           ## ISSUE: And ignore?
-         }
-         !!!next-token;
          return;
        } elsif ($token->{tag_name} eq 'body') {
          !!!parse-error (type => 'in body:body');
-Line 2511 
 sub _tree_construction_main ($) {
+Line 2669 
 sub _tree_construction_main ($) {
            }
          } # INSCOPE
+         ## NOTE: See <http://html5.org/tools/web-apps-tracker?from=925&to=926>
          ## has an element in scope
-         my $i;
+         #my $i;
-         INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
+         #INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
-           my $node = $self->{open_elements}->[$_];
+         #  my $node = $self->{open_elements}->[$_];
-           if ({
+         #  if ({
-                h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
+         #       h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
-               }->{$node->[1]}) {
+         #      }->{$node->[1]}) {
-             $i = $_;
+         #    $i = $_;
-             last INSCOPE;
+         #    last INSCOPE;
-           } elsif ({
+         #  } elsif ({
-                     table => 1, caption => 1, td => 1, th => 1,
+         #            table => 1, caption => 1, td => 1, th => 1,
-                     button => 1, marquee => 1, object => 1, html => 1,
+         #            button => 1, marquee => 1, object => 1, html => 1,
-                    }->{$node->[1]}) {
+         #           }->{$node->[1]}) {
-             last INSCOPE;
+         #    last INSCOPE;
-           }
+         #  }
-         } # INSCOPE
+         #} # INSCOPE
+         #
-         if (defined $i) {
+         #if (defined $i) {
-           !!!parse-error (type => 'in hn:hn');
+         #  !!! parse-error (type => 'in hn:hn');
-           splice @{$self->{open_elements}}, $i;
+         #  splice @{$self->{open_elements}}, $i;
-         }
+         #}
          !!!insert-element-t ($token->{tag_name}, $token->{attributes});
-Line 2574 
 sub _tree_construction_main ($) {
+Line 2733 
 sub _tree_construction_main ($) {
          return;
        } elsif ({
                  b => 1, big => 1, em => 1, font => 1, i => 1,
-                 nobr => 1, s => 1, small => 1, strile => 1,
+                 s => 1, small => 1, strile => 1,
                  strong => 1, tt => 1, u => 1,
                 }->{$token->{tag_name}}) {
          $reconstruct_active_formatting_elements->($insert_to_current);
-Line 2584 
 sub _tree_construction_main ($) {
+Line 2743 
 sub _tree_construction_main ($) {
          !!!next-token;
          return;
+       } elsif ($token->{tag_name} eq 'nobr') {
+         $reconstruct_active_formatting_elements->($insert_to_current);
+         ## has a |nobr| element in scope
+         INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
+           my $node = $self->{open_elements}->[$_];
+           if ($node->[1] eq 'nobr') {
+             !!!back-token;
+             $token = {type => 'end tag', tag_name => 'nobr'};
+             return;
+           } elsif ({
+                     table => 1, caption => 1, td => 1, th => 1,
+                     button => 1, marquee => 1, object => 1, html => 1,
+                    }->{$node->[1]}) {
+             last INSCOPE;
+           }
+         } # INSCOPE
+         !!!insert-element-t ($token->{tag_name}, $token->{attributes});
+         push @$active_formatting_elements, $self->{open_elements}->[-1];
+         !!!next-token;
+         return;
        } elsif ($token->{tag_name} eq 'button') {
          ## has a button element in scope
          INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
-Line 2619 
 sub _tree_construction_main ($) {
+Line 2801 
 sub _tree_construction_main ($) {
          return;
        } elsif ($token->{tag_name} eq 'xmp') {
          $reconstruct_active_formatting_elements->($insert_to_current);
+         $parse_rcdata->('CDATA', $insert);
-         !!!insert-element-t ($token->{tag_name}, $token->{attributes});
-         $self->{content_model_flag} = 'CDATA';
-         delete $self->{escape}; # MUST
-         !!!next-token;
          return;
        } elsif ($token->{tag_name} eq 'table') {
          ## has a p element in scope
-Line 2703 
 sub _tree_construction_main ($) {
+Line 2879 
 sub _tree_construction_main ($) {
            return;
          } else {
            my $at = $token->{attributes};
+           my $form_attrs;
+           $form_attrs->{action} = $at->{action} if $at->{action};
+           my $prompt_attr = $at->{prompt};
            $at->{name} = {name => 'name', value => 'isindex'};
+           delete $at->{action};
+           delete $at->{prompt};
            my @tokens = (
-                         {type => 'start tag', tag_name => 'form'},
+                         {type => 'start tag', tag_name => 'form',
+                          attributes => $form_attrs},
                          {type => 'start tag', tag_name => 'hr'},
                          {type => 'start tag', tag_name => 'p'},
                          {type => 'start tag', tag_name => 'label'},
-                         {type => 'character',
+                        );
-                          data => 'This is a searchable index. Insert your search keywords here: '}, # SHOULD
+           if ($prompt_attr) {
-                         ## TODO: make this configurable
+             push @tokens, {type => 'character', data => $prompt_attr->{value}};
+           } else {
+             push @tokens, {type => 'character',
+                            data => 'This is a searchable index. Insert your search keywords here: '}; # SHOULD
+             ## TODO: make this configurable
+           }
+           push @tokens,
                          {type => 'start tag', tag_name => 'input', attributes => $at},
                          #{type => 'character', data => ''}, # SHOULD
                          {type => 'end tag', tag_name => 'label'},
                          {type => 'end tag', tag_name => 'p'},
                          {type => 'start tag', tag_name => 'hr'},
-                         {type => 'end tag', tag_name => 'form'},
+                         {type => 'end tag', tag_name => 'form'};
-                        );
            $token = shift @tokens;
            !!!back-token (@tokens);
            return;
          }
-       } elsif ({
+       } elsif ($token->{tag_name} eq 'textarea') {
-                 textarea => 1,
-                 iframe => 1,
-                 noembed => 1,
-                 noframes => 1,
-                 noscript => 0, ## TODO: 1 if scripting is enabled
-                }->{$token->{tag_name}}) {
          my $tag_name = $token->{tag_name};
          my $el;
          !!!create-element ($el, $token->{tag_name}, $token->{attributes});
-         if ($token->{tag_name} eq 'textarea') {
+         ## TODO: $self->{form_element} if defined
-           ## TODO: $self->{form_element} if defined
+         $self->{content_model_flag} = 'RCDATA';
-           $self->{content_model_flag} = 'RCDATA';
-         } else {
-           $self->{content_model_flag} = 'CDATA';
-         }
          delete $self->{escape}; # MUST
          $insert->($el);
          my $text = '';
-         if ($token->{tag_name} eq 'textarea') {
+         !!!next-token;
-           !!!next-token;
+         if ($token->{type} eq 'character') {
-           if ($token->{type} eq 'character') {
+           $token->{data} =~ s/^\x0A//;
-             $token->{data} =~ s/^\x0A//;
+           unless (length $token->{data}) {
-             unless (length $token->{data}) {
+             !!!next-token;
-               !!!next-token;
-             }
            }
-         } else {
-           !!!next-token;
          }
          while ($token->{type} eq 'character') {
            $text .= $token->{data};
-Line 2770 
 sub _tree_construction_main ($) {
+Line 2943 
 sub _tree_construction_main ($) {
              $token->{tag_name} eq $tag_name) {
            ## Ignore the token
          } else {
-           if ($token->{tag_name} eq 'textarea') {
+           !!!parse-error (type => 'in RCDATA:#'.$token->{type});
-             !!!parse-error (type => 'in RCDATA:#'.$token->{type});
-           } else {
-             !!!parse-error (type => 'in CDATA:#'.$token->{type});
-           }
-           ## ISSUE: And ignore?
          }
          !!!next-token;
          return;
+       } elsif ({
+                 iframe => 1,
+                 noembed => 1,
+                 noframes => 1,
+                 noscript => 0, ## TODO: 1 if scripting is enabled
+                }->{$token->{tag_name}}) {
+         $parse_rcdata->('CDATA', $insert);
+         return;
        } elsif ($token->{tag_name} eq 'select') {
          $reconstruct_active_formatting_elements->($insert_to_current);
-Line 2809 
 sub _tree_construction_main ($) {
+Line 2985 
 sub _tree_construction_main ($) {
        }
      } elsif ($token->{type} eq 'end tag') {
        if ($token->{tag_name} eq 'body') {
-         if (@{$self->{open_elements}} > 1 and $self->{open_elements}->[1]->[1] eq 'body') {
+         if (@{$self->{open_elements}} > 1 and
-           ## ISSUE: There is an issue in the spec.
+             $self->{open_elements}->[1]->[1] eq 'body') {
-           if ($self->{open_elements}->[-1]->[1] ne 'body') {
+           for (@{$self->{open_elements}}) {
-             !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
+             unless ({
+                        dd => 1, dt => 1, li => 1, p => 1, td => 1,
+                        th => 1, tr => 1, body => 1, html => 1,
+                     }->{$_->[1]}) {
+               !!!parse-error (type => 'not closed:'.$_->[1]);
+             }
            }
            $self->{insertion_mode} = 'after body';
            !!!next-token;
            return;
-Line 3019 
 sub _tree_construction_main ($) {
+Line 3201 
 sub _tree_construction_main ($) {
                  #not $phrasing_category->{$node->[1]} and
                  ($special_category->{$node->[1]} or
                   $scoping_category->{$node->[1]})) {
-               !!!parse-error (type => 'not closed:'.$node->[1]);
+               !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
                ## Ignore the token
                !!!next-token;
                last S2;
-Line 3122 
 sub _tree_construction_main ($) {
+Line 3304 
 sub _tree_construction_main ($) {
              }
              redo B;
            } elsif ($token->{type} eq 'end tag') {
-             if ($token->{tag_name} eq 'html') {
+             if ({head => 1, body => 1, html => 1}->{$token->{tag_name}}) {
                ## As if <head>
                !!!create-element ($self->{head_element}, 'head');
                $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
-Line 3132 
 sub _tree_construction_main ($) {
+Line 3314 
 sub _tree_construction_main ($) {
                redo B;
              } else {
                !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
-               ## Ignore the token
+               ## Ignore the token ## ISSUE: An issue in the spec.
                !!!next-token;
                redo B;
              }
            } else {
              die "$0: $token->{type}: Unknown type";
            }
-         } elsif ($self->{insertion_mode} eq 'in head') {
+         } elsif ($self->{insertion_mode} eq 'in head' or
+                  $self->{insertion_mode} eq 'in head noscript' or
+                  $self->{insertion_mode} eq 'after head') {
            if ($token->{type} eq 'character') {
              if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
                $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
-Line 3156 
 sub _tree_construction_main ($) {
+Line 3340 
 sub _tree_construction_main ($) {
              !!!next-token;
              redo B;
            } elsif ($token->{type} eq 'start tag') {
-             if ($token->{tag_name} eq 'title') {
+             if ({base => ($self->{insertion_mode} eq 'in head' or
-               ## NOTE: There is an "as if in head" code clone
+                           $self->{insertion_mode} eq 'after head'),
-               my $title_el;
+                  link => 1, meta => 1}->{$token->{tag_name}}) {
-               !!!create-element ($title_el, 'title', $token->{attributes});
+               ## NOTE: There is a "as if in head" code clone.
-               (defined $self->{head_element} ? $self->{head_element} : $self->{open_elements}->[-1]->[0])
+               if ($self->{insertion_mode} eq 'after head') {
-                 ->append_child ($title_el);
+                 !!!parse-error (type => 'after head:'.$token->{tag_name});
-               $self->{content_model_flag} = 'RCDATA';
+                 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
-               delete $self->{escape}; # MUST
+               }
+               !!!insert-element ($token->{tag_name}, $token->{attributes});
-               my $text = '';
+               pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
+               pop @{$self->{open_elements}}
+                   if $self->{insertion_mode} eq 'after head';
                !!!next-token;
-               while ($token->{type} eq 'character') {
+               redo B;
-                 $text .= $token->{data};
+             } elsif ($token->{tag_name} eq 'title' and
+                      $self->{insertion_mode} eq 'in head') {
+               ## NOTE: There is a "as if in head" code clone.
+               if ($self->{insertion_mode} eq 'after head') {
+                 !!!parse-error (type => 'after head:'.$token->{tag_name});
+                 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
+               }
+               $parse_rcdata->('RCDATA', $insert_to_current);
+               pop @{$self->{open_elements}}
+                   if $self->{insertion_mode} eq 'after head';
+               redo B;
+             } elsif ($token->{tag_name} eq 'style') {
+               ## NOTE: Or (scripting is enabled and tag_name eq 'noscript' and
+               ## insertion mode 'in head')
+               ## NOTE: There is a "as if in head" code clone.
+               if ($self->{insertion_mode} eq 'after head') {
+                 !!!parse-error (type => 'after head:'.$token->{tag_name});
+                 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
+               }
+               $parse_rcdata->('CDATA', $insert_to_current);
+               pop @{$self->{open_elements}}
+                   if $self->{insertion_mode} eq 'after head';
+               redo B;
+             } elsif ($token->{tag_name} eq 'noscript') {
+               if ($self->{insertion_mode} eq 'in head') {
+                 ## NOTE: and scripting is disalbed
+                 !!!insert-element ($token->{tag_name}, $token->{attributes});
+                 $self->{insertion_mode} = 'in head noscript';
                  !!!next-token;
-               }
+                 redo B;
-               if (length $text) {
+               } elsif ($self->{insertion_mode} eq 'in head noscript') {
-                 $title_el->manakai_append_text ($text);
+                 !!!parse-error (type => 'noscript in noscript');
-               }
-               $self->{content_model_flag} = 'PCDATA';
-               if ($token->{type} eq 'end tag' and
-                   $token->{tag_name} eq 'title') {
                  ## Ignore the token
+                 redo B;
                } else {
-                 !!!parse-error (type => 'in RCDATA:#'.$token->{type});
+                 #
-                 ## ISSUE: And ignore?
                }
+             } elsif ($token->{tag_name} eq 'head' and
+                      $self->{insertion_mode} ne 'after head') {
+               !!!parse-error (type => 'in head:head'); # or in head noscript
+               ## Ignore the token
                !!!next-token;
                redo B;
-             } elsif ($token->{tag_name} eq 'style') {
+             } elsif ($self->{insertion_mode} ne 'in head noscript' and
-               $style_start_tag->();
+                      $token->{tag_name} eq 'script') {
+               if ($self->{insertion_mode} eq 'after head') {
+                 !!!parse-error (type => 'after head:'.$token->{tag_name});
+                 push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
+               }
+               ## NOTE: There is a "as if in head" code clone.
+               $script_start_tag->($insert_to_current);
+               pop @{$self->{open_elements}}
+                   if $self->{insertion_mode} eq 'after head';
                redo B;
-             } elsif ($token->{tag_name} eq 'script') {
+             } elsif ($self->{insertion_mode} eq 'after head' and
-               $script_start_tag->();
+                      $token->{tag_name} eq 'body') {
-               redo B;
+               !!!insert-element ('body', $token->{attributes});
-             } elsif ({base => 1, link => 1, meta => 1}->{$token->{tag_name}}) {
+               $self->{insertion_mode} = 'in body';
-               ## NOTE: There are "as if in head" code clones
-               my $el;
-               !!!create-element ($el, $token->{tag_name}, $token->{attributes});
-               (defined $self->{head_element} ? $self->{head_element} : $self->{open_elements}->[-1]->[0])
-                 ->append_child ($el);
                !!!next-token;
                redo B;
-             } elsif ($token->{tag_name} eq 'head') {
+             } elsif ($self->{insertion_mode} eq 'after head' and
-               !!!parse-error (type => 'in head:head');
+                      $token->{tag_name} eq 'frameset') {
-               ## Ignore the token
+               !!!insert-element ('frameset', $token->{attributes});
+               $self->{insertion_mode} = 'in frameset';
                !!!next-token;
                redo B;
              } else {
                #
              }
            } elsif ($token->{type} eq 'end tag') {
-             if ($token->{tag_name} eq 'head') {
+             if ($self->{insertion_mode} eq 'in head' and
-               if ($self->{open_elements}->[-1]->[1] eq 'head') {
+                 $token->{tag_name} eq 'head') {
-                 pop @{$self->{open_elements}};
+               pop @{$self->{open_elements}};
-               } else {
-                 !!!parse-error (type => 'unmatched end tag:head');
-               }
                $self->{insertion_mode} = 'after head';
                !!!next-token;
                redo B;
-             } elsif ($token->{tag_name} eq 'html') {
+             } elsif ($self->{insertion_mode} eq 'in head noscript' and
+                 $token->{tag_name} eq 'noscript') {
+               pop @{$self->{open_elements}};
+               $self->{insertion_mode} = 'in head';
+               !!!next-token;
+               redo B;
+             } elsif ($self->{insertion_mode} eq 'in head' and
+                      ($token->{tag_name} eq 'body' or
+                       $token->{tag_name} eq 'html')) {
                #
-             } else {
+             } elsif ($self->{insertion_mode} ne 'after head') {
                !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
                ## Ignore the token
                !!!next-token;
                redo B;
+             } else {
+               #
              }
            } else {
              #
            }
-           if ($self->{open_elements}->[-1]->[1] eq 'head') {
+           ## As if </head> or </noscript> or <body>
-             ## As if </head>
+           if ($self->{insertion_mode} eq 'in head') {
+             pop @{$self->{open_elements}};
+             $self->{insertion_mode} = 'after head';
+           } elsif ($self->{insertion_mode} eq 'in head noscript') {
              pop @{$self->{open_elements}};
+             !!!parse-error (type => 'in noscript:'.(defined $token->{tag_name} ? ($token->{type} eq 'end tag' ? '/' : '') . $token->{tag_name} : '#' . $token->{type}));
+             $self->{insertion_mode} = 'in head';
+           } else { # 'after head'
+             !!!insert-element ('body');
+             $self->{insertion_mode} = 'in body';
            }
-           $self->{insertion_mode} = 'after head';
            ## reprocess
            redo B;
            ## ISSUE: An issue in the spec.
-         } elsif ($self->{insertion_mode} eq 'after head') {
-           if ($token->{type} eq 'character') {
-             if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
-               $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
-               unless (length $token->{data}) {
-                 !!!next-token;
-                 redo B;
-               }
-             }
-             #
-           } elsif ($token->{type} eq 'comment') {
-             my $comment = $self->{document}->create_comment ($token->{data});
-             $self->{open_elements}->[-1]->[0]->append_child ($comment);
-             !!!next-token;
-             redo B;
-           } elsif ($token->{type} eq 'start tag') {
-             if ($token->{tag_name} eq 'body') {
-               !!!insert-element ('body', $token->{attributes});
-               $self->{insertion_mode} = 'in body';
-               !!!next-token;
-               redo B;
-             } elsif ($token->{tag_name} eq 'frameset') {
-               !!!insert-element ('frameset', $token->{attributes});
-               $self->{insertion_mode} = 'in frameset';
-               !!!next-token;
-               redo B;
-             } elsif ({
-                       base => 1, link => 1, meta => 1,
-                       script => 1, style => 1, title => 1,
-                      }->{$token->{tag_name}}) {
-               !!!parse-error (type => 'after head:'.$token->{tag_name});
-               $self->{insertion_mode} = 'in head';
-               ## reprocess
-               redo B;
-             } else {
-               #
-             }
-           } else {
-             #
-           }
-           ## As if <body>
-           !!!insert-element ('body');
-           $self->{insertion_mode} = 'in body';
-           ## reprocess
-           redo B;
          } elsif ($self->{insertion_mode} eq 'in body') {
            if ($token->{type} eq 'character') {
              ## NOTE: There is a code clone of "character in body".
-Line 4935 
 sub set_inner_html ($$$) {
+Line 5116 
 sub set_inner_html ($$$) {
      ## NOTE: Most of this code is copied from |parse_string|
      ## Step 1 # MUST
-     my $doc = $node->owner_document->implementation->create_document;
+     my $this_doc = $node->owner_document;
-     ## TODO: Mark as HTML document
+     my $doc = $this_doc->implementation->create_document;
+     $doc->manakai_is_html (1);
      my $p = $class->new;
      $p->{document} = $doc;
-Line 4946 
 sub set_inner_html ($$$) {
+Line 5128 
 sub set_inner_html ($$$) {
      my $column = 0;
      $p->{set_next_input_character} = sub {
        my $self = shift;
+       pop @{$self->{prev_input_character}};
+       unshift @{$self->{prev_input_character}}, $self->{next_input_character};
        $self->{next_input_character} = -1 and return if $i >= length $$s;
        $self->{next_input_character} = ord substr $$s, $i++, 1;
        $column++;
-Line 4954 
 sub set_inner_html ($$$) {
+Line 5140 
 sub set_inner_html ($$$) {
          $line++;
          $column = 0;
        } elsif ($self->{next_input_character} == 0x000D) { # CR
-         if ($i >= length $$s) {
+         $i++ if substr ($$s, $i, 1) eq "\x0A";
-           #
-         } else {
-           my $next_char = ord substr $$s, $i++, 1;
-           if ($next_char == 0x000A) { # LF
-             #
-           } else {
-             push @{$self->{char}}, $next_char;
-           }
-         }
          $self->{next_input_character} = 0x000A; # LF # MUST
          $line++;
          $column = 0;
        } elsif ($self->{next_input_character} > 0x10FFFF) {
          $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
        } elsif ($self->{next_input_character} == 0x0000) { # NULL
+         !!!parse-error (type => 'NULL');
          $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
        }
      };
+     $p->{prev_input_character} = [-1, -1, -1];
+     $p->{next_input_character} = -1;
      my $ponerror = $onerror || sub {
        my (%opt) = @_;
-Line 5051 
 sub set_inner_html ($$$) {
+Line 5231 
 sub set_inner_html ($$$) {
      ## Step 12 # MUST
      @cn = @{$root->child_nodes};
      for (@cn) {
+       $this_doc->adopt_node ($_);
        $node->append_child ($_);
      }
-     ## ISSUE: adopt_node? mutation events?
+     ## ISSUE: mutation events?
      $p->_terminate_tree_constructor;
    } else {
-Line 5124 
 sub get_inner_html ($$$) {
+Line 5305 
 sub get_inner_html ($$$) {
          spacer => 1, wbr => 1,
        }->{$tag_name};
+       $s .= "\x0A" if $tag_name eq 'pre' or $tag_name eq 'textarea';
        if (not $in_cdata and {
          style => 1, script => 1, xmp => 1, iframe => 1,
          noembed => 1, noframes => 1, noscript => 1,

 Legend:



Removed from v.1.13
 


changed lines


 
Added in v.1.25
 Legend:



Removed from v.1.13
 


changed lines


 
Added in v.1.25
-Removed from v.1.13
+Added in v.1.25

admin@suikawiki.org	ViewVC Help
Powered by ViewVC 1.1.24