/[suikacvs]/markup/html/whatpm/Whatpm/HTML.pm.src

Diff of /markup/html/whatpm/Whatpm/HTML.pm.src

Parent Directory | Revision Log | View Patch Patch

-revision 1.13 by wakaba,
Sat Jun 23 05:29:48 2007 UTC
+revision 1.22 by wakaba,
Sat Jun 23 14:55:45 2007 UTC
 Line 2 
 package Whatpm::HTML;
  use strict;
  our $VERSION=do{my @r=(q$Revision$=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
- ## This is an early version of an HTML parser.
+ ## ISSUE:
+ ## var doc = implementation.createDocument (null, null, null);
+ ## doc.write ('');
+ ## alert (doc.compatMode);
  my $permitted_slash_tag_name = {
    base => 1,
-Line 18 
 my $permitted_slash_tag_name = {
+Line 21 
 my $permitted_slash_tag_name = {
    input => 1,
  };
- my $entity_char = {
-   AElig => "\x{00C6}",
-   Aacute => "\x{00C1}",
-   Acirc => "\x{00C2}",
-   Agrave => "\x{00C0}",
-   Alpha => "\x{0391}",
-   Aring => "\x{00C5}",
-   Atilde => "\x{00C3}",
-   Auml => "\x{00C4}",
-   Beta => "\x{0392}",
-   Ccedil => "\x{00C7}",
-   Chi => "\x{03A7}",
-   Dagger => "\x{2021}",
-   Delta => "\x{0394}",
-   ETH => "\x{00D0}",
-   Eacute => "\x{00C9}",
-   Ecirc => "\x{00CA}",
-   Egrave => "\x{00C8}",
-   Epsilon => "\x{0395}",
-   Eta => "\x{0397}",
-   Euml => "\x{00CB}",
-   Gamma => "\x{0393}",
-   Iacute => "\x{00CD}",
-   Icirc => "\x{00CE}",
-   Igrave => "\x{00CC}",
-   Iota => "\x{0399}",
-   Iuml => "\x{00CF}",
-   Kappa => "\x{039A}",
-   Lambda => "\x{039B}",
-   Mu => "\x{039C}",
-   Ntilde => "\x{00D1}",
-   Nu => "\x{039D}",
-   OElig => "\x{0152}",
-   Oacute => "\x{00D3}",
-   Ocirc => "\x{00D4}",
-   Ograve => "\x{00D2}",
-   Omega => "\x{03A9}",
-   Omicron => "\x{039F}",
-   Oslash => "\x{00D8}",
-   Otilde => "\x{00D5}",
-   Ouml => "\x{00D6}",
-   Phi => "\x{03A6}",
-   Pi => "\x{03A0}",
-   Prime => "\x{2033}",
-   Psi => "\x{03A8}",
-   Rho => "\x{03A1}",
-   Scaron => "\x{0160}",
-   Sigma => "\x{03A3}",
-   THORN => "\x{00DE}",
-   Tau => "\x{03A4}",
-   Theta => "\x{0398}",
-   Uacute => "\x{00DA}",
-   Ucirc => "\x{00DB}",
-   Ugrave => "\x{00D9}",
-   Upsilon => "\x{03A5}",
-   Uuml => "\x{00DC}",
-   Xi => "\x{039E}",
-   Yacute => "\x{00DD}",
-   Yuml => "\x{0178}",
-   Zeta => "\x{0396}",
-   aacute => "\x{00E1}",
-   acirc => "\x{00E2}",
-   acute => "\x{00B4}",
-   aelig => "\x{00E6}",
-   agrave => "\x{00E0}",
-   alefsym => "\x{2135}",
-   alpha => "\x{03B1}",
-   amp => "\x{0026}",
-   AMP => "\x{0026}",
-   and => "\x{2227}",
-   ang => "\x{2220}",
-   apos => "\x{0027}",
-   aring => "\x{00E5}",
-   asymp => "\x{2248}",
-   atilde => "\x{00E3}",
-   auml => "\x{00E4}",
-   bdquo => "\x{201E}",
-   beta => "\x{03B2}",
-   brvbar => "\x{00A6}",
-   bull => "\x{2022}",
-   cap => "\x{2229}",
-   ccedil => "\x{00E7}",
-   cedil => "\x{00B8}",
-   cent => "\x{00A2}",
-   chi => "\x{03C7}",
-   circ => "\x{02C6}",
-   clubs => "\x{2663}",
-   cong => "\x{2245}",
-   copy => "\x{00A9}",
-   COPY => "\x{00A9}",
-   crarr => "\x{21B5}",
-   cup => "\x{222A}",
-   curren => "\x{00A4}",
-   dArr => "\x{21D3}",
-   dagger => "\x{2020}",
-   darr => "\x{2193}",
-   deg => "\x{00B0}",
-   delta => "\x{03B4}",
-   diams => "\x{2666}",
-   divide => "\x{00F7}",
-   eacute => "\x{00E9}",
-   ecirc => "\x{00EA}",
-   egrave => "\x{00E8}",
-   empty => "\x{2205}",
-   emsp => "\x{2003}",
-   ensp => "\x{2002}",
-   epsilon => "\x{03B5}",
-   equiv => "\x{2261}",
-   eta => "\x{03B7}",
-   eth => "\x{00F0}",
-   euml => "\x{00EB}",
-   euro => "\x{20AC}",
-   exist => "\x{2203}",
-   fnof => "\x{0192}",
-   forall => "\x{2200}",
-   frac12 => "\x{00BD}",
-   frac14 => "\x{00BC}",
-   frac34 => "\x{00BE}",
-   frasl => "\x{2044}",
-   gamma => "\x{03B3}",
-   ge => "\x{2265}",
-   gt => "\x{003E}",
-   GT => "\x{003E}",
-   hArr => "\x{21D4}",
-   harr => "\x{2194}",
-   hearts => "\x{2665}",
-   hellip => "\x{2026}",
-   iacute => "\x{00ED}",
-   icirc => "\x{00EE}",
-   iexcl => "\x{00A1}",
-   igrave => "\x{00EC}",
-   image => "\x{2111}",
-   infin => "\x{221E}",
-   int => "\x{222B}",
-   iota => "\x{03B9}",
-   iquest => "\x{00BF}",
-   isin => "\x{2208}",
-   iuml => "\x{00EF}",
-   kappa => "\x{03BA}",
-   lArr => "\x{21D0}",
-   lambda => "\x{03BB}",
-   lang => "\x{2329}",
-   laquo => "\x{00AB}",
-   larr => "\x{2190}",
-   lceil => "\x{2308}",
-   ldquo => "\x{201C}",
-   le => "\x{2264}",
-   lfloor => "\x{230A}",
-   lowast => "\x{2217}",
-   loz => "\x{25CA}",
-   lrm => "\x{200E}",
-   lsaquo => "\x{2039}",
-   lsquo => "\x{2018}",
-   lt => "\x{003C}",
-   LT => "\x{003C}",
-   macr => "\x{00AF}",
-   mdash => "\x{2014}",
-   micro => "\x{00B5}",
-   middot => "\x{00B7}",
-   minus => "\x{2212}",
-   mu => "\x{03BC}",
-   nabla => "\x{2207}",
-   nbsp => "\x{00A0}",
-   ndash => "\x{2013}",
-   ne => "\x{2260}",
-   ni => "\x{220B}",
-   not => "\x{00AC}",
-   notin => "\x{2209}",
-   nsub => "\x{2284}",
-   ntilde => "\x{00F1}",
-   nu => "\x{03BD}",
-   oacute => "\x{00F3}",
-   ocirc => "\x{00F4}",
-   oelig => "\x{0153}",
-   ograve => "\x{00F2}",
-   oline => "\x{203E}",
-   omega => "\x{03C9}",
-   omicron => "\x{03BF}",
-   oplus => "\x{2295}",
-   or => "\x{2228}",
-   ordf => "\x{00AA}",
-   ordm => "\x{00BA}",
-   oslash => "\x{00F8}",
-   otilde => "\x{00F5}",
-   otimes => "\x{2297}",
-   ouml => "\x{00F6}",
-   para => "\x{00B6}",
-   part => "\x{2202}",
-   permil => "\x{2030}",
-   perp => "\x{22A5}",
-   phi => "\x{03C6}",
-   pi => "\x{03C0}",
-   piv => "\x{03D6}",
-   plusmn => "\x{00B1}",
-   pound => "\x{00A3}",
-   prime => "\x{2032}",
-   prod => "\x{220F}",
-   prop => "\x{221D}",
-   psi => "\x{03C8}",
-   quot => "\x{0022}",
-   QUOT => "\x{0022}",
-   rArr => "\x{21D2}",
-   radic => "\x{221A}",
-   rang => "\x{232A}",
-   raquo => "\x{00BB}",
-   rarr => "\x{2192}",
-   rceil => "\x{2309}",
-   rdquo => "\x{201D}",
-   real => "\x{211C}",
-   reg => "\x{00AE}",
-   REG => "\x{00AE}",
-   rfloor => "\x{230B}",
-   rho => "\x{03C1}",
-   rlm => "\x{200F}",
-   rsaquo => "\x{203A}",
-   rsquo => "\x{2019}",
-   sbquo => "\x{201A}",
-   scaron => "\x{0161}",
-   sdot => "\x{22C5}",
-   sect => "\x{00A7}",
-   shy => "\x{00AD}",
-   sigma => "\x{03C3}",
-   sigmaf => "\x{03C2}",
-   sim => "\x{223C}",
-   spades => "\x{2660}",
-   sub => "\x{2282}",
-   sube => "\x{2286}",
-   sum => "\x{2211}",
-   sup => "\x{2283}",
-   sup1 => "\x{00B9}",
-   sup2 => "\x{00B2}",
-   sup3 => "\x{00B3}",
-   supe => "\x{2287}",
-   szlig => "\x{00DF}",
-   tau => "\x{03C4}",
-   there4 => "\x{2234}",
-   theta => "\x{03B8}",
-   thetasym => "\x{03D1}",
-   thinsp => "\x{2009}",
-   thorn => "\x{00FE}",
-   tilde => "\x{02DC}",
-   times => "\x{00D7}",
-   trade => "\x{2122}",
-   uArr => "\x{21D1}",
-   uacute => "\x{00FA}",
-   uarr => "\x{2191}",
-   ucirc => "\x{00FB}",
-   ugrave => "\x{00F9}",
-   uml => "\x{00A8}",
-   upsih => "\x{03D2}",
-   upsilon => "\x{03C5}",
-   uuml => "\x{00FC}",
-   weierp => "\x{2118}",
-   xi => "\x{03BE}",
-   yacute => "\x{00FD}",
-   yen => "\x{00A5}",
-   yuml => "\x{00FF}",
-   zeta => "\x{03B6}",
-   zwj => "\x{200D}",
-   zwnj => "\x{200C}",
- }; # $entity_char
  my $c1_entity_char = {
 x80 => 0x20AC,
 x81 => 0xFFFD,
-Line 361 
 sub parse_string ($$$;$) {
+Line 102 
 sub parse_string ($$$;$) {
        $line++;
        $column = 0;
      } elsif ($self->{next_input_character} == 0x000D) { # CR
-       if ($i >= length $$s) {
+       $i++ if substr ($$s, $i, 1) eq "\x0A";
-         #
-       } else {
-         my $next_char = ord substr $$s, $i++, 1;
-         if ($next_char == 0x000A) { # LF
-           #
-         } else {
-           push @{$self->{char}}, $next_char;
-         }
-       }
        $self->{next_input_character} = 0x000A; # LF # MUST
        $line++;
        $column = 0;
-Line 426 
 sub _initialize_tokenizer ($) {
+Line 158 
 sub _initialize_tokenizer ($) {
    # $self->{next_input_character}
    !!!next-input-character;
    $self->{token} = [];
+   # $self->{escape}
  } # _initialize_tokenizer
  ## A token has:
  ##   ->{type} eq 'DOCTYPE', 'start tag', 'end tag', 'comment',
  ##       'character', or 'end-of-file'
- ##   ->{name} (DOCTYPE, start tag (tagname), end tag (tagname))
+ ##   ->{name} (DOCTYPE, start tag (tag name), end tag (tag name))
-     ## ISSUE: the spec need s/tagname/tag name/
+ ##   ->{public_identifier} (DOCTYPE)
- ##   ->{error} == 1 or 0 (DOCTYPE)
+ ##   ->{system_identifier} (DOCTYPE)
+ ##   ->{correct} == 1 or 0 (DOCTYPE)
  ##   ->{attributes} isa HASH (start tag, end tag)
  ##   ->{data} (comment, character)
- ## Macros
- ##   Macros MUST be preceded by three EXCLAMATION MARKs.
- ##   emit ($token)
- ##     Emits the specified token.
  ## Emitted token MUST immediately be handled by the tree construction state.
  ## Before each step, UA MAY check to see if either one of the scripts in
-Line 606 
 sub _get_next_token ($) {
+Line 335 
 sub _get_next_token ($) {
              !!!next-input-character;
              next TAGNAME;
            } else {
-             !!!parse-error (type => 'unmatched end tag');
              $self->{next_input_character} = shift @next_char; # reconsume
              !!!back-next-input-character (@next_char);
              $self->{state} = 'data';
-Line 625 
 sub _get_next_token ($) {
+Line 353 
 sub _get_next_token ($) {
                  $self->{next_input_character} == 0x0020 or # SP
                  $self->{next_input_character} == 0x003E or # >
                  $self->{next_input_character} == 0x002F or # /
-                 $self->{next_input_character} == 0x003C or # <
                  $self->{next_input_character} == -1) {
-           !!!parse-error (type => 'unmatched end tag');
            $self->{next_input_character} = shift @next_char; # reconsume
            !!!back-next-input-character (@next_char);
            $self->{state} = 'data';
-Line 709 
 sub _get_next_token ($) {
+Line 435 
 sub _get_next_token ($) {
          ## Stay in this state
          !!!next-input-character;
          redo A;
-       } elsif ($self->{next_input_character} == 0x003C or # <
+       } elsif ($self->{next_input_character} == -1) {
-                $self->{next_input_character} == -1) {
          !!!parse-error (type => 'unclosed tag');
          if ($self->{current_token}->{type} eq 'start tag') {
            $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
-Line 796 
 sub _get_next_token ($) {
+Line 521 
 sub _get_next_token ($) {
          ## Stay in the state
          # next-input-character is already done
          redo A;
-       } elsif ($self->{next_input_character} == 0x003C or # <
+       } elsif ($self->{next_input_character} == -1) {
-                $self->{next_input_character} == -1) {
          !!!parse-error (type => 'unclosed tag');
          if ($self->{current_token}->{type} eq 'start tag') {
            $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
-Line 888 
 sub _get_next_token ($) {
+Line 612 
 sub _get_next_token ($) {
          $self->{state} = 'before attribute name';
          # next-input-character is already done
          redo A;
-       } elsif ($self->{next_input_character} == 0x003C or # <
+       } elsif ($self->{next_input_character} == -1) {
-                $self->{next_input_character} == -1) {
          !!!parse-error (type => 'unclosed tag');
          $before_leave->();
          if ($self->{current_token}->{type} eq 'start tag') {
-Line 966 
 sub _get_next_token ($) {
+Line 689 
 sub _get_next_token ($) {
          $self->{state} = 'before attribute name';
          # next-input-character is already done
          redo A;
-       } elsif ($self->{next_input_character} == 0x003C or # <
+       } elsif ($self->{next_input_character} == -1) {
-                $self->{next_input_character} == -1) {
          !!!parse-error (type => 'unclosed tag');
          if ($self->{current_token}->{type} eq 'start tag') {
            $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
-Line 1032 
 sub _get_next_token ($) {
+Line 754 
 sub _get_next_token ($) {
          undef $self->{current_token};
          redo A;
-       } elsif ($self->{next_input_character} == 0x003C or # <
+       } elsif ($self->{next_input_character} == -1) {
-                $self->{next_input_character} == -1) {
          !!!parse-error (type => 'unclosed tag');
          if ($self->{current_token}->{type} eq 'start tag') {
            $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
-Line 1160 
 sub _get_next_token ($) {
+Line 881 
 sub _get_next_token ($) {
          undef $self->{current_token};
          redo A;
-       } elsif ($self->{next_input_character} == 0x003C or # <
+       } elsif ($self->{next_input_character} == -1) {
-                $self->{next_input_character} == -1) {
          !!!parse-error (type => 'unclosed tag');
          if ($self->{current_token}->{type} eq 'start tag') {
            $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
-Line 1381 
 sub _get_next_token ($) {
+Line 1101 
 sub _get_next_token ($) {
          ## Stay in the state
          !!!next-input-character;
          redo A;
-       } elsif (0x0061 <= $self->{next_input_character} and
-                $self->{next_input_character} <= 0x007A) { # a..z
- ## ISSUE: "Set the token's name name to the" in the spec
-         $self->{current_token} = {type => 'DOCTYPE',
-                           name => chr ($self->{next_input_character} - 0x0020),
-                           error => 1};
-         $self->{state} = 'DOCTYPE name';
-         !!!next-input-character;
-         redo A;
        } elsif ($self->{next_input_character} == 0x003E) { # >
          !!!parse-error (type => 'no DOCTYPE name');
          $self->{state} = 'data';
          !!!next-input-character;
-         !!!emit ({type => 'DOCTYPE', name => '', error => 1});
+         !!!emit ({type => 'DOCTYPE'}); # incorrect
          redo A;
        } elsif ($self->{next_input_character} == -1) {
-Line 1403 
 sub _get_next_token ($) {
+Line 1114 
 sub _get_next_token ($) {
          $self->{state} = 'data';
          ## reconsume
-         !!!emit ({type => 'DOCTYPE', name => '', error => 1});
+         !!!emit ({type => 'DOCTYPE'}); # incorrect
          redo A;
        } else {
-         $self->{current_token} = {type => 'DOCTYPE',
+         $self->{current_token}
-                           name => chr ($self->{next_input_character}),
+             = {type => 'DOCTYPE',
-                           error => 1};
+                name => chr ($self->{next_input_character}),
+                correct => 1};
  ## ISSUE: "Set the token's name name to the" in the spec
          $self->{state} = 'DOCTYPE name';
          !!!next-input-character;
          redo A;
        }
      } elsif ($self->{state} eq 'DOCTYPE name') {
+ ## ISSUE: Redundant "First," in the spec.
        if ($self->{next_input_character} == 0x0009 or # HT
            $self->{next_input_character} == 0x000A or # LF
            $self->{next_input_character} == 0x000B or # VT
            $self->{next_input_character} == 0x000C or # FF
            $self->{next_input_character} == 0x0020) { # SP
-         $self->{current_token}->{error} = ($self->{current_token}->{name} ne 'HTML'); # DOCTYPE
          $self->{state} = 'after DOCTYPE name';
          !!!next-input-character;
          redo A;
        } elsif ($self->{next_input_character} == 0x003E) { # >
-         $self->{current_token}->{error} = ($self->{current_token}->{name} ne 'HTML'); # DOCTYPE
          $self->{state} = 'data';
          !!!next-input-character;
-Line 1434 
 sub _get_next_token ($) {
+Line 1145 
 sub _get_next_token ($) {
          undef $self->{current_token};
          redo A;
-       } elsif (0x0061 <= $self->{next_input_character} and
-                $self->{next_input_character} <= 0x007A) { # a..z
-         $self->{current_token}->{name} .= chr ($self->{next_input_character} - 0x0020); # DOCTYPE
-         #$self->{current_token}->{error} = ($self->{current_token}->{name} ne 'HTML');
-         ## Stay in the state
-         !!!next-input-character;
-         redo A;
        } elsif ($self->{next_input_character} == -1) {
          !!!parse-error (type => 'unclosed DOCTYPE');
-         $self->{current_token}->{error} = ($self->{current_token}->{name} ne 'HTML'); # DOCTYPE
          $self->{state} = 'data';
          ## reconsume
-         !!!emit ($self->{current_token});
+         delete $self->{current_token}->{correct};
+         !!!emit ($self->{current_token}); # DOCTYPE
          undef $self->{current_token};
          redo A;
        } else {
          $self->{current_token}->{name}
            .= chr ($self->{next_input_character}); # DOCTYPE
-         #$self->{current_token}->{error} = ($self->{current_token}->{name} ne 'HTML');
          ## Stay in the state
          !!!next-input-character;
          redo A;
-Line 1481 
 sub _get_next_token ($) {
+Line 1184 
 sub _get_next_token ($) {
          $self->{state} = 'data';
          ## reconsume
+         delete $self->{current_token}->{correct};
          !!!emit ($self->{current_token}); # DOCTYPE
          undef $self->{current_token};
          redo A;
+       } elsif ($self->{next_input_character} == 0x0050 or # P
+                $self->{next_input_character} == 0x0070) { # p
+         !!!next-input-character;
+         if ($self->{next_input_character} == 0x0055 or # U
+             $self->{next_input_character} == 0x0075) { # u
+           !!!next-input-character;
+           if ($self->{next_input_character} == 0x0042 or # B
+               $self->{next_input_character} == 0x0062) { # b
+             !!!next-input-character;
+             if ($self->{next_input_character} == 0x004C or # L
+                 $self->{next_input_character} == 0x006C) { # l
+               !!!next-input-character;
+               if ($self->{next_input_character} == 0x0049 or # I
+                   $self->{next_input_character} == 0x0069) { # i
+                 !!!next-input-character;
+                 if ($self->{next_input_character} == 0x0043 or # C
+                     $self->{next_input_character} == 0x0063) { # c
+                   $self->{state} = 'before DOCTYPE public identifier';
+                   !!!next-input-character;
+                   redo A;
+                 }
+               }
+             }
+           }
+         }
+         #
+       } elsif ($self->{next_input_character} == 0x0053 or # S
+                $self->{next_input_character} == 0x0073) { # s
+         !!!next-input-character;
+         if ($self->{next_input_character} == 0x0059 or # Y
+             $self->{next_input_character} == 0x0079) { # y
+           !!!next-input-character;
+           if ($self->{next_input_character} == 0x0053 or # S
+               $self->{next_input_character} == 0x0073) { # s
+             !!!next-input-character;
+             if ($self->{next_input_character} == 0x0054 or # T
+                 $self->{next_input_character} == 0x0074) { # t
+               !!!next-input-character;
+               if ($self->{next_input_character} == 0x0045 or # E
+                   $self->{next_input_character} == 0x0065) { # e
+                 !!!next-input-character;
+                 if ($self->{next_input_character} == 0x004D or # M
+                     $self->{next_input_character} == 0x006D) { # m
+                   $self->{state} = 'before DOCTYPE system identifier';
+                   !!!next-input-character;
+                   redo A;
+                 }
+               }
+             }
+           }
+         }
+         #
        } else {
-         !!!parse-error (type => 'string after DOCTYPE name');
+         !!!next-input-character;
-         $self->{current_token}->{error} = 1; # DOCTYPE
+         #
+       }
+       !!!parse-error (type => 'string after DOCTYPE name');
+       $self->{state} = 'bogus DOCTYPE';
+       # next-input-character is already done
+       redo A;
+     } elsif ($self->{state} eq 'before DOCTYPE public identifier') {
+       if ({
+x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
+             #0x000D => 1, # HT, LF, VT, FF, SP, CR
+           }->{$self->{next_input_character}}) {
+         ## Stay in the state
+         !!!next-input-character;
+         redo A;
+       } elsif ($self->{next_input_character} eq 0x0022) { # "
+         $self->{current_token}->{public_identifier} = ''; # DOCTYPE
+         $self->{state} = 'DOCTYPE public identifier (double-quoted)';
+         !!!next-input-character;
+         redo A;
+       } elsif ($self->{next_input_character} eq 0x0027) { # '
+         $self->{current_token}->{public_identifier} = ''; # DOCTYPE
+         $self->{state} = 'DOCTYPE public identifier (single-quoted)';
+         !!!next-input-character;
+         redo A;
+       } elsif ($self->{next_input_character} eq 0x003E) { # >
+         !!!parse-error (type => 'no PUBLIC literal');
+         $self->{state} = 'data';
+         !!!next-input-character;
+         delete $self->{current_token}->{correct};
+         !!!emit ($self->{current_token}); # DOCTYPE
+         undef $self->{current_token};
+         redo A;
+       } elsif ($self->{next_input_character} == -1) {
+         !!!parse-error (type => 'unclosed DOCTYPE');
+         $self->{state} = 'data';
+         ## reconsume
+         delete $self->{current_token}->{correct};
+         !!!emit ($self->{current_token}); # DOCTYPE
+         undef $self->{current_token};
+         redo A;
+       } else {
+         !!!parse-error (type => 'string after PUBLIC');
+         $self->{state} = 'bogus DOCTYPE';
+         !!!next-input-character;
+         redo A;
+       }
+     } elsif ($self->{state} eq 'DOCTYPE public identifier (double-quoted)') {
+       if ($self->{next_input_character} == 0x0022) { # "
+         $self->{state} = 'after DOCTYPE public identifier';
+         !!!next-input-character;
+         redo A;
+       } elsif ($self->{next_input_character} == -1) {
+         !!!parse-error (type => 'unclosed PUBLIC literal');
+         $self->{state} = 'data';
+         ## reconsume
+         delete $self->{current_token}->{correct};
+         !!!emit ($self->{current_token}); # DOCTYPE
+         undef $self->{current_token};
+         redo A;
+       } else {
+         $self->{current_token}->{public_identifier} # DOCTYPE
+             .= chr $self->{next_input_character};
+         ## Stay in the state
+         !!!next-input-character;
+         redo A;
+       }
+     } elsif ($self->{state} eq 'DOCTYPE public identifier (single-quoted)') {
+       if ($self->{next_input_character} == 0x0027) { # '
+         $self->{state} = 'after DOCTYPE public identifier';
+         !!!next-input-character;
+         redo A;
+       } elsif ($self->{next_input_character} == -1) {
+         !!!parse-error (type => 'unclosed PUBLIC literal');
+         $self->{state} = 'data';
+         ## reconsume
+         delete $self->{current_token}->{correct};
+         !!!emit ($self->{current_token}); # DOCTYPE
+         undef $self->{current_token};
+         redo A;
+       } else {
+         $self->{current_token}->{public_identifier} # DOCTYPE
+             .= chr $self->{next_input_character};
+         ## Stay in the state
+         !!!next-input-character;
+         redo A;
+       }
+     } elsif ($self->{state} eq 'after DOCTYPE public identifier') {
+       if ({
+x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
+             #0x000D => 1, # HT, LF, VT, FF, SP, CR
+           }->{$self->{next_input_character}}) {
+         ## Stay in the state
+         !!!next-input-character;
+         redo A;
+       } elsif ($self->{next_input_character} == 0x0022) { # "
+         $self->{current_token}->{system_identifier} = ''; # DOCTYPE
+         $self->{state} = 'DOCTYPE system identifier (double-quoted)';
+         !!!next-input-character;
+         redo A;
+       } elsif ($self->{next_input_character} == 0x0027) { # '
+         $self->{current_token}->{system_identifier} = ''; # DOCTYPE
+         $self->{state} = 'DOCTYPE system identifier (single-quoted)';
+         !!!next-input-character;
+         redo A;
+       } elsif ($self->{next_input_character} == 0x003E) { # >
+         $self->{state} = 'data';
+         !!!next-input-character;
+         !!!emit ($self->{current_token}); # DOCTYPE
+         undef $self->{current_token};
+         redo A;
+       } elsif ($self->{next_input_character} == -1) {
+         !!!parse-error (type => 'unclosed DOCTYPE');
+         $self->{state} = 'data';
+         ## recomsume
+         delete $self->{current_token}->{correct};
+         !!!emit ($self->{current_token}); # DOCTYPE
+         undef $self->{current_token};
+         redo A;
+       } else {
+         !!!parse-error (type => 'string after PUBLIC literal');
+         $self->{state} = 'bogus DOCTYPE';
+         !!!next-input-character;
+         redo A;
+       }
+     } elsif ($self->{state} eq 'before DOCTYPE system identifier') {
+       if ({
+x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
+             #0x000D => 1, # HT, LF, VT, FF, SP, CR
+           }->{$self->{next_input_character}}) {
+         ## Stay in the state
+         !!!next-input-character;
+         redo A;
+       } elsif ($self->{next_input_character} == 0x0022) { # "
+         $self->{current_token}->{system_identifier} = ''; # DOCTYPE
+         $self->{state} = 'DOCTYPE system identifier (double-quoted)';
+         !!!next-input-character;
+         redo A;
+       } elsif ($self->{next_input_character} == 0x0027) { # '
+         $self->{current_token}->{system_identifier} = ''; # DOCTYPE
+         $self->{state} = 'DOCTYPE system identifier (single-quoted)';
+         !!!next-input-character;
+         redo A;
+       } elsif ($self->{next_input_character} == 0x003E) { # >
+         !!!parse-error (type => 'no SYSTEM literal');
+         $self->{state} = 'data';
+         !!!next-input-character;
+         delete $self->{current_token}->{correct};
+         !!!emit ($self->{current_token}); # DOCTYPE
+         undef $self->{current_token};
+         redo A;
+       } elsif ($self->{next_input_character} == -1) {
+         !!!parse-error (type => 'unclosed DOCTYPE');
+         $self->{state} = 'data';
+         ## recomsume
+         delete $self->{current_token}->{correct};
+         !!!emit ($self->{current_token}); # DOCTYPE
+         undef $self->{current_token};
+         redo A;
+       } else {
+         !!!parse-error (type => 'string after PUBLIC literal');
+         $self->{state} = 'bogus DOCTYPE';
+         !!!next-input-character;
+         redo A;
+       }
+     } elsif ($self->{state} eq 'DOCTYPE system identifier (double-quoted)') {
+       if ($self->{next_input_character} == 0x0022) { # "
+         $self->{state} = 'after DOCTYPE system identifier';
+         !!!next-input-character;
+         redo A;
+       } elsif ($self->{next_input_character} == -1) {
+         !!!parse-error (type => 'unclosed SYSTEM literal');
+         $self->{state} = 'data';
+         ## reconsume
+         delete $self->{current_token}->{correct};
+         !!!emit ($self->{current_token}); # DOCTYPE
+         undef $self->{current_token};
+         redo A;
+       } else {
+         $self->{current_token}->{system_identifier} # DOCTYPE
+             .= chr $self->{next_input_character};
+         ## Stay in the state
+         !!!next-input-character;
+         redo A;
+       }
+     } elsif ($self->{state} eq 'DOCTYPE system identifier (single-quoted)') {
+       if ($self->{next_input_character} == 0x0027) { # '
+         $self->{state} = 'after DOCTYPE system identifier';
+         !!!next-input-character;
+         redo A;
+       } elsif ($self->{next_input_character} == -1) {
+         !!!parse-error (type => 'unclosed SYSTEM literal');
+         $self->{state} = 'data';
+         ## reconsume
+         delete $self->{current_token}->{correct};
+         !!!emit ($self->{current_token}); # DOCTYPE
+         undef $self->{current_token};
+         redo A;
+       } else {
+         $self->{current_token}->{system_identifier} # DOCTYPE
+             .= chr $self->{next_input_character};
+         ## Stay in the state
+         !!!next-input-character;
+         redo A;
+       }
+     } elsif ($self->{state} eq 'after DOCTYPE system identifier') {
+       if ({
+x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
+             #0x000D => 1, # HT, LF, VT, FF, SP, CR
+           }->{$self->{next_input_character}}) {
+         ## Stay in the state
+         !!!next-input-character;
+         redo A;
+       } elsif ($self->{next_input_character} == 0x003E) { # >
+         $self->{state} = 'data';
+         !!!next-input-character;
+         !!!emit ($self->{current_token}); # DOCTYPE
+         undef $self->{current_token};
+         redo A;
+       } elsif ($self->{next_input_character} == -1) {
+         !!!parse-error (type => 'unclosed DOCTYPE');
+         $self->{state} = 'data';
+         ## recomsume
+         delete $self->{current_token}->{correct};
+         !!!emit ($self->{current_token}); # DOCTYPE
+         undef $self->{current_token};
+         redo A;
+       } else {
+         !!!parse-error (type => 'string after SYSTEM literal');
          $self->{state} = 'bogus DOCTYPE';
          !!!next-input-character;
          redo A;
-Line 1497 
 sub _get_next_token ($) {
+Line 1516 
 sub _get_next_token ($) {
          $self->{state} = 'data';
          !!!next-input-character;
+         delete $self->{current_token}->{correct};
          !!!emit ($self->{current_token}); # DOCTYPE
          undef $self->{current_token};
-Line 1506 
 sub _get_next_token ($) {
+Line 1526 
 sub _get_next_token ($) {
          $self->{state} = 'data';
          ## reconsume
+         delete $self->{current_token}->{correct};
          !!!emit ($self->{current_token}); # DOCTYPE
          undef $self->{current_token};
-Line 1525 
 sub _get_next_token ($) {
+Line 1546 
 sub _get_next_token ($) {
  sub _tokenize_attempt_to_consume_an_entity ($) {
    my $self = shift;
-   if ($self->{next_input_character} == 0x0023) { # #
+   if ({
+x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, # HT, LF, VT, FF,
+x0020 => 1, 0x003C => 1, 0x0026 => 1, -1 => 1, # SP, <, & # 0x000D # CR
+       }->{$self->{next_input_character}}) {
+     ## Don't consume
+     ## No error
+     return undef;
+   } elsif ($self->{next_input_character} == 0x0023) { # #
      !!!next-input-character;
      if ($self->{next_input_character} == 0x0078 or # x
          $self->{next_input_character} == 0x0058) { # X
-Line 1621 
 sub _tokenize_attempt_to_consume_an_enti
+Line 1649 
 sub _tokenize_attempt_to_consume_an_enti
      my $value = $entity_name;
      my $match;
+     require Whatpm::_NamedEntityList;
+     our $EntityChar;
      while (length $entity_name < 10 and
             ## NOTE: Some number greater than the maximum length of entity name
-            ((0x0041 <= $self->{next_input_character} and
+            ((0x0041 <= $self->{next_input_character} and # a
-              $self->{next_input_character} <= 0x005A) or
+              $self->{next_input_character} <= 0x005A) or # x
-             (0x0061 <= $self->{next_input_character} and
+             (0x0061 <= $self->{next_input_character} and # a
-              $self->{next_input_character} <= 0x007A) or
+              $self->{next_input_character} <= 0x007A) or # z
-             (0x0030 <= $self->{next_input_character} and
+             (0x0030 <= $self->{next_input_character} and # 0
-              $self->{next_input_character} <= 0x0039))) {
+              $self->{next_input_character} <= 0x0039) or # 9
+             $self->{next_input_character} == 0x003B)) { # ;
        $entity_name .= chr $self->{next_input_character};
-       if (defined $entity_char->{$entity_name}) {
+       if (defined $EntityChar->{$entity_name}) {
-         $value = $entity_char->{$entity_name};
+         $value = $EntityChar->{$entity_name};
-         $match = 1;
+         if ($self->{next_input_character} == 0x003B) { # ;
+           $match = 1;
+           !!!next-input-character;
+           last;
+         } else {
+           $match = -1;
+         }
        } else {
          $value .= chr $self->{next_input_character};
        }
        !!!next-input-character;
      }
-     if ($match) {
+     if ($match > 0) {
-       if ($self->{next_input_character} == 0x003B) { # ;
+       return {type => 'character', data => $value};
-         !!!next-input-character;
+     } elsif ($match < 0) {
-       } else {
+       !!!parse-error (type => 'refc');
-         !!!parse-error (type => 'refc');
-       }
        return {type => 'character', data => $value};
      } else {
        !!!parse-error (type => 'bare ero');
-Line 1667 
 sub _initialize_tree_constructor ($) {
+Line 1701 
 sub _initialize_tree_constructor ($) {
    $self->{document}->strict_error_checking (0);
    ## TODO: Turn mutation events off # MUST
    ## TODO: Turn loose Document option (manakai extension) on
-   ## TODO: Mark the Document as an HTML document # MUST
+   $self->{document}->manakai_is_html (1); # MUST
  } # _initialize_tree_constructor
  sub _terminate_tree_constructor ($) {
-Line 1707 
 sub _construct_tree ($) {
+Line 1741 
 sub _construct_tree ($) {
  sub _tree_construction_initial ($) {
    my $self = shift;
-   B: {
+   INITIAL: {
-       if ($token->{type} eq 'DOCTYPE') {
+     if ($token->{type} eq 'DOCTYPE') {
-         if ($token->{error}) {
+       ## NOTE: Conformance checkers MAY, instead of reporting "not HTML5"
-           ## ISSUE: Spec currently left this case undefined.
+       ## error, switch to a conformance checking mode for another
-           !!!parse-error (type => 'bogus DOCTYPE');
+       ## language.
-         }
+       my $doctype_name = $token->{name};
-         my $doctype = $self->{document}->create_document_type_definition
+       $doctype_name = '' unless defined $doctype_name;
-           ($token->{name});
+       $doctype_name =~ tr/a-z/A-Z/;
-         $self->{document}->append_child ($doctype);
+       if (not defined $token->{name} or # <!DOCTYPE>
-         #$phase = 'root element';
+           defined $token->{public_identifier} or
-         !!!next-token;
+           defined $token->{system_identifier}) {
-         #redo B;
+         !!!parse-error (type => 'not HTML5');
-         return;
+       } elsif ($doctype_name ne 'HTML') {
-       } elsif ({
+         ## ISSUE: ASCII case-insensitive? (in fact it does not matter)
-                 comment => 1,
+         !!!parse-error (type => 'not HTML5');
-                 'start tag' => 1,
+       }
-                 'end tag' => 1,
-                 'end-of-file' => 1,
+       my $doctype = $self->{document}->create_document_type_definition
-                }->{$token->{type}}) {
+         ($token->{name}); ## ISSUE: If name is missing (e.g. <!DOCTYPE>)?
-         ## ISSUE: Spec currently left this case undefined.
+       $doctype->public_id ($token->{public_identifier})
-         !!!parse-error (type => 'missing DOCTYPE');
+           if defined $token->{public_identifier};
-         #$phase = 'root element';
+       $doctype->system_id ($token->{system_identifier})
-         ## reprocess
+           if defined $token->{system_identifier};
-         #redo B;
+       ## NOTE: Other DocumentType attributes are null or empty lists.
-         return;
+       ## ISSUE: internalSubset = null??
-       } elsif ($token->{type} eq 'character') {
+       $self->{document}->append_child ($doctype);
-         if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
-           $self->{document}->manakai_append_text ($1);
+       if (not $token->{correct} or $doctype_name ne 'HTML') {
-           ## ISSUE: DOM3 Core does not allow Document > Text
+         $self->{document}->manakai_compat_mode ('quirks');
-           unless (length $token->{data}) {
+       } elsif (defined $token->{public_identifier}) {
-             ## Stay in the phase
+         my $pubid = $token->{public_identifier};
-             !!!next-token;
+         $pubid =~ tr/a-z/A-z/;
-             redo B;
+         if ({
+           "+//SILMARIL//DTD HTML PRO V0R11 19970101//EN" => 1,
+           "-//ADVASOFT LTD//DTD HTML 3.0 ASWEDIT + EXTENSIONS//EN" => 1,
+           "-//AS//DTD HTML 3.0 ASWEDIT + EXTENSIONS//EN" => 1,
+           "-//IETF//DTD HTML 2.0 LEVEL 1//EN" => 1,
+           "-//IETF//DTD HTML 2.0 LEVEL 2//EN" => 1,
+           "-//IETF//DTD HTML 2.0 STRICT LEVEL 1//EN" => 1,
+           "-//IETF//DTD HTML 2.0 STRICT LEVEL 2//EN" => 1,
+           "-//IETF//DTD HTML 2.0 STRICT//EN" => 1,
+           "-//IETF//DTD HTML 2.0//EN" => 1,
+           "-//IETF//DTD HTML 2.1E//EN" => 1,
+           "-//IETF//DTD HTML 3.0//EN" => 1,
+           "-//IETF//DTD HTML 3.0//EN//" => 1,
+           "-//IETF//DTD HTML 3.2 FINAL//EN" => 1,
+           "-//IETF//DTD HTML 3.2//EN" => 1,
+           "-//IETF//DTD HTML 3//EN" => 1,
+           "-//IETF//DTD HTML LEVEL 0//EN" => 1,
+           "-//IETF//DTD HTML LEVEL 0//EN//2.0" => 1,
+           "-//IETF//DTD HTML LEVEL 1//EN" => 1,
+           "-//IETF//DTD HTML LEVEL 1//EN//2.0" => 1,
+           "-//IETF//DTD HTML LEVEL 2//EN" => 1,
+           "-//IETF//DTD HTML LEVEL 2//EN//2.0" => 1,
+           "-//IETF//DTD HTML LEVEL 3//EN" => 1,
+           "-//IETF//DTD HTML LEVEL 3//EN//3.0" => 1,
+           "-//IETF//DTD HTML STRICT LEVEL 0//EN" => 1,
+           "-//IETF//DTD HTML STRICT LEVEL 0//EN//2.0" => 1,
+           "-//IETF//DTD HTML STRICT LEVEL 1//EN" => 1,
+           "-//IETF//DTD HTML STRICT LEVEL 1//EN//2.0" => 1,
+           "-//IETF//DTD HTML STRICT LEVEL 2//EN" => 1,
+           "-//IETF//DTD HTML STRICT LEVEL 2//EN//2.0" => 1,
+           "-//IETF//DTD HTML STRICT LEVEL 3//EN" => 1,
+           "-//IETF//DTD HTML STRICT LEVEL 3//EN//3.0" => 1,
+           "-//IETF//DTD HTML STRICT//EN" => 1,
+           "-//IETF//DTD HTML STRICT//EN//2.0" => 1,
+           "-//IETF//DTD HTML STRICT//EN//3.0" => 1,
+           "-//IETF//DTD HTML//EN" => 1,
+           "-//IETF//DTD HTML//EN//2.0" => 1,
+           "-//IETF//DTD HTML//EN//3.0" => 1,
+           "-//METRIUS//DTD METRIUS PRESENTATIONAL//EN" => 1,
+           "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML STRICT//EN" => 1,
+           "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML//EN" => 1,
+           "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 TABLES//EN" => 1,
+           "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML STRICT//EN" => 1,
+           "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML//EN" => 1,
+           "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 TABLES//EN" => 1,
+           "-//NETSCAPE COMM. CORP.//DTD HTML//EN" => 1,
+           "-//NETSCAPE COMM. CORP.//DTD STRICT HTML//EN" => 1,
+           "-//O'REILLY AND ASSOCIATES//DTD HTML 2.0//EN" => 1,
+           "-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED 1.0//EN" => 1,
+           "-//SPYGLASS//DTD HTML 2.0 EXTENDED//EN" => 1,
+           "-//SQ//DTD HTML 2.0 HOTMETAL + EXTENSIONS//EN" => 1,
+           "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA HTML//EN" => 1,
+           "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA STRICT HTML//EN" => 1,
+           "-//W3C//DTD HTML 3 1995-03-24//EN" => 1,
+           "-//W3C//DTD HTML 3.2 DRAFT//EN" => 1,
+           "-//W3C//DTD HTML 3.2 FINAL//EN" => 1,
+           "-//W3C//DTD HTML 3.2//EN" => 1,
+           "-//W3C//DTD HTML 3.2S DRAFT//EN" => 1,
+           "-//W3C//DTD HTML 4.0 FRAMESET//EN" => 1,
+           "-//W3C//DTD HTML 4.0 TRANSITIONAL//EN" => 1,
+           "-//W3C//DTD HTML EXPERIMETNAL 19960712//EN" => 1,
+           "-//W3C//DTD HTML EXPERIMENTAL 970421//EN" => 1,
+           "-//W3C//DTD W3 HTML//EN" => 1,
+           "-//W3O//DTD W3 HTML 3.0//EN" => 1,
+           "-//W3O//DTD W3 HTML 3.0//EN//" => 1,
+           "-//W3O//DTD W3 HTML STRICT 3.0//EN//" => 1,
+           "-//WEBTECHS//DTD MOZILLA HTML 2.0//EN" => 1,
+           "-//WEBTECHS//DTD MOZILLA HTML//EN" => 1,
+           "-/W3C/DTD HTML 4.0 TRANSITIONAL/EN" => 1,
+           "HTML" => 1,
+         }->{$pubid}) {
+           $self->{document}->manakai_compat_mode ('quirks');
+         } elsif ($pubid eq "-//W3C//DTD HTML 4.01 FRAMESET//EN" or
+                  $pubid eq "-//W3C//DTD HTML 4.01 TRANSITIONAL//EN") {
+           if (defined $token->{system_identifier}) {
+             $self->{document}->manakai_compat_mode ('quirks');
+           } else {
+             $self->{document}->manakai_compat_mode ('limited quirks');
            }
+         } elsif ($pubid eq "-//W3C//DTD XHTML 1.0 Frameset//EN" or
+                  $pubid eq "-//W3C//DTD XHTML 1.0 Transitional//EN") {
+           $self->{document}->manakai_compat_mode ('limited quirks');
+         }
+       }
+       if (defined $token->{system_identifier}) {
+         my $sysid = $token->{system_identifier};
+         $sysid =~ tr/A-Z/a-z/;
+         if ($sysid eq "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd") {
+           $self->{document}->manakai_compat_mode ('quirks');
          }
-         ## ISSUE: Spec currently left this case undefined.
-         !!!parse-error (type => 'missing DOCTYPE');
-         #$phase = 'root element';
-         ## reprocess
-         #redo B;
-         return;
-       } else {
-         die "$0: $token->{type}: Unknown token";
        }
-     } # B
+       ## Go to the root element phase.
+       !!!next-token;
+       return;
+     } elsif ({
+               'start tag' => 1,
+               'end tag' => 1,
+               'end-of-file' => 1,
+              }->{$token->{type}}) {
+       !!!parse-error (type => 'no DOCTYPE');
+       $self->{document}->manakai_compat_mode ('quirks');
+       ## Go to the root element phase
+       ## reprocess
+       return;
+     } elsif ($token->{type} eq 'character') {
+       if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { # \x0D
+         ## Ignore the token
+         unless (length $token->{data}) {
+           ## Stay in the phase
+           !!!next-token;
+           redo INITIAL;
+         }
+       }
+       !!!parse-error (type => 'no DOCTYPE');
+       $self->{document}->manakai_compat_mode ('quirks');
+       ## Go to the root element phase
+       ## reprocess
+       return;
+     } elsif ($token->{type} eq 'comment') {
+       my $comment = $self->{document}->create_comment ($token->{data});
+       $self->{document}->append_child ($comment);
+       ## Stay in the phase.
+       !!!next-token;
+       redo INITIAL;
+     } else {
+       die "$0: $token->{type}: Unknown token";
+     }
+   } # INITIAL
  } # _tree_construction_initial
  sub _tree_construction_root_element ($) {
-Line 2574 
 sub _tree_construction_main ($) {
+Line 2726 
 sub _tree_construction_main ($) {
          return;
        } elsif ({
                  b => 1, big => 1, em => 1, font => 1, i => 1,
-                 nobr => 1, s => 1, small => 1, strile => 1,
+                 s => 1, small => 1, strile => 1,
                  strong => 1, tt => 1, u => 1,
                 }->{$token->{tag_name}}) {
          $reconstruct_active_formatting_elements->($insert_to_current);
-Line 2584 
 sub _tree_construction_main ($) {
+Line 2736 
 sub _tree_construction_main ($) {
          !!!next-token;
          return;
+       } elsif ($token->{tag_name} eq 'nobr') {
+         $reconstruct_active_formatting_elements->($insert_to_current);
+         ## has a |nobr| element in scope
+         INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
+           my $node = $self->{open_elements}->[$_];
+           if ($node->[1] eq 'nobr') {
+             !!!back-token;
+             $token = {type => 'end tag', tag_name => 'nobr'};
+             return;
+           } elsif ({
+                     table => 1, caption => 1, td => 1, th => 1,
+                     button => 1, marquee => 1, object => 1, html => 1,
+                    }->{$node->[1]}) {
+             last INSCOPE;
+           }
+         } # INSCOPE
+         !!!insert-element-t ($token->{tag_name}, $token->{attributes});
+         push @$active_formatting_elements, $self->{open_elements}->[-1];
+         !!!next-token;
+         return;
        } elsif ($token->{tag_name} eq 'button') {
          ## has a button element in scope
          INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
-Line 2703 
 sub _tree_construction_main ($) {
+Line 2878 
 sub _tree_construction_main ($) {
            return;
          } else {
            my $at = $token->{attributes};
+           my $form_attrs;
+           $form_attrs->{action} = $at->{action} if $at->{action};
+           my $prompt_attr = $at->{prompt};
            $at->{name} = {name => 'name', value => 'isindex'};
+           delete $at->{action};
+           delete $at->{prompt};
            my @tokens = (
-                         {type => 'start tag', tag_name => 'form'},
+                         {type => 'start tag', tag_name => 'form',
+                          attributes => $form_attrs},
                          {type => 'start tag', tag_name => 'hr'},
                          {type => 'start tag', tag_name => 'p'},
                          {type => 'start tag', tag_name => 'label'},
-                         {type => 'character',
+                        );
-                          data => 'This is a searchable index. Insert your search keywords here: '}, # SHOULD
+           if ($prompt_attr) {
-                         ## TODO: make this configurable
+             push @tokens, {type => 'character', data => $prompt_attr->{value}};
+           } else {
+             push @tokens, {type => 'character',
+                            data => 'This is a searchable index. Insert your search keywords here: '}; # SHOULD
+             ## TODO: make this configurable
+           }
+           push @tokens,
                          {type => 'start tag', tag_name => 'input', attributes => $at},
                          #{type => 'character', data => ''}, # SHOULD
                          {type => 'end tag', tag_name => 'label'},
                          {type => 'end tag', tag_name => 'p'},
                          {type => 'start tag', tag_name => 'hr'},
-                         {type => 'end tag', tag_name => 'form'},
+                         {type => 'end tag', tag_name => 'form'};
-                        );
            $token = shift @tokens;
            !!!back-token (@tokens);
            return;
-Line 2809 
 sub _tree_construction_main ($) {
+Line 2995 
 sub _tree_construction_main ($) {
        }
      } elsif ($token->{type} eq 'end tag') {
        if ($token->{tag_name} eq 'body') {
-         if (@{$self->{open_elements}} > 1 and $self->{open_elements}->[1]->[1] eq 'body') {
+         if (@{$self->{open_elements}} > 1 and
-           ## ISSUE: There is an issue in the spec.
+             $self->{open_elements}->[1]->[1] eq 'body') {
-           if ($self->{open_elements}->[-1]->[1] ne 'body') {
+           for (@{$self->{open_elements}}) {
-             !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
+             unless ({
+                        dd => 1, dt => 1, li => 1, p => 1, td => 1,
+                        th => 1, tr => 1, body => 1, html => 1,
+                     }->{$_->[1]}) {
+               !!!parse-error (type => 'not closed:'.$_->[1]);
+             }
            }
            $self->{insertion_mode} = 'after body';
            !!!next-token;
            return;
-Line 3122 
 sub _tree_construction_main ($) {
+Line 3314 
 sub _tree_construction_main ($) {
              }
              redo B;
            } elsif ($token->{type} eq 'end tag') {
-             if ($token->{tag_name} eq 'html') {
+             if ({head => 1, body => 1, html => 1}->{$token->{tag_name}}) {
                ## As if <head>
                !!!create-element ($self->{head_element}, 'head');
                $self->{open_elements}->[-1]->[0]->append_child ($self->{head_element});
-Line 3132 
 sub _tree_construction_main ($) {
+Line 3324 
 sub _tree_construction_main ($) {
                redo B;
              } else {
                !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
-               ## Ignore the token
+               ## Ignore the token ## ISSUE: An issue in the spec.
                !!!next-token;
                redo B;
              }
-Line 3219 
 sub _tree_construction_main ($) {
+Line 3411 
 sub _tree_construction_main ($) {
                $self->{insertion_mode} = 'after head';
                !!!next-token;
                redo B;
-             } elsif ($token->{tag_name} eq 'html') {
+             } elsif ($token->{tag_name} eq 'body' or
+                      $token->{tag_name} eq 'html') {
                #
              } else {
                !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
-Line 4935 
 sub set_inner_html ($$$) {
+Line 5128 
 sub set_inner_html ($$$) {
      ## NOTE: Most of this code is copied from |parse_string|
      ## Step 1 # MUST
-     my $doc = $node->owner_document->implementation->create_document;
+     my $this_doc = $node->owner_document;
-     ## TODO: Mark as HTML document
+     my $doc = $this_doc->implementation->create_document;
+     $doc->manakai_is_html (1);
      my $p = $class->new;
      $p->{document} = $doc;
-Line 4946 
 sub set_inner_html ($$$) {
+Line 5140 
 sub set_inner_html ($$$) {
      my $column = 0;
      $p->{set_next_input_character} = sub {
        my $self = shift;
+       pop @{$self->{prev_input_character}};
+       unshift @{$self->{prev_input_character}}, $self->{next_input_character};
        $self->{next_input_character} = -1 and return if $i >= length $$s;
        $self->{next_input_character} = ord substr $$s, $i++, 1;
        $column++;
-Line 4954 
 sub set_inner_html ($$$) {
+Line 5152 
 sub set_inner_html ($$$) {
          $line++;
          $column = 0;
        } elsif ($self->{next_input_character} == 0x000D) { # CR
-         if ($i >= length $$s) {
+         $i++ if substr ($$s, $i, 1) eq "\x0A";
-           #
-         } else {
-           my $next_char = ord substr $$s, $i++, 1;
-           if ($next_char == 0x000A) { # LF
-             #
-           } else {
-             push @{$self->{char}}, $next_char;
-           }
-         }
          $self->{next_input_character} = 0x000A; # LF # MUST
          $line++;
          $column = 0;
        } elsif ($self->{next_input_character} > 0x10FFFF) {
          $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
        } elsif ($self->{next_input_character} == 0x0000) { # NULL
+         !!!parse-error (type => 'NULL');
          $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
        }
      };
+     $p->{prev_input_character} = [-1, -1, -1];
+     $p->{next_input_character} = -1;
      my $ponerror = $onerror || sub {
        my (%opt) = @_;
-Line 5051 
 sub set_inner_html ($$$) {
+Line 5243 
 sub set_inner_html ($$$) {
      ## Step 12 # MUST
      @cn = @{$root->child_nodes};
      for (@cn) {
+       $this_doc->adopt_node ($_);
        $node->append_child ($_);
      }
-     ## ISSUE: adopt_node? mutation events?
+     ## ISSUE: mutation events?
      $p->_terminate_tree_constructor;
    } else {

 Legend:



Removed from v.1.13
 


changed lines


 
Added in v.1.22
 Legend:



Removed from v.1.13
 


changed lines


 
Added in v.1.22
-Removed from v.1.13
+Added in v.1.22

admin@suikawiki.org	ViewVC Help
Powered by ViewVC 1.1.24