/[suikacvs]/markup/html/whatpm/Whatpm/HTML.pm.src

Diff of /markup/html/whatpm/Whatpm/HTML.pm.src

Parent Directory | Revision Log | View Patch Patch

-revision 1.56 by wakaba,
Sat Aug 11 07:19:18 2007 UTC
+revision 1.75 by wakaba,
Mon Mar  3 00:13:22 2008 UTC
 Line 1
  package Whatpm::HTML;
  use strict;
  our $VERSION=do{my @r=(q$Revision$=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
+ use Error qw(:try);
  ## ISSUE:
  ## var doc = implementation.createDocument (null, null, null);
  ## doc.write ('');
  ## alert (doc.compatMode);
- ## ISSUE: HTML5 revision 967 says that the encoding layer MUST NOT
+ ## TODO: Control charcters and noncharacters are not allowed (HTML5 revision 1263)
- ## strip BOM and the HTML layer MUST ignore it.  Whether we can do it
+ ## TODO: 1252 parse error (revision 1264)
- ## is not yet clear.
+ ## TODO: 8859-11 = 874 (revision 1271)
- ## "{U+FEFF}..." in UTF-16BE/UTF-16LE is three or four characters?
- ## "{U+FEFF}..." in GB18030?
  my $permitted_slash_tag_name = {
    base => 1,
-Line 19 
 my $permitted_slash_tag_name = {
+Line 18 
 my $permitted_slash_tag_name = {
    meta => 1,
    hr => 1,
    br => 1,
-   img=> 1,
+   img => 1,
    embed => 1,
    param => 1,
    area => 1,
-Line 84 
 my $formatting_category = {
+Line 83 
 my $formatting_category = {
  };
  # $phrasing_category: all other elements
+ sub parse_byte_string ($$$$;$) {
+   my $self = ref $_[0] ? shift : shift->new;
+   my $charset = shift;
+   my $bytes_s = ref $_[0] ? $_[0] : \($_[0]);
+   my $s;
+   if (defined $charset) {
+     require Encode; ## TODO: decode(utf8) don't delete BOM
+     $s = \ (Encode::decode ($charset, $$bytes_s));
+     $self->{input_encoding} = lc $charset; ## TODO: normalize name
+     $self->{confident} = 1;
+   } else {
+     ## TODO: Implement HTML5 detection algorithm
+     require Whatpm::Charset::UniversalCharDet;
+     $charset = Whatpm::Charset::UniversalCharDet->detect_byte_string
+         (substr ($$bytes_s, 0, 1024));
+     $charset ||= 'windows-1252';
+     $s = \ (Encode::decode ($charset, $$bytes_s));
+     $self->{input_encoding} = $charset;
+     $self->{confident} = 0;
+   }
+   $self->{change_encoding} = sub {
+     my $self = shift;
+     my $charset = lc shift;
+     ## TODO: if $charset is supported
+     ## TODO: normalize charset name
+     ## "Change the encoding" algorithm:
+     ## Step 1
+     if ($charset eq 'utf-16') { ## ISSUE: UTF-16BE -> UTF-8? UTF-16LE -> UTF-8?
+       $charset = 'utf-8';
+     }
+     ## Step 2
+     if (defined $self->{input_encoding} and
+         $self->{input_encoding} eq $charset) {
+       $self->{confident} = 1;
+       return;
+     }
+     !!!parse-error (type => 'charset label detected:'.$self->{input_encoding}.
+         ':'.$charset, level => 'w');
+     ## Step 3
+     # if (can) {
+       ## change the encoding on the fly.
+       #$self->{confident} = 1;
+       #return;
+     # }
+     ## Step 4
+     throw Whatpm::HTML::RestartParser (charset => $charset);
+   }; # $self->{change_encoding}
+   my @args = @_; shift @args; # $s
+   my $return;
+   try {
+     $return = $self->parse_char_string ($s, @args);
+   } catch Whatpm::HTML::RestartParser with {
+     my $charset = shift->{charset};
+     $s = \ (Encode::decode ($charset, $$bytes_s));
+     $self->{input_encoding} = $charset; ## TODO: normalize
+     $self->{confident} = 1;
+     $return = $self->parse_char_string ($s, @args);
+   };
+   return $return;
+ } # parse_byte_string
+ ## NOTE: HTML5 spec says that the encoding layer MUST NOT strip BOM
+ ## and the HTML layer MUST ignore it.  However, we does strip BOM in
+ ## the encoding layer and the HTML layer does not ignore any U+FEFF,
+ ## because the core part of our HTML parser expects a string of character,
+ ## not a string of bytes or code units or anything which might contain a BOM.
+ ## Therefore, any parser interface that accepts a string of bytes,
+ ## such as |parse_byte_string| in this module, must ensure that it does
+ ## strip the BOM and never strip any ZWNBSP.
+ *parse_char_string = \&parse_string;
  sub parse_string ($$$;$) {
-   my $self = shift->new;
+   my $self = ref $_[0] ? shift : shift->new;
-   my $s = \$_[0];
+   my $s = ref $_[0] ? $_[0] : \($_[0]);
    $self->{document} = $_[1];
+   @{$self->{document}->child_nodes} = ();
    ## NOTE: |set_inner_html| copies most of this method's code
+   $self->{confident} = 1 unless exists $self->{confident};
+   $self->{document}->input_encoding ($self->{input_encoding})
+       if defined $self->{input_encoding};
    my $i = 0;
    my $line = 1;
    my $column = 0;
-Line 147 
 sub new ($) {
+Line 232 
 sub new ($) {
    $self->{parse_error} = sub {
      #
    };
+   $self->{change_encoding} = sub {
+     # if ($_[0] is a supported encoding) {
+     #   run "change the encoding" algorithm;
+     #   throw Whatpm::HTML::RestartParser (charset => $new_encoding);
+     # }
+   };
+   $self->{application_cache_selection} = sub {
+     #
+   };
    return $self;
  } # new
-Line 159 
 sub CDATA_CONTENT_MODEL () { CM_LIMITED_
+Line 253 
 sub CDATA_CONTENT_MODEL () { CM_LIMITED_
  sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
  sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
+ sub DATA_STATE () { 0 }
+ sub ENTITY_DATA_STATE () { 1 }
+ sub TAG_OPEN_STATE () { 2 }
+ sub CLOSE_TAG_OPEN_STATE () { 3 }
+ sub TAG_NAME_STATE () { 4 }
+ sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
+ sub ATTRIBUTE_NAME_STATE () { 6 }
+ sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
+ sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
+ sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
+ sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
+ sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
+ sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
+ sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
+ sub COMMENT_START_STATE () { 14 }
+ sub COMMENT_START_DASH_STATE () { 15 }
+ sub COMMENT_STATE () { 16 }
+ sub COMMENT_END_STATE () { 17 }
+ sub COMMENT_END_DASH_STATE () { 18 }
+ sub BOGUS_COMMENT_STATE () { 19 }
+ sub DOCTYPE_STATE () { 20 }
+ sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
+ sub DOCTYPE_NAME_STATE () { 22 }
+ sub AFTER_DOCTYPE_NAME_STATE () { 23 }
+ sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
+ sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
+ sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
+ sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
+ sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
+ sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
+ sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
+ sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
+ sub BOGUS_DOCTYPE_STATE () { 32 }
+ sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
  sub DOCTYPE_TOKEN () { 1 }
  sub COMMENT_TOKEN () { 2 }
  sub START_TAG_TOKEN () { 3 }
-Line 197 
 sub IN_COLUMN_GROUP_IM () { 0b10 }
+Line 326 
 sub IN_COLUMN_GROUP_IM () { 0b10 }
  sub _initialize_tokenizer ($) {
    my $self = shift;
-   $self->{state} = 'data'; # MUST
+   $self->{state} = DATA_STATE; # MUST
    $self->{content_model} = PCDATA_CONTENT_MODEL; # be
    undef $self->{current_token}; # start tag, end tag, comment, or DOCTYPE
    undef $self->{current_attribute};
-Line 217 
 sub _initialize_tokenizer ($) {
+Line 346 
 sub _initialize_tokenizer ($) {
  ##   ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
  ##   ->{public_identifier} (DOCTYPE_TOKEN)
  ##   ->{system_identifier} (DOCTYPE_TOKEN)
- ##   ->{correct} == 1 or 0 (DOCTYPE_TOKEN)
+ ##   ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
  ##   ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
+ ##        ->{name}
+ ##        ->{value}
+ ##        ->{has_reference} == 1 or 0
  ##   ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)
  ## Emitted token MUST immediately be handled by the tree construction state.
-Line 229 
 sub _initialize_tokenizer ($) {
+Line 361 
 sub _initialize_tokenizer ($) {
  ## has completed loading.  If one has, then it MUST be executed
  ## and removed from the list.
+ ## NOTE: HTML5 "Writing HTML documents" section, applied to
+ ## documents and not to user agents and conformance checkers,
+ ## contains some requirements that are not detected by the
+ ## parsing algorithm:
+ ## - Some requirements on character encoding declarations. ## TODO
+ ## - "Elements MUST NOT contain content that their content model disallows."
+ ##   ... Some are parse error, some are not (will be reported by c.c.).
+ ## - Polytheistic slash SHOULD NOT be used. (Applied only to atheists.) ## TODO
+ ## - Text (in elements, attributes, and comments) SHOULD NOT contain
+ ##   control characters other than space characters. ## TODO: (what is control character? C0, C1 and DEL?  Unicode control character?)
+ ## TODO: HTML5 poses authors two SHOULD-level requirements that cannot
+ ## be detected by the HTML5 parsing algorithm:
+ ## - Text,
  sub _get_next_token ($) {
    my $self = shift;
    if (@{$self->{token}}) {
-Line 236 
 sub _get_next_token ($) {
+Line 383 
 sub _get_next_token ($) {
    }
    A: {
-     if ($self->{state} eq 'data') {
+     if ($self->{state} == DATA_STATE) {
        if ($self->{next_input_character} == 0x0026) { # &
-         if ($self->{content_model} & CM_ENTITY) { # PCDATA | RCDATA
+         if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
-           $self->{state} = 'entity data';
+             not $self->{escape}) {
+           $self->{state} = ENTITY_DATA_STATE;
            !!!next-input-character;
            redo A;
          } else {
-Line 261 
 sub _get_next_token ($) {
+Line 409 
 sub _get_next_token ($) {
          if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
              (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
               not $self->{escape})) {
-           $self->{state} = 'tag open';
+           $self->{state} = TAG_OPEN_STATE;
            !!!next-input-character;
            redo A;
          } else {
-Line 290 
 sub _get_next_token ($) {
+Line 438 
 sub _get_next_token ($) {
        !!!emit ($token);
        redo A;
-     } elsif ($self->{state} eq 'entity data') {
+     } elsif ($self->{state} == ENTITY_DATA_STATE) {
        ## (cannot happen in CDATA state)
-       my $token = $self->_tokenize_attempt_to_consume_an_entity (0);
+       my $token = $self->_tokenize_attempt_to_consume_an_entity (0, -1);
-       $self->{state} = 'data';
+       $self->{state} = DATA_STATE;
        # next-input-character is already done
        unless (defined $token) {
-Line 305 
 sub _get_next_token ($) {
+Line 453 
 sub _get_next_token ($) {
        }
        redo A;
-     } elsif ($self->{state} eq 'tag open') {
+     } elsif ($self->{state} == TAG_OPEN_STATE) {
        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
          if ($self->{next_input_character} == 0x002F) { # /
            !!!next-input-character;
-           $self->{state} = 'close tag open';
+           $self->{state} = CLOSE_TAG_OPEN_STATE;
            redo A;
          } else {
            ## reconsume
-           $self->{state} = 'data';
+           $self->{state} = DATA_STATE;
            !!!emit ({type => CHARACTER_TOKEN, data => '<'});
-Line 321 
 sub _get_next_token ($) {
+Line 469 
 sub _get_next_token ($) {
          }
        } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
          if ($self->{next_input_character} == 0x0021) { # !
-           $self->{state} = 'markup declaration open';
+           $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
            !!!next-input-character;
            redo A;
          } elsif ($self->{next_input_character} == 0x002F) { # /
-           $self->{state} = 'close tag open';
+           $self->{state} = CLOSE_TAG_OPEN_STATE;
            !!!next-input-character;
            redo A;
          } elsif (0x0041 <= $self->{next_input_character} and
-Line 333 
 sub _get_next_token ($) {
+Line 481 
 sub _get_next_token ($) {
            $self->{current_token}
              = {type => START_TAG_TOKEN,
                 tag_name => chr ($self->{next_input_character} + 0x0020)};
-           $self->{state} = 'tag name';
+           $self->{state} = TAG_NAME_STATE;
            !!!next-input-character;
            redo A;
          } elsif (0x0061 <= $self->{next_input_character} and
                   $self->{next_input_character} <= 0x007A) { # a..z
            $self->{current_token} = {type => START_TAG_TOKEN,
                              tag_name => chr ($self->{next_input_character})};
-           $self->{state} = 'tag name';
+           $self->{state} = TAG_NAME_STATE;
            !!!next-input-character;
            redo A;
          } elsif ($self->{next_input_character} == 0x003E) { # >
            !!!parse-error (type => 'empty start tag');
-           $self->{state} = 'data';
+           $self->{state} = DATA_STATE;
            !!!next-input-character;
            !!!emit ({type => CHARACTER_TOKEN, data => '<>'});
-Line 353 
 sub _get_next_token ($) {
+Line 501 
 sub _get_next_token ($) {
            redo A;
          } elsif ($self->{next_input_character} == 0x003F) { # ?
            !!!parse-error (type => 'pio');
-           $self->{state} = 'bogus comment';
+           $self->{state} = BOGUS_COMMENT_STATE;
            ## $self->{next_input_character} is intentionally left as is
            redo A;
          } else {
            !!!parse-error (type => 'bare stago');
-           $self->{state} = 'data';
+           $self->{state} = DATA_STATE;
            ## reconsume
            !!!emit ({type => CHARACTER_TOKEN, data => '<'});
-Line 368 
 sub _get_next_token ($) {
+Line 516 
 sub _get_next_token ($) {
        } else {
          die "$0: $self->{content_model} in tag open";
        }
-     } elsif ($self->{state} eq 'close tag open') {
+     } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
          if (defined $self->{last_emitted_start_tag_name}) {
            ## NOTE: <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>
-Line 383 
 sub _get_next_token ($) {
+Line 531 
 sub _get_next_token ($) {
              } else {
                $self->{next_input_character} = shift @next_char; # reconsume
                !!!back-next-input-character (@next_char);
-               $self->{state} = 'data';
+               $self->{state} = DATA_STATE;
                !!!emit ({type => CHARACTER_TOKEN, data => '</'});
-Line 402 
 sub _get_next_token ($) {
+Line 550 
 sub _get_next_token ($) {
                    $self->{next_input_character} == -1) {
              $self->{next_input_character} = shift @next_char; # reconsume
              !!!back-next-input-character (@next_char);
-             $self->{state} = 'data';
+             $self->{state} = DATA_STATE;
              !!!emit ({type => CHARACTER_TOKEN, data => '</'});
              redo A;
            } else {
-Line 413 
 sub _get_next_token ($) {
+Line 561 
 sub _get_next_token ($) {
          } else {
            ## No start tag token has ever been emitted
            # next-input-character is already done
-           $self->{state} = 'data';
+           $self->{state} = DATA_STATE;
            !!!emit ({type => CHARACTER_TOKEN, data => '</'});
            redo A;
          }
-Line 423 
 sub _get_next_token ($) {
+Line 571 
 sub _get_next_token ($) {
            $self->{next_input_character} <= 0x005A) { # A..Z
          $self->{current_token} = {type => END_TAG_TOKEN,
                            tag_name => chr ($self->{next_input_character} + 0x0020)};
-         $self->{state} = 'tag name';
+         $self->{state} = TAG_NAME_STATE;
          !!!next-input-character;
          redo A;
        } elsif (0x0061 <= $self->{next_input_character} and
                 $self->{next_input_character} <= 0x007A) { # a..z
          $self->{current_token} = {type => END_TAG_TOKEN,
                            tag_name => chr ($self->{next_input_character})};
-         $self->{state} = 'tag name';
+         $self->{state} = TAG_NAME_STATE;
          !!!next-input-character;
          redo A;
        } elsif ($self->{next_input_character} == 0x003E) { # >
          !!!parse-error (type => 'empty end tag');
-         $self->{state} = 'data';
+         $self->{state} = DATA_STATE;
          !!!next-input-character;
          redo A;
        } elsif ($self->{next_input_character} == -1) {
          !!!parse-error (type => 'bare etago');
-         $self->{state} = 'data';
+         $self->{state} = DATA_STATE;
          # reconsume
          !!!emit ({type => CHARACTER_TOKEN, data => '</'});
-Line 448 
 sub _get_next_token ($) {
+Line 596 
 sub _get_next_token ($) {
          redo A;
        } else {
          !!!parse-error (type => 'bogus end tag');
-         $self->{state} = 'bogus comment';
+         $self->{state} = BOGUS_COMMENT_STATE;
          ## $self->{next_input_character} is intentionally left as is
          redo A;
        }
-     } elsif ($self->{state} eq 'tag name') {
+     } elsif ($self->{state} == TAG_NAME_STATE) {
        if ($self->{next_input_character} == 0x0009 or # HT
            $self->{next_input_character} == 0x000A or # LF
            $self->{next_input_character} == 0x000B or # VT
            $self->{next_input_character} == 0x000C or # FF
            $self->{next_input_character} == 0x0020) { # SP
-         $self->{state} = 'before attribute name';
+         $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
          !!!next-input-character;
          redo A;
        } elsif ($self->{next_input_character} == 0x003E) { # >
-Line 474 
 sub _get_next_token ($) {
+Line 622 
 sub _get_next_token ($) {
          } else {
            die "$0: $self->{current_token}->{type}: Unknown token type";
          }
-         $self->{state} = 'data';
+         $self->{state} = DATA_STATE;
          !!!next-input-character;
          !!!emit ($self->{current_token}); # start tag or end tag
-Line 501 
 sub _get_next_token ($) {
+Line 649 
 sub _get_next_token ($) {
          } else {
            die "$0: $self->{current_token}->{type}: Unknown token type";
          }
-         $self->{state} = 'data';
+         $self->{state} = DATA_STATE;
          # reconsume
          !!!emit ($self->{current_token}); # start tag or end tag
-Line 517 
 sub _get_next_token ($) {
+Line 665 
 sub _get_next_token ($) {
          } else {
            !!!parse-error (type => 'nestc');
          }
-         $self->{state} = 'before attribute name';
+         $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
          # next-input-character is already done
          redo A;
        } else {
-Line 527 
 sub _get_next_token ($) {
+Line 675 
 sub _get_next_token ($) {
          !!!next-input-character;
          redo A;
        }
-     } elsif ($self->{state} eq 'before attribute name') {
+     } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
        if ($self->{next_input_character} == 0x0009 or # HT
            $self->{next_input_character} == 0x000A or # LF
            $self->{next_input_character} == 0x000B or # VT
-Line 549 
 sub _get_next_token ($) {
+Line 697 
 sub _get_next_token ($) {
          } else {
            die "$0: $self->{current_token}->{type}: Unknown token type";
          }
-         $self->{state} = 'data';
+         $self->{state} = DATA_STATE;
          !!!next-input-character;
          !!!emit ($self->{current_token}); # start tag or end tag
-Line 559 
 sub _get_next_token ($) {
+Line 707 
 sub _get_next_token ($) {
                 $self->{next_input_character} <= 0x005A) { # A..Z
          $self->{current_attribute} = {name => chr ($self->{next_input_character} + 0x0020),
                                value => ''};
-         $self->{state} = 'attribute name';
+         $self->{state} = ATTRIBUTE_NAME_STATE;
          !!!next-input-character;
          redo A;
        } elsif ($self->{next_input_character} == 0x002F) { # /
-Line 589 
 sub _get_next_token ($) {
+Line 737 
 sub _get_next_token ($) {
          } else {
            die "$0: $self->{current_token}->{type}: Unknown token type";
          }
-         $self->{state} = 'data';
+         $self->{state} = DATA_STATE;
          # reconsume
          !!!emit ($self->{current_token}); # start tag or end tag
          redo A;
        } else {
+         if ({
+x0022 => 1, # "
+x0027 => 1, # '
+x003D => 1, # =
+             }->{$self->{next_input_character}}) {
+           !!!parse-error (type => 'bad attribute name');
+         }
          $self->{current_attribute} = {name => chr ($self->{next_input_character}),
                                value => ''};
-         $self->{state} = 'attribute name';
+         $self->{state} = ATTRIBUTE_NAME_STATE;
          !!!next-input-character;
          redo A;
        }
-     } elsif ($self->{state} eq 'attribute name') {
+     } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
        my $before_leave = sub {
          if (exists $self->{current_token}->{attributes} # start tag or end tag
              ->{$self->{current_attribute}->{name}}) { # MUST
-Line 620 
 sub _get_next_token ($) {
+Line 775 
 sub _get_next_token ($) {
            $self->{next_input_character} == 0x000C or # FF
            $self->{next_input_character} == 0x0020) { # SP
          $before_leave->();
-         $self->{state} = 'after attribute name';
+         $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
          !!!next-input-character;
          redo A;
        } elsif ($self->{next_input_character} == 0x003D) { # =
          $before_leave->();
-         $self->{state} = 'before attribute value';
+         $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
          !!!next-input-character;
          redo A;
        } elsif ($self->{next_input_character} == 0x003E) { # >
-Line 642 
 sub _get_next_token ($) {
+Line 797 
 sub _get_next_token ($) {
          } else {
            die "$0: $self->{current_token}->{type}: Unknown token type";
          }
-         $self->{state} = 'data';
+         $self->{state} = DATA_STATE;
          !!!next-input-character;
          !!!emit ($self->{current_token}); # start tag or end tag
-Line 665 
 sub _get_next_token ($) {
+Line 820 
 sub _get_next_token ($) {
          } else {
            !!!parse-error (type => 'nestc');
          }
-         $self->{state} = 'before attribute name';
+         $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
          # next-input-character is already done
          redo A;
        } elsif ($self->{next_input_character} == -1) {
-Line 683 
 sub _get_next_token ($) {
+Line 838 
 sub _get_next_token ($) {
          } else {
            die "$0: $self->{current_token}->{type}: Unknown token type";
          }
-         $self->{state} = 'data';
+         $self->{state} = DATA_STATE;
          # reconsume
          !!!emit ($self->{current_token}); # start tag or end tag
          redo A;
        } else {
+         if ($self->{next_input_character} == 0x0022 or # "
+             $self->{next_input_character} == 0x0027) { # '
+           !!!parse-error (type => 'bad attribute name');
+         }
          $self->{current_attribute}->{name} .= chr ($self->{next_input_character});
          ## Stay in the state
          !!!next-input-character;
          redo A;
        }
-     } elsif ($self->{state} eq 'after attribute name') {
+     } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
        if ($self->{next_input_character} == 0x0009 or # HT
            $self->{next_input_character} == 0x000A or # LF
            $self->{next_input_character} == 0x000B or # VT
-Line 705 
 sub _get_next_token ($) {
+Line 864 
 sub _get_next_token ($) {
          !!!next-input-character;
          redo A;
        } elsif ($self->{next_input_character} == 0x003D) { # =
-         $self->{state} = 'before attribute value';
+         $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
          !!!next-input-character;
          redo A;
        } elsif ($self->{next_input_character} == 0x003E) { # >
-Line 721 
 sub _get_next_token ($) {
+Line 880 
 sub _get_next_token ($) {
          } else {
            die "$0: $self->{current_token}->{type}: Unknown token type";
          }
-         $self->{state} = 'data';
+         $self->{state} = DATA_STATE;
          !!!next-input-character;
          !!!emit ($self->{current_token}); # start tag or end tag
-Line 731 
 sub _get_next_token ($) {
+Line 890 
 sub _get_next_token ($) {
                 $self->{next_input_character} <= 0x005A) { # A..Z
          $self->{current_attribute} = {name => chr ($self->{next_input_character} + 0x0020),
                                value => ''};
-         $self->{state} = 'attribute name';
+         $self->{state} = ATTRIBUTE_NAME_STATE;
          !!!next-input-character;
          redo A;
        } elsif ($self->{next_input_character} == 0x002F) { # /
-Line 745 
 sub _get_next_token ($) {
+Line 904 
 sub _get_next_token ($) {
            !!!parse-error (type => 'nestc');
            ## TODO: Different error type for <aa / bb> than <aa/>
          }
-         $self->{state} = 'before attribute name';
+         $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
          # next-input-character is already done
          redo A;
        } elsif ($self->{next_input_character} == -1) {
-Line 762 
 sub _get_next_token ($) {
+Line 921 
 sub _get_next_token ($) {
          } else {
            die "$0: $self->{current_token}->{type}: Unknown token type";
          }
-         $self->{state} = 'data';
+         $self->{state} = DATA_STATE;
          # reconsume
          !!!emit ($self->{current_token}); # start tag or end tag
-Line 771 
 sub _get_next_token ($) {
+Line 930 
 sub _get_next_token ($) {
        } else {
          $self->{current_attribute} = {name => chr ($self->{next_input_character}),
                                value => ''};
-         $self->{state} = 'attribute name';
+         $self->{state} = ATTRIBUTE_NAME_STATE;
          !!!next-input-character;
          redo A;
        }
-     } elsif ($self->{state} eq 'before attribute value') {
+     } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
        if ($self->{next_input_character} == 0x0009 or # HT
            $self->{next_input_character} == 0x000A or # LF
            $self->{next_input_character} == 0x000B or # VT
-Line 785 
 sub _get_next_token ($) {
+Line 944 
 sub _get_next_token ($) {
          !!!next-input-character;
          redo A;
        } elsif ($self->{next_input_character} == 0x0022) { # "
-         $self->{state} = 'attribute value (double-quoted)';
+         $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
          !!!next-input-character;
          redo A;
        } elsif ($self->{next_input_character} == 0x0026) { # &
-         $self->{state} = 'attribute value (unquoted)';
+         $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
          ## reconsume
          redo A;
        } elsif ($self->{next_input_character} == 0x0027) { # '
-         $self->{state} = 'attribute value (single-quoted)';
+         $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
          !!!next-input-character;
          redo A;
        } elsif ($self->{next_input_character} == 0x003E) { # >
-Line 809 
 sub _get_next_token ($) {
+Line 968 
 sub _get_next_token ($) {
          } else {
            die "$0: $self->{current_token}->{type}: Unknown token type";
          }
-         $self->{state} = 'data';
+         $self->{state} = DATA_STATE;
          !!!next-input-character;
          !!!emit ($self->{current_token}); # start tag or end tag
-Line 829 
 sub _get_next_token ($) {
+Line 988 
 sub _get_next_token ($) {
          } else {
            die "$0: $self->{current_token}->{type}: Unknown token type";
          }
-         $self->{state} = 'data';
+         $self->{state} = DATA_STATE;
          ## reconsume
          !!!emit ($self->{current_token}); # start tag or end tag
          redo A;
        } else {
+         if ($self->{next_input_character} == 0x003D) { # =
+           !!!parse-error (type => 'bad attribute value');
+         }
          $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
-         $self->{state} = 'attribute value (unquoted)';
+         $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
          !!!next-input-character;
          redo A;
        }
-     } elsif ($self->{state} eq 'attribute value (double-quoted)') {
+     } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
        if ($self->{next_input_character} == 0x0022) { # "
-         $self->{state} = 'before attribute name';
+         $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
          !!!next-input-character;
          redo A;
        } elsif ($self->{next_input_character} == 0x0026) { # &
-         $self->{last_attribute_value_state} = 'attribute value (double-quoted)';
+         $self->{last_attribute_value_state} = $self->{state};
-         $self->{state} = 'entity in attribute value';
+         $self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE;
          !!!next-input-character;
          redo A;
        } elsif ($self->{next_input_character} == -1) {
-Line 865 
 sub _get_next_token ($) {
+Line 1027 
 sub _get_next_token ($) {
          } else {
            die "$0: $self->{current_token}->{type}: Unknown token type";
          }
-         $self->{state} = 'data';
+         $self->{state} = DATA_STATE;
          ## reconsume
          !!!emit ($self->{current_token}); # start tag or end tag
-Line 877 
 sub _get_next_token ($) {
+Line 1039 
 sub _get_next_token ($) {
          !!!next-input-character;
          redo A;
        }
-     } elsif ($self->{state} eq 'attribute value (single-quoted)') {
+     } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
        if ($self->{next_input_character} == 0x0027) { # '
-         $self->{state} = 'before attribute name';
+         $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
          !!!next-input-character;
          redo A;
        } elsif ($self->{next_input_character} == 0x0026) { # &
-         $self->{last_attribute_value_state} = 'attribute value (single-quoted)';
+         $self->{last_attribute_value_state} = $self->{state};
-         $self->{state} = 'entity in attribute value';
+         $self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE;
          !!!next-input-character;
          redo A;
        } elsif ($self->{next_input_character} == -1) {
-Line 901 
 sub _get_next_token ($) {
+Line 1063 
 sub _get_next_token ($) {
          } else {
            die "$0: $self->{current_token}->{type}: Unknown token type";
          }
-         $self->{state} = 'data';
+         $self->{state} = DATA_STATE;
          ## reconsume
          !!!emit ($self->{current_token}); # start tag or end tag
-Line 913 
 sub _get_next_token ($) {
+Line 1075 
 sub _get_next_token ($) {
          !!!next-input-character;
          redo A;
        }
-     } elsif ($self->{state} eq 'attribute value (unquoted)') {
+     } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
        if ($self->{next_input_character} == 0x0009 or # HT
            $self->{next_input_character} == 0x000A or # LF
            $self->{next_input_character} == 0x000B or # HT
            $self->{next_input_character} == 0x000C or # FF
            $self->{next_input_character} == 0x0020) { # SP
-         $self->{state} = 'before attribute name';
+         $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
          !!!next-input-character;
          redo A;
        } elsif ($self->{next_input_character} == 0x0026) { # &
-         $self->{last_attribute_value_state} = 'attribute value (unquoted)';
+         $self->{last_attribute_value_state} = $self->{state};
-         $self->{state} = 'entity in attribute value';
+         $self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE;
          !!!next-input-character;
          redo A;
        } elsif ($self->{next_input_character} == 0x003E) { # >
-Line 940 
 sub _get_next_token ($) {
+Line 1102 
 sub _get_next_token ($) {
          } else {
            die "$0: $self->{current_token}->{type}: Unknown token type";
          }
-         $self->{state} = 'data';
+         $self->{state} = DATA_STATE;
          !!!next-input-character;
          !!!emit ($self->{current_token}); # start tag or end tag
-Line 960 
 sub _get_next_token ($) {
+Line 1122 
 sub _get_next_token ($) {
          } else {
            die "$0: $self->{current_token}->{type}: Unknown token type";
          }
-         $self->{state} = 'data';
+         $self->{state} = DATA_STATE;
          ## reconsume
          !!!emit ($self->{current_token}); # start tag or end tag
          redo A;
        } else {
+         if ({
+x0022 => 1, # "
+x0027 => 1, # '
+x003D => 1, # =
+             }->{$self->{next_input_character}}) {
+           !!!parse-error (type => 'bad attribute value');
+         }
          $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
          ## Stay in the state
          !!!next-input-character;
          redo A;
        }
-     } elsif ($self->{state} eq 'entity in attribute value') {
+     } elsif ($self->{state} == ENTITY_IN_ATTRIBUTE_VALUE_STATE) {
-       my $token = $self->_tokenize_attempt_to_consume_an_entity (1);
+       my $token = $self->_tokenize_attempt_to_consume_an_entity
+           (1,
+            $self->{last_attribute_value_state}
+              == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE ? 0x0022 : # "
+            $self->{last_attribute_value_state}
+              == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE ? 0x0027 : # '
+            -1);
        unless (defined $token) {
          $self->{current_attribute}->{value} .= '&';
        } else {
          $self->{current_attribute}->{value} .= $token->{data};
+         $self->{current_attribute}->{has_reference} = $token->{has_reference};
          ## ISSUE: spec says "append the returned character token to the current attribute's value"
        }
        $self->{state} = $self->{last_attribute_value_state};
        # next-input-character is already done
        redo A;
-     } elsif ($self->{state} eq 'bogus comment') {
+     } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
+       if ($self->{next_input_character} == 0x0009 or # HT
+           $self->{next_input_character} == 0x000A or # LF
+           $self->{next_input_character} == 0x000B or # VT
+           $self->{next_input_character} == 0x000C or # FF
+           $self->{next_input_character} == 0x0020) { # SP
+         $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
+         !!!next-input-character;
+         redo A;
+       } elsif ($self->{next_input_character} == 0x003E) { # >
+         if ($self->{current_token}->{type} == START_TAG_TOKEN) {
+           $self->{current_token}->{first_start_tag}
+               = not defined $self->{last_emitted_start_tag_name};
+           $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
+         } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
+           $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
+           if ($self->{current_token}->{attributes}) {
+             !!!parse-error (type => 'end tag attribute');
+           }
+         } else {
+           die "$0: $self->{current_token}->{type}: Unknown token type";
+         }
+         $self->{state} = DATA_STATE;
+         !!!next-input-character;
+         !!!emit ($self->{current_token}); # start tag or end tag
+         redo A;
+       } elsif ($self->{next_input_character} == 0x002F) { # /
+         !!!next-input-character;
+         if ($self->{next_input_character} == 0x003E and # >
+             $self->{current_token}->{type} == START_TAG_TOKEN and
+             $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
+           # permitted slash
+           #
+         } else {
+           !!!parse-error (type => 'nestc');
+         }
+         $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
+         # next-input-character is already done
+         redo A;
+       } else {
+         !!!parse-error (type => 'no space between attributes');
+         $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
+         ## reconsume
+         redo A;
+       }
+     } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
        ## (only happen if PCDATA state)
        my $token = {type => COMMENT_TOKEN, data => ''};
        BC: {
          if ($self->{next_input_character} == 0x003E) { # >
-           $self->{state} = 'data';
+           $self->{state} = DATA_STATE;
            !!!next-input-character;
            !!!emit ($token);
            redo A;
          } elsif ($self->{next_input_character} == -1) {
-           $self->{state} = 'data';
+           $self->{state} = DATA_STATE;
            ## reconsume
            !!!emit ($token);
-Line 1011 
 sub _get_next_token ($) {
+Line 1234 
 sub _get_next_token ($) {
            redo BC;
          }
        } # BC
-     } elsif ($self->{state} eq 'markup declaration open') {
+     } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
        ## (only happen if PCDATA state)
        my @next_char;
-Line 1022 
 sub _get_next_token ($) {
+Line 1245 
 sub _get_next_token ($) {
          push @next_char, $self->{next_input_character};
          if ($self->{next_input_character} == 0x002D) { # -
            $self->{current_token} = {type => COMMENT_TOKEN, data => ''};
-           $self->{state} = 'comment start';
+           $self->{state} = COMMENT_START_STATE;
            !!!next-input-character;
            redo A;
          }
-Line 1053 
 sub _get_next_token ($) {
+Line 1276 
 sub _get_next_token ($) {
                    if ($self->{next_input_character} == 0x0045 or # E
                        $self->{next_input_character} == 0x0065) { # e
                      ## ISSUE: What a stupid code this is!
-                     $self->{state} = 'DOCTYPE';
+                     $self->{state} = DOCTYPE_STATE;
                      !!!next-input-character;
                      redo A;
                    }
-Line 1067 
 sub _get_next_token ($) {
+Line 1290 
 sub _get_next_token ($) {
        !!!parse-error (type => 'bogus comment');
        $self->{next_input_character} = shift @next_char;
        !!!back-next-input-character (@next_char);
-       $self->{state} = 'bogus comment';
+       $self->{state} = BOGUS_COMMENT_STATE;
        redo A;
        ## ISSUE: typos in spec: chacacters, is is a parse error
        ## ISSUE: spec is somewhat unclear on "is the first character that will be in the comment"; what is "that will be in the comment" is what the algorithm defines, isn't it?
-     } elsif ($self->{state} eq 'comment start') {
+     } elsif ($self->{state} == COMMENT_START_STATE) {
        if ($self->{next_input_character} == 0x002D) { # -
-         $self->{state} = 'comment start dash';
+         $self->{state} = COMMENT_START_DASH_STATE;
          !!!next-input-character;
          redo A;
        } elsif ($self->{next_input_character} == 0x003E) { # >
          !!!parse-error (type => 'bogus comment');
-         $self->{state} = 'data';
+         $self->{state} = DATA_STATE;
          !!!next-input-character;
          !!!emit ($self->{current_token}); # comment
-Line 1087 
 sub _get_next_token ($) {
+Line 1310 
 sub _get_next_token ($) {
          redo A;
        } elsif ($self->{next_input_character} == -1) {
          !!!parse-error (type => 'unclosed comment');
-         $self->{state} = 'data';
+         $self->{state} = DATA_STATE;
          ## reconsume
          !!!emit ($self->{current_token}); # comment
-Line 1096 
 sub _get_next_token ($) {
+Line 1319 
 sub _get_next_token ($) {
        } else {
          $self->{current_token}->{data} # comment
              .= chr ($self->{next_input_character});
-         $self->{state} = 'comment';
+         $self->{state} = COMMENT_STATE;
          !!!next-input-character;
          redo A;
        }
-     } elsif ($self->{state} eq 'comment start dash') {
+     } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
        if ($self->{next_input_character} == 0x002D) { # -
-         $self->{state} = 'comment end';
+         $self->{state} = COMMENT_END_STATE;
          !!!next-input-character;
          redo A;
        } elsif ($self->{next_input_character} == 0x003E) { # >
          !!!parse-error (type => 'bogus comment');
-         $self->{state} = 'data';
+         $self->{state} = DATA_STATE;
          !!!next-input-character;
          !!!emit ($self->{current_token}); # comment
-Line 1115 
 sub _get_next_token ($) {
+Line 1338 
 sub _get_next_token ($) {
          redo A;
        } elsif ($self->{next_input_character} == -1) {
          !!!parse-error (type => 'unclosed comment');
-         $self->{state} = 'data';
+         $self->{state} = DATA_STATE;
          ## reconsume
          !!!emit ($self->{current_token}); # comment
-Line 1124 
 sub _get_next_token ($) {
+Line 1347 
 sub _get_next_token ($) {
        } else {
          $self->{current_token}->{data} # comment
              .= '-' . chr ($self->{next_input_character});
-         $self->{state} = 'comment';
+         $self->{state} = COMMENT_STATE;
          !!!next-input-character;
          redo A;
        }
-     } elsif ($self->{state} eq 'comment') {
+     } elsif ($self->{state} == COMMENT_STATE) {
        if ($self->{next_input_character} == 0x002D) { # -
-         $self->{state} = 'comment end dash';
+         $self->{state} = COMMENT_END_DASH_STATE;
          !!!next-input-character;
          redo A;
        } elsif ($self->{next_input_character} == -1) {
          !!!parse-error (type => 'unclosed comment');
-         $self->{state} = 'data';
+         $self->{state} = DATA_STATE;
          ## reconsume
          !!!emit ($self->{current_token}); # comment
-Line 1147 
 sub _get_next_token ($) {
+Line 1370 
 sub _get_next_token ($) {
          !!!next-input-character;
          redo A;
        }
-     } elsif ($self->{state} eq 'comment end dash') {
+     } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
        if ($self->{next_input_character} == 0x002D) { # -
-         $self->{state} = 'comment end';
+         $self->{state} = COMMENT_END_STATE;
          !!!next-input-character;
          redo A;
        } elsif ($self->{next_input_character} == -1) {
          !!!parse-error (type => 'unclosed comment');
-         $self->{state} = 'data';
+         $self->{state} = DATA_STATE;
          ## reconsume
          !!!emit ($self->{current_token}); # comment
-Line 1162 
 sub _get_next_token ($) {
+Line 1385 
 sub _get_next_token ($) {
          redo A;
        } else {
          $self->{current_token}->{data} .= '-' . chr ($self->{next_input_character}); # comment
-         $self->{state} = 'comment';
+         $self->{state} = COMMENT_STATE;
          !!!next-input-character;
          redo A;
        }
-     } elsif ($self->{state} eq 'comment end') {
+     } elsif ($self->{state} == COMMENT_END_STATE) {
        if ($self->{next_input_character} == 0x003E) { # >
-         $self->{state} = 'data';
+         $self->{state} = DATA_STATE;
          !!!next-input-character;
          !!!emit ($self->{current_token}); # comment
-Line 1182 
 sub _get_next_token ($) {
+Line 1405 
 sub _get_next_token ($) {
          redo A;
        } elsif ($self->{next_input_character} == -1) {
          !!!parse-error (type => 'unclosed comment');
-         $self->{state} = 'data';
+         $self->{state} = DATA_STATE;
          ## reconsume
          !!!emit ($self->{current_token}); # comment
-Line 1191 
 sub _get_next_token ($) {
+Line 1414 
 sub _get_next_token ($) {
        } else {
          !!!parse-error (type => 'dash in comment');
          $self->{current_token}->{data} .= '--' . chr ($self->{next_input_character}); # comment
-         $self->{state} = 'comment';
+         $self->{state} = COMMENT_STATE;
          !!!next-input-character;
          redo A;
        }
-     } elsif ($self->{state} eq 'DOCTYPE') {
+     } elsif ($self->{state} == DOCTYPE_STATE) {
        if ($self->{next_input_character} == 0x0009 or # HT
            $self->{next_input_character} == 0x000A or # LF
            $self->{next_input_character} == 0x000B or # VT
            $self->{next_input_character} == 0x000C or # FF
            $self->{next_input_character} == 0x0020) { # SP
-         $self->{state} = 'before DOCTYPE name';
+         $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
          !!!next-input-character;
          redo A;
        } else {
          !!!parse-error (type => 'no space before DOCTYPE name');
-         $self->{state} = 'before DOCTYPE name';
+         $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
          ## reconsume
          redo A;
        }
-     } elsif ($self->{state} eq 'before DOCTYPE name') {
+     } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
        if ($self->{next_input_character} == 0x0009 or # HT
            $self->{next_input_character} == 0x000A or # LF
            $self->{next_input_character} == 0x000B or # VT
-Line 1221 
 sub _get_next_token ($) {
+Line 1444 
 sub _get_next_token ($) {
          redo A;
        } elsif ($self->{next_input_character} == 0x003E) { # >
          !!!parse-error (type => 'no DOCTYPE name');
-         $self->{state} = 'data';
+         $self->{state} = DATA_STATE;
          !!!next-input-character;
-         !!!emit ({type => DOCTYPE_TOKEN}); # incorrect
+         !!!emit ({type => DOCTYPE_TOKEN, quirks => 1});
          redo A;
        } elsif ($self->{next_input_character} == -1) {
          !!!parse-error (type => 'no DOCTYPE name');
-         $self->{state} = 'data';
+         $self->{state} = DATA_STATE;
          ## reconsume
-         !!!emit ({type => DOCTYPE_TOKEN}); # incorrect
+         !!!emit ({type => DOCTYPE_TOKEN, quirks => 1});
          redo A;
        } else {
          $self->{current_token}
              = {type => DOCTYPE_TOKEN,
                 name => chr ($self->{next_input_character}),
-                correct => 1};
+                #quirks => 0,
+               };
  ## ISSUE: "Set the token's name name to the" in the spec
-         $self->{state} = 'DOCTYPE name';
+         $self->{state} = DOCTYPE_NAME_STATE;
          !!!next-input-character;
          redo A;
        }
-     } elsif ($self->{state} eq 'DOCTYPE name') {
+     } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
  ## ISSUE: Redundant "First," in the spec.
        if ($self->{next_input_character} == 0x0009 or # HT
            $self->{next_input_character} == 0x000A or # LF
            $self->{next_input_character} == 0x000B or # VT
            $self->{next_input_character} == 0x000C or # FF
            $self->{next_input_character} == 0x0020) { # SP
-         $self->{state} = 'after DOCTYPE name';
+         $self->{state} = AFTER_DOCTYPE_NAME_STATE;
          !!!next-input-character;
          redo A;
        } elsif ($self->{next_input_character} == 0x003E) { # >
-         $self->{state} = 'data';
+         $self->{state} = DATA_STATE;
          !!!next-input-character;
          !!!emit ($self->{current_token}); # DOCTYPE
-Line 1264 
 sub _get_next_token ($) {
+Line 1488 
 sub _get_next_token ($) {
          redo A;
        } elsif ($self->{next_input_character} == -1) {
          !!!parse-error (type => 'unclosed DOCTYPE');
-         $self->{state} = 'data';
+         $self->{state} = DATA_STATE;
          ## reconsume
-         delete $self->{current_token}->{correct};
+         $self->{current_token}->{quirks} = 1;
          !!!emit ($self->{current_token}); # DOCTYPE
          redo A;
-Line 1278 
 sub _get_next_token ($) {
+Line 1502 
 sub _get_next_token ($) {
          !!!next-input-character;
          redo A;
        }
-     } elsif ($self->{state} eq 'after DOCTYPE name') {
+     } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
        if ($self->{next_input_character} == 0x0009 or # HT
            $self->{next_input_character} == 0x000A or # LF
            $self->{next_input_character} == 0x000B or # VT
-Line 1288 
 sub _get_next_token ($) {
+Line 1512 
 sub _get_next_token ($) {
          !!!next-input-character;
          redo A;
        } elsif ($self->{next_input_character} == 0x003E) { # >
-         $self->{state} = 'data';
+         $self->{state} = DATA_STATE;
          !!!next-input-character;
          !!!emit ($self->{current_token}); # DOCTYPE
-Line 1296 
 sub _get_next_token ($) {
+Line 1520 
 sub _get_next_token ($) {
          redo A;
        } elsif ($self->{next_input_character} == -1) {
          !!!parse-error (type => 'unclosed DOCTYPE');
-         $self->{state} = 'data';
+         $self->{state} = DATA_STATE;
          ## reconsume
-         delete $self->{current_token}->{correct};
+         $self->{current_token}->{quirks} = 1;
          !!!emit ($self->{current_token}); # DOCTYPE
          redo A;
-Line 1320 
 sub _get_next_token ($) {
+Line 1544 
 sub _get_next_token ($) {
                  !!!next-input-character;
                  if ($self->{next_input_character} == 0x0043 or # C
                      $self->{next_input_character} == 0x0063) { # c
-                   $self->{state} = 'before DOCTYPE public identifier';
+                   $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
                    !!!next-input-character;
                    redo A;
                  }
-Line 1347 
 sub _get_next_token ($) {
+Line 1571 
 sub _get_next_token ($) {
                  !!!next-input-character;
                  if ($self->{next_input_character} == 0x004D or # M
                      $self->{next_input_character} == 0x006D) { # m
-                   $self->{state} = 'before DOCTYPE system identifier';
+                   $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
                    !!!next-input-character;
                    redo A;
                  }
-Line 1363 
 sub _get_next_token ($) {
+Line 1587 
 sub _get_next_token ($) {
        }
        !!!parse-error (type => 'string after DOCTYPE name');
-       $self->{state} = 'bogus DOCTYPE';
+       $self->{current_token}->{quirks} = 1;
+       $self->{state} = BOGUS_DOCTYPE_STATE;
        # next-input-character is already done
        redo A;
-     } elsif ($self->{state} eq 'before DOCTYPE public identifier') {
+     } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
        if ({
 x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
              #0x000D => 1, # HT, LF, VT, FF, SP, CR
-Line 1376 
 sub _get_next_token ($) {
+Line 1602 
 sub _get_next_token ($) {
          redo A;
        } elsif ($self->{next_input_character} eq 0x0022) { # "
          $self->{current_token}->{public_identifier} = ''; # DOCTYPE
-         $self->{state} = 'DOCTYPE public identifier (double-quoted)';
+         $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
          !!!next-input-character;
          redo A;
        } elsif ($self->{next_input_character} eq 0x0027) { # '
          $self->{current_token}->{public_identifier} = ''; # DOCTYPE
-         $self->{state} = 'DOCTYPE public identifier (single-quoted)';
+         $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
          !!!next-input-character;
          redo A;
        } elsif ($self->{next_input_character} eq 0x003E) { # >
          !!!parse-error (type => 'no PUBLIC literal');
-         $self->{state} = 'data';
+         $self->{state} = DATA_STATE;
          !!!next-input-character;
-         delete $self->{current_token}->{correct};
+         $self->{current_token}->{quirks} = 1;
          !!!emit ($self->{current_token}); # DOCTYPE
          redo A;
        } elsif ($self->{next_input_character} == -1) {
          !!!parse-error (type => 'unclosed DOCTYPE');
-         $self->{state} = 'data';
+         $self->{state} = DATA_STATE;
          ## reconsume
-         delete $self->{current_token}->{correct};
+         $self->{current_token}->{quirks} = 1;
          !!!emit ($self->{current_token}); # DOCTYPE
          redo A;
        } else {
          !!!parse-error (type => 'string after PUBLIC');
-         $self->{state} = 'bogus DOCTYPE';
+         $self->{current_token}->{quirks} = 1;
+         $self->{state} = BOGUS_DOCTYPE_STATE;
          !!!next-input-character;
          redo A;
        }
-     } elsif ($self->{state} eq 'DOCTYPE public identifier (double-quoted)') {
+     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
        if ($self->{next_input_character} == 0x0022) { # "
-         $self->{state} = 'after DOCTYPE public identifier';
+         $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
+         !!!next-input-character;
+         redo A;
+       } elsif ($self->{next_input_character} == 0x003E) { # >
+         !!!parse-error (type => 'unclosed PUBLIC literal');
+         $self->{state} = DATA_STATE;
          !!!next-input-character;
+         $self->{current_token}->{quirks} = 1;
+         !!!emit ($self->{current_token}); # DOCTYPE
          redo A;
        } elsif ($self->{next_input_character} == -1) {
          !!!parse-error (type => 'unclosed PUBLIC literal');
-         $self->{state} = 'data';
+         $self->{state} = DATA_STATE;
          ## reconsume
-         delete $self->{current_token}->{correct};
+         $self->{current_token}->{quirks} = 1;
          !!!emit ($self->{current_token}); # DOCTYPE
          redo A;
-Line 1432 
 sub _get_next_token ($) {
+Line 1670 
 sub _get_next_token ($) {
          !!!next-input-character;
          redo A;
        }
-     } elsif ($self->{state} eq 'DOCTYPE public identifier (single-quoted)') {
+     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
        if ($self->{next_input_character} == 0x0027) { # '
-         $self->{state} = 'after DOCTYPE public identifier';
+         $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
          !!!next-input-character;
          redo A;
+       } elsif ($self->{next_input_character} == 0x003E) { # >
+         !!!parse-error (type => 'unclosed PUBLIC literal');
+         $self->{state} = DATA_STATE;
+         !!!next-input-character;
+         $self->{current_token}->{quirks} = 1;
+         !!!emit ($self->{current_token}); # DOCTYPE
+         redo A;
        } elsif ($self->{next_input_character} == -1) {
          !!!parse-error (type => 'unclosed PUBLIC literal');
-         $self->{state} = 'data';
+         $self->{state} = DATA_STATE;
          ## reconsume
-         delete $self->{current_token}->{correct};
+         $self->{current_token}->{quirks} = 1;
          !!!emit ($self->{current_token}); # DOCTYPE
          redo A;
-Line 1454 
 sub _get_next_token ($) {
+Line 1702 
 sub _get_next_token ($) {
          !!!next-input-character;
          redo A;
        }
-     } elsif ($self->{state} eq 'after DOCTYPE public identifier') {
+     } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
        if ({
 x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
              #0x000D => 1, # HT, LF, VT, FF, SP, CR
-Line 1464 
 sub _get_next_token ($) {
+Line 1712 
 sub _get_next_token ($) {
          redo A;
        } elsif ($self->{next_input_character} == 0x0022) { # "
          $self->{current_token}->{system_identifier} = ''; # DOCTYPE
-         $self->{state} = 'DOCTYPE system identifier (double-quoted)';
+         $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
          !!!next-input-character;
          redo A;
        } elsif ($self->{next_input_character} == 0x0027) { # '
          $self->{current_token}->{system_identifier} = ''; # DOCTYPE
-         $self->{state} = 'DOCTYPE system identifier (single-quoted)';
+         $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
          !!!next-input-character;
          redo A;
        } elsif ($self->{next_input_character} == 0x003E) { # >
-         $self->{state} = 'data';
+         $self->{state} = DATA_STATE;
          !!!next-input-character;
          !!!emit ($self->{current_token}); # DOCTYPE
-Line 1482 
 sub _get_next_token ($) {
+Line 1730 
 sub _get_next_token ($) {
        } elsif ($self->{next_input_character} == -1) {
          !!!parse-error (type => 'unclosed DOCTYPE');
-         $self->{state} = 'data';
+         $self->{state} = DATA_STATE;
          ## reconsume
-         delete $self->{current_token}->{correct};
+         $self->{current_token}->{quirks} = 1;
          !!!emit ($self->{current_token}); # DOCTYPE
          redo A;
        } else {
          !!!parse-error (type => 'string after PUBLIC literal');
-         $self->{state} = 'bogus DOCTYPE';
+         $self->{current_token}->{quirks} = 1;
+         $self->{state} = BOGUS_DOCTYPE_STATE;
          !!!next-input-character;
          redo A;
        }
-     } elsif ($self->{state} eq 'before DOCTYPE system identifier') {
+     } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
        if ({
 x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
              #0x000D => 1, # HT, LF, VT, FF, SP, CR
-Line 1505 
 sub _get_next_token ($) {
+Line 1755 
 sub _get_next_token ($) {
          redo A;
        } elsif ($self->{next_input_character} == 0x0022) { # "
          $self->{current_token}->{system_identifier} = ''; # DOCTYPE
-         $self->{state} = 'DOCTYPE system identifier (double-quoted)';
+         $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
          !!!next-input-character;
          redo A;
        } elsif ($self->{next_input_character} == 0x0027) { # '
          $self->{current_token}->{system_identifier} = ''; # DOCTYPE
-         $self->{state} = 'DOCTYPE system identifier (single-quoted)';
+         $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
          !!!next-input-character;
          redo A;
        } elsif ($self->{next_input_character} == 0x003E) { # >
          !!!parse-error (type => 'no SYSTEM literal');
-         $self->{state} = 'data';
+         $self->{state} = DATA_STATE;
          !!!next-input-character;
-         delete $self->{current_token}->{correct};
+         $self->{current_token}->{quirks} = 1;
          !!!emit ($self->{current_token}); # DOCTYPE
          redo A;
        } elsif ($self->{next_input_character} == -1) {
          !!!parse-error (type => 'unclosed DOCTYPE');
-         $self->{state} = 'data';
+         $self->{state} = DATA_STATE;
          ## reconsume
-         delete $self->{current_token}->{correct};
+         $self->{current_token}->{quirks} = 1;
          !!!emit ($self->{current_token}); # DOCTYPE
          redo A;
        } else {
          !!!parse-error (type => 'string after SYSTEM');
-         $self->{state} = 'bogus DOCTYPE';
+         $self->{current_token}->{quirks} = 1;
+         $self->{state} = BOGUS_DOCTYPE_STATE;
          !!!next-input-character;
          redo A;
        }
-     } elsif ($self->{state} eq 'DOCTYPE system identifier (double-quoted)') {
+     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
        if ($self->{next_input_character} == 0x0022) { # "
-         $self->{state} = 'after DOCTYPE system identifier';
+         $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
          !!!next-input-character;
          redo A;
+       } elsif ($self->{next_input_character} == 0x003E) { # >
+         !!!parse-error (type => 'unclosed PUBLIC literal');
+         $self->{state} = DATA_STATE;
+         !!!next-input-character;
+         $self->{current_token}->{quirks} = 1;
+         !!!emit ($self->{current_token}); # DOCTYPE
+         redo A;
        } elsif ($self->{next_input_character} == -1) {
          !!!parse-error (type => 'unclosed SYSTEM literal');
-         $self->{state} = 'data';
+         $self->{state} = DATA_STATE;
          ## reconsume
-         delete $self->{current_token}->{correct};
+         $self->{current_token}->{quirks} = 1;
          !!!emit ($self->{current_token}); # DOCTYPE
          redo A;
-Line 1560 
 sub _get_next_token ($) {
+Line 1822 
 sub _get_next_token ($) {
          !!!next-input-character;
          redo A;
        }
-     } elsif ($self->{state} eq 'DOCTYPE system identifier (single-quoted)') {
+     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
        if ($self->{next_input_character} == 0x0027) { # '
-         $self->{state} = 'after DOCTYPE system identifier';
+         $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
+         !!!next-input-character;
+         redo A;
+       } elsif ($self->{next_input_character} == 0x003E) { # >
+         !!!parse-error (type => 'unclosed PUBLIC literal');
+         $self->{state} = DATA_STATE;
          !!!next-input-character;
+         $self->{current_token}->{quirks} = 1;
+         !!!emit ($self->{current_token}); # DOCTYPE
          redo A;
        } elsif ($self->{next_input_character} == -1) {
          !!!parse-error (type => 'unclosed SYSTEM literal');
-         $self->{state} = 'data';
+         $self->{state} = DATA_STATE;
          ## reconsume
-         delete $self->{current_token}->{correct};
+         $self->{current_token}->{quirks} = 1;
          !!!emit ($self->{current_token}); # DOCTYPE
          redo A;
-Line 1582 
 sub _get_next_token ($) {
+Line 1854 
 sub _get_next_token ($) {
          !!!next-input-character;
          redo A;
        }
-     } elsif ($self->{state} eq 'after DOCTYPE system identifier') {
+     } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
        if ({
 x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
              #0x000D => 1, # HT, LF, VT, FF, SP, CR
-Line 1591 
 sub _get_next_token ($) {
+Line 1863 
 sub _get_next_token ($) {
          !!!next-input-character;
          redo A;
        } elsif ($self->{next_input_character} == 0x003E) { # >
-         $self->{state} = 'data';
+         $self->{state} = DATA_STATE;
          !!!next-input-character;
          !!!emit ($self->{current_token}); # DOCTYPE
-Line 1600 
 sub _get_next_token ($) {
+Line 1872 
 sub _get_next_token ($) {
        } elsif ($self->{next_input_character} == -1) {
          !!!parse-error (type => 'unclosed DOCTYPE');
-         $self->{state} = 'data';
+         $self->{state} = DATA_STATE;
          ## reconsume
-         delete $self->{current_token}->{correct};
+         $self->{current_token}->{quirks} = 1;
          !!!emit ($self->{current_token}); # DOCTYPE
          redo A;
        } else {
          !!!parse-error (type => 'string after SYSTEM literal');
-         $self->{state} = 'bogus DOCTYPE';
+         #$self->{current_token}->{quirks} = 1;
+         $self->{state} = BOGUS_DOCTYPE_STATE;
          !!!next-input-character;
          redo A;
        }
-     } elsif ($self->{state} eq 'bogus DOCTYPE') {
+     } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
        if ($self->{next_input_character} == 0x003E) { # >
-         $self->{state} = 'data';
+         $self->{state} = DATA_STATE;
          !!!next-input-character;
-         delete $self->{current_token}->{correct};
          !!!emit ($self->{current_token}); # DOCTYPE
          redo A;
        } elsif ($self->{next_input_character} == -1) {
          !!!parse-error (type => 'unclosed DOCTYPE');
-         $self->{state} = 'data';
+         $self->{state} = DATA_STATE;
          ## reconsume
-         delete $self->{current_token}->{correct};
          !!!emit ($self->{current_token}); # DOCTYPE
          redo A;
-Line 1644 
 sub _get_next_token ($) {
+Line 1916 
 sub _get_next_token ($) {
    die "$0: _get_next_token: unexpected case";
  } # _get_next_token
- sub _tokenize_attempt_to_consume_an_entity ($$) {
+ sub _tokenize_attempt_to_consume_an_entity ($$$) {
-   my ($self, $in_attr) = @_;
+   my ($self, $in_attr, $additional) = @_;
    if ({
 x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, # HT, LF, VT, FF,
 x0020 => 1, 0x003C => 1, 0x0026 => 1, -1 => 1, # SP, <, & # 0x000D # CR
+        $additional => 1,
        }->{$self->{next_input_character}}) {
      ## Don't consume
      ## No error
-Line 1705 
 sub _tokenize_attempt_to_consume_an_enti
+Line 1978 
 sub _tokenize_attempt_to_consume_an_enti
            $code = $c1_entity_char->{$code};
          }
-         return {type => CHARACTER_TOKEN, data => chr $code};
+         return {type => CHARACTER_TOKEN, data => chr $code,
+                 has_reference => 1};
        } # X
      } elsif (0x0030 <= $self->{next_input_character} and
               $self->{next_input_character} <= 0x0039) { # 0..9
-Line 1740 
 sub _tokenize_attempt_to_consume_an_enti
+Line 2014 
 sub _tokenize_attempt_to_consume_an_enti
          $code = $c1_entity_char->{$code};
        }
-       return {type => CHARACTER_TOKEN, data => chr $code};
+       return {type => CHARACTER_TOKEN, data => chr $code, has_reference => 1};
      } else {
        !!!parse-error (type => 'bare nero');
        !!!back-next-input-character ($self->{next_input_character});
-Line 1788 
 sub _tokenize_attempt_to_consume_an_enti
+Line 2062 
 sub _tokenize_attempt_to_consume_an_enti
      }
      if ($match > 0) {
-       return {type => CHARACTER_TOKEN, data => $value};
+       return {type => CHARACTER_TOKEN, data => $value, has_reference => 1};
      } elsif ($match < 0) {
        !!!parse-error (type => 'no refc');
        if ($in_attr and $match < -1) {
          return {type => CHARACTER_TOKEN, data => '&'.$entity_name};
        } else {
-         return {type => CHARACTER_TOKEN, data => $value};
+         return {type => CHARACTER_TOKEN, data => $value, has_reference => 1};
        }
      } else {
        !!!parse-error (type => 'bare ero');
-       ## NOTE: No characters are consumed in the spec.
+       ## NOTE: "No characters are consumed" in the spec.
        return {type => CHARACTER_TOKEN, data => '&'.$value};
      }
    } else {
-Line 1881 
 sub _tree_construction_initial ($) {
+Line 2155 
 sub _tree_construction_initial ($) {
        ## ISSUE: internalSubset = null??
        $self->{document}->append_child ($doctype);
-       if (not $token->{correct} or $doctype_name ne 'HTML') {
+       if ($token->{quirks} or $doctype_name ne 'HTML') {
          $self->{document}->manakai_compat_mode ('quirks');
        } elsif (defined $token->{public_identifier}) {
          my $pubid = $token->{public_identifier};
-Line 1935 
 sub _tree_construction_initial ($) {
+Line 2209 
 sub _tree_construction_initial ($) {
            "-//NETSCAPE COMM. CORP.//DTD STRICT HTML//EN" => 1,
            "-//O'REILLY AND ASSOCIATES//DTD HTML 2.0//EN" => 1,
            "-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED 1.0//EN" => 1,
+           "-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED RELAXED 1.0//EN" => 1,
+           "-//SOFTQUAD SOFTWARE//DTD HOTMETAL PRO 6.0::19990601::EXTENSIONS TO HTML 4.0//EN" => 1,
+           "-//SOFTQUAD//DTD HOTMETAL PRO 4.0::19971010::EXTENSIONS TO HTML 4.0//EN" => 1,
            "-//SPYGLASS//DTD HTML 2.0 EXTENDED//EN" => 1,
            "-//SQ//DTD HTML 2.0 HOTMETAL + EXTENSIONS//EN" => 1,
            "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA HTML//EN" => 1,
-Line 2046 
 sub _tree_construction_root_element ($)
+Line 2323 
 sub _tree_construction_root_element ($)
              redo B;
            }
          }
+         $self->{application_cache_selection}->(undef);
+         #
+       } elsif ($token->{type} == START_TAG_TOKEN) {
+         if ($token->{tag_name} eq 'html' and
+             $token->{attributes}->{manifest}) {
+           $self->{application_cache_selection}
+                ->($token->{attributes}->{manifest}->{value});
+           ## ISSUE: No relative reference resolution?
+         } else {
+           $self->{application_cache_selection}->(undef);
+         }
+         ## ISSUE: There is an issue in the spec
          #
        } elsif ({
-                 START_TAG_TOKEN, 1,
                  END_TAG_TOKEN, 1,
                  END_OF_FILE_TOKEN, 1,
                 }->{$token->{type}}) {
+         $self->{application_cache_selection}->(undef);
          ## ISSUE: There is an issue in the spec
          #
        } else {
          die "$0: $token->{type}: Unknown token type";
        }
        my $root_element; !!!create-element ($root_element, 'html');
        $self->{document}->append_child ($root_element);
        push @{$self->{open_elements}}, [$root_element, 'html'];
-Line 2713 
 sub _tree_construction_main ($) {
+Line 3007 
 sub _tree_construction_main ($) {
                  push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
                }
                !!!insert-element ($token->{tag_name}, $token->{attributes});
-               pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
+               my $meta_el = pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
                unless ($self->{confident}) {
-                 my $charset;
                  if ($token->{attributes}->{charset}) { ## TODO: And if supported
-                   $charset = $token->{attributes}->{charset}->{value};
+                   $self->{change_encoding}
-                 }
+                       ->($self, $token->{attributes}->{charset}->{value});
-                 if ($token->{attributes}->{'http-equiv'}) {
+                   $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
+                       ->set_user_data (manakai_has_reference =>
+                                            $token->{attributes}->{charset}
+                                                ->{has_reference});
+                 } elsif ($token->{attributes}->{content}) {
                    ## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition.
-                   if ($token->{attributes}->{'http-equiv'}->{value}
+                   if ($token->{attributes}->{content}->{value}
-                       =~ /\A[^;]*;[\x09-\x0D\x20]*charset[\x09-\x0D\x20]*=
+                       =~ /\A[^;]*;[\x09-\x0D\x20]*[Cc][Hh][Aa][Rr][Ss][Ee][Tt]
+                           [\x09-\x0D\x20]*=
                            [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|
                            ([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) {
-                     $charset = defined $1 ? $1 : defined $2 ? $2 : $3;
+                     $self->{change_encoding}
-                   } ## TODO: And if supported
+                         ->($self, defined $1 ? $1 : defined $2 ? $2 : $3);
+                     $meta_el->[0]->get_attribute_node_ns (undef, 'content')
+                         ->set_user_data (manakai_has_reference =>
+                                              $token->{attributes}->{content}
+                                                    ->{has_reference});
+                   }
+                 }
+               } else {
+                 if ($token->{attributes}->{charset}) {
+                   $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
+                       ->set_user_data (manakai_has_reference =>
+                                            $token->{attributes}->{charset}
+                                                ->{has_reference});
+                 }
+                 if ($token->{attributes}->{content}) {
+                   $meta_el->[0]->get_attribute_node_ns (undef, 'content')
+                       ->set_user_data (manakai_has_reference =>
+                                            $token->{attributes}->{content}
+                                                ->{has_reference});
                  }
-                 ## TODO: Change the encoding
                }
-               ## TODO: Extracting |charset| from |meta|.
                pop @{$self->{open_elements}}
                    if $self->{insertion_mode} == AFTER_HEAD_IM;
                !!!next-token;
-Line 3302 
 sub _tree_construction_main ($) {
+Line 3617 
 sub _tree_construction_main ($) {
        $insert = $insert_to_current;
        #
      } elsif ($self->{insertion_mode} & TABLE_IMS) {
-           if ($token->{type} == CHARACTER_TOKEN) {
+       if ($token->{type} == CHARACTER_TOKEN) {
-             ## NOTE: There are "character in table" code clones.
              if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
                $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
-Line 3360 
 sub _tree_construction_main ($) {
+Line 3674 
 sub _tree_construction_main ($) {
              !!!next-token;
              redo B;
-           } elsif ($token->{type} == START_TAG_TOKEN) {
+       } elsif ($token->{type} == START_TAG_TOKEN) {
              if ({
                   tr => ($self->{insertion_mode} != IN_ROW_IM),
                   th => 1, td => 1,
-Line 3546 
 sub _tree_construction_main ($) {
+Line 3860 
 sub _tree_construction_main ($) {
                  die "$0: in table: <>: $token->{tag_name}";
                }
              } elsif ($token->{tag_name} eq 'table') {
-               ## NOTE: There are code clones for this "table in table"
                !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
                ## As if </table>
-Line 3594 
 sub _tree_construction_main ($) {
+Line 3907 
 sub _tree_construction_main ($) {
                ## reprocess
                redo B;
-             } else {
+         } else {
-               #
+           !!!parse-error (type => 'in table:'.$token->{tag_name});
-             }
-           } elsif ($token->{type} == END_TAG_TOKEN) {
+           $insert = $insert_to_foster;
+           #
+         }
+       } elsif ($token->{type} == END_TAG_TOKEN) {
              if ($token->{tag_name} eq 'tr' and
                  $self->{insertion_mode} == IN_ROW_IM) {
                ## have an element in table scope
-Line 3854 
 sub _tree_construction_main ($) {
+Line 4170 
 sub _tree_construction_main ($) {
                ## Ignore the token
                !!!next-token;
                redo B;
-             } else {
+         } else {
-               #
+           !!!parse-error (type => 'in table:/'.$token->{tag_name});
-             }
-           } else {
-             die "$0: $token->{type}: Unknown token type";
-           }
-       !!!parse-error (type => 'in table:'.$token->{tag_name});
-       $insert = $insert_to_foster;
+           $insert = $insert_to_foster;
-       #
+           #
+         }
+       } else {
+         die "$0: $token->{type}: Unknown token type";
+       }
      } elsif ($self->{insertion_mode} == IN_COLUMN_GROUP_IM) {
            if ($token->{type} == CHARACTER_TOKEN) {
              if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
-Line 3923 
 sub _tree_construction_main ($) {
+Line 4237 
 sub _tree_construction_main ($) {
              redo B;
            }
      } elsif ($self->{insertion_mode} == IN_SELECT_IM) {
-           if ($token->{type} == CHARACTER_TOKEN) {
+       if ($token->{type} == CHARACTER_TOKEN) {
-             $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
+         $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
-             !!!next-token;
+         !!!next-token;
-             redo B;
+         redo B;
-           } elsif ($token->{type} == START_TAG_TOKEN) {
+       } elsif ($token->{type} == START_TAG_TOKEN) {
              if ($token->{tag_name} eq 'option') {
                if ($self->{open_elements}->[-1]->[1] eq 'option') {
                  ## As if </option>
-Line 3980 
 sub _tree_construction_main ($) {
+Line 4294 
 sub _tree_construction_main ($) {
                !!!next-token;
                redo B;
-             } else {
+         } else {
-               #
+           !!!parse-error (type => 'in select:'.$token->{tag_name});
-             }
+           ## Ignore the token
-           } elsif ($token->{type} == END_TAG_TOKEN) {
+           !!!next-token;
+           redo B;
+         }
+       } elsif ($token->{type} == END_TAG_TOKEN) {
              if ($token->{tag_name} eq 'optgroup') {
                if ($self->{open_elements}->[-1]->[1] eq 'option' and
                    $self->{open_elements}->[-2]->[1] eq 'optgroup') {
-Line 4085 
 sub _tree_construction_main ($) {
+Line 4402 
 sub _tree_construction_main ($) {
                ## reprocess
                redo B;
-             } else {
+         } else {
-               #
+           !!!parse-error (type => 'in select:/'.$token->{tag_name});
-             }
-           } else {
-             #
-           }
-           !!!parse-error (type => 'in select:'.$token->{tag_name});
            ## Ignore the token
            !!!next-token;
            redo B;
+         }
+       } else {
+         die "$0: $token->{type}: Unknown token type";
+       }
      } elsif ($self->{insertion_mode} & BODY_AFTER_IMS) {
        if ($token->{type} == CHARACTER_TOKEN) {
          if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
-Line 4303 
 sub _tree_construction_main ($) {
+Line 4618 
 sub _tree_construction_main ($) {
        } elsif ($token->{tag_name} eq 'meta') {
          ## NOTE: This is an "as if in head" code clone, only "-t" differs
          !!!insert-element-t ($token->{tag_name}, $token->{attributes});
-         pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
+         my $meta_el = pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
          unless ($self->{confident}) {
-           my $charset;
            if ($token->{attributes}->{charset}) { ## TODO: And if supported
-             $charset = $token->{attributes}->{charset}->{value};
+             $self->{change_encoding}
-           }
+                 ->($self, $token->{attributes}->{charset}->{value});
-           if ($token->{attributes}->{'http-equiv'}) {
+             $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
+                 ->set_user_data (manakai_has_reference =>
+                                      $token->{attributes}->{charset}
+                                          ->{has_reference});
+           } elsif ($token->{attributes}->{content}) {
              ## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition.
-             if ($token->{attributes}->{'http-equiv'}->{value}
+             if ($token->{attributes}->{content}->{value}
-                 =~ /\A[^;]*;[\x09-\x0D\x20]*charset[\x09-\x0D\x20]*=
+                 =~ /\A[^;]*;[\x09-\x0D\x20]*[Cc][Hh][Aa][Rr][Ss][Ee][Tt]
+                     [\x09-\x0D\x20]*=
                      [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|
                      ([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) {
-               $charset = defined $1 ? $1 : defined $2 ? $2 : $3;
+               $self->{change_encoding}
-             } ## TODO: And if supported
+                   ->($self, defined $1 ? $1 : defined $2 ? $2 : $3);
+               $meta_el->[0]->get_attribute_node_ns (undef, 'content')
+                   ->set_user_data (manakai_has_reference =>
+                                        $token->{attributes}->{content}
+                                              ->{has_reference});
+             }
+           }
+         } else {
+           if ($token->{attributes}->{charset}) {
+             $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
+                 ->set_user_data (manakai_has_reference =>
+                                      $token->{attributes}->{charset}
+                                          ->{has_reference});
+           }
+           if ($token->{attributes}->{content}) {
+             $meta_el->[0]->get_attribute_node_ns (undef, 'content')
+                 ->set_user_data (manakai_has_reference =>
+                                      $token->{attributes}->{content}
+                                          ->{has_reference});
            }
-           ## TODO: Change the encoding
          }
          !!!next-token;
-Line 4626 
 sub _tree_construction_main ($) {
+Line 4963 
 sub _tree_construction_main ($) {
          INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
            my $node = $self->{open_elements}->[$_];
            if ($node->[1] eq 'nobr') {
-             !!!parse-error (type => 'not closed:nobr');
+             !!!parse-error (type => 'in nobr:nobr');
              !!!back-token;
              $token = {type => END_TAG_TOKEN, tag_name => 'nobr'};
              redo B;
-Line 4831 
 sub _tree_construction_main ($) {
+Line 5168 
 sub _tree_construction_main ($) {
                  noframes => 1,
                  noscript => 0, ## TODO: 1 if scripting is enabled
                 }->{$token->{tag_name}}) {
-         ## NOTE: There are two "as if in body" code clones.
+         ## NOTE: There is an "as if in body" code clone.
          $parse_rcdata->(CDATA_CONTENT_MODEL, $insert);
          redo B;
        } elsif ($token->{tag_name} eq 'select') {
-Line 4987 
 sub _tree_construction_main ($) {
+Line 5324 
 sub _tree_construction_main ($) {
          if ($self->{open_elements}->[-1]->[1] eq $token->{tag_name}) {
            pop @{$self->{open_elements}};
          } else {
-           !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
+           !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
          }
          undef $self->{form_element};
-Line 5025 
 sub _tree_construction_main ($) {
+Line 5362 
 sub _tree_construction_main ($) {
          } # INSCOPE
          if ($self->{open_elements}->[-1]->[1] ne $token->{tag_name}) {
-           !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
+           !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
          }
          splice @{$self->{open_elements}}, $i if defined $i;
-Line 5094 
 sub _tree_construction_main ($) {
+Line 5431 
 sub _tree_construction_main ($) {
              ## Step 2
              if ($token->{tag_name} ne $self->{open_elements}->[-1]->[1]) {
+               ## NOTE: <x><y></x>
                !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
              }
-Line 5144 
 sub set_inner_html ($$$) {
+Line 5482 
 sub set_inner_html ($$$) {
    my $s = \$_[0];
    my $onerror = $_[1];
+   ## ISSUE: Should {confident} be true?
    my $nt = $node->node_type;
    if ($nt == 9) {
      # MUST
-Line 5216 
 sub set_inner_html ($$$) {
+Line 5556 
 sub set_inner_html ($$$) {
      $p->_initialize_tree_constructor;
      ## Step 2
-     my $node_ln = $node->local_name;
+     my $node_ln = $node->manakai_local_name;
      $p->{content_model} = {
        title => RCDATA_CONTENT_MODEL,
        textarea => RCDATA_CONTENT_MODEL,
-Line 5256 
 sub set_inner_html ($$$) {
+Line 5596 
 sub set_inner_html ($$$) {
        if ($anode->node_type == 1) {
          my $nsuri = $anode->namespace_uri;
          if (defined $nsuri and $nsuri eq 'http://www.w3.org/1999/xhtml') {
-           if ($anode->local_name eq 'form') { ## TODO: case?
+           if ($anode->manakai_local_name eq 'form') {
              $p->{form_element} = $anode;
              last AN;
            }
-Line 5296 
 sub set_inner_html ($$$) {
+Line 5636 
 sub set_inner_html ($$$) {
  } # tree construction stage
- sub get_inner_html ($$$) {
+ package Whatpm::HTML::RestartParser;
-   my (undef, $node, $on_error) = @_;
+ push our @ISA, 'Error';
-   ## Step 1
-   my $s = '';
-   my $in_cdata;
-   my $parent = $node;
-   while (defined $parent) {
-     if ($parent->node_type == 1 and
-         $parent->namespace_uri eq 'http://www.w3.org/1999/xhtml' and
-         {
-           style => 1, script => 1, xmp => 1, iframe => 1,
-           noembed => 1, noframes => 1, noscript => 1,
-         }->{$parent->local_name}) { ## TODO: case thingy
-       $in_cdata = 1;
-     }
-     $parent = $parent->parent_node;
-   }
-   ## Step 2
-   my @node = @{$node->child_nodes};
-   C: while (@node) {
-     my $child = shift @node;
-     unless (ref $child) {
-       if ($child eq 'cdata-out') {
-         $in_cdata = 0;
-       } else {
-         $s .= $child; # end tag
-       }
-       next C;
-     }
-     my $nt = $child->node_type;
-     if ($nt == 1) { # Element
-       my $tag_name = $child->tag_name; ## TODO: manakai_tag_name
-       $s .= '<' . $tag_name;
-       ## NOTE: Non-HTML case:
-       ## <http://permalink.gmane.org/gmane.org.w3c.whatwg.discuss/11191>
-       my @attrs = @{$child->attributes}; # sort order MUST be stable
-       for my $attr (@attrs) { # order is implementation dependent
-         my $attr_name = $attr->name; ## TODO: manakai_name
-         $s .= ' ' . $attr_name . '="';
-         my $attr_value = $attr->value;
-         ## escape
-         $attr_value =~ s/&/&amp;/g;
-         $attr_value =~ s/</&lt;/g;
-         $attr_value =~ s/>/&gt;/g;
-         $attr_value =~ s/"/&quot;/g;
-         $s .= $attr_value . '"';
-       }
-       $s .= '>';
-       next C if {
-         area => 1, base => 1, basefont => 1, bgsound => 1,
-         br => 1, col => 1, embed => 1, frame => 1, hr => 1,
-         img => 1, input => 1, link => 1, meta => 1, param => 1,
-         spacer => 1, wbr => 1,
-       }->{$tag_name};
-       $s .= "\x0A" if $tag_name eq 'pre' or $tag_name eq 'textarea';
-       if (not $in_cdata and {
-         style => 1, script => 1, xmp => 1, iframe => 1,
-         noembed => 1, noframes => 1, noscript => 1,
-         plaintext => 1,
-       }->{$tag_name}) {
-         unshift @node, 'cdata-out';
-         $in_cdata = 1;
-       }
-       unshift @node, @{$child->child_nodes}, '</' . $tag_name . '>';
-     } elsif ($nt == 3 or $nt == 4) {
-       if ($in_cdata) {
-         $s .= $child->data;
-       } else {
-         my $value = $child->data;
-         $value =~ s/&/&amp;/g;
-         $value =~ s/</&lt;/g;
-         $value =~ s/>/&gt;/g;
-         $value =~ s/"/&quot;/g;
-         $s .= $value;
-       }
-     } elsif ($nt == 8) {
-       $s .= '<!--' . $child->data . '-->';
-     } elsif ($nt == 10) {
-       $s .= '<!DOCTYPE ' . $child->name . '>';
-     } elsif ($nt == 5) { # entrefs
-       push @node, @{$child->child_nodes};
-     } else {
-       $on_error->($child) if defined $on_error;
-     }
-     ## ISSUE: This code does not support PIs.
-   } # C
-   ## Step 3
-   return \$s;
- } # get_inner_html
 ;
  # $Date$

 Legend:



Removed from v.1.56
 


changed lines


 
Added in v.1.75
 Legend:



Removed from v.1.56
 


changed lines


 
Added in v.1.75
-Removed from v.1.56
+Added in v.1.75

admin@suikawiki.org	ViewVC Help
Powered by ViewVC 1.1.24