/[suikacvs]/markup/html/whatpm/Whatpm/HTML.pm.src

Diff of /markup/html/whatpm/Whatpm/HTML.pm.src

Parent Directory | Revision Log | View Patch Patch

-revision 1.55 by wakaba,
Sat Aug 11 06:53:38 2007 UTC
+revision 1.77 by wakaba,
Mon Mar  3 10:20:19 2008 UTC
 Line 1
  package Whatpm::HTML;
  use strict;
  our $VERSION=do{my @r=(q$Revision$=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
+ use Error qw(:try);
  ## ISSUE:
  ## var doc = implementation.createDocument (null, null, null);
  ## doc.write ('');
  ## alert (doc.compatMode);
- ## ISSUE: HTML5 revision 967 says that the encoding layer MUST NOT
+ ## TODO: Control charcters and noncharacters are not allowed (HTML5 revision 1263)
- ## strip BOM and the HTML layer MUST ignore it.  Whether we can do it
+ ## TODO: 1252 parse error (revision 1264)
- ## is not yet clear.
+ ## TODO: 8859-11 = 874 (revision 1271)
- ## "{U+FEFF}..." in UTF-16BE/UTF-16LE is three or four characters?
- ## "{U+FEFF}..." in GB18030?
  my $permitted_slash_tag_name = {
    base => 1,
-Line 19 
 my $permitted_slash_tag_name = {
+Line 18 
 my $permitted_slash_tag_name = {
    meta => 1,
    hr => 1,
    br => 1,
-   img=> 1,
+   img => 1,
    embed => 1,
    param => 1,
    area => 1,
-Line 84 
 my $formatting_category = {
+Line 83 
 my $formatting_category = {
  };
  # $phrasing_category: all other elements
+ sub parse_byte_string ($$$$;$) {
+   my $self = ref $_[0] ? shift : shift->new;
+   my $charset = shift;
+   my $bytes_s = ref $_[0] ? $_[0] : \($_[0]);
+   my $s;
+   if (defined $charset) {
+     require Encode; ## TODO: decode(utf8) don't delete BOM
+     $s = \ (Encode::decode ($charset, $$bytes_s));
+     $self->{input_encoding} = lc $charset; ## TODO: normalize name
+     $self->{confident} = 1;
+   } else {
+     ## TODO: Implement HTML5 detection algorithm
+     require Whatpm::Charset::UniversalCharDet;
+     $charset = Whatpm::Charset::UniversalCharDet->detect_byte_string
+         (substr ($$bytes_s, 0, 1024));
+     $charset ||= 'windows-1252';
+     $s = \ (Encode::decode ($charset, $$bytes_s));
+     $self->{input_encoding} = $charset;
+     $self->{confident} = 0;
+   }
+   $self->{change_encoding} = sub {
+     my $self = shift;
+     my $charset = lc shift;
+     ## TODO: if $charset is supported
+     ## TODO: normalize charset name
+     ## "Change the encoding" algorithm:
+     ## Step 1
+     if ($charset eq 'utf-16') { ## ISSUE: UTF-16BE -> UTF-8? UTF-16LE -> UTF-8?
+       $charset = 'utf-8';
+     }
+     ## Step 2
+     if (defined $self->{input_encoding} and
+         $self->{input_encoding} eq $charset) {
+       $self->{confident} = 1;
+       return;
+     }
+     !!!parse-error (type => 'charset label detected:'.$self->{input_encoding}.
+         ':'.$charset, level => 'w');
+     ## Step 3
+     # if (can) {
+       ## change the encoding on the fly.
+       #$self->{confident} = 1;
+       #return;
+     # }
+     ## Step 4
+     throw Whatpm::HTML::RestartParser (charset => $charset);
+   }; # $self->{change_encoding}
+   my @args = @_; shift @args; # $s
+   my $return;
+   try {
+     $return = $self->parse_char_string ($s, @args);
+   } catch Whatpm::HTML::RestartParser with {
+     my $charset = shift->{charset};
+     $s = \ (Encode::decode ($charset, $$bytes_s));
+     $self->{input_encoding} = $charset; ## TODO: normalize
+     $self->{confident} = 1;
+     $return = $self->parse_char_string ($s, @args);
+   };
+   return $return;
+ } # parse_byte_string
+ ## NOTE: HTML5 spec says that the encoding layer MUST NOT strip BOM
+ ## and the HTML layer MUST ignore it.  However, we does strip BOM in
+ ## the encoding layer and the HTML layer does not ignore any U+FEFF,
+ ## because the core part of our HTML parser expects a string of character,
+ ## not a string of bytes or code units or anything which might contain a BOM.
+ ## Therefore, any parser interface that accepts a string of bytes,
+ ## such as |parse_byte_string| in this module, must ensure that it does
+ ## strip the BOM and never strip any ZWNBSP.
+ *parse_char_string = \&parse_string;
  sub parse_string ($$$;$) {
-   my $self = shift->new;
+   my $self = ref $_[0] ? shift : shift->new;
-   my $s = \$_[0];
+   my $s = ref $_[0] ? $_[0] : \($_[0]);
    $self->{document} = $_[1];
+   @{$self->{document}->child_nodes} = ();
    ## NOTE: |set_inner_html| copies most of this method's code
+   $self->{confident} = 1 unless exists $self->{confident};
+   $self->{document}->input_encoding ($self->{input_encoding})
+       if defined $self->{input_encoding};
    my $i = 0;
    my $line = 1;
    my $column = 0;
-   $self->{set_next_input_character} = sub {
+   $self->{set_next_char} = sub {
      my $self = shift;
-     pop @{$self->{prev_input_character}};
+     pop @{$self->{prev_char}};
-     unshift @{$self->{prev_input_character}}, $self->{next_input_character};
+     unshift @{$self->{prev_char}}, $self->{next_char};
-     $self->{next_input_character} = -1 and return if $i >= length $$s;
+     $self->{next_char} = -1 and return if $i >= length $$s;
-     $self->{next_input_character} = ord substr $$s, $i++, 1;
+     $self->{next_char} = ord substr $$s, $i++, 1;
      $column++;
-     if ($self->{next_input_character} == 0x000A) { # LF
+     if ($self->{next_char} == 0x000A) { # LF
        $line++;
        $column = 0;
-     } elsif ($self->{next_input_character} == 0x000D) { # CR
+     } elsif ($self->{next_char} == 0x000D) { # CR
        $i++ if substr ($$s, $i, 1) eq "\x0A";
-       $self->{next_input_character} = 0x000A; # LF # MUST
+       $self->{next_char} = 0x000A; # LF # MUST
        $line++;
        $column = 0;
-     } elsif ($self->{next_input_character} > 0x10FFFF) {
+     } elsif ($self->{next_char} > 0x10FFFF) {
-       $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
+       $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
-     } elsif ($self->{next_input_character} == 0x0000) { # NULL
+     } elsif ($self->{next_char} == 0x0000) { # NULL
        !!!parse-error (type => 'NULL');
-       $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
+       $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
      }
    };
-   $self->{prev_input_character} = [-1, -1, -1];
+   $self->{prev_char} = [-1, -1, -1];
-   $self->{next_input_character} = -1;
+   $self->{next_char} = -1;
    my $onerror = $_[2] || sub {
      my (%opt) = @_;
-Line 141 
 sub parse_string ($$$;$) {
+Line 226 
 sub parse_string ($$$;$) {
  sub new ($) {
    my $class = shift;
    my $self = bless {}, $class;
-   $self->{set_next_input_character} = sub {
+   $self->{set_next_char} = sub {
-     $self->{next_input_character} = -1;
+     $self->{next_char} = -1;
    };
    $self->{parse_error} = sub {
      #
    };
+   $self->{change_encoding} = sub {
+     # if ($_[0] is a supported encoding) {
+     #   run "change the encoding" algorithm;
+     #   throw Whatpm::HTML::RestartParser (charset => $new_encoding);
+     # }
+   };
+   $self->{application_cache_selection} = sub {
+     #
+   };
    return $self;
  } # new
-Line 159 
 sub CDATA_CONTENT_MODEL () { CM_LIMITED_
+Line 253 
 sub CDATA_CONTENT_MODEL () { CM_LIMITED_
  sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
  sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
+ sub DATA_STATE () { 0 }
+ sub ENTITY_DATA_STATE () { 1 }
+ sub TAG_OPEN_STATE () { 2 }
+ sub CLOSE_TAG_OPEN_STATE () { 3 }
+ sub TAG_NAME_STATE () { 4 }
+ sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
+ sub ATTRIBUTE_NAME_STATE () { 6 }
+ sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
+ sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
+ sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
+ sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
+ sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
+ sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
+ sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
+ sub COMMENT_START_STATE () { 14 }
+ sub COMMENT_START_DASH_STATE () { 15 }
+ sub COMMENT_STATE () { 16 }
+ sub COMMENT_END_STATE () { 17 }
+ sub COMMENT_END_DASH_STATE () { 18 }
+ sub BOGUS_COMMENT_STATE () { 19 }
+ sub DOCTYPE_STATE () { 20 }
+ sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
+ sub DOCTYPE_NAME_STATE () { 22 }
+ sub AFTER_DOCTYPE_NAME_STATE () { 23 }
+ sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
+ sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
+ sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
+ sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
+ sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
+ sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
+ sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
+ sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
+ sub BOGUS_DOCTYPE_STATE () { 32 }
+ sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
  sub DOCTYPE_TOKEN () { 1 }
  sub COMMENT_TOKEN () { 2 }
  sub START_TAG_TOKEN () { 3 }
-Line 169 
 sub CHARACTER_TOKEN () { 6 }
+Line 298 
 sub CHARACTER_TOKEN () { 6 }
  sub AFTER_HTML_IMS () { 0b100 }
  sub HEAD_IMS ()       { 0b1000 }
  sub BODY_IMS ()       { 0b10000 }
- sub BODY_TABLE_IMS () { 0b100000 | BODY_IMS }
+ sub BODY_TABLE_IMS () { 0b100000 }
  sub TABLE_IMS ()      { 0b1000000 }
- sub ROW_IMS ()        { 0b10000000 | TABLE_IMS }
+ sub ROW_IMS ()        { 0b10000000 }
  sub BODY_AFTER_IMS () { 0b100000000 }
  sub FRAME_IMS ()      { 0b1000000000 }
-Line 182 
 sub IN_HEAD_NOSCRIPT_IM () { HEAD_IMS |
+Line 311 
 sub IN_HEAD_NOSCRIPT_IM () { HEAD_IMS |
  sub AFTER_HEAD_IM () { HEAD_IMS | 0b10 }
  sub BEFORE_HEAD_IM () { HEAD_IMS | 0b11 }
  sub IN_BODY_IM () { BODY_IMS }
- sub IN_CELL_IM () { BODY_TABLE_IMS | 0b01 }
+ sub IN_CELL_IM () { BODY_IMS | BODY_TABLE_IMS | 0b01 }
- sub IN_CAPTION_IM () { BODY_TABLE_IMS | 0b10 }
+ sub IN_CAPTION_IM () { BODY_IMS | BODY_TABLE_IMS | 0b10 }
- sub IN_ROW_IM () { ROW_IMS | 0b01 }
+ sub IN_ROW_IM () { TABLE_IMS | ROW_IMS | 0b01 }
- sub IN_TABLE_BODY_IM () { ROW_IMS | 0b10 }
+ sub IN_TABLE_BODY_IM () { TABLE_IMS | ROW_IMS | 0b10 }
  sub IN_TABLE_IM () { TABLE_IMS }
  sub AFTER_BODY_IM () { BODY_AFTER_IMS }
  sub IN_FRAMESET_IM () { FRAME_IMS | 0b01 }
-Line 197 
 sub IN_COLUMN_GROUP_IM () { 0b10 }
+Line 326 
 sub IN_COLUMN_GROUP_IM () { 0b10 }
  sub _initialize_tokenizer ($) {
    my $self = shift;
-   $self->{state} = 'data'; # MUST
+   $self->{state} = DATA_STATE; # MUST
    $self->{content_model} = PCDATA_CONTENT_MODEL; # be
    undef $self->{current_token}; # start tag, end tag, comment, or DOCTYPE
    undef $self->{current_attribute};
    undef $self->{last_emitted_start_tag_name};
    undef $self->{last_attribute_value_state};
    $self->{char} = [];
-   # $self->{next_input_character}
+   # $self->{next_char}
    !!!next-input-character;
    $self->{token} = [];
    # $self->{escape}
-Line 217 
 sub _initialize_tokenizer ($) {
+Line 346 
 sub _initialize_tokenizer ($) {
  ##   ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
  ##   ->{public_identifier} (DOCTYPE_TOKEN)
  ##   ->{system_identifier} (DOCTYPE_TOKEN)
- ##   ->{correct} == 1 or 0 (DOCTYPE_TOKEN)
+ ##   ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
  ##   ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
+ ##        ->{name}
+ ##        ->{value}
+ ##        ->{has_reference} == 1 or 0
  ##   ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)
  ## Emitted token MUST immediately be handled by the tree construction state.
-Line 229 
 sub _initialize_tokenizer ($) {
+Line 361 
 sub _initialize_tokenizer ($) {
  ## has completed loading.  If one has, then it MUST be executed
  ## and removed from the list.
+ ## NOTE: HTML5 "Writing HTML documents" section, applied to
+ ## documents and not to user agents and conformance checkers,
+ ## contains some requirements that are not detected by the
+ ## parsing algorithm:
+ ## - Some requirements on character encoding declarations. ## TODO
+ ## - "Elements MUST NOT contain content that their content model disallows."
+ ##   ... Some are parse error, some are not (will be reported by c.c.).
+ ## - Polytheistic slash SHOULD NOT be used. (Applied only to atheists.) ## TODO
+ ## - Text (in elements, attributes, and comments) SHOULD NOT contain
+ ##   control characters other than space characters. ## TODO: (what is control character? C0, C1 and DEL?  Unicode control character?)
+ ## TODO: HTML5 poses authors two SHOULD-level requirements that cannot
+ ## be detected by the HTML5 parsing algorithm:
+ ## - Text,
  sub _get_next_token ($) {
    my $self = shift;
    if (@{$self->{token}}) {
-Line 236 
 sub _get_next_token ($) {
+Line 383 
 sub _get_next_token ($) {
    }
    A: {
-     if ($self->{state} eq 'data') {
+     if ($self->{state} == DATA_STATE) {
-       if ($self->{next_input_character} == 0x0026) { # &
+       if ($self->{next_char} == 0x0026) { # &
-         if ($self->{content_model} & CM_ENTITY) { # PCDATA | RCDATA
+         if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
-           $self->{state} = 'entity data';
+             not $self->{escape}) {
+           !!!cp (1);
+           $self->{state} = ENTITY_DATA_STATE;
            !!!next-input-character;
            redo A;
          } else {
+           !!!cp (2);
            #
          }
-       } elsif ($self->{next_input_character} == 0x002D) { # -
+       } elsif ($self->{next_char} == 0x002D) { # -
          if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
            unless ($self->{escape}) {
-             if ($self->{prev_input_character}->[0] == 0x002D and # -
+             if ($self->{prev_char}->[0] == 0x002D and # -
-                 $self->{prev_input_character}->[1] == 0x0021 and # !
+                 $self->{prev_char}->[1] == 0x0021 and # !
-                 $self->{prev_input_character}->[2] == 0x003C) { # <
+                 $self->{prev_char}->[2] == 0x003C) { # <
+               !!!cp (3);
                $self->{escape} = 1;
+             } else {
+               !!!cp (4);
              }
+           } else {
+             !!!cp (5);
            }
          }
          #
-       } elsif ($self->{next_input_character} == 0x003C) { # <
+       } elsif ($self->{next_char} == 0x003C) { # <
          if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
              (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
               not $self->{escape})) {
-           $self->{state} = 'tag open';
+           !!!cp (6);
+           $self->{state} = TAG_OPEN_STATE;
            !!!next-input-character;
            redo A;
          } else {
+           !!!cp (7);
            #
          }
-       } elsif ($self->{next_input_character} == 0x003E) { # >
+       } elsif ($self->{next_char} == 0x003E) { # >
          if ($self->{escape} and
              ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
-           if ($self->{prev_input_character}->[0] == 0x002D and # -
+           if ($self->{prev_char}->[0] == 0x002D and # -
-               $self->{prev_input_character}->[1] == 0x002D) { # -
+               $self->{prev_char}->[1] == 0x002D) { # -
+             !!!cp (8);
              delete $self->{escape};
+           } else {
+             !!!cp (9);
            }
+         } else {
+           !!!cp (10);
          }
          #
-       } elsif ($self->{next_input_character} == -1) {
+       } elsif ($self->{next_char} == -1) {
+         !!!cp (11);
          !!!emit ({type => END_OF_FILE_TOKEN});
          last A; ## TODO: ok?
+       } else {
+         !!!cp (12);
        }
        # Anything else
        my $token = {type => CHARACTER_TOKEN,
-                    data => chr $self->{next_input_character}};
+                    data => chr $self->{next_char}};
        ## Stay in the data state
        !!!next-input-character;
        !!!emit ($token);
        redo A;
-     } elsif ($self->{state} eq 'entity data') {
+     } elsif ($self->{state} == ENTITY_DATA_STATE) {
        ## (cannot happen in CDATA state)
-       my $token = $self->_tokenize_attempt_to_consume_an_entity (0);
+       my $token = $self->_tokenize_attempt_to_consume_an_entity (0, -1);
-       $self->{state} = 'data';
+       $self->{state} = DATA_STATE;
        # next-input-character is already done
        unless (defined $token) {
+         !!!cp (13);
          !!!emit ({type => CHARACTER_TOKEN, data => '&'});
        } else {
+         !!!cp (14);
          !!!emit ($token);
        }
        redo A;
-     } elsif ($self->{state} eq 'tag open') {
+     } elsif ($self->{state} == TAG_OPEN_STATE) {
        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
-         if ($self->{next_input_character} == 0x002F) { # /
+         if ($self->{next_char} == 0x002F) { # /
+           !!!cp (15);
            !!!next-input-character;
-           $self->{state} = 'close tag open';
+           $self->{state} = CLOSE_TAG_OPEN_STATE;
            redo A;
          } else {
+           !!!cp (16);
            ## reconsume
-           $self->{state} = 'data';
+           $self->{state} = DATA_STATE;
            !!!emit ({type => CHARACTER_TOKEN, data => '<'});
            redo A;
          }
        } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
-         if ($self->{next_input_character} == 0x0021) { # !
+         if ($self->{next_char} == 0x0021) { # !
-           $self->{state} = 'markup declaration open';
+           !!!cp (17);
+           $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
            !!!next-input-character;
            redo A;
-         } elsif ($self->{next_input_character} == 0x002F) { # /
+         } elsif ($self->{next_char} == 0x002F) { # /
-           $self->{state} = 'close tag open';
+           !!!cp (18);
+           $self->{state} = CLOSE_TAG_OPEN_STATE;
            !!!next-input-character;
            redo A;
-         } elsif (0x0041 <= $self->{next_input_character} and
+         } elsif (0x0041 <= $self->{next_char} and
-                  $self->{next_input_character} <= 0x005A) { # A..Z
+                  $self->{next_char} <= 0x005A) { # A..Z
+           !!!cp (19);
            $self->{current_token}
              = {type => START_TAG_TOKEN,
-                tag_name => chr ($self->{next_input_character} + 0x0020)};
+                tag_name => chr ($self->{next_char} + 0x0020)};
-           $self->{state} = 'tag name';
+           $self->{state} = TAG_NAME_STATE;
            !!!next-input-character;
            redo A;
-         } elsif (0x0061 <= $self->{next_input_character} and
+         } elsif (0x0061 <= $self->{next_char} and
-                  $self->{next_input_character} <= 0x007A) { # a..z
+                  $self->{next_char} <= 0x007A) { # a..z
+           !!!cp (20);
            $self->{current_token} = {type => START_TAG_TOKEN,
-                             tag_name => chr ($self->{next_input_character})};
+                             tag_name => chr ($self->{next_char})};
-           $self->{state} = 'tag name';
+           $self->{state} = TAG_NAME_STATE;
            !!!next-input-character;
            redo A;
-         } elsif ($self->{next_input_character} == 0x003E) { # >
+         } elsif ($self->{next_char} == 0x003E) { # >
+           !!!cp (21);
            !!!parse-error (type => 'empty start tag');
-           $self->{state} = 'data';
+           $self->{state} = DATA_STATE;
            !!!next-input-character;
            !!!emit ({type => CHARACTER_TOKEN, data => '<>'});
            redo A;
-         } elsif ($self->{next_input_character} == 0x003F) { # ?
+         } elsif ($self->{next_char} == 0x003F) { # ?
+           !!!cp (22);
            !!!parse-error (type => 'pio');
-           $self->{state} = 'bogus comment';
+           $self->{state} = BOGUS_COMMENT_STATE;
-           ## $self->{next_input_character} is intentionally left as is
+           ## $self->{next_char} is intentionally left as is
            redo A;
          } else {
+           !!!cp (23);
            !!!parse-error (type => 'bare stago');
-           $self->{state} = 'data';
+           $self->{state} = DATA_STATE;
            ## reconsume
            !!!emit ({type => CHARACTER_TOKEN, data => '<'});
-Line 368 
 sub _get_next_token ($) {
+Line 544 
 sub _get_next_token ($) {
        } else {
          die "$0: $self->{content_model} in tag open";
        }
-     } elsif ($self->{state} eq 'close tag open') {
+     } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
          if (defined $self->{last_emitted_start_tag_name}) {
            ## NOTE: <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>
            my @next_char;
            TAGNAME: for (my $i = 0; $i < length $self->{last_emitted_start_tag_name}; $i++) {
-             push @next_char, $self->{next_input_character};
+             push @next_char, $self->{next_char};
              my $c = ord substr ($self->{last_emitted_start_tag_name}, $i, 1);
              my $C = 0x0061 <= $c && $c <= 0x007A ? $c - 0x0020 : $c;
-             if ($self->{next_input_character} == $c or $self->{next_input_character} == $C) {
+             if ($self->{next_char} == $c or $self->{next_char} == $C) {
+               !!!cp (24);
                !!!next-input-character;
                next TAGNAME;
              } else {
-               $self->{next_input_character} = shift @next_char; # reconsume
+               !!!cp (25);
+               $self->{next_char} = shift @next_char; # reconsume
                !!!back-next-input-character (@next_char);
-               $self->{state} = 'data';
+               $self->{state} = DATA_STATE;
                !!!emit ({type => CHARACTER_TOKEN, data => '</'});
                redo A;
              }
            }
-           push @next_char, $self->{next_input_character};
+           push @next_char, $self->{next_char};
-           unless ($self->{next_input_character} == 0x0009 or # HT
+           unless ($self->{next_char} == 0x0009 or # HT
-                   $self->{next_input_character} == 0x000A or # LF
+                   $self->{next_char} == 0x000A or # LF
-                   $self->{next_input_character} == 0x000B or # VT
+                   $self->{next_char} == 0x000B or # VT
-                   $self->{next_input_character} == 0x000C or # FF
+                   $self->{next_char} == 0x000C or # FF
-                   $self->{next_input_character} == 0x0020 or # SP
+                   $self->{next_char} == 0x0020 or # SP
-                   $self->{next_input_character} == 0x003E or # >
+                   $self->{next_char} == 0x003E or # >
-                   $self->{next_input_character} == 0x002F or # /
+                   $self->{next_char} == 0x002F or # /
-                   $self->{next_input_character} == -1) {
+                   $self->{next_char} == -1) {
-             $self->{next_input_character} = shift @next_char; # reconsume
+             !!!cp (26);
+             $self->{next_char} = shift @next_char; # reconsume
              !!!back-next-input-character (@next_char);
-             $self->{state} = 'data';
+             $self->{state} = DATA_STATE;
              !!!emit ({type => CHARACTER_TOKEN, data => '</'});
              redo A;
            } else {
-             $self->{next_input_character} = shift @next_char;
+             !!!cp (27);
+             $self->{next_char} = shift @next_char;
              !!!back-next-input-character (@next_char);
              # and consume...
            }
          } else {
            ## No start tag token has ever been emitted
+           !!!cp (28);
            # next-input-character is already done
-           $self->{state} = 'data';
+           $self->{state} = DATA_STATE;
            !!!emit ({type => CHARACTER_TOKEN, data => '</'});
            redo A;
          }
        }
-       if (0x0041 <= $self->{next_input_character} and
+       if (0x0041 <= $self->{next_char} and
-           $self->{next_input_character} <= 0x005A) { # A..Z
+           $self->{next_char} <= 0x005A) { # A..Z
+         !!!cp (29);
          $self->{current_token} = {type => END_TAG_TOKEN,
-                           tag_name => chr ($self->{next_input_character} + 0x0020)};
+                           tag_name => chr ($self->{next_char} + 0x0020)};
-         $self->{state} = 'tag name';
+         $self->{state} = TAG_NAME_STATE;
          !!!next-input-character;
          redo A;
-       } elsif (0x0061 <= $self->{next_input_character} and
+       } elsif (0x0061 <= $self->{next_char} and
-                $self->{next_input_character} <= 0x007A) { # a..z
+                $self->{next_char} <= 0x007A) { # a..z
+         !!!cp (30);
          $self->{current_token} = {type => END_TAG_TOKEN,
-                           tag_name => chr ($self->{next_input_character})};
+                           tag_name => chr ($self->{next_char})};
-         $self->{state} = 'tag name';
+         $self->{state} = TAG_NAME_STATE;
          !!!next-input-character;
          redo A;
-       } elsif ($self->{next_input_character} == 0x003E) { # >
+       } elsif ($self->{next_char} == 0x003E) { # >
+         !!!cp (31);
          !!!parse-error (type => 'empty end tag');
-         $self->{state} = 'data';
+         $self->{state} = DATA_STATE;
          !!!next-input-character;
          redo A;
-       } elsif ($self->{next_input_character} == -1) {
+       } elsif ($self->{next_char} == -1) {
+         !!!cp (32);
          !!!parse-error (type => 'bare etago');
-         $self->{state} = 'data';
+         $self->{state} = DATA_STATE;
          # reconsume
          !!!emit ({type => CHARACTER_TOKEN, data => '</'});
          redo A;
        } else {
+         !!!cp (33);
          !!!parse-error (type => 'bogus end tag');
-         $self->{state} = 'bogus comment';
+         $self->{state} = BOGUS_COMMENT_STATE;
-         ## $self->{next_input_character} is intentionally left as is
+         ## $self->{next_char} is intentionally left as is
          redo A;
        }
-     } elsif ($self->{state} eq 'tag name') {
+     } elsif ($self->{state} == TAG_NAME_STATE) {
-       if ($self->{next_input_character} == 0x0009 or # HT
+       if ($self->{next_char} == 0x0009 or # HT
-           $self->{next_input_character} == 0x000A or # LF
+           $self->{next_char} == 0x000A or # LF
-           $self->{next_input_character} == 0x000B or # VT
+           $self->{next_char} == 0x000B or # VT
-           $self->{next_input_character} == 0x000C or # FF
+           $self->{next_char} == 0x000C or # FF
-           $self->{next_input_character} == 0x0020) { # SP
+           $self->{next_char} == 0x0020) { # SP
-         $self->{state} = 'before attribute name';
+         !!!cp (34);
+         $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
          !!!next-input-character;
          redo A;
-       } elsif ($self->{next_input_character} == 0x003E) { # >
+       } elsif ($self->{next_char} == 0x003E) { # >
          if ($self->{current_token}->{type} == START_TAG_TOKEN) {
+           !!!cp (35);
            $self->{current_token}->{first_start_tag}
                = not defined $self->{last_emitted_start_tag_name};
            $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
          } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
            if ($self->{current_token}->{attributes}) {
+             !!!cp (36);
              !!!parse-error (type => 'end tag attribute');
+           } else {
+             !!!cp (37);
            }
          } else {
            die "$0: $self->{current_token}->{type}: Unknown token type";
          }
-         $self->{state} = 'data';
+         $self->{state} = DATA_STATE;
          !!!next-input-character;
          !!!emit ($self->{current_token}); # start tag or end tag
          redo A;
-       } elsif (0x0041 <= $self->{next_input_character} and
+       } elsif (0x0041 <= $self->{next_char} and
-                $self->{next_input_character} <= 0x005A) { # A..Z
+                $self->{next_char} <= 0x005A) { # A..Z
-         $self->{current_token}->{tag_name} .= chr ($self->{next_input_character} + 0x0020);
+         !!!cp (38);
+         $self->{current_token}->{tag_name} .= chr ($self->{next_char} + 0x0020);
            # start tag or end tag
          ## Stay in this state
          !!!next-input-character;
          redo A;
-       } elsif ($self->{next_input_character} == -1) {
+       } elsif ($self->{next_char} == -1) {
          !!!parse-error (type => 'unclosed tag');
          if ($self->{current_token}->{type} == START_TAG_TOKEN) {
+           !!!cp (39);
            $self->{current_token}->{first_start_tag}
                = not defined $self->{last_emitted_start_tag_name};
            $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
          } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
            if ($self->{current_token}->{attributes}) {
+             !!!cp (40);
              !!!parse-error (type => 'end tag attribute');
+           } else {
+             !!!cp (41);
            }
          } else {
            die "$0: $self->{current_token}->{type}: Unknown token type";
          }
-         $self->{state} = 'data';
+         $self->{state} = DATA_STATE;
          # reconsume
          !!!emit ($self->{current_token}); # start tag or end tag
          redo A;
-       } elsif ($self->{next_input_character} == 0x002F) { # /
+       } elsif ($self->{next_char} == 0x002F) { # /
          !!!next-input-character;
-         if ($self->{next_input_character} == 0x003E and # >
+         if ($self->{next_char} == 0x003E and # >
              $self->{current_token}->{type} == START_TAG_TOKEN and
              $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
            # permitted slash
+           !!!cp (42);
            #
          } else {
+           !!!cp (43);
            !!!parse-error (type => 'nestc');
          }
-         $self->{state} = 'before attribute name';
+         $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
          # next-input-character is already done
          redo A;
        } else {
-         $self->{current_token}->{tag_name} .= chr $self->{next_input_character};
+         !!!cp (44);
+         $self->{current_token}->{tag_name} .= chr $self->{next_char};
            # start tag or end tag
          ## Stay in the state
          !!!next-input-character;
          redo A;
        }
-     } elsif ($self->{state} eq 'before attribute name') {
+     } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
-       if ($self->{next_input_character} == 0x0009 or # HT
+       if ($self->{next_char} == 0x0009 or # HT
-           $self->{next_input_character} == 0x000A or # LF
+           $self->{next_char} == 0x000A or # LF
-           $self->{next_input_character} == 0x000B or # VT
+           $self->{next_char} == 0x000B or # VT
-           $self->{next_input_character} == 0x000C or # FF
+           $self->{next_char} == 0x000C or # FF
-           $self->{next_input_character} == 0x0020) { # SP
+           $self->{next_char} == 0x0020) { # SP
+         !!!cp (45);
          ## Stay in the state
          !!!next-input-character;
          redo A;
-       } elsif ($self->{next_input_character} == 0x003E) { # >
+       } elsif ($self->{next_char} == 0x003E) { # >
          if ($self->{current_token}->{type} == START_TAG_TOKEN) {
+           !!!cp (46);
            $self->{current_token}->{first_start_tag}
                = not defined $self->{last_emitted_start_tag_name};
            $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
          } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
            if ($self->{current_token}->{attributes}) {
+             !!!cp (47);
              !!!parse-error (type => 'end tag attribute');
+           } else {
+             !!!cp (48);
            }
          } else {
            die "$0: $self->{current_token}->{type}: Unknown token type";
          }
-         $self->{state} = 'data';
+         $self->{state} = DATA_STATE;
          !!!next-input-character;
          !!!emit ($self->{current_token}); # start tag or end tag
          redo A;
-       } elsif (0x0041 <= $self->{next_input_character} and
+       } elsif (0x0041 <= $self->{next_char} and
-                $self->{next_input_character} <= 0x005A) { # A..Z
+                $self->{next_char} <= 0x005A) { # A..Z
-         $self->{current_attribute} = {name => chr ($self->{next_input_character} + 0x0020),
+         !!!cp (49);
+         $self->{current_attribute} = {name => chr ($self->{next_char} + 0x0020),
                                value => ''};
-         $self->{state} = 'attribute name';
+         $self->{state} = ATTRIBUTE_NAME_STATE;
          !!!next-input-character;
          redo A;
-       } elsif ($self->{next_input_character} == 0x002F) { # /
+       } elsif ($self->{next_char} == 0x002F) { # /
          !!!next-input-character;
-         if ($self->{next_input_character} == 0x003E and # >
+         if ($self->{next_char} == 0x003E and # >
              $self->{current_token}->{type} == START_TAG_TOKEN and
              $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
            # permitted slash
+           !!!cp (50);
            #
          } else {
+           !!!cp (51);
            !!!parse-error (type => 'nestc');
          }
          ## Stay in the state
          # next-input-character is already done
          redo A;
-       } elsif ($self->{next_input_character} == -1) {
+       } elsif ($self->{next_char} == -1) {
          !!!parse-error (type => 'unclosed tag');
          if ($self->{current_token}->{type} == START_TAG_TOKEN) {
+           !!!cp (52);
            $self->{current_token}->{first_start_tag}
                = not defined $self->{last_emitted_start_tag_name};
            $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
          } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
            if ($self->{current_token}->{attributes}) {
+             !!!cp (53);
              !!!parse-error (type => 'end tag attribute');
+           } else {
+             !!!cp (54);
            }
          } else {
            die "$0: $self->{current_token}->{type}: Unknown token type";
          }
-         $self->{state} = 'data';
+         $self->{state} = DATA_STATE;
          # reconsume
          !!!emit ($self->{current_token}); # start tag or end tag
          redo A;
        } else {
-         $self->{current_attribute} = {name => chr ($self->{next_input_character}),
+         if ({
+x0022 => 1, # "
+x0027 => 1, # '
+x003D => 1, # =
+             }->{$self->{next_char}}) {
+           !!!cp (55);
+           !!!parse-error (type => 'bad attribute name');
+         } else {
+           !!!cp (56);
+         }
+         $self->{current_attribute} = {name => chr ($self->{next_char}),
                                value => ''};
-         $self->{state} = 'attribute name';
+         $self->{state} = ATTRIBUTE_NAME_STATE;
          !!!next-input-character;
          redo A;
        }
-     } elsif ($self->{state} eq 'attribute name') {
+     } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
        my $before_leave = sub {
          if (exists $self->{current_token}->{attributes} # start tag or end tag
              ->{$self->{current_attribute}->{name}}) { # MUST
+           !!!cp (57);
            !!!parse-error (type => 'duplicate attribute:'.$self->{current_attribute}->{name});
            ## Discard $self->{current_attribute} # MUST
          } else {
+           !!!cp (58);
            $self->{current_token}->{attributes}->{$self->{current_attribute}->{name}}
              = $self->{current_attribute};
          }
        }; # $before_leave
-       if ($self->{next_input_character} == 0x0009 or # HT
+       if ($self->{next_char} == 0x0009 or # HT
-           $self->{next_input_character} == 0x000A or # LF
+           $self->{next_char} == 0x000A or # LF
-           $self->{next_input_character} == 0x000B or # VT
+           $self->{next_char} == 0x000B or # VT
-           $self->{next_input_character} == 0x000C or # FF
+           $self->{next_char} == 0x000C or # FF
-           $self->{next_input_character} == 0x0020) { # SP
+           $self->{next_char} == 0x0020) { # SP
+         !!!cp (59);
          $before_leave->();
-         $self->{state} = 'after attribute name';
+         $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
          !!!next-input-character;
          redo A;
-       } elsif ($self->{next_input_character} == 0x003D) { # =
+       } elsif ($self->{next_char} == 0x003D) { # =
+         !!!cp (60);
          $before_leave->();
-         $self->{state} = 'before attribute value';
+         $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
          !!!next-input-character;
          redo A;
-       } elsif ($self->{next_input_character} == 0x003E) { # >
+       } elsif ($self->{next_char} == 0x003E) { # >
          $before_leave->();
          if ($self->{current_token}->{type} == START_TAG_TOKEN) {
+           !!!cp (61);
            $self->{current_token}->{first_start_tag}
                = not defined $self->{last_emitted_start_tag_name};
            $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
          } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
+           !!!cp (62);
            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
            if ($self->{current_token}->{attributes}) {
              !!!parse-error (type => 'end tag attribute');
-Line 642 
 sub _get_next_token ($) {
+Line 869 
 sub _get_next_token ($) {
          } else {
            die "$0: $self->{current_token}->{type}: Unknown token type";
          }
-         $self->{state} = 'data';
+         $self->{state} = DATA_STATE;
          !!!next-input-character;
          !!!emit ($self->{current_token}); # start tag or end tag
          redo A;
-       } elsif (0x0041 <= $self->{next_input_character} and
+       } elsif (0x0041 <= $self->{next_char} and
-                $self->{next_input_character} <= 0x005A) { # A..Z
+                $self->{next_char} <= 0x005A) { # A..Z
-         $self->{current_attribute}->{name} .= chr ($self->{next_input_character} + 0x0020);
+         !!!cp (63);
+         $self->{current_attribute}->{name} .= chr ($self->{next_char} + 0x0020);
          ## Stay in the state
          !!!next-input-character;
          redo A;
-       } elsif ($self->{next_input_character} == 0x002F) { # /
+       } elsif ($self->{next_char} == 0x002F) { # /
          $before_leave->();
          !!!next-input-character;
-         if ($self->{next_input_character} == 0x003E and # >
+         if ($self->{next_char} == 0x003E and # >
              $self->{current_token}->{type} == START_TAG_TOKEN and
              $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
            # permitted slash
+           !!!cp (64);
            #
          } else {
+           !!!cp (65);
            !!!parse-error (type => 'nestc');
          }
-         $self->{state} = 'before attribute name';
+         $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
          # next-input-character is already done
          redo A;
-       } elsif ($self->{next_input_character} == -1) {
+       } elsif ($self->{next_char} == -1) {
          !!!parse-error (type => 'unclosed tag');
          $before_leave->();
          if ($self->{current_token}->{type} == START_TAG_TOKEN) {
+           !!!cp (66);
            $self->{current_token}->{first_start_tag}
                = not defined $self->{last_emitted_start_tag_name};
            $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
          } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
            if ($self->{current_token}->{attributes}) {
+             !!!cp (67);
              !!!parse-error (type => 'end tag attribute');
+           } else {
+             !!!cp (68);
            }
          } else {
            die "$0: $self->{current_token}->{type}: Unknown token type";
          }
-         $self->{state} = 'data';
+         $self->{state} = DATA_STATE;
          # reconsume
          !!!emit ($self->{current_token}); # start tag or end tag
          redo A;
        } else {
-         $self->{current_attribute}->{name} .= chr ($self->{next_input_character});
+         if ($self->{next_char} == 0x0022 or # "
+             $self->{next_char} == 0x0027) { # '
+           !!!cp (69);
+           !!!parse-error (type => 'bad attribute name');
+         } else {
+           !!!cp (70);
+         }
+         $self->{current_attribute}->{name} .= chr ($self->{next_char});
          ## Stay in the state
          !!!next-input-character;
          redo A;
        }
-     } elsif ($self->{state} eq 'after attribute name') {
+     } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
-       if ($self->{next_input_character} == 0x0009 or # HT
+       if ($self->{next_char} == 0x0009 or # HT
-           $self->{next_input_character} == 0x000A or # LF
+           $self->{next_char} == 0x000A or # LF
-           $self->{next_input_character} == 0x000B or # VT
+           $self->{next_char} == 0x000B or # VT
-           $self->{next_input_character} == 0x000C or # FF
+           $self->{next_char} == 0x000C or # FF
-           $self->{next_input_character} == 0x0020) { # SP
+           $self->{next_char} == 0x0020) { # SP
+         !!!cp (71);
          ## Stay in the state
          !!!next-input-character;
          redo A;
-       } elsif ($self->{next_input_character} == 0x003D) { # =
+       } elsif ($self->{next_char} == 0x003D) { # =
-         $self->{state} = 'before attribute value';
+         !!!cp (72);
+         $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
          !!!next-input-character;
          redo A;
-       } elsif ($self->{next_input_character} == 0x003E) { # >
+       } elsif ($self->{next_char} == 0x003E) { # >
          if ($self->{current_token}->{type} == START_TAG_TOKEN) {
+           !!!cp (73);
            $self->{current_token}->{first_start_tag}
                = not defined $self->{last_emitted_start_tag_name};
            $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
          } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
            if ($self->{current_token}->{attributes}) {
+             !!!cp (74);
              !!!parse-error (type => 'end tag attribute');
+           } else {
+             !!!cp (75);
            }
          } else {
            die "$0: $self->{current_token}->{type}: Unknown token type";
          }
-         $self->{state} = 'data';
+         $self->{state} = DATA_STATE;
          !!!next-input-character;
          !!!emit ($self->{current_token}); # start tag or end tag
          redo A;
-       } elsif (0x0041 <= $self->{next_input_character} and
+       } elsif (0x0041 <= $self->{next_char} and
-                $self->{next_input_character} <= 0x005A) { # A..Z
+                $self->{next_char} <= 0x005A) { # A..Z
-         $self->{current_attribute} = {name => chr ($self->{next_input_character} + 0x0020),
+         !!!cp (76);
+         $self->{current_attribute} = {name => chr ($self->{next_char} + 0x0020),
                                value => ''};
-         $self->{state} = 'attribute name';
+         $self->{state} = ATTRIBUTE_NAME_STATE;
          !!!next-input-character;
          redo A;
-       } elsif ($self->{next_input_character} == 0x002F) { # /
+       } elsif ($self->{next_char} == 0x002F) { # /
          !!!next-input-character;
-         if ($self->{next_input_character} == 0x003E and # >
+         if ($self->{next_char} == 0x003E and # >
              $self->{current_token}->{type} == START_TAG_TOKEN and
              $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
            # permitted slash
+           !!!cp (77);
            #
          } else {
+           !!!cp (78);
            !!!parse-error (type => 'nestc');
            ## TODO: Different error type for <aa / bb> than <aa/>
          }
-         $self->{state} = 'before attribute name';
+         $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
          # next-input-character is already done
          redo A;
-       } elsif ($self->{next_input_character} == -1) {
+       } elsif ($self->{next_char} == -1) {
          !!!parse-error (type => 'unclosed tag');
          if ($self->{current_token}->{type} == START_TAG_TOKEN) {
+           !!!cp (79);
            $self->{current_token}->{first_start_tag}
                = not defined $self->{last_emitted_start_tag_name};
            $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
          } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
            if ($self->{current_token}->{attributes}) {
+             !!!cp (80);
              !!!parse-error (type => 'end tag attribute');
+           } else {
+             !!!cp (81);
            }
          } else {
            die "$0: $self->{current_token}->{type}: Unknown token type";
          }
-         $self->{state} = 'data';
+         $self->{state} = DATA_STATE;
          # reconsume
          !!!emit ($self->{current_token}); # start tag or end tag
          redo A;
        } else {
-         $self->{current_attribute} = {name => chr ($self->{next_input_character}),
+         !!!cp (82);
+         $self->{current_attribute} = {name => chr ($self->{next_char}),
                                value => ''};
-         $self->{state} = 'attribute name';
+         $self->{state} = ATTRIBUTE_NAME_STATE;
          !!!next-input-character;
          redo A;
        }
-     } elsif ($self->{state} eq 'before attribute value') {
+     } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
-       if ($self->{next_input_character} == 0x0009 or # HT
+       if ($self->{next_char} == 0x0009 or # HT
-           $self->{next_input_character} == 0x000A or # LF
+           $self->{next_char} == 0x000A or # LF
-           $self->{next_input_character} == 0x000B or # VT
+           $self->{next_char} == 0x000B or # VT
-           $self->{next_input_character} == 0x000C or # FF
+           $self->{next_char} == 0x000C or # FF
-           $self->{next_input_character} == 0x0020) { # SP
+           $self->{next_char} == 0x0020) { # SP
+         !!!cp (83);
          ## Stay in the state
          !!!next-input-character;
          redo A;
-       } elsif ($self->{next_input_character} == 0x0022) { # "
+       } elsif ($self->{next_char} == 0x0022) { # "
-         $self->{state} = 'attribute value (double-quoted)';
+         !!!cp (84);
+         $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
          !!!next-input-character;
          redo A;
-       } elsif ($self->{next_input_character} == 0x0026) { # &
+       } elsif ($self->{next_char} == 0x0026) { # &
-         $self->{state} = 'attribute value (unquoted)';
+         !!!cp (85);
+         $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
          ## reconsume
          redo A;
-       } elsif ($self->{next_input_character} == 0x0027) { # '
+       } elsif ($self->{next_char} == 0x0027) { # '
-         $self->{state} = 'attribute value (single-quoted)';
+         !!!cp (86);
+         $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
          !!!next-input-character;
          redo A;
-       } elsif ($self->{next_input_character} == 0x003E) { # >
+       } elsif ($self->{next_char} == 0x003E) { # >
          if ($self->{current_token}->{type} == START_TAG_TOKEN) {
+           !!!cp (87);
            $self->{current_token}->{first_start_tag}
                = not defined $self->{last_emitted_start_tag_name};
            $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
          } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
            if ($self->{current_token}->{attributes}) {
+             !!!cp (88);
              !!!parse-error (type => 'end tag attribute');
+           } else {
+             !!!cp (89);
            }
          } else {
            die "$0: $self->{current_token}->{type}: Unknown token type";
          }
-         $self->{state} = 'data';
+         $self->{state} = DATA_STATE;
          !!!next-input-character;
          !!!emit ($self->{current_token}); # start tag or end tag
          redo A;
-       } elsif ($self->{next_input_character} == -1) {
+       } elsif ($self->{next_char} == -1) {
          !!!parse-error (type => 'unclosed tag');
          if ($self->{current_token}->{type} == START_TAG_TOKEN) {
+           !!!cp (90);
            $self->{current_token}->{first_start_tag}
                = not defined $self->{last_emitted_start_tag_name};
            $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
          } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
            if ($self->{current_token}->{attributes}) {
+             !!!cp (91);
              !!!parse-error (type => 'end tag attribute');
+           } else {
+             !!!cp (92);
            }
          } else {
            die "$0: $self->{current_token}->{type}: Unknown token type";
          }
-         $self->{state} = 'data';
+         $self->{state} = DATA_STATE;
          ## reconsume
          !!!emit ($self->{current_token}); # start tag or end tag
          redo A;
        } else {
-         $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
+         if ($self->{next_char} == 0x003D) { # =
-         $self->{state} = 'attribute value (unquoted)';
+           !!!cp (93);
+           !!!parse-error (type => 'bad attribute value');
+         } else {
+           !!!cp (94);
+         }
+         $self->{current_attribute}->{value} .= chr ($self->{next_char});
+         $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
          !!!next-input-character;
          redo A;
        }
-     } elsif ($self->{state} eq 'attribute value (double-quoted)') {
+     } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
-       if ($self->{next_input_character} == 0x0022) { # "
+       if ($self->{next_char} == 0x0022) { # "
-         $self->{state} = 'before attribute name';
+         !!!cp (95);
+         $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
          !!!next-input-character;
          redo A;
-       } elsif ($self->{next_input_character} == 0x0026) { # &
+       } elsif ($self->{next_char} == 0x0026) { # &
-         $self->{last_attribute_value_state} = 'attribute value (double-quoted)';
+         !!!cp (96);
-         $self->{state} = 'entity in attribute value';
+         $self->{last_attribute_value_state} = $self->{state};
+         $self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE;
          !!!next-input-character;
          redo A;
-       } elsif ($self->{next_input_character} == -1) {
+       } elsif ($self->{next_char} == -1) {
          !!!parse-error (type => 'unclosed attribute value');
          if ($self->{current_token}->{type} == START_TAG_TOKEN) {
+           !!!cp (97);
            $self->{current_token}->{first_start_tag}
                = not defined $self->{last_emitted_start_tag_name};
            $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
          } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
            if ($self->{current_token}->{attributes}) {
+             !!!cp (98);
              !!!parse-error (type => 'end tag attribute');
+           } else {
+             !!!cp (99);
            }
          } else {
            die "$0: $self->{current_token}->{type}: Unknown token type";
          }
-         $self->{state} = 'data';
+         $self->{state} = DATA_STATE;
          ## reconsume
          !!!emit ($self->{current_token}); # start tag or end tag
          redo A;
        } else {
-         $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
+         !!!cp (100);
+         $self->{current_attribute}->{value} .= chr ($self->{next_char});
          ## Stay in the state
          !!!next-input-character;
          redo A;
        }
-     } elsif ($self->{state} eq 'attribute value (single-quoted)') {
+     } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
-       if ($self->{next_input_character} == 0x0027) { # '
+       if ($self->{next_char} == 0x0027) { # '
-         $self->{state} = 'before attribute name';
+         !!!cp (101);
+         $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
          !!!next-input-character;
          redo A;
-       } elsif ($self->{next_input_character} == 0x0026) { # &
+       } elsif ($self->{next_char} == 0x0026) { # &
-         $self->{last_attribute_value_state} = 'attribute value (single-quoted)';
+         !!!cp (102);
-         $self->{state} = 'entity in attribute value';
+         $self->{last_attribute_value_state} = $self->{state};
+         $self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE;
          !!!next-input-character;
          redo A;
-       } elsif ($self->{next_input_character} == -1) {
+       } elsif ($self->{next_char} == -1) {
          !!!parse-error (type => 'unclosed attribute value');
          if ($self->{current_token}->{type} == START_TAG_TOKEN) {
+           !!!cp (103);
            $self->{current_token}->{first_start_tag}
                = not defined $self->{last_emitted_start_tag_name};
            $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
          } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
            if ($self->{current_token}->{attributes}) {
+             !!!cp (104);
              !!!parse-error (type => 'end tag attribute');
+           } else {
+             !!!cp (105);
            }
          } else {
            die "$0: $self->{current_token}->{type}: Unknown token type";
          }
-         $self->{state} = 'data';
+         $self->{state} = DATA_STATE;
          ## reconsume
          !!!emit ($self->{current_token}); # start tag or end tag
          redo A;
        } else {
-         $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
+         !!!cp (106);
+         $self->{current_attribute}->{value} .= chr ($self->{next_char});
          ## Stay in the state
          !!!next-input-character;
          redo A;
        }
-     } elsif ($self->{state} eq 'attribute value (unquoted)') {
+     } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
-       if ($self->{next_input_character} == 0x0009 or # HT
+       if ($self->{next_char} == 0x0009 or # HT
-           $self->{next_input_character} == 0x000A or # LF
+           $self->{next_char} == 0x000A or # LF
-           $self->{next_input_character} == 0x000B or # HT
+           $self->{next_char} == 0x000B or # HT
-           $self->{next_input_character} == 0x000C or # FF
+           $self->{next_char} == 0x000C or # FF
-           $self->{next_input_character} == 0x0020) { # SP
+           $self->{next_char} == 0x0020) { # SP
-         $self->{state} = 'before attribute name';
+         !!!cp (107);
-         !!!next-input-character;
+         $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
-         redo A;
+         !!!next-input-character;
-       } elsif ($self->{next_input_character} == 0x0026) { # &
+         redo A;
-         $self->{last_attribute_value_state} = 'attribute value (unquoted)';
+       } elsif ($self->{next_char} == 0x0026) { # &
-         $self->{state} = 'entity in attribute value';
+         !!!cp (108);
+         $self->{last_attribute_value_state} = $self->{state};
+         $self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE;
          !!!next-input-character;
          redo A;
-       } elsif ($self->{next_input_character} == 0x003E) { # >
+       } elsif ($self->{next_char} == 0x003E) { # >
          if ($self->{current_token}->{type} == START_TAG_TOKEN) {
+           !!!cp (109);
            $self->{current_token}->{first_start_tag}
                = not defined $self->{last_emitted_start_tag_name};
            $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
          } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
            if ($self->{current_token}->{attributes}) {
+             !!!cp (110);
              !!!parse-error (type => 'end tag attribute');
+           } else {
+             !!!cp (111);
            }
          } else {
            die "$0: $self->{current_token}->{type}: Unknown token type";
          }
-         $self->{state} = 'data';
+         $self->{state} = DATA_STATE;
          !!!next-input-character;
          !!!emit ($self->{current_token}); # start tag or end tag
          redo A;
-       } elsif ($self->{next_input_character} == -1) {
+       } elsif ($self->{next_char} == -1) {
          !!!parse-error (type => 'unclosed tag');
          if ($self->{current_token}->{type} == START_TAG_TOKEN) {
+           !!!cp (112);
            $self->{current_token}->{first_start_tag}
                = not defined $self->{last_emitted_start_tag_name};
            $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
          } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
            if ($self->{current_token}->{attributes}) {
+             !!!cp (113);
              !!!parse-error (type => 'end tag attribute');
+           } else {
+             !!!cp (114);
            }
          } else {
            die "$0: $self->{current_token}->{type}: Unknown token type";
          }
-         $self->{state} = 'data';
+         $self->{state} = DATA_STATE;
          ## reconsume
          !!!emit ($self->{current_token}); # start tag or end tag
          redo A;
        } else {
-         $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
+         if ({
+x0022 => 1, # "
+x0027 => 1, # '
+x003D => 1, # =
+             }->{$self->{next_char}}) {
+           !!!cp (115);
+           !!!parse-error (type => 'bad attribute value');
+         } else {
+           !!!cp (116);
+         }
+         $self->{current_attribute}->{value} .= chr ($self->{next_char});
          ## Stay in the state
          !!!next-input-character;
          redo A;
        }
-     } elsif ($self->{state} eq 'entity in attribute value') {
+     } elsif ($self->{state} == ENTITY_IN_ATTRIBUTE_VALUE_STATE) {
-       my $token = $self->_tokenize_attempt_to_consume_an_entity (1);
+       my $token = $self->_tokenize_attempt_to_consume_an_entity
+           (1,
+            $self->{last_attribute_value_state}
+              == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE ? 0x0022 : # "
+            $self->{last_attribute_value_state}
+              == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE ? 0x0027 : # '
+            -1);
        unless (defined $token) {
+         !!!cp (117);
          $self->{current_attribute}->{value} .= '&';
        } else {
+         !!!cp (118);
          $self->{current_attribute}->{value} .= $token->{data};
+         $self->{current_attribute}->{has_reference} = $token->{has_reference};
          ## ISSUE: spec says "append the returned character token to the current attribute's value"
        }
        $self->{state} = $self->{last_attribute_value_state};
        # next-input-character is already done
        redo A;
-     } elsif ($self->{state} eq 'bogus comment') {
+     } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
+       if ($self->{next_char} == 0x0009 or # HT
+           $self->{next_char} == 0x000A or # LF
+           $self->{next_char} == 0x000B or # VT
+           $self->{next_char} == 0x000C or # FF
+           $self->{next_char} == 0x0020) { # SP
+         !!!cp (118);
+         $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
+         !!!next-input-character;
+         redo A;
+       } elsif ($self->{next_char} == 0x003E) { # >
+         if ($self->{current_token}->{type} == START_TAG_TOKEN) {
+           !!!cp (119);
+           $self->{current_token}->{first_start_tag}
+               = not defined $self->{last_emitted_start_tag_name};
+           $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};
+         } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {
+           $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
+           if ($self->{current_token}->{attributes}) {
+             !!!cp (120);
+             !!!parse-error (type => 'end tag attribute');
+           } else {
+             !!!cp (121);
+           }
+         } else {
+           die "$0: $self->{current_token}->{type}: Unknown token type";
+         }
+         $self->{state} = DATA_STATE;
+         !!!next-input-character;
+         !!!emit ($self->{current_token}); # start tag or end tag
+         redo A;
+       } elsif ($self->{next_char} == 0x002F) { # /
+         !!!next-input-character;
+         if ($self->{next_char} == 0x003E and # >
+             $self->{current_token}->{type} == START_TAG_TOKEN and
+             $permitted_slash_tag_name->{$self->{current_token}->{tag_name}}) {
+           # permitted slash
+           !!!cp (122);
+           #
+         } else {
+           !!!cp (123);
+           !!!parse-error (type => 'nestc');
+         }
+         $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
+         # next-input-character is already done
+         redo A;
+       } else {
+         !!!cp (124);
+         !!!parse-error (type => 'no space between attributes');
+         $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
+         ## reconsume
+         redo A;
+       }
+     } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
        ## (only happen if PCDATA state)
        my $token = {type => COMMENT_TOKEN, data => ''};
        BC: {
-         if ($self->{next_input_character} == 0x003E) { # >
+         if ($self->{next_char} == 0x003E) { # >
-           $self->{state} = 'data';
+           !!!cp (124);
+           $self->{state} = DATA_STATE;
            !!!next-input-character;
            !!!emit ($token);
            redo A;
-         } elsif ($self->{next_input_character} == -1) {
+         } elsif ($self->{next_char} == -1) {
-           $self->{state} = 'data';
+           !!!cp (125);
+           $self->{state} = DATA_STATE;
            ## reconsume
            !!!emit ($token);
            redo A;
          } else {
-           $token->{data} .= chr ($self->{next_input_character});
+           !!!cp (126);
+           $token->{data} .= chr ($self->{next_char});
            !!!next-input-character;
            redo BC;
          }
        } # BC
-     } elsif ($self->{state} eq 'markup declaration open') {
+       die "$0: _get_next_token: unexpected case [BC]";
+     } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
        ## (only happen if PCDATA state)
        my @next_char;
-       push @next_char, $self->{next_input_character};
+       push @next_char, $self->{next_char};
-       if ($self->{next_input_character} == 0x002D) { # -
+       if ($self->{next_char} == 0x002D) { # -
          !!!next-input-character;
-         push @next_char, $self->{next_input_character};
+         push @next_char, $self->{next_char};
-         if ($self->{next_input_character} == 0x002D) { # -
+         if ($self->{next_char} == 0x002D) { # -
+           !!!cp (127);
            $self->{current_token} = {type => COMMENT_TOKEN, data => ''};
-           $self->{state} = 'comment start';
+           $self->{state} = COMMENT_START_STATE;
            !!!next-input-character;
            redo A;
+         } else {
+           !!!cp (128);
          }
-       } elsif ($self->{next_input_character} == 0x0044 or # D
+       } elsif ($self->{next_char} == 0x0044 or # D
-                $self->{next_input_character} == 0x0064) { # d
+                $self->{next_char} == 0x0064) { # d
          !!!next-input-character;
-         push @next_char, $self->{next_input_character};
+         push @next_char, $self->{next_char};
-         if ($self->{next_input_character} == 0x004F or # O
+         if ($self->{next_char} == 0x004F or # O
-             $self->{next_input_character} == 0x006F) { # o
+             $self->{next_char} == 0x006F) { # o
            !!!next-input-character;
-           push @next_char, $self->{next_input_character};
+           push @next_char, $self->{next_char};
-           if ($self->{next_input_character} == 0x0043 or # C
+           if ($self->{next_char} == 0x0043 or # C
-               $self->{next_input_character} == 0x0063) { # c
+               $self->{next_char} == 0x0063) { # c
              !!!next-input-character;
-             push @next_char, $self->{next_input_character};
+             push @next_char, $self->{next_char};
-             if ($self->{next_input_character} == 0x0054 or # T
+             if ($self->{next_char} == 0x0054 or # T
-                 $self->{next_input_character} == 0x0074) { # t
+                 $self->{next_char} == 0x0074) { # t
                !!!next-input-character;
-               push @next_char, $self->{next_input_character};
+               push @next_char, $self->{next_char};
-               if ($self->{next_input_character} == 0x0059 or # Y
+               if ($self->{next_char} == 0x0059 or # Y
-                   $self->{next_input_character} == 0x0079) { # y
+                   $self->{next_char} == 0x0079) { # y
                  !!!next-input-character;
-                 push @next_char, $self->{next_input_character};
+                 push @next_char, $self->{next_char};
-                 if ($self->{next_input_character} == 0x0050 or # P
+                 if ($self->{next_char} == 0x0050 or # P
-                     $self->{next_input_character} == 0x0070) { # p
+                     $self->{next_char} == 0x0070) { # p
                    !!!next-input-character;
-                   push @next_char, $self->{next_input_character};
+                   push @next_char, $self->{next_char};
-                   if ($self->{next_input_character} == 0x0045 or # E
+                   if ($self->{next_char} == 0x0045 or # E
-                       $self->{next_input_character} == 0x0065) { # e
+                       $self->{next_char} == 0x0065) { # e
-                     ## ISSUE: What a stupid code this is!
+                     !!!cp (129);
-                     $self->{state} = 'DOCTYPE';
+                     ## TODO: What a stupid code this is!
+                     $self->{state} = DOCTYPE_STATE;
                      !!!next-input-character;
                      redo A;
+                   } else {
+                     !!!cp (130);
                    }
+                 } else {
+                   !!!cp (131);
                  }
+               } else {
+                 !!!cp (132);
                }
+             } else {
+               !!!cp (133);
              }
+           } else {
+             !!!cp (134);
            }
+         } else {
+           !!!cp (135);
          }
+       } else {
+         !!!cp (136);
        }
        !!!parse-error (type => 'bogus comment');
-       $self->{next_input_character} = shift @next_char;
+       $self->{next_char} = shift @next_char;
        !!!back-next-input-character (@next_char);
-       $self->{state} = 'bogus comment';
+       $self->{state} = BOGUS_COMMENT_STATE;
        redo A;
        ## ISSUE: typos in spec: chacacters, is is a parse error
        ## ISSUE: spec is somewhat unclear on "is the first character that will be in the comment"; what is "that will be in the comment" is what the algorithm defines, isn't it?
-     } elsif ($self->{state} eq 'comment start') {
+     } elsif ($self->{state} == COMMENT_START_STATE) {
-       if ($self->{next_input_character} == 0x002D) { # -
+       if ($self->{next_char} == 0x002D) { # -
-         $self->{state} = 'comment start dash';
+         !!!cp (137);
+         $self->{state} = COMMENT_START_DASH_STATE;
          !!!next-input-character;
          redo A;
-       } elsif ($self->{next_input_character} == 0x003E) { # >
+       } elsif ($self->{next_char} == 0x003E) { # >
+         !!!cp (138);
          !!!parse-error (type => 'bogus comment');
-         $self->{state} = 'data';
+         $self->{state} = DATA_STATE;
          !!!next-input-character;
          !!!emit ($self->{current_token}); # comment
          redo A;
-       } elsif ($self->{next_input_character} == -1) {
+       } elsif ($self->{next_char} == -1) {
+         !!!cp (139);
          !!!parse-error (type => 'unclosed comment');
-         $self->{state} = 'data';
+         $self->{state} = DATA_STATE;
          ## reconsume
          !!!emit ($self->{current_token}); # comment
          redo A;
        } else {
+         !!!cp (140);
          $self->{current_token}->{data} # comment
-             .= chr ($self->{next_input_character});
+             .= chr ($self->{next_char});
-         $self->{state} = 'comment';
+         $self->{state} = COMMENT_STATE;
          !!!next-input-character;
          redo A;
        }
-     } elsif ($self->{state} eq 'comment start dash') {
+     } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
-       if ($self->{next_input_character} == 0x002D) { # -
+       if ($self->{next_char} == 0x002D) { # -
-         $self->{state} = 'comment end';
+         !!!cp (141);
+         $self->{state} = COMMENT_END_STATE;
          !!!next-input-character;
          redo A;
-       } elsif ($self->{next_input_character} == 0x003E) { # >
+       } elsif ($self->{next_char} == 0x003E) { # >
+         !!!cp (142);
          !!!parse-error (type => 'bogus comment');
-         $self->{state} = 'data';
+         $self->{state} = DATA_STATE;
          !!!next-input-character;
          !!!emit ($self->{current_token}); # comment
          redo A;
-       } elsif ($self->{next_input_character} == -1) {
+       } elsif ($self->{next_char} == -1) {
+         !!!cp (143);
          !!!parse-error (type => 'unclosed comment');
-         $self->{state} = 'data';
+         $self->{state} = DATA_STATE;
          ## reconsume
          !!!emit ($self->{current_token}); # comment
          redo A;
        } else {
+         !!!cp (144);
          $self->{current_token}->{data} # comment
-             .= '-' . chr ($self->{next_input_character});
+             .= '-' . chr ($self->{next_char});
-         $self->{state} = 'comment';
+         $self->{state} = COMMENT_STATE;
          !!!next-input-character;
          redo A;
        }
-     } elsif ($self->{state} eq 'comment') {
+     } elsif ($self->{state} == COMMENT_STATE) {
-       if ($self->{next_input_character} == 0x002D) { # -
+       if ($self->{next_char} == 0x002D) { # -
-         $self->{state} = 'comment end dash';
+         !!!cp (145);
+         $self->{state} = COMMENT_END_DASH_STATE;
          !!!next-input-character;
          redo A;
-       } elsif ($self->{next_input_character} == -1) {
+       } elsif ($self->{next_char} == -1) {
+         !!!cp (146);
          !!!parse-error (type => 'unclosed comment');
-         $self->{state} = 'data';
+         $self->{state} = DATA_STATE;
          ## reconsume
          !!!emit ($self->{current_token}); # comment
          redo A;
        } else {
-         $self->{current_token}->{data} .= chr ($self->{next_input_character}); # comment
+         !!!cp (147);
+         $self->{current_token}->{data} .= chr ($self->{next_char}); # comment
          ## Stay in the state
          !!!next-input-character;
          redo A;
        }
-     } elsif ($self->{state} eq 'comment end dash') {
+     } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
-       if ($self->{next_input_character} == 0x002D) { # -
+       if ($self->{next_char} == 0x002D) { # -
-         $self->{state} = 'comment end';
+         !!!cp (148);
+         $self->{state} = COMMENT_END_STATE;
          !!!next-input-character;
          redo A;
-       } elsif ($self->{next_input_character} == -1) {
+       } elsif ($self->{next_char} == -1) {
+         !!!cp (149);
          !!!parse-error (type => 'unclosed comment');
-         $self->{state} = 'data';
+         $self->{state} = DATA_STATE;
          ## reconsume
          !!!emit ($self->{current_token}); # comment
          redo A;
        } else {
-         $self->{current_token}->{data} .= '-' . chr ($self->{next_input_character}); # comment
+         !!!cp (150);
-         $self->{state} = 'comment';
+         $self->{current_token}->{data} .= '-' . chr ($self->{next_char}); # comment
+         $self->{state} = COMMENT_STATE;
          !!!next-input-character;
          redo A;
        }
-     } elsif ($self->{state} eq 'comment end') {
+     } elsif ($self->{state} == COMMENT_END_STATE) {
-       if ($self->{next_input_character} == 0x003E) { # >
+       if ($self->{next_char} == 0x003E) { # >
-         $self->{state} = 'data';
+         !!!cp (151);
+         $self->{state} = DATA_STATE;
          !!!next-input-character;
          !!!emit ($self->{current_token}); # comment
          redo A;
-       } elsif ($self->{next_input_character} == 0x002D) { # -
+       } elsif ($self->{next_char} == 0x002D) { # -
+         !!!cp (152);
          !!!parse-error (type => 'dash in comment');
          $self->{current_token}->{data} .= '-'; # comment
          ## Stay in the state
          !!!next-input-character;
          redo A;
-       } elsif ($self->{next_input_character} == -1) {
+       } elsif ($self->{next_char} == -1) {
+         !!!cp (153);
          !!!parse-error (type => 'unclosed comment');
-         $self->{state} = 'data';
+         $self->{state} = DATA_STATE;
          ## reconsume
          !!!emit ($self->{current_token}); # comment
          redo A;
        } else {
+         !!!cp (154);
          !!!parse-error (type => 'dash in comment');
-         $self->{current_token}->{data} .= '--' . chr ($self->{next_input_character}); # comment
+         $self->{current_token}->{data} .= '--' . chr ($self->{next_char}); # comment
-         $self->{state} = 'comment';
+         $self->{state} = COMMENT_STATE;
          !!!next-input-character;
          redo A;
        }
-     } elsif ($self->{state} eq 'DOCTYPE') {
+     } elsif ($self->{state} == DOCTYPE_STATE) {
-       if ($self->{next_input_character} == 0x0009 or # HT
+       if ($self->{next_char} == 0x0009 or # HT
-           $self->{next_input_character} == 0x000A or # LF
+           $self->{next_char} == 0x000A or # LF
-           $self->{next_input_character} == 0x000B or # VT
+           $self->{next_char} == 0x000B or # VT
-           $self->{next_input_character} == 0x000C or # FF
+           $self->{next_char} == 0x000C or # FF
-           $self->{next_input_character} == 0x0020) { # SP
+           $self->{next_char} == 0x0020) { # SP
-         $self->{state} = 'before DOCTYPE name';
+         !!!cp (155);
+         $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
          !!!next-input-character;
          redo A;
        } else {
+         !!!cp (156);
          !!!parse-error (type => 'no space before DOCTYPE name');
-         $self->{state} = 'before DOCTYPE name';
+         $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
          ## reconsume
          redo A;
        }
-     } elsif ($self->{state} eq 'before DOCTYPE name') {
+     } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
-       if ($self->{next_input_character} == 0x0009 or # HT
+       if ($self->{next_char} == 0x0009 or # HT
-           $self->{next_input_character} == 0x000A or # LF
+           $self->{next_char} == 0x000A or # LF
-           $self->{next_input_character} == 0x000B or # VT
+           $self->{next_char} == 0x000B or # VT
-           $self->{next_input_character} == 0x000C or # FF
+           $self->{next_char} == 0x000C or # FF
-           $self->{next_input_character} == 0x0020) { # SP
+           $self->{next_char} == 0x0020) { # SP
+         !!!cp (157);
          ## Stay in the state
          !!!next-input-character;
          redo A;
-       } elsif ($self->{next_input_character} == 0x003E) { # >
+       } elsif ($self->{next_char} == 0x003E) { # >
+         !!!cp (158);
          !!!parse-error (type => 'no DOCTYPE name');
-         $self->{state} = 'data';
+         $self->{state} = DATA_STATE;
          !!!next-input-character;
-         !!!emit ({type => DOCTYPE_TOKEN}); # incorrect
+         !!!emit ({type => DOCTYPE_TOKEN, quirks => 1});
          redo A;
-       } elsif ($self->{next_input_character} == -1) {
+       } elsif ($self->{next_char} == -1) {
+         !!!cp (159);
          !!!parse-error (type => 'no DOCTYPE name');
-         $self->{state} = 'data';
+         $self->{state} = DATA_STATE;
          ## reconsume
-         !!!emit ({type => DOCTYPE_TOKEN}); # incorrect
+         !!!emit ({type => DOCTYPE_TOKEN, quirks => 1});
          redo A;
        } else {
+         !!!cp (160);
          $self->{current_token}
              = {type => DOCTYPE_TOKEN,
-                name => chr ($self->{next_input_character}),
+                name => chr ($self->{next_char}),
-                correct => 1};
+                #quirks => 0,
+               };
  ## ISSUE: "Set the token's name name to the" in the spec
-         $self->{state} = 'DOCTYPE name';
+         $self->{state} = DOCTYPE_NAME_STATE;
          !!!next-input-character;
          redo A;
        }
-     } elsif ($self->{state} eq 'DOCTYPE name') {
+     } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
  ## ISSUE: Redundant "First," in the spec.
-       if ($self->{next_input_character} == 0x0009 or # HT
+       if ($self->{next_char} == 0x0009 or # HT
-           $self->{next_input_character} == 0x000A or # LF
+           $self->{next_char} == 0x000A or # LF
-           $self->{next_input_character} == 0x000B or # VT
+           $self->{next_char} == 0x000B or # VT
-           $self->{next_input_character} == 0x000C or # FF
+           $self->{next_char} == 0x000C or # FF
-           $self->{next_input_character} == 0x0020) { # SP
+           $self->{next_char} == 0x0020) { # SP
-         $self->{state} = 'after DOCTYPE name';
+         !!!cp (161);
+         $self->{state} = AFTER_DOCTYPE_NAME_STATE;
          !!!next-input-character;
          redo A;
-       } elsif ($self->{next_input_character} == 0x003E) { # >
+       } elsif ($self->{next_char} == 0x003E) { # >
-         $self->{state} = 'data';
+         !!!cp (162);
+         $self->{state} = DATA_STATE;
          !!!next-input-character;
          !!!emit ($self->{current_token}); # DOCTYPE
          redo A;
-       } elsif ($self->{next_input_character} == -1) {
+       } elsif ($self->{next_char} == -1) {
+         !!!cp (163);
          !!!parse-error (type => 'unclosed DOCTYPE');
-         $self->{state} = 'data';
+         $self->{state} = DATA_STATE;
          ## reconsume
-         delete $self->{current_token}->{correct};
+         $self->{current_token}->{quirks} = 1;
          !!!emit ($self->{current_token}); # DOCTYPE
          redo A;
        } else {
+         !!!cp (164);
          $self->{current_token}->{name}
-           .= chr ($self->{next_input_character}); # DOCTYPE
+           .= chr ($self->{next_char}); # DOCTYPE
          ## Stay in the state
          !!!next-input-character;
          redo A;
        }
-     } elsif ($self->{state} eq 'after DOCTYPE name') {
+     } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
-       if ($self->{next_input_character} == 0x0009 or # HT
+       if ($self->{next_char} == 0x0009 or # HT
-           $self->{next_input_character} == 0x000A or # LF
+           $self->{next_char} == 0x000A or # LF
-           $self->{next_input_character} == 0x000B or # VT
+           $self->{next_char} == 0x000B or # VT
-           $self->{next_input_character} == 0x000C or # FF
+           $self->{next_char} == 0x000C or # FF
-           $self->{next_input_character} == 0x0020) { # SP
+           $self->{next_char} == 0x0020) { # SP
+         !!!cp (165);
          ## Stay in the state
          !!!next-input-character;
          redo A;
-       } elsif ($self->{next_input_character} == 0x003E) { # >
+       } elsif ($self->{next_char} == 0x003E) { # >
-         $self->{state} = 'data';
+         !!!cp (166);
+         $self->{state} = DATA_STATE;
          !!!next-input-character;
          !!!emit ($self->{current_token}); # DOCTYPE
          redo A;
-       } elsif ($self->{next_input_character} == -1) {
+       } elsif ($self->{next_char} == -1) {
+         !!!cp (167);
          !!!parse-error (type => 'unclosed DOCTYPE');
-         $self->{state} = 'data';
+         $self->{state} = DATA_STATE;
          ## reconsume
-         delete $self->{current_token}->{correct};
+         $self->{current_token}->{quirks} = 1;
          !!!emit ($self->{current_token}); # DOCTYPE
          redo A;
-       } elsif ($self->{next_input_character} == 0x0050 or # P
+       } elsif ($self->{next_char} == 0x0050 or # P
-                $self->{next_input_character} == 0x0070) { # p
+                $self->{next_char} == 0x0070) { # p
          !!!next-input-character;
-         if ($self->{next_input_character} == 0x0055 or # U
+         if ($self->{next_char} == 0x0055 or # U
-             $self->{next_input_character} == 0x0075) { # u
+             $self->{next_char} == 0x0075) { # u
            !!!next-input-character;
-           if ($self->{next_input_character} == 0x0042 or # B
+           if ($self->{next_char} == 0x0042 or # B
-               $self->{next_input_character} == 0x0062) { # b
+               $self->{next_char} == 0x0062) { # b
              !!!next-input-character;
-             if ($self->{next_input_character} == 0x004C or # L
+             if ($self->{next_char} == 0x004C or # L
-                 $self->{next_input_character} == 0x006C) { # l
+                 $self->{next_char} == 0x006C) { # l
                !!!next-input-character;
-               if ($self->{next_input_character} == 0x0049 or # I
+               if ($self->{next_char} == 0x0049 or # I
-                   $self->{next_input_character} == 0x0069) { # i
+                   $self->{next_char} == 0x0069) { # i
                  !!!next-input-character;
-                 if ($self->{next_input_character} == 0x0043 or # C
+                 if ($self->{next_char} == 0x0043 or # C
-                     $self->{next_input_character} == 0x0063) { # c
+                     $self->{next_char} == 0x0063) { # c
-                   $self->{state} = 'before DOCTYPE public identifier';
+                   !!!cp (168);
+                   $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
                    !!!next-input-character;
                    redo A;
+                 } else {
+                   !!!cp (169);
                  }
+               } else {
+                 !!!cp (170);
                }
+             } else {
+               !!!cp (171);
              }
+           } else {
+             !!!cp (172);
            }
+         } else {
+           !!!cp (173);
          }
          #
-       } elsif ($self->{next_input_character} == 0x0053 or # S
+       } elsif ($self->{next_char} == 0x0053 or # S
-                $self->{next_input_character} == 0x0073) { # s
+                $self->{next_char} == 0x0073) { # s
          !!!next-input-character;
-         if ($self->{next_input_character} == 0x0059 or # Y
+         if ($self->{next_char} == 0x0059 or # Y
-             $self->{next_input_character} == 0x0079) { # y
+             $self->{next_char} == 0x0079) { # y
            !!!next-input-character;
-           if ($self->{next_input_character} == 0x0053 or # S
+           if ($self->{next_char} == 0x0053 or # S
-               $self->{next_input_character} == 0x0073) { # s
+               $self->{next_char} == 0x0073) { # s
              !!!next-input-character;
-             if ($self->{next_input_character} == 0x0054 or # T
+             if ($self->{next_char} == 0x0054 or # T
-                 $self->{next_input_character} == 0x0074) { # t
+                 $self->{next_char} == 0x0074) { # t
                !!!next-input-character;
-               if ($self->{next_input_character} == 0x0045 or # E
+               if ($self->{next_char} == 0x0045 or # E
-                   $self->{next_input_character} == 0x0065) { # e
+                   $self->{next_char} == 0x0065) { # e
                  !!!next-input-character;
-                 if ($self->{next_input_character} == 0x004D or # M
+                 if ($self->{next_char} == 0x004D or # M
-                     $self->{next_input_character} == 0x006D) { # m
+                     $self->{next_char} == 0x006D) { # m
-                   $self->{state} = 'before DOCTYPE system identifier';
+                   !!!cp (174);
+                   $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
                    !!!next-input-character;
                    redo A;
+                 } else {
+                   !!!cp (175);
                  }
+               } else {
+                 !!!cp (176);
                }
+             } else {
+               !!!cp (177);
              }
+           } else {
+             !!!cp (178);
            }
+         } else {
+           !!!cp (179);
          }
          #
        } else {
+         !!!cp (180);
          !!!next-input-character;
          #
        }
        !!!parse-error (type => 'string after DOCTYPE name');
-       $self->{state} = 'bogus DOCTYPE';
+       $self->{current_token}->{quirks} = 1;
+       $self->{state} = BOGUS_DOCTYPE_STATE;
        # next-input-character is already done
        redo A;
-     } elsif ($self->{state} eq 'before DOCTYPE public identifier') {
+     } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
        if ({
 x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
              #0x000D => 1, # HT, LF, VT, FF, SP, CR
-           }->{$self->{next_input_character}}) {
+           }->{$self->{next_char}}) {
+         !!!cp (181);
          ## Stay in the state
          !!!next-input-character;
          redo A;
-       } elsif ($self->{next_input_character} eq 0x0022) { # "
+       } elsif ($self->{next_char} eq 0x0022) { # "
+         !!!cp (182);
          $self->{current_token}->{public_identifier} = ''; # DOCTYPE
-         $self->{state} = 'DOCTYPE public identifier (double-quoted)';
+         $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
          !!!next-input-character;
          redo A;
-       } elsif ($self->{next_input_character} eq 0x0027) { # '
+       } elsif ($self->{next_char} eq 0x0027) { # '
+         !!!cp (183);
          $self->{current_token}->{public_identifier} = ''; # DOCTYPE
-         $self->{state} = 'DOCTYPE public identifier (single-quoted)';
+         $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
          !!!next-input-character;
          redo A;
-       } elsif ($self->{next_input_character} eq 0x003E) { # >
+       } elsif ($self->{next_char} eq 0x003E) { # >
+         !!!cp (184);
          !!!parse-error (type => 'no PUBLIC literal');
-         $self->{state} = 'data';
+         $self->{state} = DATA_STATE;
          !!!next-input-character;
-         delete $self->{current_token}->{correct};
+         $self->{current_token}->{quirks} = 1;
          !!!emit ($self->{current_token}); # DOCTYPE
          redo A;
-       } elsif ($self->{next_input_character} == -1) {
+       } elsif ($self->{next_char} == -1) {
+         !!!cp (185);
          !!!parse-error (type => 'unclosed DOCTYPE');
-         $self->{state} = 'data';
+         $self->{state} = DATA_STATE;
          ## reconsume
-         delete $self->{current_token}->{correct};
+         $self->{current_token}->{quirks} = 1;
          !!!emit ($self->{current_token}); # DOCTYPE
          redo A;
        } else {
+         !!!cp (186);
          !!!parse-error (type => 'string after PUBLIC');
-         $self->{state} = 'bogus DOCTYPE';
+         $self->{current_token}->{quirks} = 1;
+         $self->{state} = BOGUS_DOCTYPE_STATE;
          !!!next-input-character;
          redo A;
        }
-     } elsif ($self->{state} eq 'DOCTYPE public identifier (double-quoted)') {
+     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
-       if ($self->{next_input_character} == 0x0022) { # "
+       if ($self->{next_char} == 0x0022) { # "
-         $self->{state} = 'after DOCTYPE public identifier';
+         !!!cp (187);
+         $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
          !!!next-input-character;
          redo A;
-       } elsif ($self->{next_input_character} == -1) {
+       } elsif ($self->{next_char} == 0x003E) { # >
+         !!!cp (188);
          !!!parse-error (type => 'unclosed PUBLIC literal');
-         $self->{state} = 'data';
+         $self->{state} = DATA_STATE;
+         !!!next-input-character;
+         $self->{current_token}->{quirks} = 1;
+         !!!emit ($self->{current_token}); # DOCTYPE
+         redo A;
+       } elsif ($self->{next_char} == -1) {
+         !!!cp (189);
+         !!!parse-error (type => 'unclosed PUBLIC literal');
+         $self->{state} = DATA_STATE;
          ## reconsume
-         delete $self->{current_token}->{correct};
+         $self->{current_token}->{quirks} = 1;
          !!!emit ($self->{current_token}); # DOCTYPE
          redo A;
        } else {
+         !!!cp (190);
          $self->{current_token}->{public_identifier} # DOCTYPE
-             .= chr $self->{next_input_character};
+             .= chr $self->{next_char};
          ## Stay in the state
          !!!next-input-character;
          redo A;
        }
-     } elsif ($self->{state} eq 'DOCTYPE public identifier (single-quoted)') {
+     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
-       if ($self->{next_input_character} == 0x0027) { # '
+       if ($self->{next_char} == 0x0027) { # '
-         $self->{state} = 'after DOCTYPE public identifier';
+         !!!cp (191);
+         $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
+         !!!next-input-character;
+         redo A;
+       } elsif ($self->{next_char} == 0x003E) { # >
+         !!!cp (192);
+         !!!parse-error (type => 'unclosed PUBLIC literal');
+         $self->{state} = DATA_STATE;
          !!!next-input-character;
+         $self->{current_token}->{quirks} = 1;
+         !!!emit ($self->{current_token}); # DOCTYPE
          redo A;
-       } elsif ($self->{next_input_character} == -1) {
+       } elsif ($self->{next_char} == -1) {
+         !!!cp (193);
          !!!parse-error (type => 'unclosed PUBLIC literal');
-         $self->{state} = 'data';
+         $self->{state} = DATA_STATE;
          ## reconsume
-         delete $self->{current_token}->{correct};
+         $self->{current_token}->{quirks} = 1;
          !!!emit ($self->{current_token}); # DOCTYPE
          redo A;
        } else {
+         !!!cp (194);
          $self->{current_token}->{public_identifier} # DOCTYPE
-             .= chr $self->{next_input_character};
+             .= chr $self->{next_char};
          ## Stay in the state
          !!!next-input-character;
          redo A;
        }
-     } elsif ($self->{state} eq 'after DOCTYPE public identifier') {
+     } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
        if ({
 x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
              #0x000D => 1, # HT, LF, VT, FF, SP, CR
-           }->{$self->{next_input_character}}) {
+           }->{$self->{next_char}}) {
+         !!!cp (195);
          ## Stay in the state
          !!!next-input-character;
          redo A;
-       } elsif ($self->{next_input_character} == 0x0022) { # "
+       } elsif ($self->{next_char} == 0x0022) { # "
+         !!!cp (196);
          $self->{current_token}->{system_identifier} = ''; # DOCTYPE
-         $self->{state} = 'DOCTYPE system identifier (double-quoted)';
+         $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
          !!!next-input-character;
          redo A;
-       } elsif ($self->{next_input_character} == 0x0027) { # '
+       } elsif ($self->{next_char} == 0x0027) { # '
+         !!!cp (197);
          $self->{current_token}->{system_identifier} = ''; # DOCTYPE
-         $self->{state} = 'DOCTYPE system identifier (single-quoted)';
+         $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
          !!!next-input-character;
          redo A;
-       } elsif ($self->{next_input_character} == 0x003E) { # >
+       } elsif ($self->{next_char} == 0x003E) { # >
-         $self->{state} = 'data';
+         !!!cp (198);
+         $self->{state} = DATA_STATE;
          !!!next-input-character;
          !!!emit ($self->{current_token}); # DOCTYPE
          redo A;
-       } elsif ($self->{next_input_character} == -1) {
+       } elsif ($self->{next_char} == -1) {
+         !!!cp (199);
          !!!parse-error (type => 'unclosed DOCTYPE');
-         $self->{state} = 'data';
+         $self->{state} = DATA_STATE;
          ## reconsume
-         delete $self->{current_token}->{correct};
+         $self->{current_token}->{quirks} = 1;
          !!!emit ($self->{current_token}); # DOCTYPE
          redo A;
        } else {
+         !!!cp (200);
          !!!parse-error (type => 'string after PUBLIC literal');
-         $self->{state} = 'bogus DOCTYPE';
+         $self->{current_token}->{quirks} = 1;
+         $self->{state} = BOGUS_DOCTYPE_STATE;
          !!!next-input-character;
          redo A;
        }
-     } elsif ($self->{state} eq 'before DOCTYPE system identifier') {
+     } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
        if ({
 x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
              #0x000D => 1, # HT, LF, VT, FF, SP, CR
-           }->{$self->{next_input_character}}) {
+           }->{$self->{next_char}}) {
+         !!!cp (201);
          ## Stay in the state
          !!!next-input-character;
          redo A;
-       } elsif ($self->{next_input_character} == 0x0022) { # "
+       } elsif ($self->{next_char} == 0x0022) { # "
+         !!!cp (202);
          $self->{current_token}->{system_identifier} = ''; # DOCTYPE
-         $self->{state} = 'DOCTYPE system identifier (double-quoted)';
+         $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
          !!!next-input-character;
          redo A;
-       } elsif ($self->{next_input_character} == 0x0027) { # '
+       } elsif ($self->{next_char} == 0x0027) { # '
+         !!!cp (203);
          $self->{current_token}->{system_identifier} = ''; # DOCTYPE
-         $self->{state} = 'DOCTYPE system identifier (single-quoted)';
+         $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
          !!!next-input-character;
          redo A;
-       } elsif ($self->{next_input_character} == 0x003E) { # >
+       } elsif ($self->{next_char} == 0x003E) { # >
+         !!!cp (204);
          !!!parse-error (type => 'no SYSTEM literal');
-         $self->{state} = 'data';
+         $self->{state} = DATA_STATE;
          !!!next-input-character;
-         delete $self->{current_token}->{correct};
+         $self->{current_token}->{quirks} = 1;
          !!!emit ($self->{current_token}); # DOCTYPE
          redo A;
-       } elsif ($self->{next_input_character} == -1) {
+       } elsif ($self->{next_char} == -1) {
+         !!!cp (205);
          !!!parse-error (type => 'unclosed DOCTYPE');
-         $self->{state} = 'data';
+         $self->{state} = DATA_STATE;
          ## reconsume
-         delete $self->{current_token}->{correct};
+         $self->{current_token}->{quirks} = 1;
          !!!emit ($self->{current_token}); # DOCTYPE
          redo A;
        } else {
+         !!!cp (206);
          !!!parse-error (type => 'string after SYSTEM');
-         $self->{state} = 'bogus DOCTYPE';
+         $self->{current_token}->{quirks} = 1;
+         $self->{state} = BOGUS_DOCTYPE_STATE;
          !!!next-input-character;
          redo A;
        }
-     } elsif ($self->{state} eq 'DOCTYPE system identifier (double-quoted)') {
+     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
-       if ($self->{next_input_character} == 0x0022) { # "
+       if ($self->{next_char} == 0x0022) { # "
-         $self->{state} = 'after DOCTYPE system identifier';
+         !!!cp (207);
+         $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
+         !!!next-input-character;
+         redo A;
+       } elsif ($self->{next_char} == 0x003E) { # >
+         !!!cp (208);
+         !!!parse-error (type => 'unclosed PUBLIC literal');
+         $self->{state} = DATA_STATE;
          !!!next-input-character;
+         $self->{current_token}->{quirks} = 1;
+         !!!emit ($self->{current_token}); # DOCTYPE
          redo A;
-       } elsif ($self->{next_input_character} == -1) {
+       } elsif ($self->{next_char} == -1) {
+         !!!cp (209);
          !!!parse-error (type => 'unclosed SYSTEM literal');
-         $self->{state} = 'data';
+         $self->{state} = DATA_STATE;
          ## reconsume
-         delete $self->{current_token}->{correct};
+         $self->{current_token}->{quirks} = 1;
          !!!emit ($self->{current_token}); # DOCTYPE
          redo A;
        } else {
+         !!!cp (210);
          $self->{current_token}->{system_identifier} # DOCTYPE
-             .= chr $self->{next_input_character};
+             .= chr $self->{next_char};
          ## Stay in the state
          !!!next-input-character;
          redo A;
        }
-     } elsif ($self->{state} eq 'DOCTYPE system identifier (single-quoted)') {
+     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
-       if ($self->{next_input_character} == 0x0027) { # '
+       if ($self->{next_char} == 0x0027) { # '
-         $self->{state} = 'after DOCTYPE system identifier';
+         !!!cp (211);
+         $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
          !!!next-input-character;
          redo A;
-       } elsif ($self->{next_input_character} == -1) {
+       } elsif ($self->{next_char} == 0x003E) { # >
+         !!!cp (212);
+         !!!parse-error (type => 'unclosed PUBLIC literal');
+         $self->{state} = DATA_STATE;
+         !!!next-input-character;
+         $self->{current_token}->{quirks} = 1;
+         !!!emit ($self->{current_token}); # DOCTYPE
+         redo A;
+       } elsif ($self->{next_char} == -1) {
+         !!!cp (213);
          !!!parse-error (type => 'unclosed SYSTEM literal');
-         $self->{state} = 'data';
+         $self->{state} = DATA_STATE;
          ## reconsume
-         delete $self->{current_token}->{correct};
+         $self->{current_token}->{quirks} = 1;
          !!!emit ($self->{current_token}); # DOCTYPE
          redo A;
        } else {
+         !!!cp (214);
          $self->{current_token}->{system_identifier} # DOCTYPE
-             .= chr $self->{next_input_character};
+             .= chr $self->{next_char};
          ## Stay in the state
          !!!next-input-character;
          redo A;
        }
-     } elsif ($self->{state} eq 'after DOCTYPE system identifier') {
+     } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
        if ({
 x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
              #0x000D => 1, # HT, LF, VT, FF, SP, CR
-           }->{$self->{next_input_character}}) {
+           }->{$self->{next_char}}) {
+         !!!cp (215);
          ## Stay in the state
          !!!next-input-character;
          redo A;
-       } elsif ($self->{next_input_character} == 0x003E) { # >
+       } elsif ($self->{next_char} == 0x003E) { # >
-         $self->{state} = 'data';
+         !!!cp (216);
+         $self->{state} = DATA_STATE;
          !!!next-input-character;
          !!!emit ($self->{current_token}); # DOCTYPE
          redo A;
-       } elsif ($self->{next_input_character} == -1) {
+       } elsif ($self->{next_char} == -1) {
+         !!!cp (217);
          !!!parse-error (type => 'unclosed DOCTYPE');
-         $self->{state} = 'data';
+         $self->{state} = DATA_STATE;
          ## reconsume
-         delete $self->{current_token}->{correct};
+         $self->{current_token}->{quirks} = 1;
          !!!emit ($self->{current_token}); # DOCTYPE
          redo A;
        } else {
+         !!!cp (218);
          !!!parse-error (type => 'string after SYSTEM literal');
-         $self->{state} = 'bogus DOCTYPE';
+         #$self->{current_token}->{quirks} = 1;
+         $self->{state} = BOGUS_DOCTYPE_STATE;
          !!!next-input-character;
          redo A;
        }
-     } elsif ($self->{state} eq 'bogus DOCTYPE') {
+     } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
-       if ($self->{next_input_character} == 0x003E) { # >
+       if ($self->{next_char} == 0x003E) { # >
-         $self->{state} = 'data';
+         !!!cp (219);
+         $self->{state} = DATA_STATE;
          !!!next-input-character;
-         delete $self->{current_token}->{correct};
          !!!emit ($self->{current_token}); # DOCTYPE
          redo A;
-       } elsif ($self->{next_input_character} == -1) {
+       } elsif ($self->{next_char} == -1) {
+         !!!cp (220);
          !!!parse-error (type => 'unclosed DOCTYPE');
-         $self->{state} = 'data';
+         $self->{state} = DATA_STATE;
          ## reconsume
-         delete $self->{current_token}->{correct};
          !!!emit ($self->{current_token}); # DOCTYPE
          redo A;
        } else {
+         !!!cp (221);
          ## Stay in the state
          !!!next-input-character;
          redo A;
-Line 1644 
 sub _get_next_token ($) {
+Line 2182 
 sub _get_next_token ($) {
    die "$0: _get_next_token: unexpected case";
  } # _get_next_token
- sub _tokenize_attempt_to_consume_an_entity ($$) {
+ sub _tokenize_attempt_to_consume_an_entity ($$$) {
-   my ($self, $in_attr) = @_;
+   my ($self, $in_attr, $additional) = @_;
    if ({
 x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, # HT, LF, VT, FF,
 x0020 => 1, 0x003C => 1, 0x0026 => 1, -1 => 1, # SP, <, & # 0x000D # CR
-       }->{$self->{next_input_character}}) {
+        $additional => 1,
+       }->{$self->{next_char}}) {
      ## Don't consume
      ## No error
      return undef;
-   } elsif ($self->{next_input_character} == 0x0023) { # #
+   } elsif ($self->{next_char} == 0x0023) { # #
      !!!next-input-character;
-     if ($self->{next_input_character} == 0x0078 or # x
+     if ($self->{next_char} == 0x0078 or # x
-         $self->{next_input_character} == 0x0058) { # X
+         $self->{next_char} == 0x0058) { # X
        my $code;
        X: {
-         my $x_char = $self->{next_input_character};
+         my $x_char = $self->{next_char};
          !!!next-input-character;
-         if (0x0030 <= $self->{next_input_character} and
+         if (0x0030 <= $self->{next_char} and
-             $self->{next_input_character} <= 0x0039) { # 0..9
+             $self->{next_char} <= 0x0039) { # 0..9
            $code ||= 0;
            $code *= 0x10;
-           $code += $self->{next_input_character} - 0x0030;
+           $code += $self->{next_char} - 0x0030;
            redo X;
-         } elsif (0x0061 <= $self->{next_input_character} and
+         } elsif (0x0061 <= $self->{next_char} and
-                  $self->{next_input_character} <= 0x0066) { # a..f
+                  $self->{next_char} <= 0x0066) { # a..f
            $code ||= 0;
            $code *= 0x10;
-           $code += $self->{next_input_character} - 0x0060 + 9;
+           $code += $self->{next_char} - 0x0060 + 9;
            redo X;
-         } elsif (0x0041 <= $self->{next_input_character} and
+         } elsif (0x0041 <= $self->{next_char} and
-                  $self->{next_input_character} <= 0x0046) { # A..F
+                  $self->{next_char} <= 0x0046) { # A..F
            $code ||= 0;
            $code *= 0x10;
-           $code += $self->{next_input_character} - 0x0040 + 9;
+           $code += $self->{next_char} - 0x0040 + 9;
            redo X;
          } elsif (not defined $code) { # no hexadecimal digit
            !!!parse-error (type => 'bare hcro');
-           !!!back-next-input-character ($x_char, $self->{next_input_character});
+           !!!back-next-input-character ($x_char, $self->{next_char});
-           $self->{next_input_character} = 0x0023; # #
+           $self->{next_char} = 0x0023; # #
            return undef;
-         } elsif ($self->{next_input_character} == 0x003B) { # ;
+         } elsif ($self->{next_char} == 0x003B) { # ;
            !!!next-input-character;
          } else {
            !!!parse-error (type => 'no refc');
-Line 1705 
 sub _tokenize_attempt_to_consume_an_enti
+Line 2244 
 sub _tokenize_attempt_to_consume_an_enti
            $code = $c1_entity_char->{$code};
          }
-         return {type => CHARACTER_TOKEN, data => chr $code};
+         return {type => CHARACTER_TOKEN, data => chr $code,
+                 has_reference => 1};
        } # X
-     } elsif (0x0030 <= $self->{next_input_character} and
+     } elsif (0x0030 <= $self->{next_char} and
-              $self->{next_input_character} <= 0x0039) { # 0..9
+              $self->{next_char} <= 0x0039) { # 0..9
-       my $code = $self->{next_input_character} - 0x0030;
+       my $code = $self->{next_char} - 0x0030;
        !!!next-input-character;
-       while (0x0030 <= $self->{next_input_character} and
+       while (0x0030 <= $self->{next_char} and
-                 $self->{next_input_character} <= 0x0039) { # 0..9
+                 $self->{next_char} <= 0x0039) { # 0..9
          $code *= 10;
-         $code += $self->{next_input_character} - 0x0030;
+         $code += $self->{next_char} - 0x0030;
          !!!next-input-character;
        }
-       if ($self->{next_input_character} == 0x003B) { # ;
+       if ($self->{next_char} == 0x003B) { # ;
          !!!next-input-character;
        } else {
          !!!parse-error (type => 'no refc');
-Line 1740 
 sub _tokenize_attempt_to_consume_an_enti
+Line 2280 
 sub _tokenize_attempt_to_consume_an_enti
          $code = $c1_entity_char->{$code};
        }
-       return {type => CHARACTER_TOKEN, data => chr $code};
+       return {type => CHARACTER_TOKEN, data => chr $code, has_reference => 1};
      } else {
        !!!parse-error (type => 'bare nero');
-       !!!back-next-input-character ($self->{next_input_character});
+       !!!back-next-input-character ($self->{next_char});
-       $self->{next_input_character} = 0x0023; # #
+       $self->{next_char} = 0x0023; # #
        return undef;
      }
-   } elsif ((0x0041 <= $self->{next_input_character} and
+   } elsif ((0x0041 <= $self->{next_char} and
-             $self->{next_input_character} <= 0x005A) or
+             $self->{next_char} <= 0x005A) or
-            (0x0061 <= $self->{next_input_character} and
+            (0x0061 <= $self->{next_char} and
-             $self->{next_input_character} <= 0x007A)) {
+             $self->{next_char} <= 0x007A)) {
-     my $entity_name = chr $self->{next_input_character};
+     my $entity_name = chr $self->{next_char};
      !!!next-input-character;
      my $value = $entity_name;
-Line 1761 
 sub _tokenize_attempt_to_consume_an_enti
+Line 2301 
 sub _tokenize_attempt_to_consume_an_enti
      while (length $entity_name < 10 and
             ## NOTE: Some number greater than the maximum length of entity name
-            ((0x0041 <= $self->{next_input_character} and # a
+            ((0x0041 <= $self->{next_char} and # a
-              $self->{next_input_character} <= 0x005A) or # x
+              $self->{next_char} <= 0x005A) or # x
-             (0x0061 <= $self->{next_input_character} and # a
+             (0x0061 <= $self->{next_char} and # a
-              $self->{next_input_character} <= 0x007A) or # z
+              $self->{next_char} <= 0x007A) or # z
-             (0x0030 <= $self->{next_input_character} and # 0
+             (0x0030 <= $self->{next_char} and # 0
-              $self->{next_input_character} <= 0x0039) or # 9
+              $self->{next_char} <= 0x0039) or # 9
-             $self->{next_input_character} == 0x003B)) { # ;
+             $self->{next_char} == 0x003B)) { # ;
-       $entity_name .= chr $self->{next_input_character};
+       $entity_name .= chr $self->{next_char};
        if (defined $EntityChar->{$entity_name}) {
-         if ($self->{next_input_character} == 0x003B) { # ;
+         if ($self->{next_char} == 0x003B) { # ;
            $value = $EntityChar->{$entity_name};
            $match = 1;
            !!!next-input-character;
-Line 1781 
 sub _tokenize_attempt_to_consume_an_enti
+Line 2321 
 sub _tokenize_attempt_to_consume_an_enti
            !!!next-input-character;
          }
        } else {
-         $value .= chr $self->{next_input_character};
+         $value .= chr $self->{next_char};
          $match *= 2;
          !!!next-input-character;
        }
      }
      if ($match > 0) {
-       return {type => CHARACTER_TOKEN, data => $value};
+       return {type => CHARACTER_TOKEN, data => $value, has_reference => 1};
      } elsif ($match < 0) {
        !!!parse-error (type => 'no refc');
        if ($in_attr and $match < -1) {
          return {type => CHARACTER_TOKEN, data => '&'.$entity_name};
        } else {
-         return {type => CHARACTER_TOKEN, data => $value};
+         return {type => CHARACTER_TOKEN, data => $value, has_reference => 1};
        }
      } else {
        !!!parse-error (type => 'bare ero');
-       ## NOTE: No characters are consumed in the spec.
+       ## NOTE: "No characters are consumed" in the spec.
        return {type => CHARACTER_TOKEN, data => '&'.$value};
      }
    } else {
-Line 1881 
 sub _tree_construction_initial ($) {
+Line 2421 
 sub _tree_construction_initial ($) {
        ## ISSUE: internalSubset = null??
        $self->{document}->append_child ($doctype);
-       if (not $token->{correct} or $doctype_name ne 'HTML') {
+       if ($token->{quirks} or $doctype_name ne 'HTML') {
          $self->{document}->manakai_compat_mode ('quirks');
        } elsif (defined $token->{public_identifier}) {
          my $pubid = $token->{public_identifier};
-Line 1935 
 sub _tree_construction_initial ($) {
+Line 2475 
 sub _tree_construction_initial ($) {
            "-//NETSCAPE COMM. CORP.//DTD STRICT HTML//EN" => 1,
            "-//O'REILLY AND ASSOCIATES//DTD HTML 2.0//EN" => 1,
            "-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED 1.0//EN" => 1,
+           "-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED RELAXED 1.0//EN" => 1,
+           "-//SOFTQUAD SOFTWARE//DTD HOTMETAL PRO 6.0::19990601::EXTENSIONS TO HTML 4.0//EN" => 1,
+           "-//SOFTQUAD//DTD HOTMETAL PRO 4.0::19971010::EXTENSIONS TO HTML 4.0//EN" => 1,
            "-//SPYGLASS//DTD HTML 2.0 EXTENDED//EN" => 1,
            "-//SQ//DTD HTML 2.0 HOTMETAL + EXTENSIONS//EN" => 1,
            "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA HTML//EN" => 1,
-Line 2046 
 sub _tree_construction_root_element ($)
+Line 2589 
 sub _tree_construction_root_element ($)
              redo B;
            }
          }
+         $self->{application_cache_selection}->(undef);
+         #
+       } elsif ($token->{type} == START_TAG_TOKEN) {
+         if ($token->{tag_name} eq 'html' and
+             $token->{attributes}->{manifest}) {
+           $self->{application_cache_selection}
+                ->($token->{attributes}->{manifest}->{value});
+           ## ISSUE: No relative reference resolution?
+         } else {
+           $self->{application_cache_selection}->(undef);
+         }
+         ## ISSUE: There is an issue in the spec
          #
        } elsif ({
-                 START_TAG_TOKEN, 1,
                  END_TAG_TOKEN, 1,
                  END_OF_FILE_TOKEN, 1,
                 }->{$token->{type}}) {
+         $self->{application_cache_selection}->(undef);
          ## ISSUE: There is an issue in the spec
          #
        } else {
          die "$0: $token->{type}: Unknown token type";
        }
        my $root_element; !!!create-element ($root_element, 'html');
        $self->{document}->append_child ($root_element);
        push @{$self->{open_elements}}, [$root_element, 'html'];
-Line 2539 
 sub _tree_construction_main ($) {
+Line 3099 
 sub _tree_construction_main ($) {
        !!!next-token;
        redo B;
      } elsif ($token->{type} == END_OF_FILE_TOKEN) {
-       if ($self->{insertion_mode} == AFTER_HTML_BODY_IM or
+       if ($self->{insertion_mode} & AFTER_HTML_IMS) {
-           $self->{insertion_mode} == AFTER_HTML_FRAMESET_IM) {
          #
        } else {
          ## Generate implied end tags
-Line 2596 
 sub _tree_construction_main ($) {
+Line 3155 
 sub _tree_construction_main ($) {
        redo B;
      } elsif ($token->{type} == COMMENT_TOKEN) {
        my $comment = $self->{document}->create_comment ($token->{data});
-       if ($self->{insertion_mode} == AFTER_HTML_BODY_IM or
+       if ($self->{insertion_mode} & AFTER_HTML_IMS) {
-           $self->{insertion_mode} == AFTER_HTML_FRAMESET_IM) {
          $self->{document}->append_child ($comment);
        } elsif ($self->{insertion_mode} == AFTER_BODY_IM) {
          $self->{open_elements}->[0]->[0]->append_child ($comment);
-Line 2606 
 sub _tree_construction_main ($) {
+Line 3164 
 sub _tree_construction_main ($) {
        }
        !!!next-token;
        redo B;
-     } elsif ($self->{insertion_mode} == IN_HEAD_IM or
+     } elsif ($self->{insertion_mode} & HEAD_IMS) {
-              $self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM or
-              $self->{insertion_mode} == AFTER_HEAD_IM or
-              $self->{insertion_mode} == BEFORE_HEAD_IM) {
        if ($token->{type} == CHARACTER_TOKEN) {
          if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
            $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
-Line 2718 
 sub _tree_construction_main ($) {
+Line 3273 
 sub _tree_construction_main ($) {
                  push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
                }
                !!!insert-element ($token->{tag_name}, $token->{attributes});
-               pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
+               my $meta_el = pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
                unless ($self->{confident}) {
-                 my $charset;
                  if ($token->{attributes}->{charset}) { ## TODO: And if supported
-                   $charset = $token->{attributes}->{charset}->{value};
+                   $self->{change_encoding}
-                 }
+                       ->($self, $token->{attributes}->{charset}->{value});
-                 if ($token->{attributes}->{'http-equiv'}) {
+                   $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
+                       ->set_user_data (manakai_has_reference =>
+                                            $token->{attributes}->{charset}
+                                                ->{has_reference});
+                 } elsif ($token->{attributes}->{content}) {
                    ## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition.
-                   if ($token->{attributes}->{'http-equiv'}->{value}
+                   if ($token->{attributes}->{content}->{value}
-                       =~ /\A[^;]*;[\x09-\x0D\x20]*charset[\x09-\x0D\x20]*=
+                       =~ /\A[^;]*;[\x09-\x0D\x20]*[Cc][Hh][Aa][Rr][Ss][Ee][Tt]
+                           [\x09-\x0D\x20]*=
                            [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|
                            ([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) {
-                     $charset = defined $1 ? $1 : defined $2 ? $2 : $3;
+                     $self->{change_encoding}
-                   } ## TODO: And if supported
+                         ->($self, defined $1 ? $1 : defined $2 ? $2 : $3);
+                     $meta_el->[0]->get_attribute_node_ns (undef, 'content')
+                         ->set_user_data (manakai_has_reference =>
+                                              $token->{attributes}->{content}
+                                                    ->{has_reference});
+                   }
+                 }
+               } else {
+                 if ($token->{attributes}->{charset}) {
+                   $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
+                       ->set_user_data (manakai_has_reference =>
+                                            $token->{attributes}->{charset}
+                                                ->{has_reference});
+                 }
+                 if ($token->{attributes}->{content}) {
+                   $meta_el->[0]->get_attribute_node_ns (undef, 'content')
+                       ->set_user_data (manakai_has_reference =>
+                                            $token->{attributes}->{content}
+                                                ->{has_reference});
                  }
-                 ## TODO: Change the encoding
                }
-               ## TODO: Extracting |charset| from |meta|.
                pop @{$self->{open_elements}}
                    if $self->{insertion_mode} == AFTER_HEAD_IM;
                !!!next-token;
-Line 2986 
 sub _tree_construction_main ($) {
+Line 3562 
 sub _tree_construction_main ($) {
            }
            ## ISSUE: An issue in the spec.
-     } elsif ($self->{insertion_mode} == IN_BODY_IM or
+     } elsif ($self->{insertion_mode} & BODY_IMS) {
-              $self->{insertion_mode} == IN_CELL_IM or
-              $self->{insertion_mode} == IN_CAPTION_IM) {
            if ($token->{type} == CHARACTER_TOKEN) {
              ## NOTE: There is a code clone of "character in body".
              $reconstruct_active_formatting_elements->($insert_to_current);
-Line 3282 
 sub _tree_construction_main ($) {
+Line 3856 
 sub _tree_construction_main ($) {
              } elsif ({
                        body => 1, col => 1, colgroup => 1, html => 1,
                       }->{$token->{tag_name}}) {
-               if ($self->{insertion_mode} == IN_CELL_IM or
+               if ($self->{insertion_mode} & BODY_TABLE_IMS) {
-                   $self->{insertion_mode} == IN_CAPTION_IM) {
                  !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
                  ## Ignore the token
                  !!!next-token;
-Line 3309 
 sub _tree_construction_main ($) {
+Line 3882 
 sub _tree_construction_main ($) {
        $insert = $insert_to_current;
        #
-     } elsif ($self->{insertion_mode} == IN_ROW_IM or
+     } elsif ($self->{insertion_mode} & TABLE_IMS) {
-              $self->{insertion_mode} == IN_TABLE_BODY_IM or
+       if ($token->{type} == CHARACTER_TOKEN) {
-              $self->{insertion_mode} == IN_TABLE_IM) {
-           if ($token->{type} == CHARACTER_TOKEN) {
-             ## NOTE: There are "character in table" code clones.
              if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
                $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
-Line 3370 
 sub _tree_construction_main ($) {
+Line 3940 
 sub _tree_construction_main ($) {
              !!!next-token;
              redo B;
-           } elsif ($token->{type} == START_TAG_TOKEN) {
+       } elsif ($token->{type} == START_TAG_TOKEN) {
              if ({
                   tr => ($self->{insertion_mode} != IN_ROW_IM),
                   th => 1, td => 1,
-Line 3556 
 sub _tree_construction_main ($) {
+Line 4126 
 sub _tree_construction_main ($) {
                  die "$0: in table: <>: $token->{tag_name}";
                }
              } elsif ($token->{tag_name} eq 'table') {
-               ## NOTE: There are code clones for this "table in table"
                !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
                ## As if </table>
-Line 3604 
 sub _tree_construction_main ($) {
+Line 4173 
 sub _tree_construction_main ($) {
                ## reprocess
                redo B;
-             } else {
+         } else {
-               #
+           !!!parse-error (type => 'in table:'.$token->{tag_name});
-             }
-           } elsif ($token->{type} == END_TAG_TOKEN) {
+           $insert = $insert_to_foster;
+           #
+         }
+       } elsif ($token->{type} == END_TAG_TOKEN) {
              if ($token->{tag_name} eq 'tr' and
                  $self->{insertion_mode} == IN_ROW_IM) {
                ## have an element in table scope
-Line 3766 
 sub _tree_construction_main ($) {
+Line 4338 
 sub _tree_construction_main ($) {
              } elsif ({
                        tbody => 1, tfoot => 1, thead => 1,
                       }->{$token->{tag_name}} and
-                      ($self->{insertion_mode} == IN_ROW_IM or
+                      $self->{insertion_mode} & ROW_IMS) {
-                       $self->{insertion_mode} == IN_TABLE_BODY_IM)) {
                if ($self->{insertion_mode} == IN_ROW_IM) {
                  ## have an element in table scope
                  my $i;
-Line 3865 
 sub _tree_construction_main ($) {
+Line 4436 
 sub _tree_construction_main ($) {
                ## Ignore the token
                !!!next-token;
                redo B;
-             } else {
+         } else {
-               #
+           !!!parse-error (type => 'in table:/'.$token->{tag_name});
-             }
-           } else {
-             die "$0: $token->{type}: Unknown token type";
-           }
-       !!!parse-error (type => 'in table:'.$token->{tag_name});
-       $insert = $insert_to_foster;
+           $insert = $insert_to_foster;
-       #
+           #
+         }
+       } else {
+         die "$0: $token->{type}: Unknown token type";
+       }
      } elsif ($self->{insertion_mode} == IN_COLUMN_GROUP_IM) {
            if ($token->{type} == CHARACTER_TOKEN) {
              if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
-Line 3934 
 sub _tree_construction_main ($) {
+Line 4503 
 sub _tree_construction_main ($) {
              redo B;
            }
      } elsif ($self->{insertion_mode} == IN_SELECT_IM) {
-           if ($token->{type} == CHARACTER_TOKEN) {
+       if ($token->{type} == CHARACTER_TOKEN) {
-             $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
+         $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
-             !!!next-token;
+         !!!next-token;
-             redo B;
+         redo B;
-           } elsif ($token->{type} == START_TAG_TOKEN) {
+       } elsif ($token->{type} == START_TAG_TOKEN) {
              if ($token->{tag_name} eq 'option') {
                if ($self->{open_elements}->[-1]->[1] eq 'option') {
                  ## As if </option>
-Line 3991 
 sub _tree_construction_main ($) {
+Line 4560 
 sub _tree_construction_main ($) {
                !!!next-token;
                redo B;
-             } else {
+         } else {
-               #
+           !!!parse-error (type => 'in select:'.$token->{tag_name});
-             }
+           ## Ignore the token
-           } elsif ($token->{type} == END_TAG_TOKEN) {
+           !!!next-token;
+           redo B;
+         }
+       } elsif ($token->{type} == END_TAG_TOKEN) {
              if ($token->{tag_name} eq 'optgroup') {
                if ($self->{open_elements}->[-1]->[1] eq 'option' and
                    $self->{open_elements}->[-2]->[1] eq 'optgroup') {
-Line 4096 
 sub _tree_construction_main ($) {
+Line 4668 
 sub _tree_construction_main ($) {
                ## reprocess
                redo B;
-             } else {
+         } else {
-               #
+           !!!parse-error (type => 'in select:/'.$token->{tag_name});
-             }
-           } else {
-             #
-           }
-           !!!parse-error (type => 'in select:'.$token->{tag_name});
            ## Ignore the token
            !!!next-token;
            redo B;
-     } elsif ($self->{insertion_mode} == AFTER_BODY_IM or
+         }
-              $self->{insertion_mode} == AFTER_HTML_BODY_IM) {
+       } else {
+         die "$0: $token->{type}: Unknown token type";
+       }
+     } elsif ($self->{insertion_mode} & BODY_AFTER_IMS) {
        if ($token->{type} == CHARACTER_TOKEN) {
          if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
            my $data = $1;
-Line 4178 
 sub _tree_construction_main ($) {
+Line 4747 
 sub _tree_construction_main ($) {
        } else {
          die "$0: $token->{type}: Unknown token type";
        }
-     } elsif ($self->{insertion_mode} == IN_FRAMESET_IM or
+     } elsif ($self->{insertion_mode} & FRAME_IMS) {
-              $self->{insertion_mode} == AFTER_FRAMESET_IM or
-              $self->{insertion_mode} == AFTER_HTML_FRAMESET_IM) {
        if ($token->{type} == CHARACTER_TOKEN) {
          if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
            $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
-Line 4317 
 sub _tree_construction_main ($) {
+Line 4884 
 sub _tree_construction_main ($) {
        } elsif ($token->{tag_name} eq 'meta') {
          ## NOTE: This is an "as if in head" code clone, only "-t" differs
          !!!insert-element-t ($token->{tag_name}, $token->{attributes});
-         pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
+         my $meta_el = pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
          unless ($self->{confident}) {
-           my $charset;
            if ($token->{attributes}->{charset}) { ## TODO: And if supported
-             $charset = $token->{attributes}->{charset}->{value};
+             $self->{change_encoding}
-           }
+                 ->($self, $token->{attributes}->{charset}->{value});
-           if ($token->{attributes}->{'http-equiv'}) {
+             $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
+                 ->set_user_data (manakai_has_reference =>
+                                      $token->{attributes}->{charset}
+                                          ->{has_reference});
+           } elsif ($token->{attributes}->{content}) {
              ## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition.
-             if ($token->{attributes}->{'http-equiv'}->{value}
+             if ($token->{attributes}->{content}->{value}
-                 =~ /\A[^;]*;[\x09-\x0D\x20]*charset[\x09-\x0D\x20]*=
+                 =~ /\A[^;]*;[\x09-\x0D\x20]*[Cc][Hh][Aa][Rr][Ss][Ee][Tt]
+                     [\x09-\x0D\x20]*=
                      [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|
                      ([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) {
-               $charset = defined $1 ? $1 : defined $2 ? $2 : $3;
+               $self->{change_encoding}
-             } ## TODO: And if supported
+                   ->($self, defined $1 ? $1 : defined $2 ? $2 : $3);
+               $meta_el->[0]->get_attribute_node_ns (undef, 'content')
+                   ->set_user_data (manakai_has_reference =>
+                                        $token->{attributes}->{content}
+                                              ->{has_reference});
+             }
+           }
+         } else {
+           if ($token->{attributes}->{charset}) {
+             $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
+                 ->set_user_data (manakai_has_reference =>
+                                      $token->{attributes}->{charset}
+                                          ->{has_reference});
+           }
+           if ($token->{attributes}->{content}) {
+             $meta_el->[0]->get_attribute_node_ns (undef, 'content')
+                 ->set_user_data (manakai_has_reference =>
+                                      $token->{attributes}->{content}
+                                          ->{has_reference});
            }
-           ## TODO: Change the encoding
          }
          !!!next-token;
-Line 4640 
 sub _tree_construction_main ($) {
+Line 5229 
 sub _tree_construction_main ($) {
          INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
            my $node = $self->{open_elements}->[$_];
            if ($node->[1] eq 'nobr') {
-             !!!parse-error (type => 'not closed:nobr');
+             !!!parse-error (type => 'in nobr:nobr');
              !!!back-token;
              $token = {type => END_TAG_TOKEN, tag_name => 'nobr'};
              redo B;
-Line 4845 
 sub _tree_construction_main ($) {
+Line 5434 
 sub _tree_construction_main ($) {
                  noframes => 1,
                  noscript => 0, ## TODO: 1 if scripting is enabled
                 }->{$token->{tag_name}}) {
-         ## NOTE: There are two "as if in body" code clones.
+         ## NOTE: There is an "as if in body" code clone.
          $parse_rcdata->(CDATA_CONTENT_MODEL, $insert);
          redo B;
        } elsif ($token->{tag_name} eq 'select') {
-Line 5001 
 sub _tree_construction_main ($) {
+Line 5590 
 sub _tree_construction_main ($) {
          if ($self->{open_elements}->[-1]->[1] eq $token->{tag_name}) {
            pop @{$self->{open_elements}};
          } else {
-           !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
+           !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
          }
          undef $self->{form_element};
-Line 5039 
 sub _tree_construction_main ($) {
+Line 5628 
 sub _tree_construction_main ($) {
          } # INSCOPE
          if ($self->{open_elements}->[-1]->[1] ne $token->{tag_name}) {
-           !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
+           !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
          }
          splice @{$self->{open_elements}}, $i if defined $i;
-Line 5108 
 sub _tree_construction_main ($) {
+Line 5697 
 sub _tree_construction_main ($) {
              ## Step 2
              if ($token->{tag_name} ne $self->{open_elements}->[-1]->[1]) {
+               ## NOTE: <x><y></x>
                !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
              }
-Line 5158 
 sub set_inner_html ($$$) {
+Line 5748 
 sub set_inner_html ($$$) {
    my $s = \$_[0];
    my $onerror = $_[1];
+   ## ISSUE: Should {confident} be true?
    my $nt = $node->node_type;
    if ($nt == 9) {
      # MUST
-Line 5190 
 sub set_inner_html ($$$) {
+Line 5782 
 sub set_inner_html ($$$) {
      my $i = 0;
      my $line = 1;
      my $column = 0;
-     $p->{set_next_input_character} = sub {
+     $p->{set_next_char} = sub {
        my $self = shift;
-       pop @{$self->{prev_input_character}};
+       pop @{$self->{prev_char}};
-       unshift @{$self->{prev_input_character}}, $self->{next_input_character};
+       unshift @{$self->{prev_char}}, $self->{next_char};
-       $self->{next_input_character} = -1 and return if $i >= length $$s;
+       $self->{next_char} = -1 and return if $i >= length $$s;
-       $self->{next_input_character} = ord substr $$s, $i++, 1;
+       $self->{next_char} = ord substr $$s, $i++, 1;
        $column++;
-       if ($self->{next_input_character} == 0x000A) { # LF
+       if ($self->{next_char} == 0x000A) { # LF
          $line++;
          $column = 0;
-       } elsif ($self->{next_input_character} == 0x000D) { # CR
+       } elsif ($self->{next_char} == 0x000D) { # CR
          $i++ if substr ($$s, $i, 1) eq "\x0A";
-         $self->{next_input_character} = 0x000A; # LF # MUST
+         $self->{next_char} = 0x000A; # LF # MUST
          $line++;
          $column = 0;
-       } elsif ($self->{next_input_character} > 0x10FFFF) {
+       } elsif ($self->{next_char} > 0x10FFFF) {
-         $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
+         $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
-       } elsif ($self->{next_input_character} == 0x0000) { # NULL
+       } elsif ($self->{next_char} == 0x0000) { # NULL
          !!!parse-error (type => 'NULL');
-         $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
+         $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
        }
      };
-     $p->{prev_input_character} = [-1, -1, -1];
+     $p->{prev_char} = [-1, -1, -1];
-     $p->{next_input_character} = -1;
+     $p->{next_char} = -1;
      my $ponerror = $onerror || sub {
        my (%opt) = @_;
-Line 5230 
 sub set_inner_html ($$$) {
+Line 5822 
 sub set_inner_html ($$$) {
      $p->_initialize_tree_constructor;
      ## Step 2
-     my $node_ln = $node->local_name;
+     my $node_ln = $node->manakai_local_name;
      $p->{content_model} = {
        title => RCDATA_CONTENT_MODEL,
        textarea => RCDATA_CONTENT_MODEL,
-Line 5270 
 sub set_inner_html ($$$) {
+Line 5862 
 sub set_inner_html ($$$) {
        if ($anode->node_type == 1) {
          my $nsuri = $anode->namespace_uri;
          if (defined $nsuri and $nsuri eq 'http://www.w3.org/1999/xhtml') {
-           if ($anode->local_name eq 'form') { ## TODO: case?
+           if ($anode->manakai_local_name eq 'form') {
              $p->{form_element} = $anode;
              last AN;
            }
-Line 5310 
 sub set_inner_html ($$$) {
+Line 5902 
 sub set_inner_html ($$$) {
  } # tree construction stage
- sub get_inner_html ($$$) {
+ package Whatpm::HTML::RestartParser;
-   my (undef, $node, $on_error) = @_;
+ push our @ISA, 'Error';
-   ## Step 1
-   my $s = '';
-   my $in_cdata;
-   my $parent = $node;
-   while (defined $parent) {
-     if ($parent->node_type == 1 and
-         $parent->namespace_uri eq 'http://www.w3.org/1999/xhtml' and
-         {
-           style => 1, script => 1, xmp => 1, iframe => 1,
-           noembed => 1, noframes => 1, noscript => 1,
-         }->{$parent->local_name}) { ## TODO: case thingy
-       $in_cdata = 1;
-     }
-     $parent = $parent->parent_node;
-   }
-   ## Step 2
-   my @node = @{$node->child_nodes};
-   C: while (@node) {
-     my $child = shift @node;
-     unless (ref $child) {
-       if ($child eq 'cdata-out') {
-         $in_cdata = 0;
-       } else {
-         $s .= $child; # end tag
-       }
-       next C;
-     }
-     my $nt = $child->node_type;
-     if ($nt == 1) { # Element
-       my $tag_name = $child->tag_name; ## TODO: manakai_tag_name
-       $s .= '<' . $tag_name;
-       ## NOTE: Non-HTML case:
-       ## <http://permalink.gmane.org/gmane.org.w3c.whatwg.discuss/11191>
-       my @attrs = @{$child->attributes}; # sort order MUST be stable
-       for my $attr (@attrs) { # order is implementation dependent
-         my $attr_name = $attr->name; ## TODO: manakai_name
-         $s .= ' ' . $attr_name . '="';
-         my $attr_value = $attr->value;
-         ## escape
-         $attr_value =~ s/&/&amp;/g;
-         $attr_value =~ s/</&lt;/g;
-         $attr_value =~ s/>/&gt;/g;
-         $attr_value =~ s/"/&quot;/g;
-         $s .= $attr_value . '"';
-       }
-       $s .= '>';
-       next C if {
-         area => 1, base => 1, basefont => 1, bgsound => 1,
-         br => 1, col => 1, embed => 1, frame => 1, hr => 1,
-         img => 1, input => 1, link => 1, meta => 1, param => 1,
-         spacer => 1, wbr => 1,
-       }->{$tag_name};
-       $s .= "\x0A" if $tag_name eq 'pre' or $tag_name eq 'textarea';
-       if (not $in_cdata and {
-         style => 1, script => 1, xmp => 1, iframe => 1,
-         noembed => 1, noframes => 1, noscript => 1,
-         plaintext => 1,
-       }->{$tag_name}) {
-         unshift @node, 'cdata-out';
-         $in_cdata = 1;
-       }
-       unshift @node, @{$child->child_nodes}, '</' . $tag_name . '>';
-     } elsif ($nt == 3 or $nt == 4) {
-       if ($in_cdata) {
-         $s .= $child->data;
-       } else {
-         my $value = $child->data;
-         $value =~ s/&/&amp;/g;
-         $value =~ s/</&lt;/g;
-         $value =~ s/>/&gt;/g;
-         $value =~ s/"/&quot;/g;
-         $s .= $value;
-       }
-     } elsif ($nt == 8) {
-       $s .= '<!--' . $child->data . '-->';
-     } elsif ($nt == 10) {
-       $s .= '<!DOCTYPE ' . $child->name . '>';
-     } elsif ($nt == 5) { # entrefs
-       push @node, @{$child->child_nodes};
-     } else {
-       $on_error->($child) if defined $on_error;
-     }
-     ## ISSUE: This code does not support PIs.
-   } # C
-   ## Step 3
-   return \$s;
- } # get_inner_html
 ;
  # $Date$

 Legend:



Removed from v.1.55
 


changed lines


 
Added in v.1.77
 Legend:



Removed from v.1.55
 


changed lines


 
Added in v.1.77
-Removed from v.1.55
+Added in v.1.77

admin@suikawiki.org	ViewVC Help
Powered by ViewVC 1.1.24