/[suikacvs]/markup/html/whatpm/Whatpm/HTML.pm.src
Suika

Diff of /markup/html/whatpm/Whatpm/HTML.pm.src

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.166 by wakaba, Sat Sep 13 08:21:35 2008 UTC revision 1.194 by wakaba, Sat Oct 4 05:53:45 2008 UTC
# Line 3  use strict; Line 3  use strict;
3  our $VERSION=do{my @r=(q$Revision$=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};  our $VERSION=do{my @r=(q$Revision$=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4  use Error qw(:try);  use Error qw(:try);
5    
6    ## NOTE: This module don't check all HTML5 parse errors; character
7    ## encoding related parse errors are expected to be handled by relevant
8    ## modules.
9    ## Parse errors for control characters that are not allowed in HTML5
10    ## documents, for surrogate code points, and for noncharacter code
11    ## points, as well as U+FFFD substitions for characters whose code points
12    ## is higher than U+10FFFF may be detected by combining the parser with
13    ## the checker implemented by Whatpm::Charset::UnicodeChecker (for its
14    ## usage example, see |t/HTML-tree.t| in the Whatpm package or the
15    ## WebHACC::Language::HTML module in the WebHACC package).
16    
17  ## ISSUE:  ## ISSUE:
18  ## var doc = implementation.createDocument (null, null, null);  ## var doc = implementation.createDocument (null, null, null);
19  ## doc.write ('');  ## doc.write ('');
# Line 55  sub TABLE_ROWS_EL () { Line 66  sub TABLE_ROWS_EL () {
66  }  }
67    
68  ## NOTE: Used in "generate implied end tags" algorithm.  ## NOTE: Used in "generate implied end tags" algorithm.
69  ## NOTE: There is a code where a modified version of END_TAG_OPTIONAL_EL  ## NOTE: There is a code where a modified version of
70  ## is used in "generate implied end tags" implementation (search for the  ## END_TAG_OPTIONAL_EL is used in "generate implied end tags"
71  ## function mae).  ## implementation (search for the algorithm name).
72  sub END_TAG_OPTIONAL_EL () {  sub END_TAG_OPTIONAL_EL () {
73    DD_EL |    DD_EL |
74    DT_EL |    DT_EL |
75    LI_EL |    LI_EL |
76      OPTION_EL |
77      OPTGROUP_EL |
78    P_EL |    P_EL |
79    RUBY_COMPONENT_EL    RUBY_COMPONENT_EL
80  }  }
# Line 130  my $el_category = { Line 143  my $el_category = {
143    address => ADDRESS_EL,    address => ADDRESS_EL,
144    applet => MISC_SCOPING_EL,    applet => MISC_SCOPING_EL,
145    area => MISC_SPECIAL_EL,    area => MISC_SPECIAL_EL,
146      article => MISC_SPECIAL_EL,
147      aside => MISC_SPECIAL_EL,
148    b => FORMATTING_EL,    b => FORMATTING_EL,
149    base => MISC_SPECIAL_EL,    base => MISC_SPECIAL_EL,
150    basefont => MISC_SPECIAL_EL,    basefont => MISC_SPECIAL_EL,
# Line 143  my $el_category = { Line 158  my $el_category = {
158    center => MISC_SPECIAL_EL,    center => MISC_SPECIAL_EL,
159    col => MISC_SPECIAL_EL,    col => MISC_SPECIAL_EL,
160    colgroup => MISC_SPECIAL_EL,    colgroup => MISC_SPECIAL_EL,
161      command => MISC_SPECIAL_EL,
162      datagrid => MISC_SPECIAL_EL,
163    dd => DD_EL,    dd => DD_EL,
164      details => MISC_SPECIAL_EL,
165      dialog => MISC_SPECIAL_EL,
166    dir => MISC_SPECIAL_EL,    dir => MISC_SPECIAL_EL,
167    div => DIV_EL,    div => DIV_EL,
168    dl => MISC_SPECIAL_EL,    dl => MISC_SPECIAL_EL,
169    dt => DT_EL,    dt => DT_EL,
170    em => FORMATTING_EL,    em => FORMATTING_EL,
171    embed => MISC_SPECIAL_EL,    embed => MISC_SPECIAL_EL,
172      eventsource => MISC_SPECIAL_EL,
173    fieldset => MISC_SPECIAL_EL,    fieldset => MISC_SPECIAL_EL,
174      figure => MISC_SPECIAL_EL,
175    font => FORMATTING_EL,    font => FORMATTING_EL,
176      footer => MISC_SPECIAL_EL,
177    form => FORM_EL,    form => FORM_EL,
178    frame => MISC_SPECIAL_EL,    frame => MISC_SPECIAL_EL,
179    frameset => FRAMESET_EL,    frameset => FRAMESET_EL,
# Line 162  my $el_category = { Line 184  my $el_category = {
184    h5 => HEADING_EL,    h5 => HEADING_EL,
185    h6 => HEADING_EL,    h6 => HEADING_EL,
186    head => MISC_SPECIAL_EL,    head => MISC_SPECIAL_EL,
187      header => MISC_SPECIAL_EL,
188    hr => MISC_SPECIAL_EL,    hr => MISC_SPECIAL_EL,
189    html => HTML_EL,    html => HTML_EL,
190    i => FORMATTING_EL,    i => FORMATTING_EL,
191    iframe => MISC_SPECIAL_EL,    iframe => MISC_SPECIAL_EL,
192    img => MISC_SPECIAL_EL,    img => MISC_SPECIAL_EL,
193      #image => MISC_SPECIAL_EL, ## NOTE: Commented out in the spec.
194    input => MISC_SPECIAL_EL,    input => MISC_SPECIAL_EL,
195    isindex => MISC_SPECIAL_EL,    isindex => MISC_SPECIAL_EL,
196    li => LI_EL,    li => LI_EL,
# Line 175  my $el_category = { Line 199  my $el_category = {
199    marquee => MISC_SCOPING_EL,    marquee => MISC_SCOPING_EL,
200    menu => MISC_SPECIAL_EL,    menu => MISC_SPECIAL_EL,
201    meta => MISC_SPECIAL_EL,    meta => MISC_SPECIAL_EL,
202      nav => MISC_SPECIAL_EL,
203    nobr => NOBR_EL | FORMATTING_EL,    nobr => NOBR_EL | FORMATTING_EL,
204    noembed => MISC_SPECIAL_EL,    noembed => MISC_SPECIAL_EL,
205    noframes => MISC_SPECIAL_EL,    noframes => MISC_SPECIAL_EL,
# Line 193  my $el_category = { Line 218  my $el_category = {
218    s => FORMATTING_EL,    s => FORMATTING_EL,
219    script => MISC_SPECIAL_EL,    script => MISC_SPECIAL_EL,
220    select => SELECT_EL,    select => SELECT_EL,
221      section => MISC_SPECIAL_EL,
222    small => FORMATTING_EL,    small => FORMATTING_EL,
223    spacer => MISC_SPECIAL_EL,    spacer => MISC_SPECIAL_EL,
224    strike => FORMATTING_EL,    strike => FORMATTING_EL,
# Line 312  my $foreign_attr_xname = { Line 338  my $foreign_attr_xname = {
338    
339  ## ISSUE: xmlns:xlink="non-xlink-ns" is not an error.  ## ISSUE: xmlns:xlink="non-xlink-ns" is not an error.
340    
341  my $c1_entity_char = {  my $charref_map = {
342      0x0D => 0x000A,
343    0x80 => 0x20AC,    0x80 => 0x20AC,
344    0x81 => 0xFFFD,    0x81 => 0xFFFD,
345    0x82 => 0x201A,    0x82 => 0x201A,
# Line 345  my $c1_entity_char = { Line 372  my $c1_entity_char = {
372    0x9D => 0xFFFD,    0x9D => 0xFFFD,
373    0x9E => 0x017E,    0x9E => 0x017E,
374    0x9F => 0x0178,    0x9F => 0x0178,
375  }; # $c1_entity_char  }; # $charref_map
376    $charref_map->{$_} = 0xFFFD
377        for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
378            0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF
379            0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
380            0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
381            0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
382            0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
383            0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
384    
385    ## TODO: Invoke the reset algorithm when a resettable element is
386    ## created (cf. HTML5 revision 2259).
387    
388  sub parse_byte_string ($$$$;$) {  sub parse_byte_string ($$$$;$) {
389    my $self = shift;    my $self = shift;
# Line 390  sub parse_byte_stream ($$$$;$$) { Line 428  sub parse_byte_stream ($$$$;$$) {
428            ## TODO: Is this ok?  Transfer protocol's parameter should be            ## TODO: Is this ok?  Transfer protocol's parameter should be
429            ## interpreted in its semantics?            ## interpreted in its semantics?
430    
       ## ISSUE: Unsupported encoding is not ignored according to the spec.  
431        ($char_stream, $e_status) = $charset->get_decode_handle        ($char_stream, $e_status) = $charset->get_decode_handle
432            ($byte_stream, allow_error_reporting => 1,            ($byte_stream, allow_error_reporting => 1,
433             allow_fallback => 1);             allow_fallback => 1);
# Line 398  sub parse_byte_stream ($$$$;$$) { Line 435  sub parse_byte_stream ($$$$;$$) {
435          $self->{confident} = 1;          $self->{confident} = 1;
436          last SNIFFING;          last SNIFFING;
437        } else {        } else {
438          ## TODO: unsupported error          !!!parse-error (type => 'charset:not supported',
439                            layer => 'encode',
440                            line => 1, column => 1,
441                            value => $charset_name,
442                            level => $self->{level}->{uncertain});
443        }        }
444      }      }
445    
# Line 496  sub parse_byte_stream ($$$$;$$) { Line 537  sub parse_byte_stream ($$$$;$$) {
537                      line => 1, column => 1,                      line => 1, column => 1,
538                      layer => 'encode');                      layer => 'encode');
539    } elsif (not ($e_status &    } elsif (not ($e_status &
540                  Message::Charset::Info::ERROR_REPORTING_ENCODING_IMPL())) {                  Message::Charset::Info::ERROR_REPORTING_ENCODING_IMPL ())) {
541      $self->{input_encoding} = $charset->get_iana_name;      $self->{input_encoding} = $charset->get_iana_name;
542      !!!parse-error (type => 'chardecode:no error',      !!!parse-error (type => 'chardecode:no error',
543                      text => $self->{input_encoding},                      text => $self->{input_encoding},
# Line 561  sub parse_byte_stream ($$$$;$$) { Line 602  sub parse_byte_stream ($$$$;$$) {
602    my $char_onerror = sub {    my $char_onerror = sub {
603      my (undef, $type, %opt) = @_;      my (undef, $type, %opt) = @_;
604      !!!parse-error (layer => 'encode',      !!!parse-error (layer => 'encode',
605                      %opt, type => $type,                      line => $self->{line}, column => $self->{column} + 1,
606                      line => $self->{line}, column => $self->{column} + 1);                      %opt, type => $type);
607      if ($opt{octets}) {      if ($opt{octets}) {
608        ${$opt{octets}} = "\x{FFFD}"; # relacement character        ${$opt{octets}} = "\x{FFFD}"; # relacement character
609      }      }
# Line 571  sub parse_byte_stream ($$$$;$$) { Line 612  sub parse_byte_stream ($$$$;$$) {
612    my $wrapped_char_stream = $get_wrapper->($char_stream);    my $wrapped_char_stream = $get_wrapper->($char_stream);
613    $wrapped_char_stream->onerror ($char_onerror);    $wrapped_char_stream->onerror ($char_onerror);
614    
615    my @args = @_; shift @args; # $s    my @args = ($_[1], $_[2]); # $doc, $onerror - $get_wrapper = undef;
616    my $return;    my $return;
617    try {    try {
618      $return = $self->parse_char_stream ($wrapped_char_stream, @args);        $return = $self->parse_char_stream ($wrapped_char_stream, @args);  
# Line 586  sub parse_byte_stream ($$$$;$$) { Line 627  sub parse_byte_stream ($$$$;$$) {
627                        line => 1, column => 1,                        line => 1, column => 1,
628                        layer => 'encode');                        layer => 'encode');
629      } elsif (not ($e_status &      } elsif (not ($e_status &
630                    Message::Charset::Info::ERROR_REPORTING_ENCODING_IMPL())) {                    Message::Charset::Info::ERROR_REPORTING_ENCODING_IMPL ())) {
631        $self->{input_encoding} = $charset->get_iana_name;        $self->{input_encoding} = $charset->get_iana_name;
632        !!!parse-error (type => 'chardecode:no error',        !!!parse-error (type => 'chardecode:no error',
633                        text => $self->{input_encoding},                        text => $self->{input_encoding},
# Line 618  sub parse_byte_stream ($$$$;$$) { Line 659  sub parse_byte_stream ($$$$;$$) {
659  sub parse_char_string ($$$;$$) {  sub parse_char_string ($$$;$$) {
660    #my ($self, $s, $doc, $onerror, $get_wrapper) = @_;    #my ($self, $s, $doc, $onerror, $get_wrapper) = @_;
661    my $self = shift;    my $self = shift;
   require utf8;  
662    my $s = ref $_[0] ? $_[0] : \($_[0]);    my $s = ref $_[0] ? $_[0] : \($_[0]);
663    open my $input, '<' . (utf8::is_utf8 ($$s) ? ':utf8' : ''), $s;    require Whatpm::Charset::DecodeHandle;
664    if ($_[3]) {    my $input = Whatpm::Charset::DecodeHandle::CharString->new ($s);
     $input = $_[3]->($input);  
   }  
665    return $self->parse_char_stream ($input, @_[1..$#_]);    return $self->parse_char_stream ($input, @_[1..$#_]);
666  } # parse_char_string  } # parse_char_string
667  *parse_string = \&parse_char_string; ## NOTE: Alias for backward compatibility.  *parse_string = \&parse_char_string; ## NOTE: Alias for backward compatibility.
668    
669  sub parse_char_stream ($$$;$) {  sub parse_char_stream ($$$;$$) {
670    my $self = ref $_[0] ? shift : shift->new;    my $self = ref $_[0] ? shift : shift->new;
671    my $input = $_[0];    my $input = $_[0];
672    $self->{document} = $_[1];    $self->{document} = $_[1];
# Line 639  sub parse_char_stream ($$$;$) { Line 677  sub parse_char_stream ($$$;$) {
677    $self->{confident} = 1 unless exists $self->{confident};    $self->{confident} = 1 unless exists $self->{confident};
678    $self->{document}->input_encoding ($self->{input_encoding})    $self->{document}->input_encoding ($self->{input_encoding})
679        if defined $self->{input_encoding};        if defined $self->{input_encoding};
680    ## TODO: |{input_encoding}| is needless?
681    
   my $i = 0;  
682    $self->{line_prev} = $self->{line} = 1;    $self->{line_prev} = $self->{line} = 1;
683    $self->{column_prev} = $self->{column} = 0;    $self->{column_prev} = -1;
684    $self->{set_next_char} = sub {    $self->{column} = 0;
685      $self->{set_nc} = sub {
686      my $self = shift;      my $self = shift;
687    
688      pop @{$self->{prev_char}};      my $char = '';
689      unshift @{$self->{prev_char}}, $self->{next_char};      if (defined $self->{next_nc}) {
690          $char = $self->{next_nc};
691      my $char;        delete $self->{next_nc};
692      if (defined $self->{next_next_char}) {        $self->{nc} = ord $char;
       $char = $self->{next_next_char};  
       delete $self->{next_next_char};  
693      } else {      } else {
694        $char = $input->getc;        $self->{char_buffer} = '';
695          $self->{char_buffer_pos} = 0;
696    
697          my $count = $input->manakai_read_until
698             ($self->{char_buffer}, qr/[^\x00\x0A\x0D]/, $self->{char_buffer_pos});
699          if ($count) {
700            $self->{line_prev} = $self->{line};
701            $self->{column_prev} = $self->{column};
702            $self->{column}++;
703            $self->{nc}
704                = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
705            return;
706          }
707    
708          if ($input->read ($char, 1)) {
709            $self->{nc} = ord $char;
710          } else {
711            $self->{nc} = -1;
712            return;
713          }
714      }      }
     $self->{next_char} = -1 and return unless defined $char;  
     $self->{next_char} = ord $char;  
715    
716      ($self->{line_prev}, $self->{column_prev})      ($self->{line_prev}, $self->{column_prev})
717          = ($self->{line}, $self->{column});          = ($self->{line}, $self->{column});
718      $self->{column}++;      $self->{column}++;
719            
720      if ($self->{next_char} == 0x000A) { # LF      if ($self->{nc} == 0x000A) { # LF
721        !!!cp ('j1');        !!!cp ('j1');
722        $self->{line}++;        $self->{line}++;
723        $self->{column} = 0;        $self->{column} = 0;
724      } elsif ($self->{next_char} == 0x000D) { # CR      } elsif ($self->{nc} == 0x000D) { # CR
725        !!!cp ('j2');        !!!cp ('j2');
726        my $next = $input->getc;  ## TODO: support for abort/streaming
727        if (defined $next and $next ne "\x0A") {        my $next = '';
728          $self->{next_next_char} = $next;        if ($input->read ($next, 1) and $next ne "\x0A") {
729            $self->{next_nc} = $next;
730        }        }
731        $self->{next_char} = 0x000A; # LF # MUST        $self->{nc} = 0x000A; # LF # MUST
732        $self->{line}++;        $self->{line}++;
733        $self->{column} = 0;        $self->{column} = 0;
734      } elsif ($self->{next_char} > 0x10FFFF) {      } elsif ($self->{nc} == 0x0000) { # NULL
       !!!cp ('j3');  
       $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST  
     } elsif ($self->{next_char} == 0x0000) { # NULL  
735        !!!cp ('j4');        !!!cp ('j4');
736        !!!parse-error (type => 'NULL');        !!!parse-error (type => 'NULL');
737        $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST        $self->{nc} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
738      } elsif ($self->{next_char} <= 0x0008 or      }
739               (0x000E <= $self->{next_char} and $self->{next_char} <= 0x001F) or    };
740               (0x007F <= $self->{next_char} and $self->{next_char} <= 0x009F) or  
741               (0xD800 <= $self->{next_char} and $self->{next_char} <= 0xDFFF) or    $self->{read_until} = sub {
742               (0xFDD0 <= $self->{next_char} and $self->{next_char} <= 0xFDDF) or      #my ($scalar, $specials_range, $offset) = @_;
743               {      return 0 if defined $self->{next_nc};
744                0xFFFE => 1, 0xFFFF => 1, 0x1FFFE => 1, 0x1FFFF => 1,  
745                0x2FFFE => 1, 0x2FFFF => 1, 0x3FFFE => 1, 0x3FFFF => 1,      my $pattern = qr/[^$_[1]\x00\x0A\x0D]/;
746                0x4FFFE => 1, 0x4FFFF => 1, 0x5FFFE => 1, 0x5FFFF => 1,      my $offset = $_[2] || 0;
747                0x6FFFE => 1, 0x6FFFF => 1, 0x7FFFE => 1, 0x7FFFF => 1,  
748                0x8FFFE => 1, 0x8FFFF => 1, 0x9FFFE => 1, 0x9FFFF => 1,      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
749                0xAFFFE => 1, 0xAFFFF => 1, 0xBFFFE => 1, 0xBFFFF => 1,        pos ($self->{char_buffer}) = $self->{char_buffer_pos};
750                0xCFFFE => 1, 0xCFFFF => 1, 0xDFFFE => 1, 0xDFFFF => 1,        if ($self->{char_buffer} =~ /\G(?>$pattern)+/) {
751                0xEFFFE => 1, 0xEFFFF => 1, 0xFFFFE => 1, 0xFFFFF => 1,          substr ($_[0], $offset)
752                0x10FFFE => 1, 0x10FFFF => 1,              = substr ($self->{char_buffer}, $-[0], $+[0] - $-[0]);
753               }->{$self->{next_char}}) {          my $count = $+[0] - $-[0];
754        !!!cp ('j5');          if ($count) {
755        if ($self->{next_char} < 0x10000) {            $self->{column} += $count;
756          !!!parse-error (type => 'control char',            $self->{char_buffer_pos} += $count;
757                          text => (sprintf 'U+%04X', $self->{next_char}));            $self->{line_prev} = $self->{line};
758              $self->{column_prev} = $self->{column} - 1;
759              $self->{nc} = -1;
760            }
761            return $count;
762        } else {        } else {
763          !!!parse-error (type => 'control char',          return 0;
764                          text => (sprintf 'U-%08X', $self->{next_char}));        }
765        } else {
766          my $count = $input->manakai_read_until ($_[0], $pattern, $_[2]);
767          if ($count) {
768            $self->{column} += $count;
769            $self->{line_prev} = $self->{line};
770            $self->{column_prev} = $self->{column} - 1;
771            $self->{nc} = -1;
772        }        }
773          return $count;
774      }      }
775    };    }; # $self->{read_until}
   $self->{prev_char} = [-1, -1, -1];  
   $self->{next_char} = -1;  
776    
777    my $onerror = $_[2] || sub {    my $onerror = $_[2] || sub {
778      my (%opt) = @_;      my (%opt) = @_;
# Line 722  sub parse_char_stream ($$$;$) { Line 784  sub parse_char_stream ($$$;$) {
784      $onerror->(line => $self->{line}, column => $self->{column}, @_);      $onerror->(line => $self->{line}, column => $self->{column}, @_);
785    };    };
786    
787      my $char_onerror = sub {
788        my (undef, $type, %opt) = @_;
789        !!!parse-error (layer => 'encode',
790                        line => $self->{line}, column => $self->{column} + 1,
791                        %opt, type => $type);
792      }; # $char_onerror
793    
794      if ($_[3]) {
795        $input = $_[3]->($input);
796        $input->onerror ($char_onerror);
797      } else {
798        $input->onerror ($char_onerror) unless defined $input->onerror;
799      }
800    
801    $self->_initialize_tokenizer;    $self->_initialize_tokenizer;
802    $self->_initialize_tree_constructor;    $self->_initialize_tree_constructor;
803    $self->_construct_tree;    $self->_construct_tree;
# Line 741  sub new ($) { Line 817  sub new ($) {
817                info => 'i',                info => 'i',
818                uncertain => 'u'},                uncertain => 'u'},
819    }, $class;    }, $class;
820    $self->{set_next_char} = sub {    $self->{set_nc} = sub {
821      $self->{next_char} = -1;      $self->{nc} = -1;
822    };    };
823    $self->{parse_error} = sub {    $self->{parse_error} = sub {
824      #      #
# Line 769  sub RCDATA_CONTENT_MODEL () { CM_ENTITY Line 845  sub RCDATA_CONTENT_MODEL () { CM_ENTITY
845  sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }  sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
846    
847  sub DATA_STATE () { 0 }  sub DATA_STATE () { 0 }
848  sub ENTITY_DATA_STATE () { 1 }  #sub ENTITY_DATA_STATE () { 1 }
849  sub TAG_OPEN_STATE () { 2 }  sub TAG_OPEN_STATE () { 2 }
850  sub CLOSE_TAG_OPEN_STATE () { 3 }  sub CLOSE_TAG_OPEN_STATE () { 3 }
851  sub TAG_NAME_STATE () { 4 }  sub TAG_NAME_STATE () { 4 }
# Line 780  sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 Line 856  sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8
856  sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }  sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
857  sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }  sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
858  sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }  sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
859  sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }  #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
860  sub MARKUP_DECLARATION_OPEN_STATE () { 13 }  sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
861  sub COMMENT_START_STATE () { 14 }  sub COMMENT_START_STATE () { 14 }
862  sub COMMENT_START_DASH_STATE () { 15 }  sub COMMENT_START_DASH_STATE () { 15 }
# Line 807  sub CDATA_SECTION_STATE () { 35 } Line 883  sub CDATA_SECTION_STATE () { 35 }
883  sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec  sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
884  sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec  sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
885  sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec  sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
886  sub CDATA_PCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec  sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
887  sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec  sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
888  sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec  sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
889  sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec  sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
890  sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec  sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
891    ## NOTE: "Entity data state", "entity in attribute value state", and
892    ## "consume a character reference" algorithm are jointly implemented
893    ## using the following six states:
894    sub ENTITY_STATE () { 44 }
895    sub ENTITY_HASH_STATE () { 45 }
896    sub NCR_NUM_STATE () { 46 }
897    sub HEXREF_X_STATE () { 47 }
898    sub HEXREF_HEX_STATE () { 48 }
899    sub ENTITY_NAME_STATE () { 49 }
900    sub PCDATA_STATE () { 50 } # "data state" in the spec
901    
902  sub DOCTYPE_TOKEN () { 1 }  sub DOCTYPE_TOKEN () { 1 }
903  sub COMMENT_TOKEN () { 2 }  sub COMMENT_TOKEN () { 2 }
# Line 864  sub IN_COLUMN_GROUP_IM () { 0b10 } Line 950  sub IN_COLUMN_GROUP_IM () { 0b10 }
950  sub _initialize_tokenizer ($) {  sub _initialize_tokenizer ($) {
951    my $self = shift;    my $self = shift;
952    $self->{state} = DATA_STATE; # MUST    $self->{state} = DATA_STATE; # MUST
953    #$self->{state_keyword}; # initialized when used    #$self->{s_kwd}; # state keyword - initialized when used
954      #$self->{entity__value}; # initialized when used
955      #$self->{entity__match}; # initialized when used
956    $self->{content_model} = PCDATA_CONTENT_MODEL; # be    $self->{content_model} = PCDATA_CONTENT_MODEL; # be
957    undef $self->{current_token};    undef $self->{ct}; # current token
958    undef $self->{current_attribute};    undef $self->{ca}; # current attribute
959    undef $self->{last_emitted_start_tag_name};    undef $self->{last_stag_name}; # last emitted start tag name
960    undef $self->{last_attribute_value_state};    #$self->{prev_state}; # initialized when used
961    delete $self->{self_closing};    delete $self->{self_closing};
962    $self->{char} = [];    $self->{char_buffer} = '';
963    # $self->{next_char}    $self->{char_buffer_pos} = 0;
964      $self->{nc} = -1; # next input character
965      #$self->{next_nc}
966    !!!next-input-character;    !!!next-input-character;
967    $self->{token} = [];    $self->{token} = [];
968    # $self->{escape}    # $self->{escape}
# Line 883  sub _initialize_tokenizer ($) { Line 973  sub _initialize_tokenizer ($) {
973  ##       CHARACTER_TOKEN, or END_OF_FILE_TOKEN  ##       CHARACTER_TOKEN, or END_OF_FILE_TOKEN
974  ##   ->{name} (DOCTYPE_TOKEN)  ##   ->{name} (DOCTYPE_TOKEN)
975  ##   ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)  ##   ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
976  ##   ->{public_identifier} (DOCTYPE_TOKEN)  ##   ->{pubid} (DOCTYPE_TOKEN)
977  ##   ->{system_identifier} (DOCTYPE_TOKEN)  ##   ->{sysid} (DOCTYPE_TOKEN)
978  ##   ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag  ##   ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
979  ##   ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)  ##   ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
980  ##        ->{name}  ##        ->{name}
# Line 903  sub _initialize_tokenizer ($) { Line 993  sub _initialize_tokenizer ($) {
993  ## has completed loading.  If one has, then it MUST be executed  ## has completed loading.  If one has, then it MUST be executed
994  ## and removed from the list.  ## and removed from the list.
995    
996  ## NOTE: HTML5 "Writing HTML documents" section, applied to  ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
997  ## documents and not to user agents and conformance checkers,  ## (This requirement was dropped from HTML5 spec, unfortunately.)
998  ## contains some requirements that are not detected by the  
999  ## parsing algorithm:  my $is_space = {
1000  ## - Some requirements on character encoding declarations. ## TODO    0x0009 => 1, # CHARACTER TABULATION (HT)
1001  ## - "Elements MUST NOT contain content that their content model disallows."    0x000A => 1, # LINE FEED (LF)
1002  ##   ... Some are parse error, some are not (will be reported by c.c.).    #0x000B => 0, # LINE TABULATION (VT)
1003  ## - Polytheistic slash SHOULD NOT be used. (Applied only to atheists.) ## TODO    0x000C => 1, # FORM FEED (FF)
1004  ## - Text (in elements, attributes, and comments) SHOULD NOT contain    #0x000D => 1, # CARRIAGE RETURN (CR)
1005  ##   control characters other than space characters. ## TODO: (what is control character? C0, C1 and DEL?  Unicode control character?)    0x0020 => 1, # SPACE (SP)
1006    };
 ## TODO: HTML5 poses authors two SHOULD-level requirements that cannot  
 ## be detected by the HTML5 parsing algorithm:  
 ## - Text,  
1007    
1008  sub _get_next_token ($) {  sub _get_next_token ($) {
1009    my $self = shift;    my $self = shift;
1010    
1011    if ($self->{self_closing}) {    if ($self->{self_closing}) {
1012      !!!parse-error (type => 'nestc', token => $self->{current_token});      !!!parse-error (type => 'nestc', token => $self->{ct});
1013      ## NOTE: The |self_closing| flag is only set by start tag token.      ## NOTE: The |self_closing| flag is only set by start tag token.
1014      ## In addition, when a start tag token is emitted, it is always set to      ## In addition, when a start tag token is emitted, it is always set to
1015      ## |current_token|.      ## |ct|.
1016      delete $self->{self_closing};      delete $self->{self_closing};
1017    }    }
1018    
# Line 935  sub _get_next_token ($) { Line 1022  sub _get_next_token ($) {
1022    }    }
1023    
1024    A: {    A: {
1025      if ($self->{state} == DATA_STATE) {      if ($self->{state} == PCDATA_STATE) {
1026        if ($self->{next_char} == 0x0026) { # &        ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
1027    
1028          if ($self->{nc} == 0x0026) { # &
1029            !!!cp (0.1);
1030            ## NOTE: In the spec, the tokenizer is switched to the
1031            ## "entity data state".  In this implementation, the tokenizer
1032            ## is switched to the |ENTITY_STATE|, which is an implementation
1033            ## of the "consume a character reference" algorithm.
1034            $self->{entity_add} = -1;
1035            $self->{prev_state} = DATA_STATE;
1036            $self->{state} = ENTITY_STATE;
1037            !!!next-input-character;
1038            redo A;
1039          } elsif ($self->{nc} == 0x003C) { # <
1040            !!!cp (0.2);
1041            $self->{state} = TAG_OPEN_STATE;
1042            !!!next-input-character;
1043            redo A;
1044          } elsif ($self->{nc} == -1) {
1045            !!!cp (0.3);
1046            !!!emit ({type => END_OF_FILE_TOKEN,
1047                      line => $self->{line}, column => $self->{column}});
1048            last A; ## TODO: ok?
1049          } else {
1050            !!!cp (0.4);
1051            #
1052          }
1053    
1054          # Anything else
1055          my $token = {type => CHARACTER_TOKEN,
1056                       data => chr $self->{nc},
1057                       line => $self->{line}, column => $self->{column},
1058                      };
1059          $self->{read_until}->($token->{data}, q[<&], length $token->{data});
1060    
1061          ## Stay in the state.
1062          !!!next-input-character;
1063          !!!emit ($token);
1064          redo A;
1065        } elsif ($self->{state} == DATA_STATE) {
1066          $self->{s_kwd} = '' unless defined $self->{s_kwd};
1067          if ($self->{nc} == 0x0026) { # &
1068            $self->{s_kwd} = '';
1069          if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA          if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
1070              not $self->{escape}) {              not $self->{escape}) {
1071            !!!cp (1);            !!!cp (1);
1072            $self->{state} = ENTITY_DATA_STATE;            ## NOTE: In the spec, the tokenizer is switched to the
1073              ## "entity data state".  In this implementation, the tokenizer
1074              ## is switched to the |ENTITY_STATE|, which is an implementation
1075              ## of the "consume a character reference" algorithm.
1076              $self->{entity_add} = -1;
1077              $self->{prev_state} = DATA_STATE;
1078              $self->{state} = ENTITY_STATE;
1079            !!!next-input-character;            !!!next-input-character;
1080            redo A;            redo A;
1081          } else {          } else {
1082            !!!cp (2);            !!!cp (2);
1083            #            #
1084          }          }
1085        } elsif ($self->{next_char} == 0x002D) { # -        } elsif ($self->{nc} == 0x002D) { # -
1086          if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA          if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
1087            unless ($self->{escape}) {            $self->{s_kwd} .= '-';
1088              if ($self->{prev_char}->[0] == 0x002D and # -            
1089                  $self->{prev_char}->[1] == 0x0021 and # !            if ($self->{s_kwd} eq '<!--') {
1090                  $self->{prev_char}->[2] == 0x003C) { # <              !!!cp (3);
1091                !!!cp (3);              $self->{escape} = 1; # unless $self->{escape};
1092                $self->{escape} = 1;              $self->{s_kwd} = '--';
1093              } else {              #
1094                !!!cp (4);            } elsif ($self->{s_kwd} eq '---') {
1095              }              !!!cp (4);
1096                $self->{s_kwd} = '--';
1097                #
1098            } else {            } else {
1099              !!!cp (5);              !!!cp (5);
1100                #
1101            }            }
1102          }          }
1103                    
1104          #          #
1105        } elsif ($self->{next_char} == 0x003C) { # <        } elsif ($self->{nc} == 0x0021) { # !
1106            if (length $self->{s_kwd}) {
1107              !!!cp (5.1);
1108              $self->{s_kwd} .= '!';
1109              #
1110            } else {
1111              !!!cp (5.2);
1112              #$self->{s_kwd} = '';
1113              #
1114            }
1115            #
1116          } elsif ($self->{nc} == 0x003C) { # <
1117          if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA          if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
1118              (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA              (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
1119               not $self->{escape})) {               not $self->{escape})) {
# Line 974  sub _get_next_token ($) { Line 1123  sub _get_next_token ($) {
1123            redo A;            redo A;
1124          } else {          } else {
1125            !!!cp (7);            !!!cp (7);
1126              $self->{s_kwd} = '';
1127            #            #
1128          }          }
1129        } elsif ($self->{next_char} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
1130          if ($self->{escape} and          if ($self->{escape} and
1131              ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA              ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
1132            if ($self->{prev_char}->[0] == 0x002D and # -            if ($self->{s_kwd} eq '--') {
               $self->{prev_char}->[1] == 0x002D) { # -  
1133              !!!cp (8);              !!!cp (8);
1134              delete $self->{escape};              delete $self->{escape};
1135            } else {            } else {
# Line 990  sub _get_next_token ($) { Line 1139  sub _get_next_token ($) {
1139            !!!cp (10);            !!!cp (10);
1140          }          }
1141                    
1142            $self->{s_kwd} = '';
1143          #          #
1144        } elsif ($self->{next_char} == -1) {        } elsif ($self->{nc} == -1) {
1145          !!!cp (11);          !!!cp (11);
1146            $self->{s_kwd} = '';
1147          !!!emit ({type => END_OF_FILE_TOKEN,          !!!emit ({type => END_OF_FILE_TOKEN,
1148                    line => $self->{line}, column => $self->{column}});                    line => $self->{line}, column => $self->{column}});
1149          last A; ## TODO: ok?          last A; ## TODO: ok?
1150        } else {        } else {
1151          !!!cp (12);          !!!cp (12);
1152            $self->{s_kwd} = '';
1153            #
1154        }        }
1155    
1156        # Anything else        # Anything else
1157        my $token = {type => CHARACTER_TOKEN,        my $token = {type => CHARACTER_TOKEN,
1158                     data => chr $self->{next_char},                     data => chr $self->{nc},
1159                     line => $self->{line}, column => $self->{column},                     line => $self->{line}, column => $self->{column},
1160                    };                    };
1161        ## Stay in the data state        if ($self->{read_until}->($token->{data}, q[-!<>&],
1162        !!!next-input-character;                                  length $token->{data})) {
1163            $self->{s_kwd} = '';
1164        !!!emit ($token);        }
   
       redo A;  
     } elsif ($self->{state} == ENTITY_DATA_STATE) {  
       ## (cannot happen in CDATA state)  
   
       my ($l, $c) = ($self->{line_prev}, $self->{column_prev});  
         
       my $token = $self->_tokenize_attempt_to_consume_an_entity (0, -1);  
   
       $self->{state} = DATA_STATE;  
       # next-input-character is already done  
1165    
1166        unless (defined $token) {        ## Stay in the data state.
1167          if ($self->{content_model} == PCDATA_CONTENT_MODEL) {
1168          !!!cp (13);          !!!cp (13);
1169          !!!emit ({type => CHARACTER_TOKEN, data => '&',          $self->{state} = PCDATA_STATE;
                   line => $l, column => $c,  
                  });  
1170        } else {        } else {
1171          !!!cp (14);          !!!cp (14);
1172          !!!emit ($token);          ## Stay in the state.
1173        }        }
1174          !!!next-input-character;
1175          !!!emit ($token);
1176        redo A;        redo A;
1177      } elsif ($self->{state} == TAG_OPEN_STATE) {      } elsif ($self->{state} == TAG_OPEN_STATE) {
1178        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
1179          if ($self->{next_char} == 0x002F) { # /          if ($self->{nc} == 0x002F) { # /
1180            !!!cp (15);            !!!cp (15);
1181            !!!next-input-character;            !!!next-input-character;
1182            $self->{state} = CLOSE_TAG_OPEN_STATE;            $self->{state} = CLOSE_TAG_OPEN_STATE;
1183            redo A;            redo A;
1184            } elsif ($self->{nc} == 0x0021) { # !
1185              !!!cp (15.1);
1186              $self->{s_kwd} = '<' unless $self->{escape};
1187              #
1188          } else {          } else {
1189            !!!cp (16);            !!!cp (16);
1190            ## reconsume            #
           $self->{state} = DATA_STATE;  
   
           !!!emit ({type => CHARACTER_TOKEN, data => '<',  
                     line => $self->{line_prev},  
                     column => $self->{column_prev},  
                    });  
   
           redo A;  
1191          }          }
1192    
1193            ## reconsume
1194            $self->{state} = DATA_STATE;
1195            !!!emit ({type => CHARACTER_TOKEN, data => '<',
1196                      line => $self->{line_prev},
1197                      column => $self->{column_prev},
1198                     });
1199            redo A;
1200        } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA        } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
1201          if ($self->{next_char} == 0x0021) { # !          if ($self->{nc} == 0x0021) { # !
1202            !!!cp (17);            !!!cp (17);
1203            $self->{state} = MARKUP_DECLARATION_OPEN_STATE;            $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
1204            !!!next-input-character;            !!!next-input-character;
1205            redo A;            redo A;
1206          } elsif ($self->{next_char} == 0x002F) { # /          } elsif ($self->{nc} == 0x002F) { # /
1207            !!!cp (18);            !!!cp (18);
1208            $self->{state} = CLOSE_TAG_OPEN_STATE;            $self->{state} = CLOSE_TAG_OPEN_STATE;
1209            !!!next-input-character;            !!!next-input-character;
1210            redo A;            redo A;
1211          } elsif (0x0041 <= $self->{next_char} and          } elsif (0x0041 <= $self->{nc} and
1212                   $self->{next_char} <= 0x005A) { # A..Z                   $self->{nc} <= 0x005A) { # A..Z
1213            !!!cp (19);            !!!cp (19);
1214            $self->{current_token}            $self->{ct}
1215              = {type => START_TAG_TOKEN,              = {type => START_TAG_TOKEN,
1216                 tag_name => chr ($self->{next_char} + 0x0020),                 tag_name => chr ($self->{nc} + 0x0020),
1217                 line => $self->{line_prev},                 line => $self->{line_prev},
1218                 column => $self->{column_prev}};                 column => $self->{column_prev}};
1219            $self->{state} = TAG_NAME_STATE;            $self->{state} = TAG_NAME_STATE;
1220            !!!next-input-character;            !!!next-input-character;
1221            redo A;            redo A;
1222          } elsif (0x0061 <= $self->{next_char} and          } elsif (0x0061 <= $self->{nc} and
1223                   $self->{next_char} <= 0x007A) { # a..z                   $self->{nc} <= 0x007A) { # a..z
1224            !!!cp (20);            !!!cp (20);
1225            $self->{current_token} = {type => START_TAG_TOKEN,            $self->{ct} = {type => START_TAG_TOKEN,
1226                                      tag_name => chr ($self->{next_char}),                                      tag_name => chr ($self->{nc}),
1227                                      line => $self->{line_prev},                                      line => $self->{line_prev},
1228                                      column => $self->{column_prev}};                                      column => $self->{column_prev}};
1229            $self->{state} = TAG_NAME_STATE;            $self->{state} = TAG_NAME_STATE;
1230            !!!next-input-character;            !!!next-input-character;
1231            redo A;            redo A;
1232          } elsif ($self->{next_char} == 0x003E) { # >          } elsif ($self->{nc} == 0x003E) { # >
1233            !!!cp (21);            !!!cp (21);
1234            !!!parse-error (type => 'empty start tag',            !!!parse-error (type => 'empty start tag',
1235                            line => $self->{line_prev},                            line => $self->{line_prev},
# Line 1096  sub _get_next_token ($) { Line 1243  sub _get_next_token ($) {
1243                     });                     });
1244    
1245            redo A;            redo A;
1246          } elsif ($self->{next_char} == 0x003F) { # ?          } elsif ($self->{nc} == 0x003F) { # ?
1247            !!!cp (22);            !!!cp (22);
1248            !!!parse-error (type => 'pio',            !!!parse-error (type => 'pio',
1249                            line => $self->{line_prev},                            line => $self->{line_prev},
1250                            column => $self->{column_prev});                            column => $self->{column_prev});
1251            $self->{state} = BOGUS_COMMENT_STATE;            $self->{state} = BOGUS_COMMENT_STATE;
1252            $self->{current_token} = {type => COMMENT_TOKEN, data => '',            $self->{ct} = {type => COMMENT_TOKEN, data => '',
1253                                      line => $self->{line_prev},                                      line => $self->{line_prev},
1254                                      column => $self->{column_prev},                                      column => $self->{column_prev},
1255                                     };                                     };
1256            ## $self->{next_char} is intentionally left as is            ## $self->{nc} is intentionally left as is
1257            redo A;            redo A;
1258          } else {          } else {
1259            !!!cp (23);            !!!cp (23);
# Line 1128  sub _get_next_token ($) { Line 1275  sub _get_next_token ($) {
1275        }        }
1276      } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {      } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
1277        ## NOTE: The "close tag open state" in the spec is implemented as        ## NOTE: The "close tag open state" in the spec is implemented as
1278        ## |CLOSE_TAG_OPEN_STATE| and |CDATA_PCDATA_CLOSE_TAG_STATE|.        ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
1279    
1280        my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"        my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
1281        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
1282          if (defined $self->{last_emitted_start_tag_name}) {          if (defined $self->{last_stag_name}) {
1283            $self->{state} = CDATA_PCDATA_CLOSE_TAG_STATE;            $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
1284            $self->{state_keyword} = '';            $self->{s_kwd} = '';
1285            ## Reconsume.            ## Reconsume.
1286            redo A;            redo A;
1287          } else {          } else {
# Line 1150  sub _get_next_token ($) { Line 1297  sub _get_next_token ($) {
1297          }          }
1298        }        }
1299    
1300        if (0x0041 <= $self->{next_char} and        if (0x0041 <= $self->{nc} and
1301            $self->{next_char} <= 0x005A) { # A..Z            $self->{nc} <= 0x005A) { # A..Z
1302          !!!cp (29);          !!!cp (29);
1303          $self->{current_token}          $self->{ct}
1304              = {type => END_TAG_TOKEN,              = {type => END_TAG_TOKEN,
1305                 tag_name => chr ($self->{next_char} + 0x0020),                 tag_name => chr ($self->{nc} + 0x0020),
1306                 line => $l, column => $c};                 line => $l, column => $c};
1307          $self->{state} = TAG_NAME_STATE;          $self->{state} = TAG_NAME_STATE;
1308          !!!next-input-character;          !!!next-input-character;
1309          redo A;          redo A;
1310        } elsif (0x0061 <= $self->{next_char} and        } elsif (0x0061 <= $self->{nc} and
1311                 $self->{next_char} <= 0x007A) { # a..z                 $self->{nc} <= 0x007A) { # a..z
1312          !!!cp (30);          !!!cp (30);
1313          $self->{current_token} = {type => END_TAG_TOKEN,          $self->{ct} = {type => END_TAG_TOKEN,
1314                                    tag_name => chr ($self->{next_char}),                                    tag_name => chr ($self->{nc}),
1315                                    line => $l, column => $c};                                    line => $l, column => $c};
1316          $self->{state} = TAG_NAME_STATE;          $self->{state} = TAG_NAME_STATE;
1317          !!!next-input-character;          !!!next-input-character;
1318          redo A;          redo A;
1319        } elsif ($self->{next_char} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
1320          !!!cp (31);          !!!cp (31);
1321          !!!parse-error (type => 'empty end tag',          !!!parse-error (type => 'empty end tag',
1322                          line => $self->{line_prev}, ## "<" in "</>"                          line => $self->{line_prev}, ## "<" in "</>"
# Line 1177  sub _get_next_token ($) { Line 1324  sub _get_next_token ($) {
1324          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1325          !!!next-input-character;          !!!next-input-character;
1326          redo A;          redo A;
1327        } elsif ($self->{next_char} == -1) {        } elsif ($self->{nc} == -1) {
1328          !!!cp (32);          !!!cp (32);
1329          !!!parse-error (type => 'bare etago');          !!!parse-error (type => 'bare etago');
1330          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
# Line 1192  sub _get_next_token ($) { Line 1339  sub _get_next_token ($) {
1339          !!!cp (33);          !!!cp (33);
1340          !!!parse-error (type => 'bogus end tag');          !!!parse-error (type => 'bogus end tag');
1341          $self->{state} = BOGUS_COMMENT_STATE;          $self->{state} = BOGUS_COMMENT_STATE;
1342          $self->{current_token} = {type => COMMENT_TOKEN, data => '',          $self->{ct} = {type => COMMENT_TOKEN, data => '',
1343                                    line => $self->{line_prev}, # "<" of "</"                                    line => $self->{line_prev}, # "<" of "</"
1344                                    column => $self->{column_prev} - 1,                                    column => $self->{column_prev} - 1,
1345                                   };                                   };
1346          ## NOTE: $self->{next_char} is intentionally left as is.          ## NOTE: $self->{nc} is intentionally left as is.
1347          ## Although the "anything else" case of the spec not explicitly          ## Although the "anything else" case of the spec not explicitly
1348          ## states that the next input character is to be reconsumed,          ## states that the next input character is to be reconsumed,
1349          ## it will be included to the |data| of the comment token          ## it will be included to the |data| of the comment token
# Line 1204  sub _get_next_token ($) { Line 1351  sub _get_next_token ($) {
1351          ## "bogus comment state" entry.          ## "bogus comment state" entry.
1352          redo A;          redo A;
1353        }        }
1354      } elsif ($self->{state} == CDATA_PCDATA_CLOSE_TAG_STATE) {      } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
1355        my $ch = substr $self->{last_emitted_start_tag_name}, length $self->{state_keyword}, 1;        my $ch = substr $self->{last_stag_name}, length $self->{s_kwd}, 1;
1356        if (length $ch) {        if (length $ch) {
1357          my $CH = $ch;          my $CH = $ch;
1358          $ch =~ tr/a-z/A-Z/;          $ch =~ tr/a-z/A-Z/;
1359          my $nch = chr $self->{next_char};          my $nch = chr $self->{nc};
1360          if ($nch eq $ch or $nch eq $CH) {          if ($nch eq $ch or $nch eq $CH) {
1361            !!!cp (24);            !!!cp (24);
1362            ## Stay in the state.            ## Stay in the state.
1363            $self->{state_keyword} .= $nch;            $self->{s_kwd} .= $nch;
1364            !!!next-input-character;            !!!next-input-character;
1365            redo A;            redo A;
1366          } else {          } else {
# Line 1221  sub _get_next_token ($) { Line 1368  sub _get_next_token ($) {
1368            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
1369            ## Reconsume.            ## Reconsume.
1370            !!!emit ({type => CHARACTER_TOKEN,            !!!emit ({type => CHARACTER_TOKEN,
1371                      data => '</' . $self->{state_keyword},                      data => '</' . $self->{s_kwd},
1372                      line => $self->{line_prev},                      line => $self->{line_prev},
1373                      column => $self->{column_prev} - 1 - length $self->{state_keyword},                      column => $self->{column_prev} - 1 - length $self->{s_kwd},
1374                     });                     });
1375            redo A;            redo A;
1376          }          }
1377        } else { # after "<{tag-name}"        } else { # after "<{tag-name}"
1378          unless ({          unless ($is_space->{$self->{nc}} or
1379                   0x0009 => 1, # HT                  {
                  0x000A => 1, # LF  
                  0x000B => 1, # VT  
                  0x000C => 1, # FF  
                  0x0020 => 1, # SP  
1380                   0x003E => 1, # >                   0x003E => 1, # >
1381                   0x002F => 1, # /                   0x002F => 1, # /
1382                   -1 => 1, # EOF                   -1 => 1, # EOF
1383                  }->{$self->{next_char}}) {                  }->{$self->{nc}}) {
1384            !!!cp (26);            !!!cp (26);
1385            ## Reconsume.            ## Reconsume.
1386            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
1387            !!!emit ({type => CHARACTER_TOKEN,            !!!emit ({type => CHARACTER_TOKEN,
1388                      data => '</' . $self->{state_keyword},                      data => '</' . $self->{s_kwd},
1389                      line => $self->{line_prev},                      line => $self->{line_prev},
1390                      column => $self->{column_prev} - 1 - length $self->{state_keyword},                      column => $self->{column_prev} - 1 - length $self->{s_kwd},
1391                     });                     });
1392            redo A;            redo A;
1393          } else {          } else {
1394            !!!cp (27);            !!!cp (27);
1395            $self->{current_token}            $self->{ct}
1396                = {type => END_TAG_TOKEN,                = {type => END_TAG_TOKEN,
1397                   tag_name => $self->{last_emitted_start_tag_name},                   tag_name => $self->{last_stag_name},
1398                   line => $self->{line_prev},                   line => $self->{line_prev},
1399                   column => $self->{column_prev} - 1 - length $self->{state_keyword}};                   column => $self->{column_prev} - 1 - length $self->{s_kwd}};
1400            $self->{state} = TAG_NAME_STATE;            $self->{state} = TAG_NAME_STATE;
1401            ## Reconsume.            ## Reconsume.
1402            redo A;            redo A;
1403          }          }
1404        }        }
1405      } elsif ($self->{state} == TAG_NAME_STATE) {      } elsif ($self->{state} == TAG_NAME_STATE) {
1406        if ($self->{next_char} == 0x0009 or # HT        if ($is_space->{$self->{nc}}) {
           $self->{next_char} == 0x000A or # LF  
           $self->{next_char} == 0x000B or # VT  
           $self->{next_char} == 0x000C or # FF  
           $self->{next_char} == 0x0020) { # SP  
1407          !!!cp (34);          !!!cp (34);
1408          $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;          $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1409          !!!next-input-character;          !!!next-input-character;
1410          redo A;          redo A;
1411        } elsif ($self->{next_char} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
1412          if ($self->{current_token}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
1413            !!!cp (35);            !!!cp (35);
1414            $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
1415          } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1416            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1417            #if ($self->{current_token}->{attributes}) {            #if ($self->{ct}->{attributes}) {
1418            #  ## NOTE: This should never be reached.            #  ## NOTE: This should never be reached.
1419            #  !!! cp (36);            #  !!! cp (36);
1420            #  !!! parse-error (type => 'end tag attribute');            #  !!! parse-error (type => 'end tag attribute');
# Line 1283  sub _get_next_token ($) { Line 1422  sub _get_next_token ($) {
1422              !!!cp (37);              !!!cp (37);
1423            #}            #}
1424          } else {          } else {
1425            die "$0: $self->{current_token}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1426          }          }
1427          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1428          !!!next-input-character;          !!!next-input-character;
1429    
1430          !!!emit ($self->{current_token}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
1431    
1432          redo A;          redo A;
1433        } elsif (0x0041 <= $self->{next_char} and        } elsif (0x0041 <= $self->{nc} and
1434                 $self->{next_char} <= 0x005A) { # A..Z                 $self->{nc} <= 0x005A) { # A..Z
1435          !!!cp (38);          !!!cp (38);
1436          $self->{current_token}->{tag_name} .= chr ($self->{next_char} + 0x0020);          $self->{ct}->{tag_name} .= chr ($self->{nc} + 0x0020);
1437            # start tag or end tag            # start tag or end tag
1438          ## Stay in this state          ## Stay in this state
1439          !!!next-input-character;          !!!next-input-character;
1440          redo A;          redo A;
1441        } elsif ($self->{next_char} == -1) {        } elsif ($self->{nc} == -1) {
1442          !!!parse-error (type => 'unclosed tag');          !!!parse-error (type => 'unclosed tag');
1443          if ($self->{current_token}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
1444            !!!cp (39);            !!!cp (39);
1445            $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
1446          } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1447            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1448            #if ($self->{current_token}->{attributes}) {            #if ($self->{ct}->{attributes}) {
1449            #  ## NOTE: This state should never be reached.            #  ## NOTE: This state should never be reached.
1450            #  !!! cp (40);            #  !!! cp (40);
1451            #  !!! parse-error (type => 'end tag attribute');            #  !!! parse-error (type => 'end tag attribute');
# Line 1314  sub _get_next_token ($) { Line 1453  sub _get_next_token ($) {
1453              !!!cp (41);              !!!cp (41);
1454            #}            #}
1455          } else {          } else {
1456            die "$0: $self->{current_token}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1457          }          }
1458          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1459          # reconsume          # reconsume
1460    
1461          !!!emit ($self->{current_token}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
1462    
1463          redo A;          redo A;
1464        } elsif ($self->{next_char} == 0x002F) { # /        } elsif ($self->{nc} == 0x002F) { # /
1465          !!!cp (42);          !!!cp (42);
1466          $self->{state} = SELF_CLOSING_START_TAG_STATE;          $self->{state} = SELF_CLOSING_START_TAG_STATE;
1467          !!!next-input-character;          !!!next-input-character;
1468          redo A;          redo A;
1469        } else {        } else {
1470          !!!cp (44);          !!!cp (44);
1471          $self->{current_token}->{tag_name} .= chr $self->{next_char};          $self->{ct}->{tag_name} .= chr $self->{nc};
1472            # start tag or end tag            # start tag or end tag
1473          ## Stay in the state          ## Stay in the state
1474          !!!next-input-character;          !!!next-input-character;
1475          redo A;          redo A;
1476        }        }
1477      } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {      } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
1478        if ($self->{next_char} == 0x0009 or # HT        if ($is_space->{$self->{nc}}) {
           $self->{next_char} == 0x000A or # LF  
           $self->{next_char} == 0x000B or # VT  
           $self->{next_char} == 0x000C or # FF  
           $self->{next_char} == 0x0020) { # SP  
1479          !!!cp (45);          !!!cp (45);
1480          ## Stay in the state          ## Stay in the state
1481          !!!next-input-character;          !!!next-input-character;
1482          redo A;          redo A;
1483        } elsif ($self->{next_char} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
1484          if ($self->{current_token}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
1485            !!!cp (46);            !!!cp (46);
1486            $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
1487          } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1488            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1489            if ($self->{current_token}->{attributes}) {            if ($self->{ct}->{attributes}) {
1490              !!!cp (47);              !!!cp (47);
1491              !!!parse-error (type => 'end tag attribute');              !!!parse-error (type => 'end tag attribute');
1492            } else {            } else {
1493              !!!cp (48);              !!!cp (48);
1494            }            }
1495          } else {          } else {
1496            die "$0: $self->{current_token}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1497          }          }
1498          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1499          !!!next-input-character;          !!!next-input-character;
1500    
1501          !!!emit ($self->{current_token}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
1502    
1503          redo A;          redo A;
1504        } elsif (0x0041 <= $self->{next_char} and        } elsif (0x0041 <= $self->{nc} and
1505                 $self->{next_char} <= 0x005A) { # A..Z                 $self->{nc} <= 0x005A) { # A..Z
1506          !!!cp (49);          !!!cp (49);
1507          $self->{current_attribute}          $self->{ca}
1508              = {name => chr ($self->{next_char} + 0x0020),              = {name => chr ($self->{nc} + 0x0020),
1509                 value => '',                 value => '',
1510                 line => $self->{line}, column => $self->{column}};                 line => $self->{line}, column => $self->{column}};
1511          $self->{state} = ATTRIBUTE_NAME_STATE;          $self->{state} = ATTRIBUTE_NAME_STATE;
1512          !!!next-input-character;          !!!next-input-character;
1513          redo A;          redo A;
1514        } elsif ($self->{next_char} == 0x002F) { # /        } elsif ($self->{nc} == 0x002F) { # /
1515          !!!cp (50);          !!!cp (50);
1516          $self->{state} = SELF_CLOSING_START_TAG_STATE;          $self->{state} = SELF_CLOSING_START_TAG_STATE;
1517          !!!next-input-character;          !!!next-input-character;
1518          redo A;          redo A;
1519        } elsif ($self->{next_char} == -1) {        } elsif ($self->{nc} == -1) {
1520          !!!parse-error (type => 'unclosed tag');          !!!parse-error (type => 'unclosed tag');
1521          if ($self->{current_token}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
1522            !!!cp (52);            !!!cp (52);
1523            $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
1524          } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1525            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1526            if ($self->{current_token}->{attributes}) {            if ($self->{ct}->{attributes}) {
1527              !!!cp (53);              !!!cp (53);
1528              !!!parse-error (type => 'end tag attribute');              !!!parse-error (type => 'end tag attribute');
1529            } else {            } else {
1530              !!!cp (54);              !!!cp (54);
1531            }            }
1532          } else {          } else {
1533            die "$0: $self->{current_token}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1534          }          }
1535          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1536          # reconsume          # reconsume
1537    
1538          !!!emit ($self->{current_token}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
1539    
1540          redo A;          redo A;
1541        } else {        } else {
# Line 1408  sub _get_next_token ($) { Line 1543  sub _get_next_token ($) {
1543               0x0022 => 1, # "               0x0022 => 1, # "
1544               0x0027 => 1, # '               0x0027 => 1, # '
1545               0x003D => 1, # =               0x003D => 1, # =
1546              }->{$self->{next_char}}) {              }->{$self->{nc}}) {
1547            !!!cp (55);            !!!cp (55);
1548            !!!parse-error (type => 'bad attribute name');            !!!parse-error (type => 'bad attribute name');
1549          } else {          } else {
1550            !!!cp (56);            !!!cp (56);
1551          }          }
1552          $self->{current_attribute}          $self->{ca}
1553              = {name => chr ($self->{next_char}),              = {name => chr ($self->{nc}),
1554                 value => '',                 value => '',
1555                 line => $self->{line}, column => $self->{column}};                 line => $self->{line}, column => $self->{column}};
1556          $self->{state} = ATTRIBUTE_NAME_STATE;          $self->{state} = ATTRIBUTE_NAME_STATE;
# Line 1424  sub _get_next_token ($) { Line 1559  sub _get_next_token ($) {
1559        }        }
1560      } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {      } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
1561        my $before_leave = sub {        my $before_leave = sub {
1562          if (exists $self->{current_token}->{attributes} # start tag or end tag          if (exists $self->{ct}->{attributes} # start tag or end tag
1563              ->{$self->{current_attribute}->{name}}) { # MUST              ->{$self->{ca}->{name}}) { # MUST
1564            !!!cp (57);            !!!cp (57);
1565            !!!parse-error (type => 'duplicate attribute', text => $self->{current_attribute}->{name}, line => $self->{current_attribute}->{line}, column => $self->{current_attribute}->{column});            !!!parse-error (type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
1566            ## Discard $self->{current_attribute} # MUST            ## Discard $self->{ca} # MUST
1567          } else {          } else {
1568            !!!cp (58);            !!!cp (58);
1569            $self->{current_token}->{attributes}->{$self->{current_attribute}->{name}}            $self->{ct}->{attributes}->{$self->{ca}->{name}}
1570              = $self->{current_attribute};              = $self->{ca};
1571          }          }
1572        }; # $before_leave        }; # $before_leave
1573    
1574        if ($self->{next_char} == 0x0009 or # HT        if ($is_space->{$self->{nc}}) {
           $self->{next_char} == 0x000A or # LF  
           $self->{next_char} == 0x000B or # VT  
           $self->{next_char} == 0x000C or # FF  
           $self->{next_char} == 0x0020) { # SP  
1575          !!!cp (59);          !!!cp (59);
1576          $before_leave->();          $before_leave->();
1577          $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;          $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
1578          !!!next-input-character;          !!!next-input-character;
1579          redo A;          redo A;
1580        } elsif ($self->{next_char} == 0x003D) { # =        } elsif ($self->{nc} == 0x003D) { # =
1581          !!!cp (60);          !!!cp (60);
1582          $before_leave->();          $before_leave->();
1583          $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;          $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1584          !!!next-input-character;          !!!next-input-character;
1585          redo A;          redo A;
1586        } elsif ($self->{next_char} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
1587          $before_leave->();          $before_leave->();
1588          if ($self->{current_token}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
1589            !!!cp (61);            !!!cp (61);
1590            $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
1591          } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1592            !!!cp (62);            !!!cp (62);
1593            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1594            if ($self->{current_token}->{attributes}) {            if ($self->{ct}->{attributes}) {
1595              !!!parse-error (type => 'end tag attribute');              !!!parse-error (type => 'end tag attribute');
1596            }            }
1597          } else {          } else {
1598            die "$0: $self->{current_token}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1599          }          }
1600          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1601          !!!next-input-character;          !!!next-input-character;
1602    
1603          !!!emit ($self->{current_token}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
1604    
1605          redo A;          redo A;
1606        } elsif (0x0041 <= $self->{next_char} and        } elsif (0x0041 <= $self->{nc} and
1607                 $self->{next_char} <= 0x005A) { # A..Z                 $self->{nc} <= 0x005A) { # A..Z
1608          !!!cp (63);          !!!cp (63);
1609          $self->{current_attribute}->{name} .= chr ($self->{next_char} + 0x0020);          $self->{ca}->{name} .= chr ($self->{nc} + 0x0020);
1610          ## Stay in the state          ## Stay in the state
1611          !!!next-input-character;          !!!next-input-character;
1612          redo A;          redo A;
1613        } elsif ($self->{next_char} == 0x002F) { # /        } elsif ($self->{nc} == 0x002F) { # /
1614          !!!cp (64);          !!!cp (64);
1615          $before_leave->();          $before_leave->();
1616          $self->{state} = SELF_CLOSING_START_TAG_STATE;          $self->{state} = SELF_CLOSING_START_TAG_STATE;
1617          !!!next-input-character;          !!!next-input-character;
1618          redo A;          redo A;
1619        } elsif ($self->{next_char} == -1) {        } elsif ($self->{nc} == -1) {
1620          !!!parse-error (type => 'unclosed tag');          !!!parse-error (type => 'unclosed tag');
1621          $before_leave->();          $before_leave->();
1622          if ($self->{current_token}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
1623            !!!cp (66);            !!!cp (66);
1624            $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
1625          } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1626            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1627            if ($self->{current_token}->{attributes}) {            if ($self->{ct}->{attributes}) {
1628              !!!cp (67);              !!!cp (67);
1629              !!!parse-error (type => 'end tag attribute');              !!!parse-error (type => 'end tag attribute');
1630            } else {            } else {
# Line 1501  sub _get_next_token ($) { Line 1632  sub _get_next_token ($) {
1632              !!!cp (68);              !!!cp (68);
1633            }            }
1634          } else {          } else {
1635            die "$0: $self->{current_token}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1636          }          }
1637          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1638          # reconsume          # reconsume
1639    
1640          !!!emit ($self->{current_token}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
1641    
1642          redo A;          redo A;
1643        } else {        } else {
1644          if ($self->{next_char} == 0x0022 or # "          if ($self->{nc} == 0x0022 or # "
1645              $self->{next_char} == 0x0027) { # '              $self->{nc} == 0x0027) { # '
1646            !!!cp (69);            !!!cp (69);
1647            !!!parse-error (type => 'bad attribute name');            !!!parse-error (type => 'bad attribute name');
1648          } else {          } else {
1649            !!!cp (70);            !!!cp (70);
1650          }          }
1651          $self->{current_attribute}->{name} .= chr ($self->{next_char});          $self->{ca}->{name} .= chr ($self->{nc});
1652          ## Stay in the state          ## Stay in the state
1653          !!!next-input-character;          !!!next-input-character;
1654          redo A;          redo A;
1655        }        }
1656      } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {      } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1657        if ($self->{next_char} == 0x0009 or # HT        if ($is_space->{$self->{nc}}) {
           $self->{next_char} == 0x000A or # LF  
           $self->{next_char} == 0x000B or # VT  
           $self->{next_char} == 0x000C or # FF  
           $self->{next_char} == 0x0020) { # SP  
1658          !!!cp (71);          !!!cp (71);
1659          ## Stay in the state          ## Stay in the state
1660          !!!next-input-character;          !!!next-input-character;
1661          redo A;          redo A;
1662        } elsif ($self->{next_char} == 0x003D) { # =        } elsif ($self->{nc} == 0x003D) { # =
1663          !!!cp (72);          !!!cp (72);
1664          $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;          $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1665          !!!next-input-character;          !!!next-input-character;
1666          redo A;          redo A;
1667        } elsif ($self->{next_char} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
1668          if ($self->{current_token}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
1669            !!!cp (73);            !!!cp (73);
1670            $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
1671          } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1672            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1673            if ($self->{current_token}->{attributes}) {            if ($self->{ct}->{attributes}) {
1674              !!!cp (74);              !!!cp (74);
1675              !!!parse-error (type => 'end tag attribute');              !!!parse-error (type => 'end tag attribute');
1676            } else {            } else {
# Line 1551  sub _get_next_token ($) { Line 1678  sub _get_next_token ($) {
1678              !!!cp (75);              !!!cp (75);
1679            }            }
1680          } else {          } else {
1681            die "$0: $self->{current_token}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1682          }          }
1683          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1684          !!!next-input-character;          !!!next-input-character;
1685    
1686          !!!emit ($self->{current_token}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
1687    
1688          redo A;          redo A;
1689        } elsif (0x0041 <= $self->{next_char} and        } elsif (0x0041 <= $self->{nc} and
1690                 $self->{next_char} <= 0x005A) { # A..Z                 $self->{nc} <= 0x005A) { # A..Z
1691          !!!cp (76);          !!!cp (76);
1692          $self->{current_attribute}          $self->{ca}
1693              = {name => chr ($self->{next_char} + 0x0020),              = {name => chr ($self->{nc} + 0x0020),
1694                 value => '',                 value => '',
1695                 line => $self->{line}, column => $self->{column}};                 line => $self->{line}, column => $self->{column}};
1696          $self->{state} = ATTRIBUTE_NAME_STATE;          $self->{state} = ATTRIBUTE_NAME_STATE;
1697          !!!next-input-character;          !!!next-input-character;
1698          redo A;          redo A;
1699        } elsif ($self->{next_char} == 0x002F) { # /        } elsif ($self->{nc} == 0x002F) { # /
1700          !!!cp (77);          !!!cp (77);
1701          $self->{state} = SELF_CLOSING_START_TAG_STATE;          $self->{state} = SELF_CLOSING_START_TAG_STATE;
1702          !!!next-input-character;          !!!next-input-character;
1703          redo A;          redo A;
1704        } elsif ($self->{next_char} == -1) {        } elsif ($self->{nc} == -1) {
1705          !!!parse-error (type => 'unclosed tag');          !!!parse-error (type => 'unclosed tag');
1706          if ($self->{current_token}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
1707            !!!cp (79);            !!!cp (79);
1708            $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
1709          } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1710            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1711            if ($self->{current_token}->{attributes}) {            if ($self->{ct}->{attributes}) {
1712              !!!cp (80);              !!!cp (80);
1713              !!!parse-error (type => 'end tag attribute');              !!!parse-error (type => 'end tag attribute');
1714            } else {            } else {
# Line 1589  sub _get_next_token ($) { Line 1716  sub _get_next_token ($) {
1716              !!!cp (81);              !!!cp (81);
1717            }            }
1718          } else {          } else {
1719            die "$0: $self->{current_token}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1720          }          }
1721          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1722          # reconsume          # reconsume
1723    
1724          !!!emit ($self->{current_token}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
1725    
1726          redo A;          redo A;
1727        } else {        } else {
1728          if ($self->{next_char} == 0x0022 or # "          if ($self->{nc} == 0x0022 or # "
1729              $self->{next_char} == 0x0027) { # '              $self->{nc} == 0x0027) { # '
1730            !!!cp (78);            !!!cp (78);
1731            !!!parse-error (type => 'bad attribute name');            !!!parse-error (type => 'bad attribute name');
1732          } else {          } else {
1733            !!!cp (82);            !!!cp (82);
1734          }          }
1735          $self->{current_attribute}          $self->{ca}
1736              = {name => chr ($self->{next_char}),              = {name => chr ($self->{nc}),
1737                 value => '',                 value => '',
1738                 line => $self->{line}, column => $self->{column}};                 line => $self->{line}, column => $self->{column}};
1739          $self->{state} = ATTRIBUTE_NAME_STATE;          $self->{state} = ATTRIBUTE_NAME_STATE;
# Line 1614  sub _get_next_token ($) { Line 1741  sub _get_next_token ($) {
1741          redo A;                  redo A;        
1742        }        }
1743      } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {      } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1744        if ($self->{next_char} == 0x0009 or # HT        if ($is_space->{$self->{nc}}) {
           $self->{next_char} == 0x000A or # LF  
           $self->{next_char} == 0x000B or # VT  
           $self->{next_char} == 0x000C or # FF  
           $self->{next_char} == 0x0020) { # SP        
1745          !!!cp (83);          !!!cp (83);
1746          ## Stay in the state          ## Stay in the state
1747          !!!next-input-character;          !!!next-input-character;
1748          redo A;          redo A;
1749        } elsif ($self->{next_char} == 0x0022) { # "        } elsif ($self->{nc} == 0x0022) { # "
1750          !!!cp (84);          !!!cp (84);
1751          $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;          $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1752          !!!next-input-character;          !!!next-input-character;
1753          redo A;          redo A;
1754        } elsif ($self->{next_char} == 0x0026) { # &        } elsif ($self->{nc} == 0x0026) { # &
1755          !!!cp (85);          !!!cp (85);
1756          $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;          $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1757          ## reconsume          ## reconsume
1758          redo A;          redo A;
1759        } elsif ($self->{next_char} == 0x0027) { # '        } elsif ($self->{nc} == 0x0027) { # '
1760          !!!cp (86);          !!!cp (86);
1761          $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;          $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1762          !!!next-input-character;          !!!next-input-character;
1763          redo A;          redo A;
1764        } elsif ($self->{next_char} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
1765          !!!parse-error (type => 'empty unquoted attribute value');          !!!parse-error (type => 'empty unquoted attribute value');
1766          if ($self->{current_token}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
1767            !!!cp (87);            !!!cp (87);
1768            $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
1769          } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1770            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1771            if ($self->{current_token}->{attributes}) {            if ($self->{ct}->{attributes}) {
1772              !!!cp (88);              !!!cp (88);
1773              !!!parse-error (type => 'end tag attribute');              !!!parse-error (type => 'end tag attribute');
1774            } else {            } else {
# Line 1653  sub _get_next_token ($) { Line 1776  sub _get_next_token ($) {
1776              !!!cp (89);              !!!cp (89);
1777            }            }
1778          } else {          } else {
1779            die "$0: $self->{current_token}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1780          }          }
1781          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1782          !!!next-input-character;          !!!next-input-character;
1783    
1784          !!!emit ($self->{current_token}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
1785    
1786          redo A;          redo A;
1787        } elsif ($self->{next_char} == -1) {        } elsif ($self->{nc} == -1) {
1788          !!!parse-error (type => 'unclosed tag');          !!!parse-error (type => 'unclosed tag');
1789          if ($self->{current_token}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
1790            !!!cp (90);            !!!cp (90);
1791            $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
1792          } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1793            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1794            if ($self->{current_token}->{attributes}) {            if ($self->{ct}->{attributes}) {
1795              !!!cp (91);              !!!cp (91);
1796              !!!parse-error (type => 'end tag attribute');              !!!parse-error (type => 'end tag attribute');
1797            } else {            } else {
# Line 1676  sub _get_next_token ($) { Line 1799  sub _get_next_token ($) {
1799              !!!cp (92);              !!!cp (92);
1800            }            }
1801          } else {          } else {
1802            die "$0: $self->{current_token}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1803          }          }
1804          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1805          ## reconsume          ## reconsume
1806    
1807          !!!emit ($self->{current_token}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
1808    
1809          redo A;          redo A;
1810        } else {        } else {
1811          if ($self->{next_char} == 0x003D) { # =          if ($self->{nc} == 0x003D) { # =
1812            !!!cp (93);            !!!cp (93);
1813            !!!parse-error (type => 'bad attribute value');            !!!parse-error (type => 'bad attribute value');
1814          } else {          } else {
1815            !!!cp (94);            !!!cp (94);
1816          }          }
1817          $self->{current_attribute}->{value} .= chr ($self->{next_char});          $self->{ca}->{value} .= chr ($self->{nc});
1818          $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;          $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1819          !!!next-input-character;          !!!next-input-character;
1820          redo A;          redo A;
1821        }        }
1822      } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {      } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1823        if ($self->{next_char} == 0x0022) { # "        if ($self->{nc} == 0x0022) { # "
1824          !!!cp (95);          !!!cp (95);
1825          $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;          $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1826          !!!next-input-character;          !!!next-input-character;
1827          redo A;          redo A;
1828        } elsif ($self->{next_char} == 0x0026) { # &        } elsif ($self->{nc} == 0x0026) { # &
1829          !!!cp (96);          !!!cp (96);
1830          $self->{last_attribute_value_state} = $self->{state};          ## NOTE: In the spec, the tokenizer is switched to the
1831          $self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE;          ## "entity in attribute value state".  In this implementation, the
1832            ## tokenizer is switched to the |ENTITY_STATE|, which is an
1833            ## implementation of the "consume a character reference" algorithm.
1834            $self->{prev_state} = $self->{state};
1835            $self->{entity_add} = 0x0022; # "
1836            $self->{state} = ENTITY_STATE;
1837          !!!next-input-character;          !!!next-input-character;
1838          redo A;          redo A;
1839        } elsif ($self->{next_char} == -1) {        } elsif ($self->{nc} == -1) {
1840          !!!parse-error (type => 'unclosed attribute value');          !!!parse-error (type => 'unclosed attribute value');
1841          if ($self->{current_token}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
1842            !!!cp (97);            !!!cp (97);
1843            $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
1844          } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1845            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1846            if ($self->{current_token}->{attributes}) {            if ($self->{ct}->{attributes}) {
1847              !!!cp (98);              !!!cp (98);
1848              !!!parse-error (type => 'end tag attribute');              !!!parse-error (type => 'end tag attribute');
1849            } else {            } else {
# Line 1723  sub _get_next_token ($) { Line 1851  sub _get_next_token ($) {
1851              !!!cp (99);              !!!cp (99);
1852            }            }
1853          } else {          } else {
1854            die "$0: $self->{current_token}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1855          }          }
1856          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1857          ## reconsume          ## reconsume
1858    
1859          !!!emit ($self->{current_token}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
1860    
1861          redo A;          redo A;
1862        } else {        } else {
1863          !!!cp (100);          !!!cp (100);
1864          $self->{current_attribute}->{value} .= chr ($self->{next_char});          $self->{ca}->{value} .= chr ($self->{nc});
1865            $self->{read_until}->($self->{ca}->{value},
1866                                  q["&],
1867                                  length $self->{ca}->{value});
1868    
1869          ## Stay in the state          ## Stay in the state
1870          !!!next-input-character;          !!!next-input-character;
1871          redo A;          redo A;
1872        }        }
1873      } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {      } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1874        if ($self->{next_char} == 0x0027) { # '        if ($self->{nc} == 0x0027) { # '
1875          !!!cp (101);          !!!cp (101);
1876          $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;          $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1877          !!!next-input-character;          !!!next-input-character;
1878          redo A;          redo A;
1879        } elsif ($self->{next_char} == 0x0026) { # &        } elsif ($self->{nc} == 0x0026) { # &
1880          !!!cp (102);          !!!cp (102);
1881          $self->{last_attribute_value_state} = $self->{state};          ## NOTE: In the spec, the tokenizer is switched to the
1882          $self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE;          ## "entity in attribute value state".  In this implementation, the
1883            ## tokenizer is switched to the |ENTITY_STATE|, which is an
1884            ## implementation of the "consume a character reference" algorithm.
1885            $self->{entity_add} = 0x0027; # '
1886            $self->{prev_state} = $self->{state};
1887            $self->{state} = ENTITY_STATE;
1888          !!!next-input-character;          !!!next-input-character;
1889          redo A;          redo A;
1890        } elsif ($self->{next_char} == -1) {        } elsif ($self->{nc} == -1) {
1891          !!!parse-error (type => 'unclosed attribute value');          !!!parse-error (type => 'unclosed attribute value');
1892          if ($self->{current_token}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
1893            !!!cp (103);            !!!cp (103);
1894            $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
1895          } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1896            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1897            if ($self->{current_token}->{attributes}) {            if ($self->{ct}->{attributes}) {
1898              !!!cp (104);              !!!cp (104);
1899              !!!parse-error (type => 'end tag attribute');              !!!parse-error (type => 'end tag attribute');
1900            } else {            } else {
# Line 1765  sub _get_next_token ($) { Line 1902  sub _get_next_token ($) {
1902              !!!cp (105);              !!!cp (105);
1903            }            }
1904          } else {          } else {
1905            die "$0: $self->{current_token}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1906          }          }
1907          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1908          ## reconsume          ## reconsume
1909    
1910          !!!emit ($self->{current_token}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
1911    
1912          redo A;          redo A;
1913        } else {        } else {
1914          !!!cp (106);          !!!cp (106);
1915          $self->{current_attribute}->{value} .= chr ($self->{next_char});          $self->{ca}->{value} .= chr ($self->{nc});
1916            $self->{read_until}->($self->{ca}->{value},
1917                                  q['&],
1918                                  length $self->{ca}->{value});
1919    
1920          ## Stay in the state          ## Stay in the state
1921          !!!next-input-character;          !!!next-input-character;
1922          redo A;          redo A;
1923        }        }
1924      } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {      } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1925        if ($self->{next_char} == 0x0009 or # HT        if ($is_space->{$self->{nc}}) {
           $self->{next_char} == 0x000A or # LF  
           $self->{next_char} == 0x000B or # HT  
           $self->{next_char} == 0x000C or # FF  
           $self->{next_char} == 0x0020) { # SP  
1926          !!!cp (107);          !!!cp (107);
1927          $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;          $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1928          !!!next-input-character;          !!!next-input-character;
1929          redo A;          redo A;
1930        } elsif ($self->{next_char} == 0x0026) { # &        } elsif ($self->{nc} == 0x0026) { # &
1931          !!!cp (108);          !!!cp (108);
1932          $self->{last_attribute_value_state} = $self->{state};          ## NOTE: In the spec, the tokenizer is switched to the
1933          $self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE;          ## "entity in attribute value state".  In this implementation, the
1934            ## tokenizer is switched to the |ENTITY_STATE|, which is an
1935            ## implementation of the "consume a character reference" algorithm.
1936            $self->{entity_add} = -1;
1937            $self->{prev_state} = $self->{state};
1938            $self->{state} = ENTITY_STATE;
1939          !!!next-input-character;          !!!next-input-character;
1940          redo A;          redo A;
1941        } elsif ($self->{next_char} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
1942          if ($self->{current_token}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
1943            !!!cp (109);            !!!cp (109);
1944            $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
1945          } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1946            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1947            if ($self->{current_token}->{attributes}) {            if ($self->{ct}->{attributes}) {
1948              !!!cp (110);              !!!cp (110);
1949              !!!parse-error (type => 'end tag attribute');              !!!parse-error (type => 'end tag attribute');
1950            } else {            } else {
# Line 1810  sub _get_next_token ($) { Line 1952  sub _get_next_token ($) {
1952              !!!cp (111);              !!!cp (111);
1953            }            }
1954          } else {          } else {
1955            die "$0: $self->{current_token}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1956          }          }
1957          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1958          !!!next-input-character;          !!!next-input-character;
1959    
1960          !!!emit ($self->{current_token}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
1961    
1962          redo A;          redo A;
1963        } elsif ($self->{next_char} == -1) {        } elsif ($self->{nc} == -1) {
1964          !!!parse-error (type => 'unclosed tag');          !!!parse-error (type => 'unclosed tag');
1965          if ($self->{current_token}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
1966            !!!cp (112);            !!!cp (112);
1967            $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
1968          } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1969            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1970            if ($self->{current_token}->{attributes}) {            if ($self->{ct}->{attributes}) {
1971              !!!cp (113);              !!!cp (113);
1972              !!!parse-error (type => 'end tag attribute');              !!!parse-error (type => 'end tag attribute');
1973            } else {            } else {
# Line 1833  sub _get_next_token ($) { Line 1975  sub _get_next_token ($) {
1975              !!!cp (114);              !!!cp (114);
1976            }            }
1977          } else {          } else {
1978            die "$0: $self->{current_token}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1979          }          }
1980          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1981          ## reconsume          ## reconsume
1982    
1983          !!!emit ($self->{current_token}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
1984    
1985          redo A;          redo A;
1986        } else {        } else {
# Line 1846  sub _get_next_token ($) { Line 1988  sub _get_next_token ($) {
1988               0x0022 => 1, # "               0x0022 => 1, # "
1989               0x0027 => 1, # '               0x0027 => 1, # '
1990               0x003D => 1, # =               0x003D => 1, # =
1991              }->{$self->{next_char}}) {              }->{$self->{nc}}) {
1992            !!!cp (115);            !!!cp (115);
1993            !!!parse-error (type => 'bad attribute value');            !!!parse-error (type => 'bad attribute value');
1994          } else {          } else {
1995            !!!cp (116);            !!!cp (116);
1996          }          }
1997          $self->{current_attribute}->{value} .= chr ($self->{next_char});          $self->{ca}->{value} .= chr ($self->{nc});
1998            $self->{read_until}->($self->{ca}->{value},
1999                                  q["'=& >],
2000                                  length $self->{ca}->{value});
2001    
2002          ## Stay in the state          ## Stay in the state
2003          !!!next-input-character;          !!!next-input-character;
2004          redo A;          redo A;
2005        }        }
     } elsif ($self->{state} == ENTITY_IN_ATTRIBUTE_VALUE_STATE) {  
       my $token = $self->_tokenize_attempt_to_consume_an_entity  
           (1,  
            $self->{last_attribute_value_state}  
              == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE ? 0x0022 : # "  
            $self->{last_attribute_value_state}  
              == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE ? 0x0027 : # '  
            -1);  
   
       unless (defined $token) {  
         !!!cp (117);  
         $self->{current_attribute}->{value} .= '&';  
       } else {  
         !!!cp (118);  
         $self->{current_attribute}->{value} .= $token->{data};  
         $self->{current_attribute}->{has_reference} = $token->{has_reference};  
         ## ISSUE: spec says "append the returned character token to the current attribute's value"  
       }  
   
       $self->{state} = $self->{last_attribute_value_state};  
       # next-input-character is already done  
       redo A;  
2006      } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {      } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
2007        if ($self->{next_char} == 0x0009 or # HT        if ($is_space->{$self->{nc}}) {
           $self->{next_char} == 0x000A or # LF  
           $self->{next_char} == 0x000B or # VT  
           $self->{next_char} == 0x000C or # FF  
           $self->{next_char} == 0x0020) { # SP  
2008          !!!cp (118);          !!!cp (118);
2009          $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;          $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2010          !!!next-input-character;          !!!next-input-character;
2011          redo A;          redo A;
2012        } elsif ($self->{next_char} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
2013          if ($self->{current_token}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
2014            !!!cp (119);            !!!cp (119);
2015            $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
2016          } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2017            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2018            if ($self->{current_token}->{attributes}) {            if ($self->{ct}->{attributes}) {
2019              !!!cp (120);              !!!cp (120);
2020              !!!parse-error (type => 'end tag attribute');              !!!parse-error (type => 'end tag attribute');
2021            } else {            } else {
# Line 1903  sub _get_next_token ($) { Line 2023  sub _get_next_token ($) {
2023              !!!cp (121);              !!!cp (121);
2024            }            }
2025          } else {          } else {
2026            die "$0: $self->{current_token}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
2027          }          }
2028          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2029          !!!next-input-character;          !!!next-input-character;
2030    
2031          !!!emit ($self->{current_token}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
2032    
2033          redo A;          redo A;
2034        } elsif ($self->{next_char} == 0x002F) { # /        } elsif ($self->{nc} == 0x002F) { # /
2035          !!!cp (122);          !!!cp (122);
2036          $self->{state} = SELF_CLOSING_START_TAG_STATE;          $self->{state} = SELF_CLOSING_START_TAG_STATE;
2037          !!!next-input-character;          !!!next-input-character;
2038          redo A;          redo A;
2039        } elsif ($self->{next_char} == -1) {        } elsif ($self->{nc} == -1) {
2040          !!!parse-error (type => 'unclosed tag');          !!!parse-error (type => 'unclosed tag');
2041          if ($self->{current_token}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
2042            !!!cp (122.3);            !!!cp (122.3);
2043            $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
2044          } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2045            if ($self->{current_token}->{attributes}) {            if ($self->{ct}->{attributes}) {
2046              !!!cp (122.1);              !!!cp (122.1);
2047              !!!parse-error (type => 'end tag attribute');              !!!parse-error (type => 'end tag attribute');
2048            } else {            } else {
# Line 1930  sub _get_next_token ($) { Line 2050  sub _get_next_token ($) {
2050              !!!cp (122.2);              !!!cp (122.2);
2051            }            }
2052          } else {          } else {
2053            die "$0: $self->{current_token}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
2054          }          }
2055          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2056          ## Reconsume.          ## Reconsume.
2057          !!!emit ($self->{current_token}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
2058          redo A;          redo A;
2059        } else {        } else {
2060          !!!cp ('124.1');          !!!cp ('124.1');
# Line 1944  sub _get_next_token ($) { Line 2064  sub _get_next_token ($) {
2064          redo A;          redo A;
2065        }        }
2066      } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {      } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
2067        if ($self->{next_char} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
2068          if ($self->{current_token}->{type} == END_TAG_TOKEN) {          if ($self->{ct}->{type} == END_TAG_TOKEN) {
2069            !!!cp ('124.2');            !!!cp ('124.2');
2070            !!!parse-error (type => 'nestc', token => $self->{current_token});            !!!parse-error (type => 'nestc', token => $self->{ct});
2071            ## TODO: Different type than slash in start tag            ## TODO: Different type than slash in start tag
2072            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2073            if ($self->{current_token}->{attributes}) {            if ($self->{ct}->{attributes}) {
2074              !!!cp ('124.4');              !!!cp ('124.4');
2075              !!!parse-error (type => 'end tag attribute');              !!!parse-error (type => 'end tag attribute');
2076            } else {            } else {
# Line 1965  sub _get_next_token ($) { Line 2085  sub _get_next_token ($) {
2085          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2086          !!!next-input-character;          !!!next-input-character;
2087    
2088          !!!emit ($self->{current_token}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
2089    
2090          redo A;          redo A;
2091        } elsif ($self->{next_char} == -1) {        } elsif ($self->{nc} == -1) {
2092          !!!parse-error (type => 'unclosed tag');          !!!parse-error (type => 'unclosed tag');
2093          if ($self->{current_token}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
2094            !!!cp (124.7);            !!!cp (124.7);
2095            $self->{last_emitted_start_tag_name} = $self->{current_token}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
2096          } elsif ($self->{current_token}->{type} == END_TAG_TOKEN) {          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2097            if ($self->{current_token}->{attributes}) {            if ($self->{ct}->{attributes}) {
2098              !!!cp (124.5);              !!!cp (124.5);
2099              !!!parse-error (type => 'end tag attribute');              !!!parse-error (type => 'end tag attribute');
2100            } else {            } else {
# Line 1982  sub _get_next_token ($) { Line 2102  sub _get_next_token ($) {
2102              !!!cp (124.6);              !!!cp (124.6);
2103            }            }
2104          } else {          } else {
2105            die "$0: $self->{current_token}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
2106          }          }
2107          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2108          ## Reconsume.          ## Reconsume.
2109          !!!emit ($self->{current_token}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
2110          redo A;          redo A;
2111        } else {        } else {
2112          !!!cp ('124.4');          !!!cp ('124.4');
# Line 1998  sub _get_next_token ($) { Line 2118  sub _get_next_token ($) {
2118        }        }
2119      } elsif ($self->{state} == BOGUS_COMMENT_STATE) {      } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
2120        ## (only happen if PCDATA state)        ## (only happen if PCDATA state)
         
       ## NOTE: Set by the previous state  
       #my $token = {type => COMMENT_TOKEN, data => ''};  
2121    
2122        BC: {        ## NOTE: Unlike spec's "bogus comment state", this implementation
2123          if ($self->{next_char} == 0x003E) { # >        ## consumes characters one-by-one basis.
2124            !!!cp (124);        
2125            $self->{state} = DATA_STATE;        if ($self->{nc} == 0x003E) { # >
2126            !!!next-input-character;          !!!cp (124);
2127            $self->{state} = DATA_STATE;
2128            !!!emit ($self->{current_token}); # comment          !!!next-input-character;
   
           redo A;  
         } elsif ($self->{next_char} == -1) {  
           !!!cp (125);  
           $self->{state} = DATA_STATE;  
           ## reconsume  
2129    
2130            !!!emit ($self->{current_token}); # comment          !!!emit ($self->{ct}); # comment
2131            redo A;
2132          } elsif ($self->{nc} == -1) {
2133            !!!cp (125);
2134            $self->{state} = DATA_STATE;
2135            ## reconsume
2136    
2137            redo A;          !!!emit ($self->{ct}); # comment
2138          } else {          redo A;
2139            !!!cp (126);        } else {
2140            $self->{current_token}->{data} .= chr ($self->{next_char}); # comment          !!!cp (126);
2141            !!!next-input-character;          $self->{ct}->{data} .= chr ($self->{nc}); # comment
2142            redo BC;          $self->{read_until}->($self->{ct}->{data},
2143          }                                q[>],
2144        } # BC                                length $self->{ct}->{data});
2145    
2146        die "$0: _get_next_token: unexpected case [BC]";          ## Stay in the state.
2147            !!!next-input-character;
2148            redo A;
2149          }
2150      } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {      } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
2151        ## (only happen if PCDATA state)        ## (only happen if PCDATA state)
2152                
2153        if ($self->{next_char} == 0x002D) { # -        if ($self->{nc} == 0x002D) { # -
2154          !!!cp (133);          !!!cp (133);
2155          $self->{state} = MD_HYPHEN_STATE;          $self->{state} = MD_HYPHEN_STATE;
2156          !!!next-input-character;          !!!next-input-character;
2157          redo A;          redo A;
2158        } elsif ($self->{next_char} == 0x0044 or # D        } elsif ($self->{nc} == 0x0044 or # D
2159                 $self->{next_char} == 0x0064) { # d                 $self->{nc} == 0x0064) { # d
2160          ## ASCII case-insensitive.          ## ASCII case-insensitive.
2161          !!!cp (130);          !!!cp (130);
2162          $self->{state} = MD_DOCTYPE_STATE;          $self->{state} = MD_DOCTYPE_STATE;
2163          $self->{state_keyword} = chr $self->{next_char};          $self->{s_kwd} = chr $self->{nc};
2164          !!!next-input-character;          !!!next-input-character;
2165          redo A;          redo A;
2166        } elsif ($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and        } elsif ($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
2167                 $self->{open_elements}->[-1]->[1] & FOREIGN_EL and                 $self->{open_elements}->[-1]->[1] & FOREIGN_EL and
2168                 $self->{next_char} == 0x005B) { # [                 $self->{nc} == 0x005B) { # [
2169          !!!cp (135.4);                          !!!cp (135.4);                
2170          $self->{state} = MD_CDATA_STATE;          $self->{state} = MD_CDATA_STATE;
2171          $self->{state_keyword} = '[';          $self->{s_kwd} = '[';
2172          !!!next-input-character;          !!!next-input-character;
2173          redo A;          redo A;
2174        } else {        } else {
# Line 2061  sub _get_next_token ($) { Line 2180  sub _get_next_token ($) {
2180                        column => $self->{column_prev} - 1);                        column => $self->{column_prev} - 1);
2181        ## Reconsume.        ## Reconsume.
2182        $self->{state} = BOGUS_COMMENT_STATE;        $self->{state} = BOGUS_COMMENT_STATE;
2183        $self->{current_token} = {type => COMMENT_TOKEN, data => '',        $self->{ct} = {type => COMMENT_TOKEN, data => '',
2184                                  line => $self->{line_prev},                                  line => $self->{line_prev},
2185                                  column => $self->{column_prev} - 1,                                  column => $self->{column_prev} - 1,
2186                                 };                                 };
2187        redo A;        redo A;
2188      } elsif ($self->{state} == MD_HYPHEN_STATE) {      } elsif ($self->{state} == MD_HYPHEN_STATE) {
2189        if ($self->{next_char} == 0x002D) { # -        if ($self->{nc} == 0x002D) { # -
2190          !!!cp (127);          !!!cp (127);
2191          $self->{current_token} = {type => COMMENT_TOKEN, data => '',          $self->{ct} = {type => COMMENT_TOKEN, data => '',
2192                                    line => $self->{line_prev},                                    line => $self->{line_prev},
2193                                    column => $self->{column_prev} - 2,                                    column => $self->{column_prev} - 2,
2194                                   };                                   };
# Line 2083  sub _get_next_token ($) { Line 2202  sub _get_next_token ($) {
2202                          column => $self->{column_prev} - 2);                          column => $self->{column_prev} - 2);
2203          $self->{state} = BOGUS_COMMENT_STATE;          $self->{state} = BOGUS_COMMENT_STATE;
2204          ## Reconsume.          ## Reconsume.
2205          $self->{current_token} = {type => COMMENT_TOKEN,          $self->{ct} = {type => COMMENT_TOKEN,
2206                                    data => '-',                                    data => '-',
2207                                    line => $self->{line_prev},                                    line => $self->{line_prev},
2208                                    column => $self->{column_prev} - 2,                                    column => $self->{column_prev} - 2,
# Line 2092  sub _get_next_token ($) { Line 2211  sub _get_next_token ($) {
2211        }        }
2212      } elsif ($self->{state} == MD_DOCTYPE_STATE) {      } elsif ($self->{state} == MD_DOCTYPE_STATE) {
2213        ## ASCII case-insensitive.        ## ASCII case-insensitive.
2214        if ($self->{next_char} == [        if ($self->{nc} == [
2215              undef,              undef,
2216              0x004F, # O              0x004F, # O
2217              0x0043, # C              0x0043, # C
2218              0x0054, # T              0x0054, # T
2219              0x0059, # Y              0x0059, # Y
2220              0x0050, # P              0x0050, # P
2221            ]->[length $self->{state_keyword}] or            ]->[length $self->{s_kwd}] or
2222            $self->{next_char} == [            $self->{nc} == [
2223              undef,              undef,
2224              0x006F, # o              0x006F, # o
2225              0x0063, # c              0x0063, # c
2226              0x0074, # t              0x0074, # t
2227              0x0079, # y              0x0079, # y
2228              0x0070, # p              0x0070, # p
2229            ]->[length $self->{state_keyword}]) {            ]->[length $self->{s_kwd}]) {
2230          !!!cp (131);          !!!cp (131);
2231          ## Stay in the state.          ## Stay in the state.
2232          $self->{state_keyword} .= chr $self->{next_char};          $self->{s_kwd} .= chr $self->{nc};
2233          !!!next-input-character;          !!!next-input-character;
2234          redo A;          redo A;
2235        } elsif ((length $self->{state_keyword}) == 6 and        } elsif ((length $self->{s_kwd}) == 6 and
2236                 ($self->{next_char} == 0x0045 or # E                 ($self->{nc} == 0x0045 or # E
2237                  $self->{next_char} == 0x0065)) { # e                  $self->{nc} == 0x0065)) { # e
2238          !!!cp (129);          !!!cp (129);
2239          $self->{state} = DOCTYPE_STATE;          $self->{state} = DOCTYPE_STATE;
2240          $self->{current_token} = {type => DOCTYPE_TOKEN,          $self->{ct} = {type => DOCTYPE_TOKEN,
2241                                    quirks => 1,                                    quirks => 1,
2242                                    line => $self->{line_prev},                                    line => $self->{line_prev},
2243                                    column => $self->{column_prev} - 7,                                    column => $self->{column_prev} - 7,
# Line 2129  sub _get_next_token ($) { Line 2248  sub _get_next_token ($) {
2248          !!!cp (132);                  !!!cp (132);        
2249          !!!parse-error (type => 'bogus comment',          !!!parse-error (type => 'bogus comment',
2250                          line => $self->{line_prev},                          line => $self->{line_prev},
2251                          column => $self->{column_prev} - 1 - length $self->{state_keyword});                          column => $self->{column_prev} - 1 - length $self->{s_kwd});
2252          $self->{state} = BOGUS_COMMENT_STATE;          $self->{state} = BOGUS_COMMENT_STATE;
2253          ## Reconsume.          ## Reconsume.
2254          $self->{current_token} = {type => COMMENT_TOKEN,          $self->{ct} = {type => COMMENT_TOKEN,
2255                                    data => $self->{state_keyword},                                    data => $self->{s_kwd},
2256                                    line => $self->{line_prev},                                    line => $self->{line_prev},
2257                                    column => $self->{column_prev} - 1 - length $self->{state_keyword},                                    column => $self->{column_prev} - 1 - length $self->{s_kwd},
2258                                   };                                   };
2259          redo A;          redo A;
2260        }        }
2261      } elsif ($self->{state} == MD_CDATA_STATE) {      } elsif ($self->{state} == MD_CDATA_STATE) {
2262        if ($self->{next_char} == {        if ($self->{nc} == {
2263              '[' => 0x0043, # C              '[' => 0x0043, # C
2264              '[C' => 0x0044, # D              '[C' => 0x0044, # D
2265              '[CD' => 0x0041, # A              '[CD' => 0x0041, # A
2266              '[CDA' => 0x0054, # T              '[CDA' => 0x0054, # T
2267              '[CDAT' => 0x0041, # A              '[CDAT' => 0x0041, # A
2268            }->{$self->{state_keyword}}) {            }->{$self->{s_kwd}}) {
2269          !!!cp (135.1);          !!!cp (135.1);
2270          ## Stay in the state.          ## Stay in the state.
2271          $self->{state_keyword} .= chr $self->{next_char};          $self->{s_kwd} .= chr $self->{nc};
2272          !!!next-input-character;          !!!next-input-character;
2273          redo A;          redo A;
2274        } elsif ($self->{state_keyword} eq '[CDATA' and        } elsif ($self->{s_kwd} eq '[CDATA' and
2275                 $self->{next_char} == 0x005B) { # [                 $self->{nc} == 0x005B) { # [
2276          !!!cp (135.2);          !!!cp (135.2);
2277          $self->{current_token} = {type => CHARACTER_TOKEN,          $self->{ct} = {type => CHARACTER_TOKEN,
2278                                    data => '',                                    data => '',
2279                                    line => $self->{line_prev},                                    line => $self->{line_prev},
2280                                    column => $self->{column_prev} - 7};                                    column => $self->{column_prev} - 7};
# Line 2166  sub _get_next_token ($) { Line 2285  sub _get_next_token ($) {
2285          !!!cp (135.3);          !!!cp (135.3);
2286          !!!parse-error (type => 'bogus comment',          !!!parse-error (type => 'bogus comment',
2287                          line => $self->{line_prev},                          line => $self->{line_prev},
2288                          column => $self->{column_prev} - 1 - length $self->{state_keyword});                          column => $self->{column_prev} - 1 - length $self->{s_kwd});
2289          $self->{state} = BOGUS_COMMENT_STATE;          $self->{state} = BOGUS_COMMENT_STATE;
2290          ## Reconsume.          ## Reconsume.
2291          $self->{current_token} = {type => COMMENT_TOKEN,          $self->{ct} = {type => COMMENT_TOKEN,
2292                                    data => $self->{state_keyword},                                    data => $self->{s_kwd},
2293                                    line => $self->{line_prev},                                    line => $self->{line_prev},
2294                                    column => $self->{column_prev} - 1 - length $self->{state_keyword},                                    column => $self->{column_prev} - 1 - length $self->{s_kwd},
2295                                   };                                   };
2296          redo A;          redo A;
2297        }        }
2298      } elsif ($self->{state} == COMMENT_START_STATE) {      } elsif ($self->{state} == COMMENT_START_STATE) {
2299        if ($self->{next_char} == 0x002D) { # -        if ($self->{nc} == 0x002D) { # -
2300          !!!cp (137);          !!!cp (137);
2301          $self->{state} = COMMENT_START_DASH_STATE;          $self->{state} = COMMENT_START_DASH_STATE;
2302          !!!next-input-character;          !!!next-input-character;
2303          redo A;          redo A;
2304        } elsif ($self->{next_char} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
2305          !!!cp (138);          !!!cp (138);
2306          !!!parse-error (type => 'bogus comment');          !!!parse-error (type => 'bogus comment');
2307          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2308          !!!next-input-character;          !!!next-input-character;
2309    
2310          !!!emit ($self->{current_token}); # comment          !!!emit ($self->{ct}); # comment
2311    
2312          redo A;          redo A;
2313        } elsif ($self->{next_char} == -1) {        } elsif ($self->{nc} == -1) {
2314          !!!cp (139);          !!!cp (139);
2315          !!!parse-error (type => 'unclosed comment');          !!!parse-error (type => 'unclosed comment');
2316          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2317          ## reconsume          ## reconsume
2318    
2319          !!!emit ($self->{current_token}); # comment          !!!emit ($self->{ct}); # comment
2320    
2321          redo A;          redo A;
2322        } else {        } else {
2323          !!!cp (140);          !!!cp (140);
2324          $self->{current_token}->{data} # comment          $self->{ct}->{data} # comment
2325              .= chr ($self->{next_char});              .= chr ($self->{nc});
2326          $self->{state} = COMMENT_STATE;          $self->{state} = COMMENT_STATE;
2327          !!!next-input-character;          !!!next-input-character;
2328          redo A;          redo A;
2329        }        }
2330      } elsif ($self->{state} == COMMENT_START_DASH_STATE) {      } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
2331        if ($self->{next_char} == 0x002D) { # -        if ($self->{nc} == 0x002D) { # -
2332          !!!cp (141);          !!!cp (141);
2333          $self->{state} = COMMENT_END_STATE;          $self->{state} = COMMENT_END_STATE;
2334          !!!next-input-character;          !!!next-input-character;
2335          redo A;          redo A;
2336        } elsif ($self->{next_char} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
2337          !!!cp (142);          !!!cp (142);
2338          !!!parse-error (type => 'bogus comment');          !!!parse-error (type => 'bogus comment');
2339          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2340          !!!next-input-character;          !!!next-input-character;
2341    
2342          !!!emit ($self->{current_token}); # comment          !!!emit ($self->{ct}); # comment
2343    
2344          redo A;          redo A;
2345        } elsif ($self->{next_char} == -1) {        } elsif ($self->{nc} == -1) {
2346          !!!cp (143);          !!!cp (143);
2347          !!!parse-error (type => 'unclosed comment');          !!!parse-error (type => 'unclosed comment');
2348          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2349          ## reconsume          ## reconsume
2350    
2351          !!!emit ($self->{current_token}); # comment          !!!emit ($self->{ct}); # comment
2352    
2353          redo A;          redo A;
2354        } else {        } else {
2355          !!!cp (144);          !!!cp (144);
2356          $self->{current_token}->{data} # comment          $self->{ct}->{data} # comment
2357              .= '-' . chr ($self->{next_char});              .= '-' . chr ($self->{nc});
2358          $self->{state} = COMMENT_STATE;          $self->{state} = COMMENT_STATE;
2359          !!!next-input-character;          !!!next-input-character;
2360          redo A;          redo A;
2361        }        }
2362      } elsif ($self->{state} == COMMENT_STATE) {      } elsif ($self->{state} == COMMENT_STATE) {
2363        if ($self->{next_char} == 0x002D) { # -        if ($self->{nc} == 0x002D) { # -
2364          !!!cp (145);          !!!cp (145);
2365          $self->{state} = COMMENT_END_DASH_STATE;          $self->{state} = COMMENT_END_DASH_STATE;
2366          !!!next-input-character;          !!!next-input-character;
2367          redo A;          redo A;
2368        } elsif ($self->{next_char} == -1) {        } elsif ($self->{nc} == -1) {
2369          !!!cp (146);          !!!cp (146);
2370          !!!parse-error (type => 'unclosed comment');          !!!parse-error (type => 'unclosed comment');
2371          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2372          ## reconsume          ## reconsume
2373    
2374          !!!emit ($self->{current_token}); # comment          !!!emit ($self->{ct}); # comment
2375    
2376          redo A;          redo A;
2377        } else {        } else {
2378          !!!cp (147);          !!!cp (147);
2379          $self->{current_token}->{data} .= chr ($self->{next_char}); # comment          $self->{ct}->{data} .= chr ($self->{nc}); # comment
2380            $self->{read_until}->($self->{ct}->{data},
2381                                  q[-],
2382                                  length $self->{ct}->{data});
2383    
2384          ## Stay in the state          ## Stay in the state
2385          !!!next-input-character;          !!!next-input-character;
2386          redo A;          redo A;
2387        }        }
2388      } elsif ($self->{state} == COMMENT_END_DASH_STATE) {      } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2389        if ($self->{next_char} == 0x002D) { # -        if ($self->{nc} == 0x002D) { # -
2390          !!!cp (148);          !!!cp (148);
2391          $self->{state} = COMMENT_END_STATE;          $self->{state} = COMMENT_END_STATE;
2392          !!!next-input-character;          !!!next-input-character;
2393          redo A;          redo A;
2394        } elsif ($self->{next_char} == -1) {        } elsif ($self->{nc} == -1) {
2395          !!!cp (149);          !!!cp (149);
2396          !!!parse-error (type => 'unclosed comment');          !!!parse-error (type => 'unclosed comment');
2397          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2398          ## reconsume          ## reconsume
2399    
2400          !!!emit ($self->{current_token}); # comment          !!!emit ($self->{ct}); # comment
2401    
2402          redo A;          redo A;
2403        } else {        } else {
2404          !!!cp (150);          !!!cp (150);
2405          $self->{current_token}->{data} .= '-' . chr ($self->{next_char}); # comment          $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
2406          $self->{state} = COMMENT_STATE;          $self->{state} = COMMENT_STATE;
2407          !!!next-input-character;          !!!next-input-character;
2408          redo A;          redo A;
2409        }        }
2410      } elsif ($self->{state} == COMMENT_END_STATE) {      } elsif ($self->{state} == COMMENT_END_STATE) {
2411        if ($self->{next_char} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
2412          !!!cp (151);          !!!cp (151);
2413          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2414          !!!next-input-character;          !!!next-input-character;
2415    
2416          !!!emit ($self->{current_token}); # comment          !!!emit ($self->{ct}); # comment
2417    
2418          redo A;          redo A;
2419        } elsif ($self->{next_char} == 0x002D) { # -        } elsif ($self->{nc} == 0x002D) { # -
2420          !!!cp (152);          !!!cp (152);
2421          !!!parse-error (type => 'dash in comment',          !!!parse-error (type => 'dash in comment',
2422                          line => $self->{line_prev},                          line => $self->{line_prev},
2423                          column => $self->{column_prev});                          column => $self->{column_prev});
2424          $self->{current_token}->{data} .= '-'; # comment          $self->{ct}->{data} .= '-'; # comment
2425          ## Stay in the state          ## Stay in the state
2426          !!!next-input-character;          !!!next-input-character;
2427          redo A;          redo A;
2428        } elsif ($self->{next_char} == -1) {        } elsif ($self->{nc} == -1) {
2429          !!!cp (153);          !!!cp (153);
2430          !!!parse-error (type => 'unclosed comment');          !!!parse-error (type => 'unclosed comment');
2431          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2432          ## reconsume          ## reconsume
2433    
2434          !!!emit ($self->{current_token}); # comment          !!!emit ($self->{ct}); # comment
2435    
2436          redo A;          redo A;
2437        } else {        } else {
# Line 2316  sub _get_next_token ($) { Line 2439  sub _get_next_token ($) {
2439          !!!parse-error (type => 'dash in comment',          !!!parse-error (type => 'dash in comment',
2440                          line => $self->{line_prev},                          line => $self->{line_prev},
2441                          column => $self->{column_prev});                          column => $self->{column_prev});
2442          $self->{current_token}->{data} .= '--' . chr ($self->{next_char}); # comment          $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
2443          $self->{state} = COMMENT_STATE;          $self->{state} = COMMENT_STATE;
2444          !!!next-input-character;          !!!next-input-character;
2445          redo A;          redo A;
2446        }        }
2447      } elsif ($self->{state} == DOCTYPE_STATE) {      } elsif ($self->{state} == DOCTYPE_STATE) {
2448        if ($self->{next_char} == 0x0009 or # HT        if ($is_space->{$self->{nc}}) {
           $self->{next_char} == 0x000A or # LF  
           $self->{next_char} == 0x000B or # VT  
           $self->{next_char} == 0x000C or # FF  
           $self->{next_char} == 0x0020) { # SP  
2449          !!!cp (155);          !!!cp (155);
2450          $self->{state} = BEFORE_DOCTYPE_NAME_STATE;          $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2451          !!!next-input-character;          !!!next-input-character;
# Line 2339  sub _get_next_token ($) { Line 2458  sub _get_next_token ($) {
2458          redo A;          redo A;
2459        }        }
2460      } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {      } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
2461        if ($self->{next_char} == 0x0009 or # HT        if ($is_space->{$self->{nc}}) {
           $self->{next_char} == 0x000A or # LF  
           $self->{next_char} == 0x000B or # VT  
           $self->{next_char} == 0x000C or # FF  
           $self->{next_char} == 0x0020) { # SP  
2462          !!!cp (157);          !!!cp (157);
2463          ## Stay in the state          ## Stay in the state
2464          !!!next-input-character;          !!!next-input-character;
2465          redo A;          redo A;
2466        } elsif ($self->{next_char} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
2467          !!!cp (158);          !!!cp (158);
2468          !!!parse-error (type => 'no DOCTYPE name');          !!!parse-error (type => 'no DOCTYPE name');
2469          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2470          !!!next-input-character;          !!!next-input-character;
2471    
2472          !!!emit ($self->{current_token}); # DOCTYPE (quirks)          !!!emit ($self->{ct}); # DOCTYPE (quirks)
2473    
2474          redo A;          redo A;
2475        } elsif ($self->{next_char} == -1) {        } elsif ($self->{nc} == -1) {
2476          !!!cp (159);          !!!cp (159);
2477          !!!parse-error (type => 'no DOCTYPE name');          !!!parse-error (type => 'no DOCTYPE name');
2478          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2479          ## reconsume          ## reconsume
2480    
2481          !!!emit ($self->{current_token}); # DOCTYPE (quirks)          !!!emit ($self->{ct}); # DOCTYPE (quirks)
2482    
2483          redo A;          redo A;
2484        } else {        } else {
2485          !!!cp (160);          !!!cp (160);
2486          $self->{current_token}->{name} = chr $self->{next_char};          $self->{ct}->{name} = chr $self->{nc};
2487          delete $self->{current_token}->{quirks};          delete $self->{ct}->{quirks};
2488  ## ISSUE: "Set the token's name name to the" in the spec  ## ISSUE: "Set the token's name name to the" in the spec
2489          $self->{state} = DOCTYPE_NAME_STATE;          $self->{state} = DOCTYPE_NAME_STATE;
2490          !!!next-input-character;          !!!next-input-character;
# Line 2377  sub _get_next_token ($) { Line 2492  sub _get_next_token ($) {
2492        }        }
2493      } elsif ($self->{state} == DOCTYPE_NAME_STATE) {      } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
2494  ## ISSUE: Redundant "First," in the spec.  ## ISSUE: Redundant "First," in the spec.
2495        if ($self->{next_char} == 0x0009 or # HT        if ($is_space->{$self->{nc}}) {
           $self->{next_char} == 0x000A or # LF  
           $self->{next_char} == 0x000B or # VT  
           $self->{next_char} == 0x000C or # FF  
           $self->{next_char} == 0x0020) { # SP  
2496          !!!cp (161);          !!!cp (161);
2497          $self->{state} = AFTER_DOCTYPE_NAME_STATE;          $self->{state} = AFTER_DOCTYPE_NAME_STATE;
2498          !!!next-input-character;          !!!next-input-character;
2499          redo A;          redo A;
2500        } elsif ($self->{next_char} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
2501          !!!cp (162);          !!!cp (162);
2502          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2503          !!!next-input-character;          !!!next-input-character;
2504    
2505          !!!emit ($self->{current_token}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
2506    
2507          redo A;          redo A;
2508        } elsif ($self->{next_char} == -1) {        } elsif ($self->{nc} == -1) {
2509          !!!cp (163);          !!!cp (163);
2510          !!!parse-error (type => 'unclosed DOCTYPE');          !!!parse-error (type => 'unclosed DOCTYPE');
2511          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2512          ## reconsume          ## reconsume
2513    
2514          $self->{current_token}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
2515          !!!emit ($self->{current_token}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
2516    
2517          redo A;          redo A;
2518        } else {        } else {
2519          !!!cp (164);          !!!cp (164);
2520          $self->{current_token}->{name}          $self->{ct}->{name}
2521            .= chr ($self->{next_char}); # DOCTYPE            .= chr ($self->{nc}); # DOCTYPE
2522          ## Stay in the state          ## Stay in the state
2523          !!!next-input-character;          !!!next-input-character;
2524          redo A;          redo A;
2525        }        }
2526      } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {      } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
2527        if ($self->{next_char} == 0x0009 or # HT        if ($is_space->{$self->{nc}}) {
           $self->{next_char} == 0x000A or # LF  
           $self->{next_char} == 0x000B or # VT  
           $self->{next_char} == 0x000C or # FF  
           $self->{next_char} == 0x0020) { # SP  
2528          !!!cp (165);          !!!cp (165);
2529          ## Stay in the state          ## Stay in the state
2530          !!!next-input-character;          !!!next-input-character;
2531          redo A;          redo A;
2532        } elsif ($self->{next_char} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
2533          !!!cp (166);          !!!cp (166);
2534          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2535          !!!next-input-character;          !!!next-input-character;
2536    
2537          !!!emit ($self->{current_token}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
2538    
2539          redo A;          redo A;
2540        } elsif ($self->{next_char} == -1) {        } elsif ($self->{nc} == -1) {
2541          !!!cp (167);          !!!cp (167);
2542          !!!parse-error (type => 'unclosed DOCTYPE');          !!!parse-error (type => 'unclosed DOCTYPE');
2543          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2544          ## reconsume          ## reconsume
2545    
2546          $self->{current_token}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
2547          !!!emit ($self->{current_token}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
2548    
2549          redo A;          redo A;
2550        } elsif ($self->{next_char} == 0x0050 or # P        } elsif ($self->{nc} == 0x0050 or # P
2551                 $self->{next_char} == 0x0070) { # p                 $self->{nc} == 0x0070) { # p
2552          $self->{state} = PUBLIC_STATE;          $self->{state} = PUBLIC_STATE;
2553          $self->{state_keyword} = chr $self->{next_char};          $self->{s_kwd} = chr $self->{nc};
2554          !!!next-input-character;          !!!next-input-character;
2555          redo A;          redo A;
2556        } elsif ($self->{next_char} == 0x0053 or # S        } elsif ($self->{nc} == 0x0053 or # S
2557                 $self->{next_char} == 0x0073) { # s                 $self->{nc} == 0x0073) { # s
2558          $self->{state} = SYSTEM_STATE;          $self->{state} = SYSTEM_STATE;
2559          $self->{state_keyword} = chr $self->{next_char};          $self->{s_kwd} = chr $self->{nc};
2560          !!!next-input-character;          !!!next-input-character;
2561          redo A;          redo A;
2562        } else {        } else {
2563          !!!cp (180);          !!!cp (180);
2564          !!!parse-error (type => 'string after DOCTYPE name');          !!!parse-error (type => 'string after DOCTYPE name');
2565          $self->{current_token}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
2566    
2567          $self->{state} = BOGUS_DOCTYPE_STATE;          $self->{state} = BOGUS_DOCTYPE_STATE;
2568          !!!next-input-character;          !!!next-input-character;
# Line 2463  sub _get_next_token ($) { Line 2570  sub _get_next_token ($) {
2570        }        }
2571      } elsif ($self->{state} == PUBLIC_STATE) {      } elsif ($self->{state} == PUBLIC_STATE) {
2572        ## ASCII case-insensitive        ## ASCII case-insensitive
2573        if ($self->{next_char} == [        if ($self->{nc} == [
2574              undef,              undef,
2575              0x0055, # U              0x0055, # U
2576              0x0042, # B              0x0042, # B
2577              0x004C, # L              0x004C, # L
2578              0x0049, # I              0x0049, # I
2579            ]->[length $self->{state_keyword}] or            ]->[length $self->{s_kwd}] or
2580            $self->{next_char} == [            $self->{nc} == [
2581              undef,              undef,
2582              0x0075, # u              0x0075, # u
2583              0x0062, # b              0x0062, # b
2584              0x006C, # l              0x006C, # l
2585              0x0069, # i              0x0069, # i
2586            ]->[length $self->{state_keyword}]) {            ]->[length $self->{s_kwd}]) {
2587          !!!cp (175);          !!!cp (175);
2588          ## Stay in the state.          ## Stay in the state.
2589          $self->{state_keyword} .= chr $self->{next_char};          $self->{s_kwd} .= chr $self->{nc};
2590          !!!next-input-character;          !!!next-input-character;
2591          redo A;          redo A;
2592        } elsif ((length $self->{state_keyword}) == 5 and        } elsif ((length $self->{s_kwd}) == 5 and
2593                 ($self->{next_char} == 0x0043 or # C                 ($self->{nc} == 0x0043 or # C
2594                  $self->{next_char} == 0x0063)) { # c                  $self->{nc} == 0x0063)) { # c
2595          !!!cp (168);          !!!cp (168);
2596          $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;          $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2597          !!!next-input-character;          !!!next-input-character;
# Line 2493  sub _get_next_token ($) { Line 2600  sub _get_next_token ($) {
2600          !!!cp (169);          !!!cp (169);
2601          !!!parse-error (type => 'string after DOCTYPE name',          !!!parse-error (type => 'string after DOCTYPE name',
2602                          line => $self->{line_prev},                          line => $self->{line_prev},
2603                          column => $self->{column_prev} + 1 - length $self->{state_keyword});                          column => $self->{column_prev} + 1 - length $self->{s_kwd});
2604          $self->{current_token}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
2605    
2606          $self->{state} = BOGUS_DOCTYPE_STATE;          $self->{state} = BOGUS_DOCTYPE_STATE;
2607          ## Reconsume.          ## Reconsume.
# Line 2502  sub _get_next_token ($) { Line 2609  sub _get_next_token ($) {
2609        }        }
2610      } elsif ($self->{state} == SYSTEM_STATE) {      } elsif ($self->{state} == SYSTEM_STATE) {
2611        ## ASCII case-insensitive        ## ASCII case-insensitive
2612        if ($self->{next_char} == [        if ($self->{nc} == [
2613              undef,              undef,
2614              0x0059, # Y              0x0059, # Y
2615              0x0053, # S              0x0053, # S
2616              0x0054, # T              0x0054, # T
2617              0x0045, # E              0x0045, # E
2618            ]->[length $self->{state_keyword}] or            ]->[length $self->{s_kwd}] or
2619            $self->{next_char} == [            $self->{nc} == [
2620              undef,              undef,
2621              0x0079, # y              0x0079, # y
2622              0x0073, # s              0x0073, # s
2623              0x0074, # t              0x0074, # t
2624              0x0065, # e              0x0065, # e
2625            ]->[length $self->{state_keyword}]) {            ]->[length $self->{s_kwd}]) {
2626          !!!cp (170);          !!!cp (170);
2627          ## Stay in the state.          ## Stay in the state.
2628          $self->{state_keyword} .= chr $self->{next_char};          $self->{s_kwd} .= chr $self->{nc};
2629          !!!next-input-character;          !!!next-input-character;
2630          redo A;          redo A;
2631        } elsif ((length $self->{state_keyword}) == 5 and        } elsif ((length $self->{s_kwd}) == 5 and
2632                 ($self->{next_char} == 0x004D or # M                 ($self->{nc} == 0x004D or # M
2633                  $self->{next_char} == 0x006D)) { # m                  $self->{nc} == 0x006D)) { # m
2634          !!!cp (171);          !!!cp (171);
2635          $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;          $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2636          !!!next-input-character;          !!!next-input-character;
# Line 2532  sub _get_next_token ($) { Line 2639  sub _get_next_token ($) {
2639          !!!cp (172);          !!!cp (172);
2640          !!!parse-error (type => 'string after DOCTYPE name',          !!!parse-error (type => 'string after DOCTYPE name',
2641                          line => $self->{line_prev},                          line => $self->{line_prev},
2642                          column => $self->{column_prev} + 1 - length $self->{state_keyword});                          column => $self->{column_prev} + 1 - length $self->{s_kwd});
2643          $self->{current_token}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
2644    
2645          $self->{state} = BOGUS_DOCTYPE_STATE;          $self->{state} = BOGUS_DOCTYPE_STATE;
2646          ## Reconsume.          ## Reconsume.
2647          redo A;          redo A;
2648        }        }
2649      } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {      } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2650        if ({        if ($is_space->{$self->{nc}}) {
             0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,  
             #0x000D => 1, # HT, LF, VT, FF, SP, CR  
           }->{$self->{next_char}}) {  
2651          !!!cp (181);          !!!cp (181);
2652          ## Stay in the state          ## Stay in the state
2653          !!!next-input-character;          !!!next-input-character;
2654          redo A;          redo A;
2655        } elsif ($self->{next_char} eq 0x0022) { # "        } elsif ($self->{nc} eq 0x0022) { # "
2656          !!!cp (182);          !!!cp (182);
2657          $self->{current_token}->{public_identifier} = ''; # DOCTYPE          $self->{ct}->{pubid} = ''; # DOCTYPE
2658          $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;          $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
2659          !!!next-input-character;          !!!next-input-character;
2660          redo A;          redo A;
2661        } elsif ($self->{next_char} eq 0x0027) { # '        } elsif ($self->{nc} eq 0x0027) { # '
2662          !!!cp (183);          !!!cp (183);
2663          $self->{current_token}->{public_identifier} = ''; # DOCTYPE          $self->{ct}->{pubid} = ''; # DOCTYPE
2664          $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;          $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
2665          !!!next-input-character;          !!!next-input-character;
2666          redo A;          redo A;
2667        } elsif ($self->{next_char} eq 0x003E) { # >        } elsif ($self->{nc} eq 0x003E) { # >
2668          !!!cp (184);          !!!cp (184);
2669          !!!parse-error (type => 'no PUBLIC literal');          !!!parse-error (type => 'no PUBLIC literal');
2670    
2671          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2672          !!!next-input-character;          !!!next-input-character;
2673    
2674          $self->{current_token}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
2675          !!!emit ($self->{current_token}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
2676    
2677          redo A;          redo A;
2678        } elsif ($self->{next_char} == -1) {        } elsif ($self->{nc} == -1) {
2679          !!!cp (185);          !!!cp (185);
2680          !!!parse-error (type => 'unclosed DOCTYPE');          !!!parse-error (type => 'unclosed DOCTYPE');
2681    
2682          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2683          ## reconsume          ## reconsume
2684    
2685          $self->{current_token}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
2686          !!!emit ($self->{current_token}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
2687    
2688          redo A;          redo A;
2689        } else {        } else {
2690          !!!cp (186);          !!!cp (186);
2691          !!!parse-error (type => 'string after PUBLIC');          !!!parse-error (type => 'string after PUBLIC');
2692          $self->{current_token}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
2693    
2694          $self->{state} = BOGUS_DOCTYPE_STATE;          $self->{state} = BOGUS_DOCTYPE_STATE;
2695          !!!next-input-character;          !!!next-input-character;
2696          redo A;          redo A;
2697        }        }
2698      } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {      } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2699        if ($self->{next_char} == 0x0022) { # "        if ($self->{nc} == 0x0022) { # "
2700          !!!cp (187);          !!!cp (187);
2701          $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;          $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2702          !!!next-input-character;          !!!next-input-character;
2703          redo A;          redo A;
2704        } elsif ($self->{next_char} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
2705          !!!cp (188);          !!!cp (188);
2706          !!!parse-error (type => 'unclosed PUBLIC literal');          !!!parse-error (type => 'unclosed PUBLIC literal');
2707    
2708          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2709          !!!next-input-character;          !!!next-input-character;
2710    
2711          $self->{current_token}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
2712          !!!emit ($self->{current_token}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
2713    
2714          redo A;          redo A;
2715        } elsif ($self->{next_char} == -1) {        } elsif ($self->{nc} == -1) {
2716          !!!cp (189);          !!!cp (189);
2717          !!!parse-error (type => 'unclosed PUBLIC literal');          !!!parse-error (type => 'unclosed PUBLIC literal');
2718    
2719          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2720          ## reconsume          ## reconsume
2721    
2722          $self->{current_token}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
2723          !!!emit ($self->{current_token}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
2724    
2725          redo A;          redo A;
2726        } else {        } else {
2727          !!!cp (190);          !!!cp (190);
2728          $self->{current_token}->{public_identifier} # DOCTYPE          $self->{ct}->{pubid} # DOCTYPE
2729              .= chr $self->{next_char};              .= chr $self->{nc};
2730            $self->{read_until}->($self->{ct}->{pubid}, q[">],
2731                                  length $self->{ct}->{pubid});
2732    
2733          ## Stay in the state          ## Stay in the state
2734          !!!next-input-character;          !!!next-input-character;
2735          redo A;          redo A;
2736        }        }
2737      } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {      } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
2738        if ($self->{next_char} == 0x0027) { # '        if ($self->{nc} == 0x0027) { # '
2739          !!!cp (191);          !!!cp (191);
2740          $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;          $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2741          !!!next-input-character;          !!!next-input-character;
2742          redo A;          redo A;
2743        } elsif ($self->{next_char} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
2744          !!!cp (192);          !!!cp (192);
2745          !!!parse-error (type => 'unclosed PUBLIC literal');          !!!parse-error (type => 'unclosed PUBLIC literal');
2746    
2747          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2748          !!!next-input-character;          !!!next-input-character;
2749    
2750          $self->{current_token}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
2751          !!!emit ($self->{current_token}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
2752    
2753          redo A;          redo A;
2754        } elsif ($self->{next_char} == -1) {        } elsif ($self->{nc} == -1) {
2755          !!!cp (193);          !!!cp (193);
2756          !!!parse-error (type => 'unclosed PUBLIC literal');          !!!parse-error (type => 'unclosed PUBLIC literal');
2757    
2758          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2759          ## reconsume          ## reconsume
2760    
2761          $self->{current_token}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
2762          !!!emit ($self->{current_token}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
2763    
2764          redo A;          redo A;
2765        } else {        } else {
2766          !!!cp (194);          !!!cp (194);
2767          $self->{current_token}->{public_identifier} # DOCTYPE          $self->{ct}->{pubid} # DOCTYPE
2768              .= chr $self->{next_char};              .= chr $self->{nc};
2769            $self->{read_until}->($self->{ct}->{pubid}, q['>],
2770                                  length $self->{ct}->{pubid});
2771    
2772          ## Stay in the state          ## Stay in the state
2773          !!!next-input-character;          !!!next-input-character;
2774          redo A;          redo A;
2775        }        }
2776      } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {      } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2777        if ({        if ($is_space->{$self->{nc}}) {
             0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,  
             #0x000D => 1, # HT, LF, VT, FF, SP, CR  
           }->{$self->{next_char}}) {  
2778          !!!cp (195);          !!!cp (195);
2779          ## Stay in the state          ## Stay in the state
2780          !!!next-input-character;          !!!next-input-character;
2781          redo A;          redo A;
2782        } elsif ($self->{next_char} == 0x0022) { # "        } elsif ($self->{nc} == 0x0022) { # "
2783          !!!cp (196);          !!!cp (196);
2784          $self->{current_token}->{system_identifier} = ''; # DOCTYPE          $self->{ct}->{sysid} = ''; # DOCTYPE
2785          $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;          $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2786          !!!next-input-character;          !!!next-input-character;
2787          redo A;          redo A;
2788        } elsif ($self->{next_char} == 0x0027) { # '        } elsif ($self->{nc} == 0x0027) { # '
2789          !!!cp (197);          !!!cp (197);
2790          $self->{current_token}->{system_identifier} = ''; # DOCTYPE          $self->{ct}->{sysid} = ''; # DOCTYPE
2791          $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;          $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2792          !!!next-input-character;          !!!next-input-character;
2793          redo A;          redo A;
2794        } elsif ($self->{next_char} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
2795          !!!cp (198);          !!!cp (198);
2796          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2797          !!!next-input-character;          !!!next-input-character;
2798    
2799          !!!emit ($self->{current_token}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
2800    
2801          redo A;          redo A;
2802        } elsif ($self->{next_char} == -1) {        } elsif ($self->{nc} == -1) {
2803          !!!cp (199);          !!!cp (199);
2804          !!!parse-error (type => 'unclosed DOCTYPE');          !!!parse-error (type => 'unclosed DOCTYPE');
2805    
2806          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2807          ## reconsume          ## reconsume
2808    
2809          $self->{current_token}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
2810          !!!emit ($self->{current_token}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
2811    
2812          redo A;          redo A;
2813        } else {        } else {
2814          !!!cp (200);          !!!cp (200);
2815          !!!parse-error (type => 'string after PUBLIC literal');          !!!parse-error (type => 'string after PUBLIC literal');
2816          $self->{current_token}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
2817    
2818          $self->{state} = BOGUS_DOCTYPE_STATE;          $self->{state} = BOGUS_DOCTYPE_STATE;
2819          !!!next-input-character;          !!!next-input-character;
2820          redo A;          redo A;
2821        }        }
2822      } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {      } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2823        if ({        if ($is_space->{$self->{nc}}) {
             0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,  
             #0x000D => 1, # HT, LF, VT, FF, SP, CR  
           }->{$self->{next_char}}) {  
2824          !!!cp (201);          !!!cp (201);
2825          ## Stay in the state          ## Stay in the state
2826          !!!next-input-character;          !!!next-input-character;
2827          redo A;          redo A;
2828        } elsif ($self->{next_char} == 0x0022) { # "        } elsif ($self->{nc} == 0x0022) { # "
2829          !!!cp (202);          !!!cp (202);
2830          $self->{current_token}->{system_identifier} = ''; # DOCTYPE          $self->{ct}->{sysid} = ''; # DOCTYPE
2831          $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;          $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2832          !!!next-input-character;          !!!next-input-character;
2833          redo A;          redo A;
2834        } elsif ($self->{next_char} == 0x0027) { # '        } elsif ($self->{nc} == 0x0027) { # '
2835          !!!cp (203);          !!!cp (203);
2836          $self->{current_token}->{system_identifier} = ''; # DOCTYPE          $self->{ct}->{sysid} = ''; # DOCTYPE
2837          $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;          $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2838          !!!next-input-character;          !!!next-input-character;
2839          redo A;          redo A;
2840        } elsif ($self->{next_char} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
2841          !!!cp (204);          !!!cp (204);
2842          !!!parse-error (type => 'no SYSTEM literal');          !!!parse-error (type => 'no SYSTEM literal');
2843          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2844          !!!next-input-character;          !!!next-input-character;
2845    
2846          $self->{current_token}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
2847          !!!emit ($self->{current_token}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
2848    
2849          redo A;          redo A;
2850        } elsif ($self->{next_char} == -1) {        } elsif ($self->{nc} == -1) {
2851          !!!cp (205);          !!!cp (205);
2852          !!!parse-error (type => 'unclosed DOCTYPE');          !!!parse-error (type => 'unclosed DOCTYPE');
2853    
2854          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2855          ## reconsume          ## reconsume
2856    
2857          $self->{current_token}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
2858          !!!emit ($self->{current_token}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
2859    
2860          redo A;          redo A;
2861        } else {        } else {
2862          !!!cp (206);          !!!cp (206);
2863          !!!parse-error (type => 'string after SYSTEM');          !!!parse-error (type => 'string after SYSTEM');
2864          $self->{current_token}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
2865    
2866          $self->{state} = BOGUS_DOCTYPE_STATE;          $self->{state} = BOGUS_DOCTYPE_STATE;
2867          !!!next-input-character;          !!!next-input-character;
2868          redo A;          redo A;
2869        }        }
2870      } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {      } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2871        if ($self->{next_char} == 0x0022) { # "        if ($self->{nc} == 0x0022) { # "
2872          !!!cp (207);          !!!cp (207);
2873          $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;          $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2874          !!!next-input-character;          !!!next-input-character;
2875          redo A;          redo A;
2876        } elsif ($self->{next_char} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
2877          !!!cp (208);          !!!cp (208);
2878          !!!parse-error (type => 'unclosed SYSTEM literal');          !!!parse-error (type => 'unclosed SYSTEM literal');
2879    
2880          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2881          !!!next-input-character;          !!!next-input-character;
2882    
2883          $self->{current_token}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
2884          !!!emit ($self->{current_token}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
2885    
2886          redo A;          redo A;
2887        } elsif ($self->{next_char} == -1) {        } elsif ($self->{nc} == -1) {
2888          !!!cp (209);          !!!cp (209);
2889          !!!parse-error (type => 'unclosed SYSTEM literal');          !!!parse-error (type => 'unclosed SYSTEM literal');
2890    
2891          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2892          ## reconsume          ## reconsume
2893    
2894          $self->{current_token}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
2895          !!!emit ($self->{current_token}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
2896    
2897          redo A;          redo A;
2898        } else {        } else {
2899          !!!cp (210);          !!!cp (210);
2900          $self->{current_token}->{system_identifier} # DOCTYPE          $self->{ct}->{sysid} # DOCTYPE
2901              .= chr $self->{next_char};              .= chr $self->{nc};
2902            $self->{read_until}->($self->{ct}->{sysid}, q[">],
2903                                  length $self->{ct}->{sysid});
2904    
2905          ## Stay in the state          ## Stay in the state
2906          !!!next-input-character;          !!!next-input-character;
2907          redo A;          redo A;
2908        }        }
2909      } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {      } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
2910        if ($self->{next_char} == 0x0027) { # '        if ($self->{nc} == 0x0027) { # '
2911          !!!cp (211);          !!!cp (211);
2912          $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;          $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2913          !!!next-input-character;          !!!next-input-character;
2914          redo A;          redo A;
2915        } elsif ($self->{next_char} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
2916          !!!cp (212);          !!!cp (212);
2917          !!!parse-error (type => 'unclosed SYSTEM literal');          !!!parse-error (type => 'unclosed SYSTEM literal');
2918    
2919          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2920          !!!next-input-character;          !!!next-input-character;
2921    
2922          $self->{current_token}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
2923          !!!emit ($self->{current_token}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
2924    
2925          redo A;          redo A;
2926        } elsif ($self->{next_char} == -1) {        } elsif ($self->{nc} == -1) {
2927          !!!cp (213);          !!!cp (213);
2928          !!!parse-error (type => 'unclosed SYSTEM literal');          !!!parse-error (type => 'unclosed SYSTEM literal');
2929    
2930          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2931          ## reconsume          ## reconsume
2932    
2933          $self->{current_token}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
2934          !!!emit ($self->{current_token}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
2935    
2936          redo A;          redo A;
2937        } else {        } else {
2938          !!!cp (214);          !!!cp (214);
2939          $self->{current_token}->{system_identifier} # DOCTYPE          $self->{ct}->{sysid} # DOCTYPE
2940              .= chr $self->{next_char};              .= chr $self->{nc};
2941            $self->{read_until}->($self->{ct}->{sysid}, q['>],
2942                                  length $self->{ct}->{sysid});
2943    
2944          ## Stay in the state          ## Stay in the state
2945          !!!next-input-character;          !!!next-input-character;
2946          redo A;          redo A;
2947        }        }
2948      } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {      } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2949        if ({        if ($is_space->{$self->{nc}}) {
             0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,  
             #0x000D => 1, # HT, LF, VT, FF, SP, CR  
           }->{$self->{next_char}}) {  
2950          !!!cp (215);          !!!cp (215);
2951          ## Stay in the state          ## Stay in the state
2952          !!!next-input-character;          !!!next-input-character;
2953          redo A;          redo A;
2954        } elsif ($self->{next_char} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
2955          !!!cp (216);          !!!cp (216);
2956          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2957          !!!next-input-character;          !!!next-input-character;
2958    
2959          !!!emit ($self->{current_token}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
2960    
2961          redo A;          redo A;
2962        } elsif ($self->{next_char} == -1) {        } elsif ($self->{nc} == -1) {
2963          !!!cp (217);          !!!cp (217);
2964          !!!parse-error (type => 'unclosed DOCTYPE');          !!!parse-error (type => 'unclosed DOCTYPE');
2965          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2966          ## reconsume          ## reconsume
2967    
2968          $self->{current_token}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
2969          !!!emit ($self->{current_token}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
2970    
2971          redo A;          redo A;
2972        } else {        } else {
2973          !!!cp (218);          !!!cp (218);
2974          !!!parse-error (type => 'string after SYSTEM literal');          !!!parse-error (type => 'string after SYSTEM literal');
2975          #$self->{current_token}->{quirks} = 1;          #$self->{ct}->{quirks} = 1;
2976    
2977          $self->{state} = BOGUS_DOCTYPE_STATE;          $self->{state} = BOGUS_DOCTYPE_STATE;
2978          !!!next-input-character;          !!!next-input-character;
2979          redo A;          redo A;
2980        }        }
2981      } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {      } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
2982        if ($self->{next_char} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
2983          !!!cp (219);          !!!cp (219);
2984          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2985          !!!next-input-character;          !!!next-input-character;
2986    
2987          !!!emit ($self->{current_token}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
2988    
2989          redo A;          redo A;
2990        } elsif ($self->{next_char} == -1) {        } elsif ($self->{nc} == -1) {
2991          !!!cp (220);          !!!cp (220);
2992          !!!parse-error (type => 'unclosed DOCTYPE');          !!!parse-error (type => 'unclosed DOCTYPE');
2993          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2994          ## reconsume          ## reconsume
2995    
2996          !!!emit ($self->{current_token}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
2997    
2998          redo A;          redo A;
2999        } else {        } else {
3000          !!!cp (221);          !!!cp (221);
3001            my $s = '';
3002            $self->{read_until}->($s, q[>], 0);
3003    
3004          ## Stay in the state          ## Stay in the state
3005          !!!next-input-character;          !!!next-input-character;
3006          redo A;          redo A;
# Line 2900  sub _get_next_token ($) { Line 3010  sub _get_next_token ($) {
3010        ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,        ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
3011        ## and |CDATA_SECTION_MSE2_STATE|.        ## and |CDATA_SECTION_MSE2_STATE|.
3012                
3013        if ($self->{next_char} == 0x005D) { # ]        if ($self->{nc} == 0x005D) { # ]
3014          !!!cp (221.1);          !!!cp (221.1);
3015          $self->{state} = CDATA_SECTION_MSE1_STATE;          $self->{state} = CDATA_SECTION_MSE1_STATE;
3016          !!!next-input-character;          !!!next-input-character;
3017          redo A;          redo A;
3018        } elsif ($self->{next_char} == -1) {        } elsif ($self->{nc} == -1) {
3019          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3020          !!!next-input-character;          !!!next-input-character;
3021          if (length $self->{current_token}->{data}) { # character          if (length $self->{ct}->{data}) { # character
3022            !!!cp (221.2);            !!!cp (221.2);
3023            !!!emit ($self->{current_token}); # character            !!!emit ($self->{ct}); # character
3024          } else {          } else {
3025            !!!cp (221.3);            !!!cp (221.3);
3026            ## No token to emit. $self->{current_token} is discarded.            ## No token to emit. $self->{ct} is discarded.
3027          }                  }        
3028          redo A;          redo A;
3029        } else {        } else {
3030          !!!cp (221.4);          !!!cp (221.4);
3031          $self->{current_token}->{data} .= chr $self->{next_char};          $self->{ct}->{data} .= chr $self->{nc};
3032            $self->{read_until}->($self->{ct}->{data},
3033                                  q<]>,
3034                                  length $self->{ct}->{data});
3035    
3036          ## Stay in the state.          ## Stay in the state.
3037          !!!next-input-character;          !!!next-input-character;
3038          redo A;          redo A;
# Line 2926  sub _get_next_token ($) { Line 3040  sub _get_next_token ($) {
3040    
3041        ## ISSUE: "text tokens" in spec.        ## ISSUE: "text tokens" in spec.
3042      } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {      } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
3043        if ($self->{next_char} == 0x005D) { # ]        if ($self->{nc} == 0x005D) { # ]
3044          !!!cp (221.5);          !!!cp (221.5);
3045          $self->{state} = CDATA_SECTION_MSE2_STATE;          $self->{state} = CDATA_SECTION_MSE2_STATE;
3046          !!!next-input-character;          !!!next-input-character;
3047          redo A;          redo A;
3048        } else {        } else {
3049          !!!cp (221.6);          !!!cp (221.6);
3050          $self->{current_token}->{data} .= ']';          $self->{ct}->{data} .= ']';
3051          $self->{state} = CDATA_SECTION_STATE;          $self->{state} = CDATA_SECTION_STATE;
3052          ## Reconsume.          ## Reconsume.
3053          redo A;          redo A;
3054        }        }
3055      } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {      } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
3056        if ($self->{next_char} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
3057          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3058          !!!next-input-character;          !!!next-input-character;
3059          if (length $self->{current_token}->{data}) { # character          if (length $self->{ct}->{data}) { # character
3060            !!!cp (221.7);            !!!cp (221.7);
3061            !!!emit ($self->{current_token}); # character            !!!emit ($self->{ct}); # character
3062          } else {          } else {
3063            !!!cp (221.8);            !!!cp (221.8);
3064            ## No token to emit. $self->{current_token} is discarded.            ## No token to emit. $self->{ct} is discarded.
3065          }          }
3066          redo A;          redo A;
3067        } elsif ($self->{next_char} == 0x005D) { # ]        } elsif ($self->{nc} == 0x005D) { # ]
3068          !!!cp (221.9); # character          !!!cp (221.9); # character
3069          $self->{current_token}->{data} .= ']'; ## Add first "]" of "]]]".          $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
3070          ## Stay in the state.          ## Stay in the state.
3071          !!!next-input-character;          !!!next-input-character;
3072          redo A;          redo A;
3073        } else {        } else {
3074          !!!cp (221.11);          !!!cp (221.11);
3075          $self->{current_token}->{data} .= ']]'; # character          $self->{ct}->{data} .= ']]'; # character
3076          $self->{state} = CDATA_SECTION_STATE;          $self->{state} = CDATA_SECTION_STATE;
3077          ## Reconsume.          ## Reconsume.
3078          redo A;          redo A;
3079        }        }
3080      } else {      } elsif ($self->{state} == ENTITY_STATE) {
3081        die "$0: $self->{state}: Unknown state";        if ($is_space->{$self->{nc}} or
3082      }            {
3083    } # A                0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
3084                $self->{entity_add} => 1,
3085    die "$0: _get_next_token: unexpected case";            }->{$self->{nc}}) {
3086  } # _get_next_token          !!!cp (1001);
3087            ## Don't consume
3088  sub _tokenize_attempt_to_consume_an_entity ($$$) {          ## No error
3089    my ($self, $in_attr, $additional) = @_;          ## Return nothing.
3090            #
3091          } elsif ($self->{nc} == 0x0023) { # #
3092            !!!cp (999);
3093            $self->{state} = ENTITY_HASH_STATE;
3094            $self->{s_kwd} = '#';
3095            !!!next-input-character;
3096            redo A;
3097          } elsif ((0x0041 <= $self->{nc} and
3098                    $self->{nc} <= 0x005A) or # A..Z
3099                   (0x0061 <= $self->{nc} and
3100                    $self->{nc} <= 0x007A)) { # a..z
3101            !!!cp (998);
3102            require Whatpm::_NamedEntityList;
3103            $self->{state} = ENTITY_NAME_STATE;
3104            $self->{s_kwd} = chr $self->{nc};
3105            $self->{entity__value} = $self->{s_kwd};
3106            $self->{entity__match} = 0;
3107            !!!next-input-character;
3108            redo A;
3109          } else {
3110            !!!cp (1027);
3111            !!!parse-error (type => 'bare ero');
3112            ## Return nothing.
3113            #
3114          }
3115    
3116    my ($l, $c) = ($self->{line_prev}, $self->{column_prev});        ## NOTE: No character is consumed by the "consume a character
3117          ## reference" algorithm.  In other word, there is an "&" character
3118          ## that does not introduce a character reference, which would be
3119          ## appended to the parent element or the attribute value in later
3120          ## process of the tokenizer.
3121    
3122          if ($self->{prev_state} == DATA_STATE) {
3123            !!!cp (997);
3124            $self->{state} = $self->{prev_state};
3125            ## Reconsume.
3126            !!!emit ({type => CHARACTER_TOKEN, data => '&',
3127                      line => $self->{line_prev},
3128                      column => $self->{column_prev},
3129                     });
3130            redo A;
3131          } else {
3132            !!!cp (996);
3133            $self->{ca}->{value} .= '&';
3134            $self->{state} = $self->{prev_state};
3135            ## Reconsume.
3136            redo A;
3137          }
3138        } elsif ($self->{state} == ENTITY_HASH_STATE) {
3139          if ($self->{nc} == 0x0078 or # x
3140              $self->{nc} == 0x0058) { # X
3141            !!!cp (995);
3142            $self->{state} = HEXREF_X_STATE;
3143            $self->{s_kwd} .= chr $self->{nc};
3144            !!!next-input-character;
3145            redo A;
3146          } elsif (0x0030 <= $self->{nc} and
3147                   $self->{nc} <= 0x0039) { # 0..9
3148            !!!cp (994);
3149            $self->{state} = NCR_NUM_STATE;
3150            $self->{s_kwd} = $self->{nc} - 0x0030;
3151            !!!next-input-character;
3152            redo A;
3153          } else {
3154            !!!parse-error (type => 'bare nero',
3155                            line => $self->{line_prev},
3156                            column => $self->{column_prev} - 1);
3157    
3158    if ({          ## NOTE: According to the spec algorithm, nothing is returned,
3159         0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, # HT, LF, VT, FF,          ## and then "&#" is appended to the parent element or the attribute
3160         0x0020 => 1, 0x003C => 1, 0x0026 => 1, -1 => 1, # SP, <, & # 0x000D # CR          ## value in the later processing.
3161         $additional => 1,  
3162        }->{$self->{next_char}}) {          if ($self->{prev_state} == DATA_STATE) {
3163      !!!cp (1001);            !!!cp (1019);
3164      ## Don't consume            $self->{state} = $self->{prev_state};
3165      ## No error            ## Reconsume.
3166      return undef;            !!!emit ({type => CHARACTER_TOKEN,
3167    } elsif ($self->{next_char} == 0x0023) { # #                      data => '&#',
3168      !!!next-input-character;                      line => $self->{line_prev},
3169      if ($self->{next_char} == 0x0078 or # x                      column => $self->{column_prev} - 1,
3170          $self->{next_char} == 0x0058) { # X                     });
3171        my $code;            redo A;
       X: {  
         my $x_char = $self->{next_char};  
         !!!next-input-character;  
         if (0x0030 <= $self->{next_char} and  
             $self->{next_char} <= 0x0039) { # 0..9  
           !!!cp (1002);  
           $code ||= 0;  
           $code *= 0x10;  
           $code += $self->{next_char} - 0x0030;  
           redo X;  
         } elsif (0x0061 <= $self->{next_char} and  
                  $self->{next_char} <= 0x0066) { # a..f  
           !!!cp (1003);  
           $code ||= 0;  
           $code *= 0x10;  
           $code += $self->{next_char} - 0x0060 + 9;  
           redo X;  
         } elsif (0x0041 <= $self->{next_char} and  
                  $self->{next_char} <= 0x0046) { # A..F  
           !!!cp (1004);  
           $code ||= 0;  
           $code *= 0x10;  
           $code += $self->{next_char} - 0x0040 + 9;  
           redo X;  
         } elsif (not defined $code) { # no hexadecimal digit  
           !!!cp (1005);  
           !!!parse-error (type => 'bare hcro', line => $l, column => $c);  
           !!!back-next-input-character ($x_char, $self->{next_char});  
           $self->{next_char} = 0x0023; # #  
           return undef;  
         } elsif ($self->{next_char} == 0x003B) { # ;  
           !!!cp (1006);  
           !!!next-input-character;  
3172          } else {          } else {
3173            !!!cp (1007);            !!!cp (993);
3174            !!!parse-error (type => 'no refc', line => $l, column => $c);            $self->{ca}->{value} .= '&#';
3175              $self->{state} = $self->{prev_state};
3176              ## Reconsume.
3177              redo A;
3178          }          }
3179          }
3180          if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) {      } elsif ($self->{state} == NCR_NUM_STATE) {
3181            !!!cp (1008);        if (0x0030 <= $self->{nc} and
3182            !!!parse-error (type => 'invalid character reference',            $self->{nc} <= 0x0039) { # 0..9
                           text => (sprintf 'U+%04X', $code),  
                           line => $l, column => $c);  
           $code = 0xFFFD;  
         } elsif ($code > 0x10FFFF) {  
           !!!cp (1009);  
           !!!parse-error (type => 'invalid character reference',  
                           text => (sprintf 'U-%08X', $code),  
                           line => $l, column => $c);  
           $code = 0xFFFD;  
         } elsif ($code == 0x000D) {  
           !!!cp (1010);  
           !!!parse-error (type => 'CR character reference', line => $l, column => $c);  
           $code = 0x000A;  
         } elsif (0x80 <= $code and $code <= 0x9F) {  
           !!!cp (1011);  
           !!!parse-error (type => 'C1 character reference', text => (sprintf 'U+%04X', $code), line => $l, column => $c);  
           $code = $c1_entity_char->{$code};  
         }  
   
         return {type => CHARACTER_TOKEN, data => chr $code,  
                 has_reference => 1,  
                 line => $l, column => $c,  
                };  
       } # X  
     } elsif (0x0030 <= $self->{next_char} and  
              $self->{next_char} <= 0x0039) { # 0..9  
       my $code = $self->{next_char} - 0x0030;  
       !!!next-input-character;  
         
       while (0x0030 <= $self->{next_char} and  
                 $self->{next_char} <= 0x0039) { # 0..9  
3183          !!!cp (1012);          !!!cp (1012);
3184          $code *= 10;          $self->{s_kwd} *= 10;
3185          $code += $self->{next_char} - 0x0030;          $self->{s_kwd} += $self->{nc} - 0x0030;
3186                    
3187            ## Stay in the state.
3188          !!!next-input-character;          !!!next-input-character;
3189        }          redo A;
3190          } elsif ($self->{nc} == 0x003B) { # ;
       if ($self->{next_char} == 0x003B) { # ;  
3191          !!!cp (1013);          !!!cp (1013);
3192          !!!next-input-character;          !!!next-input-character;
3193            #
3194        } else {        } else {
3195          !!!cp (1014);          !!!cp (1014);
3196          !!!parse-error (type => 'no refc', line => $l, column => $c);          !!!parse-error (type => 'no refc');
3197            ## Reconsume.
3198            #
3199        }        }
3200    
3201        if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) {        my $code = $self->{s_kwd};
3202          my $l = $self->{line_prev};
3203          my $c = $self->{column_prev};
3204          if ($charref_map->{$code}) {
3205          !!!cp (1015);          !!!cp (1015);
3206          !!!parse-error (type => 'invalid character reference',          !!!parse-error (type => 'invalid character reference',
3207                          text => (sprintf 'U+%04X', $code),                          text => (sprintf 'U+%04X', $code),
3208                          line => $l, column => $c);                          line => $l, column => $c);
3209          $code = 0xFFFD;          $code = $charref_map->{$code};
3210        } elsif ($code > 0x10FFFF) {        } elsif ($code > 0x10FFFF) {
3211          !!!cp (1016);          !!!cp (1016);
3212          !!!parse-error (type => 'invalid character reference',          !!!parse-error (type => 'invalid character reference',
3213                          text => (sprintf 'U-%08X', $code),                          text => (sprintf 'U-%08X', $code),
3214                          line => $l, column => $c);                          line => $l, column => $c);
3215          $code = 0xFFFD;          $code = 0xFFFD;
3216        } elsif ($code == 0x000D) {        }
3217          !!!cp (1017);  
3218          !!!parse-error (type => 'CR character reference',        if ($self->{prev_state} == DATA_STATE) {
3219                          line => $l, column => $c);          !!!cp (992);
3220          $code = 0x000A;          $self->{state} = $self->{prev_state};
3221        } elsif (0x80 <= $code and $code <= 0x9F) {          ## Reconsume.
3222          !!!cp (1018);          !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
3223          !!!parse-error (type => 'C1 character reference',                    line => $l, column => $c,
3224                     });
3225            redo A;
3226          } else {
3227            !!!cp (991);
3228            $self->{ca}->{value} .= chr $code;
3229            $self->{ca}->{has_reference} = 1;
3230            $self->{state} = $self->{prev_state};
3231            ## Reconsume.
3232            redo A;
3233          }
3234        } elsif ($self->{state} == HEXREF_X_STATE) {
3235          if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
3236              (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
3237              (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
3238            # 0..9, A..F, a..f
3239            !!!cp (990);
3240            $self->{state} = HEXREF_HEX_STATE;
3241            $self->{s_kwd} = 0;
3242            ## Reconsume.
3243            redo A;
3244          } else {
3245            !!!parse-error (type => 'bare hcro',
3246                            line => $self->{line_prev},
3247                            column => $self->{column_prev} - 2);
3248    
3249            ## NOTE: According to the spec algorithm, nothing is returned,
3250            ## and then "&#" followed by "X" or "x" is appended to the parent
3251            ## element or the attribute value in the later processing.
3252    
3253            if ($self->{prev_state} == DATA_STATE) {
3254              !!!cp (1005);
3255              $self->{state} = $self->{prev_state};
3256              ## Reconsume.
3257              !!!emit ({type => CHARACTER_TOKEN,
3258                        data => '&' . $self->{s_kwd},
3259                        line => $self->{line_prev},
3260                        column => $self->{column_prev} - length $self->{s_kwd},
3261                       });
3262              redo A;
3263            } else {
3264              !!!cp (989);
3265              $self->{ca}->{value} .= '&' . $self->{s_kwd};
3266              $self->{state} = $self->{prev_state};
3267              ## Reconsume.
3268              redo A;
3269            }
3270          }
3271        } elsif ($self->{state} == HEXREF_HEX_STATE) {
3272          if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
3273            # 0..9
3274            !!!cp (1002);
3275            $self->{s_kwd} *= 0x10;
3276            $self->{s_kwd} += $self->{nc} - 0x0030;
3277            ## Stay in the state.
3278            !!!next-input-character;
3279            redo A;
3280          } elsif (0x0061 <= $self->{nc} and
3281                   $self->{nc} <= 0x0066) { # a..f
3282            !!!cp (1003);
3283            $self->{s_kwd} *= 0x10;
3284            $self->{s_kwd} += $self->{nc} - 0x0060 + 9;
3285            ## Stay in the state.
3286            !!!next-input-character;
3287            redo A;
3288          } elsif (0x0041 <= $self->{nc} and
3289                   $self->{nc} <= 0x0046) { # A..F
3290            !!!cp (1004);
3291            $self->{s_kwd} *= 0x10;
3292            $self->{s_kwd} += $self->{nc} - 0x0040 + 9;
3293            ## Stay in the state.
3294            !!!next-input-character;
3295            redo A;
3296          } elsif ($self->{nc} == 0x003B) { # ;
3297            !!!cp (1006);
3298            !!!next-input-character;
3299            #
3300          } else {
3301            !!!cp (1007);
3302            !!!parse-error (type => 'no refc',
3303                            line => $self->{line},
3304                            column => $self->{column});
3305            ## Reconsume.
3306            #
3307          }
3308    
3309          my $code = $self->{s_kwd};
3310          my $l = $self->{line_prev};
3311          my $c = $self->{column_prev};
3312          if ($charref_map->{$code}) {
3313            !!!cp (1008);
3314            !!!parse-error (type => 'invalid character reference',
3315                          text => (sprintf 'U+%04X', $code),                          text => (sprintf 'U+%04X', $code),
3316                          line => $l, column => $c);                          line => $l, column => $c);
3317          $code = $c1_entity_char->{$code};          $code = $charref_map->{$code};
3318          } elsif ($code > 0x10FFFF) {
3319            !!!cp (1009);
3320            !!!parse-error (type => 'invalid character reference',
3321                            text => (sprintf 'U-%08X', $code),
3322                            line => $l, column => $c);
3323            $code = 0xFFFD;
3324        }        }
3325          
3326        return {type => CHARACTER_TOKEN, data => chr $code, has_reference => 1,        if ($self->{prev_state} == DATA_STATE) {
3327                line => $l, column => $c,          !!!cp (988);
3328               };          $self->{state} = $self->{prev_state};
3329      } else {          ## Reconsume.
3330        !!!cp (1019);          !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
3331        !!!parse-error (type => 'bare nero', line => $l, column => $c);                    line => $l, column => $c,
3332        !!!back-next-input-character ($self->{next_char});                   });
3333        $self->{next_char} = 0x0023; # #          redo A;
3334        return undef;        } else {
3335      }          !!!cp (987);
3336    } elsif ((0x0041 <= $self->{next_char} and          $self->{ca}->{value} .= chr $code;
3337              $self->{next_char} <= 0x005A) or          $self->{ca}->{has_reference} = 1;
3338             (0x0061 <= $self->{next_char} and          $self->{state} = $self->{prev_state};
3339              $self->{next_char} <= 0x007A)) {          ## Reconsume.
3340      my $entity_name = chr $self->{next_char};          redo A;
3341      !!!next-input-character;        }
3342        } elsif ($self->{state} == ENTITY_NAME_STATE) {
3343      my $value = $entity_name;        if (length $self->{s_kwd} < 30 and
3344      my $match = 0;            ## NOTE: Some number greater than the maximum length of entity name
3345      require Whatpm::_NamedEntityList;            ((0x0041 <= $self->{nc} and # a
3346      our $EntityChar;              $self->{nc} <= 0x005A) or # x
3347               (0x0061 <= $self->{nc} and # a
3348      while (length $entity_name < 30 and              $self->{nc} <= 0x007A) or # z
3349             ## NOTE: Some number greater than the maximum length of entity name             (0x0030 <= $self->{nc} and # 0
3350             ((0x0041 <= $self->{next_char} and # a              $self->{nc} <= 0x0039) or # 9
3351               $self->{next_char} <= 0x005A) or # x             $self->{nc} == 0x003B)) { # ;
3352              (0x0061 <= $self->{next_char} and # a          our $EntityChar;
3353               $self->{next_char} <= 0x007A) or # z          $self->{s_kwd} .= chr $self->{nc};
3354              (0x0030 <= $self->{next_char} and # 0          if (defined $EntityChar->{$self->{s_kwd}}) {
3355               $self->{next_char} <= 0x0039) or # 9            if ($self->{nc} == 0x003B) { # ;
3356              $self->{next_char} == 0x003B)) { # ;              !!!cp (1020);
3357        $entity_name .= chr $self->{next_char};              $self->{entity__value} = $EntityChar->{$self->{s_kwd}};
3358        if (defined $EntityChar->{$entity_name}) {              $self->{entity__match} = 1;
3359          if ($self->{next_char} == 0x003B) { # ;              !!!next-input-character;
3360            !!!cp (1020);              #
3361            $value = $EntityChar->{$entity_name};            } else {
3362            $match = 1;              !!!cp (1021);
3363            !!!next-input-character;              $self->{entity__value} = $EntityChar->{$self->{s_kwd}};
3364            last;              $self->{entity__match} = -1;
3365                ## Stay in the state.
3366                !!!next-input-character;
3367                redo A;
3368              }
3369          } else {          } else {
3370            !!!cp (1021);            !!!cp (1022);
3371            $value = $EntityChar->{$entity_name};            $self->{entity__value} .= chr $self->{nc};
3372            $match = -1;            $self->{entity__match} *= 2;
3373              ## Stay in the state.
3374            !!!next-input-character;            !!!next-input-character;
3375              redo A;
3376            }
3377          }
3378    
3379          my $data;
3380          my $has_ref;
3381          if ($self->{entity__match} > 0) {
3382            !!!cp (1023);
3383            $data = $self->{entity__value};
3384            $has_ref = 1;
3385            #
3386          } elsif ($self->{entity__match} < 0) {
3387            !!!parse-error (type => 'no refc');
3388            if ($self->{prev_state} != DATA_STATE and # in attribute
3389                $self->{entity__match} < -1) {
3390              !!!cp (1024);
3391              $data = '&' . $self->{s_kwd};
3392              #
3393            } else {
3394              !!!cp (1025);
3395              $data = $self->{entity__value};
3396              $has_ref = 1;
3397              #
3398          }          }
3399        } else {        } else {
3400          !!!cp (1022);          !!!cp (1026);
3401          $value .= chr $self->{next_char};          !!!parse-error (type => 'bare ero',
3402          $match *= 2;                          line => $self->{line_prev},
3403          !!!next-input-character;                          column => $self->{column_prev} - length $self->{s_kwd});
3404            $data = '&' . $self->{s_kwd};
3405            #
3406        }        }
3407      }    
3408              ## NOTE: In these cases, when a character reference is found,
3409      if ($match > 0) {        ## it is consumed and a character token is returned, or, otherwise,
3410        !!!cp (1023);        ## nothing is consumed and returned, according to the spec algorithm.
3411        return {type => CHARACTER_TOKEN, data => $value, has_reference => 1,        ## In this implementation, anything that has been examined by the
3412                line => $l, column => $c,        ## tokenizer is appended to the parent element or the attribute value
3413               };        ## as string, either literal string when no character reference or
3414      } elsif ($match < 0) {        ## entity-replaced string otherwise, in this stage, since any characters
3415        !!!parse-error (type => 'no refc', line => $l, column => $c);        ## that would not be consumed are appended in the data state or in an
3416        if ($in_attr and $match < -1) {        ## appropriate attribute value state anyway.
3417          !!!cp (1024);  
3418          return {type => CHARACTER_TOKEN, data => '&'.$entity_name,        if ($self->{prev_state} == DATA_STATE) {
3419                  line => $l, column => $c,          !!!cp (986);
3420                 };          $self->{state} = $self->{prev_state};
3421        } else {          ## Reconsume.
3422          !!!cp (1025);          !!!emit ({type => CHARACTER_TOKEN,
3423          return {type => CHARACTER_TOKEN, data => $value, has_reference => 1,                    data => $data,
3424                  line => $l, column => $c,                    line => $self->{line_prev},
3425                 };                    column => $self->{column_prev} + 1 - length $self->{s_kwd},
3426                     });
3427            redo A;
3428          } else {
3429            !!!cp (985);
3430            $self->{ca}->{value} .= $data;
3431            $self->{ca}->{has_reference} = 1 if $has_ref;
3432            $self->{state} = $self->{prev_state};
3433            ## Reconsume.
3434            redo A;
3435        }        }
3436      } else {      } else {
3437        !!!cp (1026);        die "$0: $self->{state}: Unknown state";
       !!!parse-error (type => 'bare ero', line => $l, column => $c);  
       ## NOTE: "No characters are consumed" in the spec.  
       return {type => CHARACTER_TOKEN, data => '&'.$value,  
               line => $l, column => $c,  
              };  
3438      }      }
3439    } else {    } # A  
3440      !!!cp (1027);  
3441      ## no characters are consumed    die "$0: _get_next_token: unexpected case";
3442      !!!parse-error (type => 'bare ero', line => $l, column => $c);  } # _get_next_token
     return undef;  
   }  
 } # _tokenize_attempt_to_consume_an_entity  
3443    
3444  sub _initialize_tree_constructor ($) {  sub _initialize_tree_constructor ($) {
3445    my $self = shift;    my $self = shift;
# Line 3254  sub _tree_construction_initial ($) { Line 3506  sub _tree_construction_initial ($) {
3506        $doctype_name = '' unless defined $doctype_name;        $doctype_name = '' unless defined $doctype_name;
3507        $doctype_name =~ tr/a-z/A-Z/; # ASCII case-insensitive        $doctype_name =~ tr/a-z/A-Z/; # ASCII case-insensitive
3508        if (not defined $token->{name} or # <!DOCTYPE>        if (not defined $token->{name} or # <!DOCTYPE>
3509            defined $token->{system_identifier}) {            defined $token->{sysid}) {
3510          !!!cp ('t1');          !!!cp ('t1');
3511          !!!parse-error (type => 'not HTML5', token => $token);          !!!parse-error (type => 'not HTML5', token => $token);
3512        } elsif ($doctype_name ne 'HTML') {        } elsif ($doctype_name ne 'HTML') {
3513          !!!cp ('t2');          !!!cp ('t2');
3514          !!!parse-error (type => 'not HTML5', token => $token);          !!!parse-error (type => 'not HTML5', token => $token);
3515        } elsif (defined $token->{public_identifier}) {        } elsif (defined $token->{pubid}) {
3516          if ($token->{public_identifier} eq 'XSLT-compat') {          if ($token->{pubid} eq 'XSLT-compat') {
3517            !!!cp ('t1.2');            !!!cp ('t1.2');
3518            !!!parse-error (type => 'XSLT-compat', token => $token,            !!!parse-error (type => 'XSLT-compat', token => $token,
3519                            level => $self->{level}->{should});                            level => $self->{level}->{should});
# Line 3277  sub _tree_construction_initial ($) { Line 3529  sub _tree_construction_initial ($) {
3529          ($token->{name}); ## ISSUE: If name is missing (e.g. <!DOCTYPE>)?          ($token->{name}); ## ISSUE: If name is missing (e.g. <!DOCTYPE>)?
3530        ## NOTE: Default value for both |public_id| and |system_id| attributes        ## NOTE: Default value for both |public_id| and |system_id| attributes
3531        ## are empty strings, so that we don't set any value in missing cases.        ## are empty strings, so that we don't set any value in missing cases.
3532        $doctype->public_id ($token->{public_identifier})        $doctype->public_id ($token->{pubid}) if defined $token->{pubid};
3533            if defined $token->{public_identifier};        $doctype->system_id ($token->{sysid}) if defined $token->{sysid};
       $doctype->system_id ($token->{system_identifier})  
           if defined $token->{system_identifier};  
3534        ## NOTE: Other DocumentType attributes are null or empty lists.        ## NOTE: Other DocumentType attributes are null or empty lists.
3535        ## ISSUE: internalSubset = null??        ## ISSUE: internalSubset = null??
3536        $self->{document}->append_child ($doctype);        $self->{document}->append_child ($doctype);
# Line 3288  sub _tree_construction_initial ($) { Line 3538  sub _tree_construction_initial ($) {
3538        if ($token->{quirks} or $doctype_name ne 'HTML') {        if ($token->{quirks} or $doctype_name ne 'HTML') {
3539          !!!cp ('t4');          !!!cp ('t4');
3540          $self->{document}->manakai_compat_mode ('quirks');          $self->{document}->manakai_compat_mode ('quirks');
3541        } elsif (defined $token->{public_identifier}) {        } elsif (defined $token->{pubid}) {
3542          my $pubid = $token->{public_identifier};          my $pubid = $token->{pubid};
3543          $pubid =~ tr/a-z/A-z/;          $pubid =~ tr/a-z/A-z/;
3544          my $prefix = [          my $prefix = [
3545            "+//SILMARIL//DTD HTML PRO V0R11 19970101//",            "+//SILMARIL//DTD HTML PRO V0R11 19970101//",
# Line 3363  sub _tree_construction_initial ($) { Line 3613  sub _tree_construction_initial ($) {
3613            $self->{document}->manakai_compat_mode ('quirks');            $self->{document}->manakai_compat_mode ('quirks');
3614          } elsif ($pubid =~ m[^-//W3C//DTD HTML 4.01 FRAMESET//] or          } elsif ($pubid =~ m[^-//W3C//DTD HTML 4.01 FRAMESET//] or
3615                   $pubid =~ m[^-//W3C//DTD HTML 4.01 TRANSITIONAL//]) {                   $pubid =~ m[^-//W3C//DTD HTML 4.01 TRANSITIONAL//]) {
3616            if (defined $token->{system_identifier}) {            if (defined $token->{sysid}) {
3617              !!!cp ('t6');              !!!cp ('t6');
3618              $self->{document}->manakai_compat_mode ('quirks');              $self->{document}->manakai_compat_mode ('quirks');
3619            } else {            } else {
# Line 3380  sub _tree_construction_initial ($) { Line 3630  sub _tree_construction_initial ($) {
3630        } else {        } else {
3631          !!!cp ('t10');          !!!cp ('t10');
3632        }        }
3633        if (defined $token->{system_identifier}) {        if (defined $token->{sysid}) {
3634          my $sysid = $token->{system_identifier};          my $sysid = $token->{sysid};
3635          $sysid =~ tr/A-Z/a-z/;          $sysid =~ tr/A-Z/a-z/;
3636          if ($sysid eq "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd") {          if ($sysid eq "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd") {
3637            ## NOTE: Ensure that |PUBLIC "(limited quirks)" "(quirks)"| is            ## NOTE: Ensure that |PUBLIC "(limited quirks)" "(quirks)"| is
# Line 3411  sub _tree_construction_initial ($) { Line 3661  sub _tree_construction_initial ($) {
3661        !!!ack-later;        !!!ack-later;
3662        return;        return;
3663      } elsif ($token->{type} == CHARACTER_TOKEN) {      } elsif ($token->{type} == CHARACTER_TOKEN) {
3664        if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { # \x0D        if ($token->{data} =~ s/^([\x09\x0A\x0C\x20]+)//) {
3665          ## Ignore the token          ## Ignore the token
3666    
3667          unless (length $token->{data}) {          unless (length $token->{data}) {
# Line 3468  sub _tree_construction_root_element ($) Line 3718  sub _tree_construction_root_element ($)
3718          !!!next-token;          !!!next-token;
3719          redo B;          redo B;
3720        } elsif ($token->{type} == CHARACTER_TOKEN) {        } elsif ($token->{type} == CHARACTER_TOKEN) {
3721          if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { # \x0D          if ($token->{data} =~ s/^([\x09\x0A\x0C\x20]+)//) {
3722            ## Ignore the token.            ## Ignore the token.
3723    
3724            unless (length $token->{data}) {            unless (length $token->{data}) {
# Line 4282  sub _tree_construction_main ($) { Line 4532  sub _tree_construction_main ($) {
4532    
4533      if ($self->{insertion_mode} & HEAD_IMS) {      if ($self->{insertion_mode} & HEAD_IMS) {
4534        if ($token->{type} == CHARACTER_TOKEN) {        if ($token->{type} == CHARACTER_TOKEN) {
4535          if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {          if ($token->{data} =~ s/^([\x09\x0A\x0C\x20]+)//) {
4536            unless ($self->{insertion_mode} == BEFORE_HEAD_IM) {            unless ($self->{insertion_mode} == BEFORE_HEAD_IM) {
4537              !!!cp ('t88.2');              !!!cp ('t88.2');
4538              $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);              $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
4539                #
4540            } else {            } else {
4541              !!!cp ('t88.1');              !!!cp ('t88.1');
4542              ## Ignore the token.              ## Ignore the token.
4543              !!!next-token;              #
             next B;  
4544            }            }
4545            unless (length $token->{data}) {            unless (length $token->{data}) {
4546              !!!cp ('t88');              !!!cp ('t88');
4547              !!!next-token;              !!!next-token;
4548              next B;              next B;
4549            }            }
4550    ## TODO: set $token->{column} appropriately
4551          }          }
4552    
4553          if ($self->{insertion_mode} == BEFORE_HEAD_IM) {          if ($self->{insertion_mode} == BEFORE_HEAD_IM) {
# Line 4406  sub _tree_construction_main ($) { Line 4657  sub _tree_construction_main ($) {
4657                  !!!cp ('t101');                  !!!cp ('t101');
4658                }                }
4659                !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);                !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4660                pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.                pop @{$self->{open_elements}};
4661                pop @{$self->{open_elements}} # <head>                pop @{$self->{open_elements}} # <head>
4662                    if $self->{insertion_mode} == AFTER_HEAD_IM;                    if $self->{insertion_mode} == AFTER_HEAD_IM;
4663                !!!nack ('t101.1');                !!!nack ('t101.1');
4664                !!!next-token;                !!!next-token;
4665                next B;                next B;
4666              } elsif ($token->{tag_name} eq 'link') {          } elsif ($token->{tag_name} eq 'link') {
4667                ## NOTE: There is a "as if in head" code clone.            ## NOTE: There is a "as if in head" code clone.
4668                if ($self->{insertion_mode} == AFTER_HEAD_IM) {            if ($self->{insertion_mode} == AFTER_HEAD_IM) {
4669                  !!!cp ('t102');              !!!cp ('t102');
4670                  !!!parse-error (type => 'after head',              !!!parse-error (type => 'after head',
4671                                  text => $token->{tag_name}, token => $token);                              text => $token->{tag_name}, token => $token);
4672                  push @{$self->{open_elements}},              push @{$self->{open_elements}},
4673                      [$self->{head_element}, $el_category->{head}];                  [$self->{head_element}, $el_category->{head}];
4674                } else {            } else {
4675                  !!!cp ('t103');              !!!cp ('t103');
4676                }            }
4677                !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);            !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4678                pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.            pop @{$self->{open_elements}};
4679                pop @{$self->{open_elements}} # <head>            pop @{$self->{open_elements}} # <head>
4680                    if $self->{insertion_mode} == AFTER_HEAD_IM;                if $self->{insertion_mode} == AFTER_HEAD_IM;
4681                !!!ack ('t103.1');            !!!ack ('t103.1');
4682                !!!next-token;            !!!next-token;
4683                next B;            next B;
4684            } elsif ($token->{tag_name} eq 'command' or
4685                     $token->{tag_name} eq 'eventsource') {
4686              if ($self->{insertion_mode} == IN_HEAD_IM) {
4687                ## NOTE: If the insertion mode at the time of the emission
4688                ## of the token was "before head", $self->{insertion_mode}
4689                ## is already changed to |IN_HEAD_IM|.
4690    
4691                ## NOTE: There is a "as if in head" code clone.
4692                !!!insert-element ($token->{tag_name}, $token->{attributes}, $token);
4693                pop @{$self->{open_elements}};
4694                pop @{$self->{open_elements}} # <head>
4695                    if $self->{insertion_mode} == AFTER_HEAD_IM;
4696                !!!ack ('t103.2');
4697                !!!next-token;
4698                next B;
4699              } else {
4700                ## NOTE: "in head noscript" or "after head" insertion mode
4701                ## - in these cases, these tags are treated as same as
4702                ## normal in-body tags.
4703                !!!cp ('t103.3');
4704                #
4705              }
4706              } elsif ($token->{tag_name} eq 'meta') {              } elsif ($token->{tag_name} eq 'meta') {
4707                ## NOTE: There is a "as if in head" code clone.                ## NOTE: There is a "as if in head" code clone.
4708                if ($self->{insertion_mode} == AFTER_HEAD_IM) {                if ($self->{insertion_mode} == AFTER_HEAD_IM) {
# Line 4460  sub _tree_construction_main ($) { Line 4733  sub _tree_construction_main ($) {
4733                  } elsif ($token->{attributes}->{content}) {                  } elsif ($token->{attributes}->{content}) {
4734                    if ($token->{attributes}->{content}->{value}                    if ($token->{attributes}->{content}->{value}
4735                        =~ /[Cc][Hh][Aa][Rr][Ss][Ee][Tt]                        =~ /[Cc][Hh][Aa][Rr][Ss][Ee][Tt]
4736                            [\x09-\x0D\x20]*=                            [\x09\x0A\x0C\x0D\x20]*=
4737                            [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|                            [\x09\x0A\x0C\x0D\x20]*(?>"([^"]*)"|'([^']*)'|
4738                            ([^"'\x09-\x0D\x20][^\x09-\x0D\x20\x3B]*))/x) {                            ([^"'\x09\x0A\x0C\x0D\x20]
4739                               [^\x09\x0A\x0C\x0D\x20\x3B]*))/x) {
4740                      !!!cp ('t107');                      !!!cp ('t107');
4741                      ## NOTE: Whether the encoding is supported or not is handled                      ## NOTE: Whether the encoding is supported or not is handled
4742                      ## in the {change_encoding} callback.                      ## in the {change_encoding} callback.
# Line 5271  sub _tree_construction_main ($) { Line 5545  sub _tree_construction_main ($) {
5545      } elsif ($self->{insertion_mode} & TABLE_IMS) {      } elsif ($self->{insertion_mode} & TABLE_IMS) {
5546        if ($token->{type} == CHARACTER_TOKEN) {        if ($token->{type} == CHARACTER_TOKEN) {
5547          if (not $open_tables->[-1]->[1] and # tainted          if (not $open_tables->[-1]->[1] and # tainted
5548              $token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {              $token->{data} =~ s/^([\x09\x0A\x0C\x20]+)//) {
5549            $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);            $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
5550                                
5551            unless (length $token->{data}) {            unless (length $token->{data}) {
# Line 5955  sub _tree_construction_main ($) { Line 6229  sub _tree_construction_main ($) {
6229        }        }
6230      } elsif ($self->{insertion_mode} == IN_COLUMN_GROUP_IM) {      } elsif ($self->{insertion_mode} == IN_COLUMN_GROUP_IM) {
6231            if ($token->{type} == CHARACTER_TOKEN) {            if ($token->{type} == CHARACTER_TOKEN) {
6232              if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {              if ($token->{data} =~ s/^([\x09\x0A\x0C\x20]+)//) {
6233                $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);                $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
6234                unless (length $token->{data}) {                unless (length $token->{data}) {
6235                  !!!cp ('t260');                  !!!cp ('t260');
# Line 6296  sub _tree_construction_main ($) { Line 6570  sub _tree_construction_main ($) {
6570        }        }
6571      } elsif ($self->{insertion_mode} & BODY_AFTER_IMS) {      } elsif ($self->{insertion_mode} & BODY_AFTER_IMS) {
6572        if ($token->{type} == CHARACTER_TOKEN) {        if ($token->{type} == CHARACTER_TOKEN) {
6573          if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {          if ($token->{data} =~ s/^([\x09\x0A\x0C\x20]+)//) {
6574            my $data = $1;            my $data = $1;
6575            ## As if in body            ## As if in body
6576            $reconstruct_active_formatting_elements->($insert_to_current);            $reconstruct_active_formatting_elements->($insert_to_current);
# Line 6313  sub _tree_construction_main ($) { Line 6587  sub _tree_construction_main ($) {
6587          if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {          if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) {
6588            !!!cp ('t301');            !!!cp ('t301');
6589            !!!parse-error (type => 'after html:#text', token => $token);            !!!parse-error (type => 'after html:#text', token => $token);
6590              #
           ## Reprocess in the "after body" insertion mode.  
6591          } else {          } else {
6592            !!!cp ('t302');            !!!cp ('t302');
6593              ## "after body" insertion mode
6594              !!!parse-error (type => 'after body:#text', token => $token);
6595              #
6596          }          }
           
         ## "after body" insertion mode  
         !!!parse-error (type => 'after body:#text', token => $token);  
6597    
6598          $self->{insertion_mode} = IN_BODY_IM;          $self->{insertion_mode} = IN_BODY_IM;
6599          ## reprocess          ## reprocess
# Line 6330  sub _tree_construction_main ($) { Line 6603  sub _tree_construction_main ($) {
6603            !!!cp ('t303');            !!!cp ('t303');
6604            !!!parse-error (type => 'after html',            !!!parse-error (type => 'after html',
6605                            text => $token->{tag_name}, token => $token);                            text => $token->{tag_name}, token => $token);
6606                        #
           ## Reprocess in the "after body" insertion mode.  
6607          } else {          } else {
6608            !!!cp ('t304');            !!!cp ('t304');
6609              ## "after body" insertion mode
6610              !!!parse-error (type => 'after body',
6611                              text => $token->{tag_name}, token => $token);
6612              #
6613          }          }
6614    
         ## "after body" insertion mode  
         !!!parse-error (type => 'after body',  
                         text => $token->{tag_name}, token => $token);  
   
6615          $self->{insertion_mode} = IN_BODY_IM;          $self->{insertion_mode} = IN_BODY_IM;
6616          !!!ack-later;          !!!ack-later;
6617          ## reprocess          ## reprocess
# Line 6350  sub _tree_construction_main ($) { Line 6622  sub _tree_construction_main ($) {
6622            !!!parse-error (type => 'after html:/',            !!!parse-error (type => 'after html:/',
6623                            text => $token->{tag_name}, token => $token);                            text => $token->{tag_name}, token => $token);
6624                        
6625            $self->{insertion_mode} = AFTER_BODY_IM;            $self->{insertion_mode} = IN_BODY_IM;
6626            ## Reprocess in the "after body" insertion mode.            ## Reprocess.
6627              next B;
6628          } else {          } else {
6629            !!!cp ('t306');            !!!cp ('t306');
6630          }          }
# Line 6389  sub _tree_construction_main ($) { Line 6662  sub _tree_construction_main ($) {
6662        }        }
6663      } elsif ($self->{insertion_mode} & FRAME_IMS) {      } elsif ($self->{insertion_mode} & FRAME_IMS) {
6664        if ($token->{type} == CHARACTER_TOKEN) {        if ($token->{type} == CHARACTER_TOKEN) {
6665          if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {          if ($token->{data} =~ s/^([\x09\x0A\x0C\x20]+)//) {
6666            $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);            $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
6667                        
6668            unless (length $token->{data}) {            unless (length $token->{data}) {
# Line 6399  sub _tree_construction_main ($) { Line 6672  sub _tree_construction_main ($) {
6672            }            }
6673          }          }
6674                    
6675          if ($token->{data} =~ s/^[^\x09\x0A\x0B\x0C\x20]+//) {          if ($token->{data} =~ s/^[^\x09\x0A\x0C\x20]+//) {
6676            if ($self->{insertion_mode} == IN_FRAMESET_IM) {            if ($self->{insertion_mode} == IN_FRAMESET_IM) {
6677              !!!cp ('t311');              !!!cp ('t311');
6678              !!!parse-error (type => 'in frameset:#text', token => $token);              !!!parse-error (type => 'in frameset:#text', token => $token);
# Line 6547  sub _tree_construction_main ($) { Line 6820  sub _tree_construction_main ($) {
6820          $parse_rcdata->(CDATA_CONTENT_MODEL);          $parse_rcdata->(CDATA_CONTENT_MODEL);
6821          next B;          next B;
6822        } elsif ({        } elsif ({
6823                  base => 1, link => 1,                  base => 1, command => 1, eventsource => 1, link => 1,
6824                 }->{$token->{tag_name}}) {                 }->{$token->{tag_name}}) {
6825          !!!cp ('t334');          !!!cp ('t334');
6826          ## NOTE: This is an "as if in head" code clone, only "-t" differs          ## NOTE: This is an "as if in head" code clone, only "-t" differs
6827          !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);          !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6828          pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.          pop @{$self->{open_elements}};
6829          !!!ack ('t334.1');          !!!ack ('t334.1');
6830          !!!next-token;          !!!next-token;
6831          next B;          next B;
6832        } elsif ($token->{tag_name} eq 'meta') {        } elsif ($token->{tag_name} eq 'meta') {
6833          ## NOTE: This is an "as if in head" code clone, only "-t" differs          ## NOTE: This is an "as if in head" code clone, only "-t" differs
6834          !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);          !!!insert-element-t ($token->{tag_name}, $token->{attributes}, $token);
6835          my $meta_el = pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.          my $meta_el = pop @{$self->{open_elements}};
6836    
6837          unless ($self->{confident}) {          unless ($self->{confident}) {
6838            if ($token->{attributes}->{charset}) {            if ($token->{attributes}->{charset}) {
# Line 6576  sub _tree_construction_main ($) { Line 6849  sub _tree_construction_main ($) {
6849            } elsif ($token->{attributes}->{content}) {            } elsif ($token->{attributes}->{content}) {
6850              if ($token->{attributes}->{content}->{value}              if ($token->{attributes}->{content}->{value}
6851                  =~ /[Cc][Hh][Aa][Rr][Ss][Ee][Tt]                  =~ /[Cc][Hh][Aa][Rr][Ss][Ee][Tt]
6852                      [\x09-\x0D\x20]*=                      [\x09\x0A\x0C\x0D\x20]*=
6853                      [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|                      [\x09\x0A\x0C\x0D\x20]*(?>"([^"]*)"|'([^']*)'|
6854                      ([^"'\x09-\x0D\x20][^\x09-\x0D\x20\x3B]*))/x) {                      ([^"'\x09\x0A\x0C\x0D\x20][^\x09\x0A\x0C\x0D\x20\x3B]*))
6855                       /x) {
6856                !!!cp ('t336');                !!!cp ('t336');
6857                ## NOTE: Whether the encoding is supported or not is handled                ## NOTE: Whether the encoding is supported or not is handled
6858                ## in the {change_encoding} callback.                ## in the {change_encoding} callback.
# Line 6723  sub _tree_construction_main ($) { Line 6997  sub _tree_construction_main ($) {
6997              last INSCOPE;              last INSCOPE;
6998            }            }
6999          } # INSCOPE          } # INSCOPE
7000    
7001            ## NOTE: Special, Scope (<li><foo><li> == <li><foo><li/></foo></li>)
7002              ## Interpreted as <li><foo/></li><li/> (non-conforming)
7003              ## blockquote (O9.27), center (O), dd (Fx3, O, S3.1.2, IE7),
7004              ## dt (Fx, O, S, IE), dl (O), fieldset (O, S, IE), form (Fx, O, S),
7005              ## hn (O), pre (O), applet (O, S), button (O, S), marquee (Fx, O, S),
7006              ## object (Fx)
7007              ## Generate non-tree (non-conforming)
7008              ## basefont (IE7 (where basefont is non-void)), center (IE),
7009              ## form (IE), hn (IE)
7010            ## address, div, p (<li><foo><li> == <li><foo/></li><li/>)
7011              ## Interpreted as <li><foo><li/></foo></li> (non-conforming)
7012              ## div (Fx, S)
7013                        
7014          ## Step 1          ## Step 1
7015          my $i = -1;          my $i = -1;
# Line 7103  sub _tree_construction_main ($) { Line 7390  sub _tree_construction_main ($) {
7390            !!!nack ('t380.1');            !!!nack ('t380.1');
7391          } elsif ({          } elsif ({
7392                    b => 1, big => 1, em => 1, font => 1, i => 1,                    b => 1, big => 1, em => 1, font => 1, i => 1,
7393                    s => 1, small => 1, strile => 1,                    s => 1, small => 1, strike => 1,
7394                    strong => 1, tt => 1, u => 1,                    strong => 1, tt => 1, u => 1,
7395                   }->{$token->{tag_name}}) {                   }->{$token->{tag_name}}) {
7396            !!!cp ('t375');            !!!cp ('t375');
# Line 7239  sub _tree_construction_main ($) { Line 7526  sub _tree_construction_main ($) {
7526                    dd => ($token->{tag_name} ne 'dd'),                    dd => ($token->{tag_name} ne 'dd'),
7527                    dt => ($token->{tag_name} ne 'dt'),                    dt => ($token->{tag_name} ne 'dt'),
7528                    li => ($token->{tag_name} ne 'li'),                    li => ($token->{tag_name} ne 'li'),
7529                      option => 1,
7530                      optgroup => 1,
7531                    p => 1,                    p => 1,
7532                    rt => 1,                    rt => 1,
7533                    rp => 1,                    rp => 1,
# Line 7408  sub _tree_construction_main ($) { Line 7697  sub _tree_construction_main ($) {
7697        } elsif ({        } elsif ({
7698                  a => 1,                  a => 1,
7699                  b => 1, big => 1, em => 1, font => 1, i => 1,                  b => 1, big => 1, em => 1, font => 1, i => 1,
7700                  nobr => 1, s => 1, small => 1, strile => 1,                  nobr => 1, s => 1, small => 1, strike => 1,
7701                  strong => 1, tt => 1, u => 1,                  strong => 1, tt => 1, u => 1,
7702                 }->{$token->{tag_name}}) {                 }->{$token->{tag_name}}) {
7703          !!!cp ('t427');          !!!cp ('t427');
# Line 7499  sub _tree_construction_main ($) { Line 7788  sub _tree_construction_main ($) {
7788                ## Ignore the token                ## Ignore the token
7789                !!!next-token;                !!!next-token;
7790                last S2;                last S2;
             }  
7791    
7792                  ## NOTE: |<span><dd></span>a|: In Safari 3.1.2 and Opera
7793                  ## 9.27, "a" is a child of <dd> (conforming).  In
7794                  ## Firefox 3.0.2, "a" is a child of <body>.  In WinIE 7,
7795                  ## "a" is a child of both <body> and <dd>.
7796                }
7797                
7798              !!!cp ('t434');              !!!cp ('t434');
7799            }            }
7800                        
# Line 7541  sub _tree_construction_main ($) { Line 7835  sub _tree_construction_main ($) {
7835    ## TODO: script stuffs    ## TODO: script stuffs
7836  } # _tree_construct_main  } # _tree_construct_main
7837    
7838  sub set_inner_html ($$$;$) {  sub set_inner_html ($$$$;$) {
7839    my $class = shift;    my $class = shift;
7840    my $node = shift;    my $node = shift;
7841    my $s = \$_[0];    #my $s = \$_[0];
7842    my $onerror = $_[1];    my $onerror = $_[1];
7843    my $get_wrapper = $_[2] || sub ($) { return $_[0] };    my $get_wrapper = $_[2] || sub ($) { return $_[0] };
7844    
# Line 7565  sub set_inner_html ($$$;$) { Line 7859  sub set_inner_html ($$$;$) {
7859      }      }
7860    
7861      ## Step 3, 4, 5 # MUST      ## Step 3, 4, 5 # MUST
7862      $class->parse_char_string ($$s => $node, $onerror, $get_wrapper);      $class->parse_char_string ($_[0] => $node, $onerror, $get_wrapper);
7863    } elsif ($nt == 1) {    } elsif ($nt == 1) {
7864      ## TODO: If non-html element      ## TODO: If non-html element
7865    
# Line 7584  sub set_inner_html ($$$;$) { Line 7878  sub set_inner_html ($$$;$) {
7878      my $i = 0;      my $i = 0;
7879      $p->{line_prev} = $p->{line} = 1;      $p->{line_prev} = $p->{line} = 1;
7880      $p->{column_prev} = $p->{column} = 0;      $p->{column_prev} = $p->{column} = 0;
7881      $p->{set_next_char} = sub {      require Whatpm::Charset::DecodeHandle;
7882        my $input = Whatpm::Charset::DecodeHandle::CharString->new (\($_[0]));
7883        $input = $get_wrapper->($input);
7884        $p->{set_nc} = sub {
7885        my $self = shift;        my $self = shift;
7886    
7887        pop @{$self->{prev_char}};        my $char = '';
7888        unshift @{$self->{prev_char}}, $self->{next_char};        if (defined $self->{next_nc}) {
7889            $char = $self->{next_nc};
7890        $self->{next_char} = -1 and return if $i >= length $$s;          delete $self->{next_nc};
7891        $self->{next_char} = ord substr $$s, $i++, 1;          $self->{nc} = ord $char;
7892          } else {
7893            $self->{char_buffer} = '';
7894            $self->{char_buffer_pos} = 0;
7895            
7896            my $count = $input->manakai_read_until
7897                ($self->{char_buffer}, qr/[^\x00\x0A\x0D]/,
7898                 $self->{char_buffer_pos});
7899            if ($count) {
7900              $self->{line_prev} = $self->{line};
7901              $self->{column_prev} = $self->{column};
7902              $self->{column}++;
7903              $self->{nc}
7904                  = ord substr ($self->{char_buffer},
7905                                $self->{char_buffer_pos}++, 1);
7906              return;
7907            }
7908            
7909            if ($input->read ($char, 1)) {
7910              $self->{nc} = ord $char;
7911            } else {
7912              $self->{nc} = -1;
7913              return;
7914            }
7915          }
7916    
7917        ($p->{line_prev}, $p->{column_prev}) = ($p->{line}, $p->{column});        ($p->{line_prev}, $p->{column_prev}) = ($p->{line}, $p->{column});
7918        $p->{column}++;        $p->{column}++;
7919    
7920        if ($self->{next_char} == 0x000A) { # LF        if ($self->{nc} == 0x000A) { # LF
7921          $p->{line}++;          $p->{line}++;
7922          $p->{column} = 0;          $p->{column} = 0;
7923          !!!cp ('i1');          !!!cp ('i1');
7924        } elsif ($self->{next_char} == 0x000D) { # CR        } elsif ($self->{nc} == 0x000D) { # CR
7925          $i++ if substr ($$s, $i, 1) eq "\x0A";  ## TODO: support for abort/streaming
7926          $self->{next_char} = 0x000A; # LF # MUST          my $next = '';
7927            if ($input->read ($next, 1) and $next ne "\x0A") {
7928              $self->{next_nc} = $next;
7929            }
7930            $self->{nc} = 0x000A; # LF # MUST
7931          $p->{line}++;          $p->{line}++;
7932          $p->{column} = 0;          $p->{column} = 0;
7933          !!!cp ('i2');          !!!cp ('i2');
7934        } elsif ($self->{next_char} > 0x10FFFF) {        } elsif ($self->{nc} == 0x0000) { # NULL
         $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST  
         !!!cp ('i3');  
       } elsif ($self->{next_char} == 0x0000) { # NULL  
7935          !!!cp ('i4');          !!!cp ('i4');
7936          !!!parse-error (type => 'NULL');          !!!parse-error (type => 'NULL');
7937          $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST          $self->{nc} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
       } elsif ($self->{next_char} <= 0x0008 or  
                (0x000E <= $self->{next_char} and  
                 $self->{next_char} <= 0x001F) or  
                (0x007F <= $self->{next_char} and  
                 $self->{next_char} <= 0x009F) or  
                (0xD800 <= $self->{next_char} and  
                 $self->{next_char} <= 0xDFFF) or  
                (0xFDD0 <= $self->{next_char} and  
                 $self->{next_char} <= 0xFDDF) or  
                {  
                 0xFFFE => 1, 0xFFFF => 1, 0x1FFFE => 1, 0x1FFFF => 1,  
                 0x2FFFE => 1, 0x2FFFF => 1, 0x3FFFE => 1, 0x3FFFF => 1,  
                 0x4FFFE => 1, 0x4FFFF => 1, 0x5FFFE => 1, 0x5FFFF => 1,  
                 0x6FFFE => 1, 0x6FFFF => 1, 0x7FFFE => 1, 0x7FFFF => 1,  
                 0x8FFFE => 1, 0x8FFFF => 1, 0x9FFFE => 1, 0x9FFFF => 1,  
                 0xAFFFE => 1, 0xAFFFF => 1, 0xBFFFE => 1, 0xBFFFF => 1,  
                 0xCFFFE => 1, 0xCFFFF => 1, 0xDFFFE => 1, 0xDFFFF => 1,  
                 0xEFFFE => 1, 0xEFFFF => 1, 0xFFFFE => 1, 0xFFFFF => 1,  
                 0x10FFFE => 1, 0x10FFFF => 1,  
                }->{$self->{next_char}}) {  
         !!!cp ('i4.1');  
         if ($self->{next_char} < 0x10000) {  
           !!!parse-error (type => 'control char',  
                           text => (sprintf 'U+%04X', $self->{next_char}));  
         } else {  
           !!!parse-error (type => 'control char',  
                           text => (sprintf 'U-%08X', $self->{next_char}));  
         }  
7938        }        }
7939      };      };
7940      $p->{prev_char} = [-1, -1, -1];  
7941      $p->{next_char} = -1;      $p->{read_until} = sub {
7942              #my ($scalar, $specials_range, $offset) = @_;
7943          return 0 if defined $p->{next_nc};
7944    
7945          my $pattern = qr/[^$_[1]\x00\x0A\x0D]/;
7946          my $offset = $_[2] || 0;
7947          
7948          if ($p->{char_buffer_pos} < length $p->{char_buffer}) {
7949            pos ($p->{char_buffer}) = $p->{char_buffer_pos};
7950            if ($p->{char_buffer} =~ /\G(?>$pattern)+/) {
7951              substr ($_[0], $offset)
7952                  = substr ($p->{char_buffer}, $-[0], $+[0] - $-[0]);
7953              my $count = $+[0] - $-[0];
7954              if ($count) {
7955                $p->{column} += $count;
7956                $p->{char_buffer_pos} += $count;
7957                $p->{line_prev} = $p->{line};
7958                $p->{column_prev} = $p->{column} - 1;
7959                $p->{nc} = -1;
7960              }
7961              return $count;
7962            } else {
7963              return 0;
7964            }
7965          } else {
7966            my $count = $input->manakai_read_until ($_[0], $pattern, $_[2]);
7967            if ($count) {
7968              $p->{column} += $count;
7969              $p->{column_prev} += $count;
7970              $p->{nc} = -1;
7971            }
7972            return $count;
7973          }
7974        }; # $p->{read_until}
7975    
7976      my $ponerror = $onerror || sub {      my $ponerror = $onerror || sub {
7977        my (%opt) = @_;        my (%opt) = @_;
7978        my $line = $opt{line};        my $line = $opt{line};
# Line 7660  sub set_inner_html ($$$;$) { Line 7987  sub set_inner_html ($$$;$) {
7987        $ponerror->(line => $p->{line}, column => $p->{column}, @_);        $ponerror->(line => $p->{line}, column => $p->{column}, @_);
7988      };      };
7989            
7990        my $char_onerror = sub {
7991          my (undef, $type, %opt) = @_;
7992          $ponerror->(layer => 'encode',
7993                      line => $p->{line}, column => $p->{column} + 1,
7994                      %opt, type => $type);
7995        }; # $char_onerror
7996        $input->onerror ($char_onerror);
7997    
7998      $p->_initialize_tokenizer;      $p->_initialize_tokenizer;
7999      $p->_initialize_tree_constructor;      $p->_initialize_tree_constructor;
8000    

Legend:
Removed from v.1.166  
changed lines
  Added in v.1.194

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24