/[suikacvs]/markup/html/whatpm/Whatpm/HTML.pm.src
Suika

Diff of /markup/html/whatpm/Whatpm/HTML.pm.src

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.56 by wakaba, Sat Aug 11 07:19:18 2007 UTC revision 1.70 by wakaba, Sat Mar 1 00:42:52 2008 UTC
# Line 1  Line 1 
1  package Whatpm::HTML;  package Whatpm::HTML;
2  use strict;  use strict;
3  our $VERSION=do{my @r=(q$Revision$=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};  our $VERSION=do{my @r=(q$Revision$=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4    use Error qw(:try);
5    
6  ## ISSUE:  ## ISSUE:
7  ## var doc = implementation.createDocument (null, null, null);  ## var doc = implementation.createDocument (null, null, null);
# Line 13  our $VERSION=do{my @r=(q$Revision$=~/\d+ Line 14  our $VERSION=do{my @r=(q$Revision$=~/\d+
14  ## "{U+FEFF}..." in UTF-16BE/UTF-16LE is three or four characters?  ## "{U+FEFF}..." in UTF-16BE/UTF-16LE is three or four characters?
15  ## "{U+FEFF}..." in GB18030?  ## "{U+FEFF}..." in GB18030?
16    
17    ## TODO: Control charcters and noncharacters are not allowed (HTML5 revision 1263)
18    ## TODO: 1252 parse error (revision 1264)
19    ## TODO: 8859-11 = 874 (revision 1271)
20    
21  my $permitted_slash_tag_name = {  my $permitted_slash_tag_name = {
22    base => 1,    base => 1,
23    link => 1,    link => 1,
# Line 84  my $formatting_category = { Line 89  my $formatting_category = {
89  };  };
90  # $phrasing_category: all other elements  # $phrasing_category: all other elements
91    
92    sub parse_byte_string ($$$$;$) {
93      my $self = ref $_[0] ? shift : shift->new;
94      my $charset = shift;
95      my $bytes_s = ref $_[0] ? $_[0] : \($_[0]);
96      my $s;
97      
98      if (defined $charset) {
99        require Encode; ## TODO: decode(utf8) don't delete BOM
100        $s = \ (Encode::decode ($charset, $$bytes_s));
101        $self->{input_encoding} = lc $charset; ## TODO: normalize name
102        $self->{confident} = 1;
103      } else {
104        ## TODO: Implement HTML5 detection algorithm
105        require Whatpm::Charset::UniversalCharDet;
106        $charset = Whatpm::Charset::UniversalCharDet->detect_byte_string
107            (substr ($$bytes_s, 0, 1024));
108        $charset ||= 'windows-1252';
109        $s = \ (Encode::decode ($charset, $$bytes_s));
110        $self->{input_encoding} = $charset;
111        $self->{confident} = 0;
112      }
113    
114      $self->{change_encoding} = sub {
115        my $self = shift;
116        my $charset = lc shift;
117        ## TODO: if $charset is supported
118        ## TODO: normalize charset name
119    
120        ## "Change the encoding" algorithm:
121    
122        ## Step 1    
123        if ($charset eq 'utf-16') { ## ISSUE: UTF-16BE -> UTF-8? UTF-16LE -> UTF-8?
124          $charset = 'utf-8';
125        }
126    
127        ## Step 2
128        if (defined $self->{input_encoding} and
129            $self->{input_encoding} eq $charset) {
130          $self->{confident} = 1;
131          return;
132        }
133    
134        !!!parse-error (type => 'charset label detected:'.$self->{input_encoding}.
135            ':'.$charset, level => 'w');
136    
137        ## Step 3
138        # if (can) {
139          ## change the encoding on the fly.
140          #$self->{confident} = 1;
141          #return;
142        # }
143    
144        ## Step 4
145        throw Whatpm::HTML::RestartParser (charset => $charset);
146      }; # $self->{change_encoding}
147    
148      my @args = @_; shift @args; # $s
149      my $return;
150      try {
151        $return = $self->parse_char_string ($s, @args);  
152      } catch Whatpm::HTML::RestartParser with {
153        my $charset = shift->{charset};
154        $s = \ (Encode::decode ($charset, $$bytes_s));    
155        $self->{input_encoding} = $charset; ## TODO: normalize
156        $self->{confident} = 1;
157        $return = $self->parse_char_string ($s, @args);
158      };
159      return $return;
160    } # parse_byte_string
161    
162    *parse_char_string = \&parse_string;
163    
164  sub parse_string ($$$;$) {  sub parse_string ($$$;$) {
165    my $self = shift->new;    my $self = ref $_[0] ? shift : shift->new;
166    my $s = \$_[0];    my $s = ref $_[0] ? $_[0] : \($_[0]);
167    $self->{document} = $_[1];    $self->{document} = $_[1];
168      @{$self->{document}->child_nodes} = ();
169    
170    ## NOTE: |set_inner_html| copies most of this method's code    ## NOTE: |set_inner_html| copies most of this method's code
171    
172      $self->{confident} = 1 unless exists $self->{confident};
173      $self->{document}->input_encoding ($self->{input_encoding})
174          if defined $self->{input_encoding};
175    
176    my $i = 0;    my $i = 0;
177    my $line = 1;    my $line = 1;
178    my $column = 0;    my $column = 0;
# Line 147  sub new ($) { Line 229  sub new ($) {
229    $self->{parse_error} = sub {    $self->{parse_error} = sub {
230      #      #
231    };    };
232      $self->{change_encoding} = sub {
233        # if ($_[0] is a supported encoding) {
234        #   run "change the encoding" algorithm;
235        #   throw Whatpm::HTML::RestartParser (charset => $new_encoding);
236        # }
237      };
238      $self->{application_cache_selection} = sub {
239        #
240      };
241    return $self;    return $self;
242  } # new  } # new
243    
# Line 159  sub CDATA_CONTENT_MODEL () { CM_LIMITED_ Line 250  sub CDATA_CONTENT_MODEL () { CM_LIMITED_
250  sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }  sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
251  sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }  sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
252    
253    sub DATA_STATE () { 0 }
254    sub ENTITY_DATA_STATE () { 1 }
255    sub TAG_OPEN_STATE () { 2 }
256    sub CLOSE_TAG_OPEN_STATE () { 3 }
257    sub TAG_NAME_STATE () { 4 }
258    sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
259    sub ATTRIBUTE_NAME_STATE () { 6 }
260    sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
261    sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
262    sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
263    sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
264    sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
265    sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
266    sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
267    sub COMMENT_START_STATE () { 14 }
268    sub COMMENT_START_DASH_STATE () { 15 }
269    sub COMMENT_STATE () { 16 }
270    sub COMMENT_END_STATE () { 17 }
271    sub COMMENT_END_DASH_STATE () { 18 }
272    sub BOGUS_COMMENT_STATE () { 19 }
273    sub DOCTYPE_STATE () { 20 }
274    sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
275    sub DOCTYPE_NAME_STATE () { 22 }
276    sub AFTER_DOCTYPE_NAME_STATE () { 23 }
277    sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
278    sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
279    sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
280    sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
281    sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
282    sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
283    sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
284    sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
285    sub BOGUS_DOCTYPE_STATE () { 32 }
286    
287  sub DOCTYPE_TOKEN () { 1 }  sub DOCTYPE_TOKEN () { 1 }
288  sub COMMENT_TOKEN () { 2 }  sub COMMENT_TOKEN () { 2 }
289  sub START_TAG_TOKEN () { 3 }  sub START_TAG_TOKEN () { 3 }
# Line 197  sub IN_COLUMN_GROUP_IM () { 0b10 } Line 322  sub IN_COLUMN_GROUP_IM () { 0b10 }
322    
323  sub _initialize_tokenizer ($) {  sub _initialize_tokenizer ($) {
324    my $self = shift;    my $self = shift;
325    $self->{state} = 'data'; # MUST    $self->{state} = DATA_STATE; # MUST
326    $self->{content_model} = PCDATA_CONTENT_MODEL; # be    $self->{content_model} = PCDATA_CONTENT_MODEL; # be
327    undef $self->{current_token}; # start tag, end tag, comment, or DOCTYPE    undef $self->{current_token}; # start tag, end tag, comment, or DOCTYPE
328    undef $self->{current_attribute};    undef $self->{current_attribute};
# Line 219  sub _initialize_tokenizer ($) { Line 344  sub _initialize_tokenizer ($) {
344  ##   ->{system_identifier} (DOCTYPE_TOKEN)  ##   ->{system_identifier} (DOCTYPE_TOKEN)
345  ##   ->{correct} == 1 or 0 (DOCTYPE_TOKEN)  ##   ->{correct} == 1 or 0 (DOCTYPE_TOKEN)
346  ##   ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)  ##   ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
347    ##        ->{name}
348    ##        ->{value}
349    ##        ->{has_reference} == 1 or 0
350  ##   ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)  ##   ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)
351    
352  ## Emitted token MUST immediately be handled by the tree construction state.  ## Emitted token MUST immediately be handled by the tree construction state.
# Line 229  sub _initialize_tokenizer ($) { Line 357  sub _initialize_tokenizer ($) {
357  ## has completed loading.  If one has, then it MUST be executed  ## has completed loading.  If one has, then it MUST be executed
358  ## and removed from the list.  ## and removed from the list.
359    
360    ## NOTE: HTML5 "Writing HTML documents" section, applied to
361    ## documents and not to user agents and conformance checkers,
362    ## contains some requirements that are not detected by the
363    ## parsing algorithm:
364    ## - Some requirements on character encoding declarations. ## TODO
365    ## - "Elements MUST NOT contain content that their content model disallows."
366    ##   ... Some are parse error, some are not (will be reported by c.c.).
367    ## - Polytheistic slash SHOULD NOT be used. (Applied only to atheists.) ## TODO
368    ## - Text (in elements, attributes, and comments) SHOULD NOT contain
369    ##   control characters other than space characters. ## TODO: (what is control character? C0, C1 and DEL?  Unicode control character?)
370    
371    ## TODO: HTML5 poses authors two SHOULD-level requirements that cannot
372    ## be detected by the HTML5 parsing algorithm:
373    ## - Text,
374    
375  sub _get_next_token ($) {  sub _get_next_token ($) {
376    my $self = shift;    my $self = shift;
377    if (@{$self->{token}}) {    if (@{$self->{token}}) {
# Line 236  sub _get_next_token ($) { Line 379  sub _get_next_token ($) {
379    }    }
380    
381    A: {    A: {
382      if ($self->{state} eq 'data') {      if ($self->{state} == DATA_STATE) {
383        if ($self->{next_input_character} == 0x0026) { # &        if ($self->{next_input_character} == 0x0026) { # &
384          if ($self->{content_model} & CM_ENTITY) { # PCDATA | RCDATA          if ($self->{content_model} & CM_ENTITY) { # PCDATA | RCDATA
385            $self->{state} = 'entity data';            $self->{state} = ENTITY_DATA_STATE;
386            !!!next-input-character;            !!!next-input-character;
387            redo A;            redo A;
388          } else {          } else {
# Line 261  sub _get_next_token ($) { Line 404  sub _get_next_token ($) {
404          if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA          if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
405              (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA              (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
406               not $self->{escape})) {               not $self->{escape})) {
407            $self->{state} = 'tag open';            $self->{state} = TAG_OPEN_STATE;
408            !!!next-input-character;            !!!next-input-character;
409            redo A;            redo A;
410          } else {          } else {
# Line 290  sub _get_next_token ($) { Line 433  sub _get_next_token ($) {
433        !!!emit ($token);        !!!emit ($token);
434    
435        redo A;        redo A;
436      } elsif ($self->{state} eq 'entity data') {      } elsif ($self->{state} == ENTITY_DATA_STATE) {
437        ## (cannot happen in CDATA state)        ## (cannot happen in CDATA state)
438                
439        my $token = $self->_tokenize_attempt_to_consume_an_entity (0);        my $token = $self->_tokenize_attempt_to_consume_an_entity (0);
440    
441        $self->{state} = 'data';        $self->{state} = DATA_STATE;
442        # next-input-character is already done        # next-input-character is already done
443    
444        unless (defined $token) {        unless (defined $token) {
# Line 305  sub _get_next_token ($) { Line 448  sub _get_next_token ($) {
448        }        }
449    
450        redo A;        redo A;
451      } elsif ($self->{state} eq 'tag open') {      } elsif ($self->{state} == TAG_OPEN_STATE) {
452        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
453          if ($self->{next_input_character} == 0x002F) { # /          if ($self->{next_input_character} == 0x002F) { # /
454            !!!next-input-character;            !!!next-input-character;
455            $self->{state} = 'close tag open';            $self->{state} = CLOSE_TAG_OPEN_STATE;
456            redo A;            redo A;
457          } else {          } else {
458            ## reconsume            ## reconsume
459            $self->{state} = 'data';            $self->{state} = DATA_STATE;
460    
461            !!!emit ({type => CHARACTER_TOKEN, data => '<'});            !!!emit ({type => CHARACTER_TOKEN, data => '<'});
462    
# Line 321  sub _get_next_token ($) { Line 464  sub _get_next_token ($) {
464          }          }
465        } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA        } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
466          if ($self->{next_input_character} == 0x0021) { # !          if ($self->{next_input_character} == 0x0021) { # !
467            $self->{state} = 'markup declaration open';            $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
468            !!!next-input-character;            !!!next-input-character;
469            redo A;            redo A;
470          } elsif ($self->{next_input_character} == 0x002F) { # /          } elsif ($self->{next_input_character} == 0x002F) { # /
471            $self->{state} = 'close tag open';            $self->{state} = CLOSE_TAG_OPEN_STATE;
472            !!!next-input-character;            !!!next-input-character;
473            redo A;            redo A;
474          } elsif (0x0041 <= $self->{next_input_character} and          } elsif (0x0041 <= $self->{next_input_character} and
# Line 333  sub _get_next_token ($) { Line 476  sub _get_next_token ($) {
476            $self->{current_token}            $self->{current_token}
477              = {type => START_TAG_TOKEN,              = {type => START_TAG_TOKEN,
478                 tag_name => chr ($self->{next_input_character} + 0x0020)};                 tag_name => chr ($self->{next_input_character} + 0x0020)};
479            $self->{state} = 'tag name';            $self->{state} = TAG_NAME_STATE;
480            !!!next-input-character;            !!!next-input-character;
481            redo A;            redo A;
482          } elsif (0x0061 <= $self->{next_input_character} and          } elsif (0x0061 <= $self->{next_input_character} and
483                   $self->{next_input_character} <= 0x007A) { # a..z                   $self->{next_input_character} <= 0x007A) { # a..z
484            $self->{current_token} = {type => START_TAG_TOKEN,            $self->{current_token} = {type => START_TAG_TOKEN,
485                              tag_name => chr ($self->{next_input_character})};                              tag_name => chr ($self->{next_input_character})};
486            $self->{state} = 'tag name';            $self->{state} = TAG_NAME_STATE;
487            !!!next-input-character;            !!!next-input-character;
488            redo A;            redo A;
489          } elsif ($self->{next_input_character} == 0x003E) { # >          } elsif ($self->{next_input_character} == 0x003E) { # >
490            !!!parse-error (type => 'empty start tag');            !!!parse-error (type => 'empty start tag');
491            $self->{state} = 'data';            $self->{state} = DATA_STATE;
492            !!!next-input-character;            !!!next-input-character;
493    
494            !!!emit ({type => CHARACTER_TOKEN, data => '<>'});            !!!emit ({type => CHARACTER_TOKEN, data => '<>'});
# Line 353  sub _get_next_token ($) { Line 496  sub _get_next_token ($) {
496            redo A;            redo A;
497          } elsif ($self->{next_input_character} == 0x003F) { # ?          } elsif ($self->{next_input_character} == 0x003F) { # ?
498            !!!parse-error (type => 'pio');            !!!parse-error (type => 'pio');
499            $self->{state} = 'bogus comment';            $self->{state} = BOGUS_COMMENT_STATE;
500            ## $self->{next_input_character} is intentionally left as is            ## $self->{next_input_character} is intentionally left as is
501            redo A;            redo A;
502          } else {          } else {
503            !!!parse-error (type => 'bare stago');            !!!parse-error (type => 'bare stago');
504            $self->{state} = 'data';            $self->{state} = DATA_STATE;
505            ## reconsume            ## reconsume
506    
507            !!!emit ({type => CHARACTER_TOKEN, data => '<'});            !!!emit ({type => CHARACTER_TOKEN, data => '<'});
# Line 368  sub _get_next_token ($) { Line 511  sub _get_next_token ($) {
511        } else {        } else {
512          die "$0: $self->{content_model} in tag open";          die "$0: $self->{content_model} in tag open";
513        }        }
514      } elsif ($self->{state} eq 'close tag open') {      } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
515        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
516          if (defined $self->{last_emitted_start_tag_name}) {          if (defined $self->{last_emitted_start_tag_name}) {
517            ## NOTE: <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>            ## NOTE: <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>
# Line 383  sub _get_next_token ($) { Line 526  sub _get_next_token ($) {
526              } else {              } else {
527                $self->{next_input_character} = shift @next_char; # reconsume                $self->{next_input_character} = shift @next_char; # reconsume
528                !!!back-next-input-character (@next_char);                !!!back-next-input-character (@next_char);
529                $self->{state} = 'data';                $self->{state} = DATA_STATE;
530    
531                !!!emit ({type => CHARACTER_TOKEN, data => '</'});                !!!emit ({type => CHARACTER_TOKEN, data => '</'});
532        
# Line 402  sub _get_next_token ($) { Line 545  sub _get_next_token ($) {
545                    $self->{next_input_character} == -1) {                    $self->{next_input_character} == -1) {
546              $self->{next_input_character} = shift @next_char; # reconsume              $self->{next_input_character} = shift @next_char; # reconsume
547              !!!back-next-input-character (@next_char);              !!!back-next-input-character (@next_char);
548              $self->{state} = 'data';              $self->{state} = DATA_STATE;
549              !!!emit ({type => CHARACTER_TOKEN, data => '</'});              !!!emit ({type => CHARACTER_TOKEN, data => '</'});
550              redo A;              redo A;
551            } else {            } else {
# Line 413  sub _get_next_token ($) { Line 556  sub _get_next_token ($) {
556          } else {          } else {
557            ## No start tag token has ever been emitted            ## No start tag token has ever been emitted
558            # next-input-character is already done            # next-input-character is already done
559            $self->{state} = 'data';            $self->{state} = DATA_STATE;
560            !!!emit ({type => CHARACTER_TOKEN, data => '</'});            !!!emit ({type => CHARACTER_TOKEN, data => '</'});
561            redo A;            redo A;
562          }          }
# Line 423  sub _get_next_token ($) { Line 566  sub _get_next_token ($) {
566            $self->{next_input_character} <= 0x005A) { # A..Z            $self->{next_input_character} <= 0x005A) { # A..Z
567          $self->{current_token} = {type => END_TAG_TOKEN,          $self->{current_token} = {type => END_TAG_TOKEN,
568                            tag_name => chr ($self->{next_input_character} + 0x0020)};                            tag_name => chr ($self->{next_input_character} + 0x0020)};
569          $self->{state} = 'tag name';          $self->{state} = TAG_NAME_STATE;
570          !!!next-input-character;          !!!next-input-character;
571          redo A;          redo A;
572        } elsif (0x0061 <= $self->{next_input_character} and        } elsif (0x0061 <= $self->{next_input_character} and
573                 $self->{next_input_character} <= 0x007A) { # a..z                 $self->{next_input_character} <= 0x007A) { # a..z
574          $self->{current_token} = {type => END_TAG_TOKEN,          $self->{current_token} = {type => END_TAG_TOKEN,
575                            tag_name => chr ($self->{next_input_character})};                            tag_name => chr ($self->{next_input_character})};
576          $self->{state} = 'tag name';          $self->{state} = TAG_NAME_STATE;
577          !!!next-input-character;          !!!next-input-character;
578          redo A;          redo A;
579        } elsif ($self->{next_input_character} == 0x003E) { # >        } elsif ($self->{next_input_character} == 0x003E) { # >
580          !!!parse-error (type => 'empty end tag');          !!!parse-error (type => 'empty end tag');
581          $self->{state} = 'data';          $self->{state} = DATA_STATE;
582          !!!next-input-character;          !!!next-input-character;
583          redo A;          redo A;
584        } elsif ($self->{next_input_character} == -1) {        } elsif ($self->{next_input_character} == -1) {
585          !!!parse-error (type => 'bare etago');          !!!parse-error (type => 'bare etago');
586          $self->{state} = 'data';          $self->{state} = DATA_STATE;
587          # reconsume          # reconsume
588    
589          !!!emit ({type => CHARACTER_TOKEN, data => '</'});          !!!emit ({type => CHARACTER_TOKEN, data => '</'});
# Line 448  sub _get_next_token ($) { Line 591  sub _get_next_token ($) {
591          redo A;          redo A;
592        } else {        } else {
593          !!!parse-error (type => 'bogus end tag');          !!!parse-error (type => 'bogus end tag');
594          $self->{state} = 'bogus comment';          $self->{state} = BOGUS_COMMENT_STATE;
595          ## $self->{next_input_character} is intentionally left as is          ## $self->{next_input_character} is intentionally left as is
596          redo A;          redo A;
597        }        }
598      } elsif ($self->{state} eq 'tag name') {      } elsif ($self->{state} == TAG_NAME_STATE) {
599        if ($self->{next_input_character} == 0x0009 or # HT        if ($self->{next_input_character} == 0x0009 or # HT
600            $self->{next_input_character} == 0x000A or # LF            $self->{next_input_character} == 0x000A or # LF
601            $self->{next_input_character} == 0x000B or # VT            $self->{next_input_character} == 0x000B or # VT
602            $self->{next_input_character} == 0x000C or # FF            $self->{next_input_character} == 0x000C or # FF
603            $self->{next_input_character} == 0x0020) { # SP            $self->{next_input_character} == 0x0020) { # SP
604          $self->{state} = 'before attribute name';          $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
605          !!!next-input-character;          !!!next-input-character;
606          redo A;          redo A;
607        } elsif ($self->{next_input_character} == 0x003E) { # >        } elsif ($self->{next_input_character} == 0x003E) { # >
# Line 474  sub _get_next_token ($) { Line 617  sub _get_next_token ($) {
617          } else {          } else {
618            die "$0: $self->{current_token}->{type}: Unknown token type";            die "$0: $self->{current_token}->{type}: Unknown token type";
619          }          }
620          $self->{state} = 'data';          $self->{state} = DATA_STATE;
621          !!!next-input-character;          !!!next-input-character;
622    
623          !!!emit ($self->{current_token}); # start tag or end tag          !!!emit ($self->{current_token}); # start tag or end tag
# Line 501  sub _get_next_token ($) { Line 644  sub _get_next_token ($) {
644          } else {          } else {
645            die "$0: $self->{current_token}->{type}: Unknown token type";            die "$0: $self->{current_token}->{type}: Unknown token type";
646          }          }
647          $self->{state} = 'data';          $self->{state} = DATA_STATE;
648          # reconsume          # reconsume
649    
650          !!!emit ($self->{current_token}); # start tag or end tag          !!!emit ($self->{current_token}); # start tag or end tag
# Line 517  sub _get_next_token ($) { Line 660  sub _get_next_token ($) {
660          } else {          } else {
661            !!!parse-error (type => 'nestc');            !!!parse-error (type => 'nestc');
662          }          }
663          $self->{state} = 'before attribute name';          $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
664          # next-input-character is already done          # next-input-character is already done
665          redo A;          redo A;
666        } else {        } else {
# Line 527  sub _get_next_token ($) { Line 670  sub _get_next_token ($) {
670          !!!next-input-character;          !!!next-input-character;
671          redo A;          redo A;
672        }        }
673      } elsif ($self->{state} eq 'before attribute name') {      } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
674        if ($self->{next_input_character} == 0x0009 or # HT        if ($self->{next_input_character} == 0x0009 or # HT
675            $self->{next_input_character} == 0x000A or # LF            $self->{next_input_character} == 0x000A or # LF
676            $self->{next_input_character} == 0x000B or # VT            $self->{next_input_character} == 0x000B or # VT
# Line 549  sub _get_next_token ($) { Line 692  sub _get_next_token ($) {
692          } else {          } else {
693            die "$0: $self->{current_token}->{type}: Unknown token type";            die "$0: $self->{current_token}->{type}: Unknown token type";
694          }          }
695          $self->{state} = 'data';          $self->{state} = DATA_STATE;
696          !!!next-input-character;          !!!next-input-character;
697    
698          !!!emit ($self->{current_token}); # start tag or end tag          !!!emit ($self->{current_token}); # start tag or end tag
# Line 559  sub _get_next_token ($) { Line 702  sub _get_next_token ($) {
702                 $self->{next_input_character} <= 0x005A) { # A..Z                 $self->{next_input_character} <= 0x005A) { # A..Z
703          $self->{current_attribute} = {name => chr ($self->{next_input_character} + 0x0020),          $self->{current_attribute} = {name => chr ($self->{next_input_character} + 0x0020),
704                                value => ''};                                value => ''};
705          $self->{state} = 'attribute name';          $self->{state} = ATTRIBUTE_NAME_STATE;
706          !!!next-input-character;          !!!next-input-character;
707          redo A;          redo A;
708        } elsif ($self->{next_input_character} == 0x002F) { # /        } elsif ($self->{next_input_character} == 0x002F) { # /
# Line 589  sub _get_next_token ($) { Line 732  sub _get_next_token ($) {
732          } else {          } else {
733            die "$0: $self->{current_token}->{type}: Unknown token type";            die "$0: $self->{current_token}->{type}: Unknown token type";
734          }          }
735          $self->{state} = 'data';          $self->{state} = DATA_STATE;
736          # reconsume          # reconsume
737    
738          !!!emit ($self->{current_token}); # start tag or end tag          !!!emit ($self->{current_token}); # start tag or end tag
# Line 598  sub _get_next_token ($) { Line 741  sub _get_next_token ($) {
741        } else {        } else {
742          $self->{current_attribute} = {name => chr ($self->{next_input_character}),          $self->{current_attribute} = {name => chr ($self->{next_input_character}),
743                                value => ''};                                value => ''};
744          $self->{state} = 'attribute name';          $self->{state} = ATTRIBUTE_NAME_STATE;
745          !!!next-input-character;          !!!next-input-character;
746          redo A;          redo A;
747        }        }
748      } elsif ($self->{state} eq 'attribute name') {      } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
749        my $before_leave = sub {        my $before_leave = sub {
750          if (exists $self->{current_token}->{attributes} # start tag or end tag          if (exists $self->{current_token}->{attributes} # start tag or end tag
751              ->{$self->{current_attribute}->{name}}) { # MUST              ->{$self->{current_attribute}->{name}}) { # MUST
# Line 620  sub _get_next_token ($) { Line 763  sub _get_next_token ($) {
763            $self->{next_input_character} == 0x000C or # FF            $self->{next_input_character} == 0x000C or # FF
764            $self->{next_input_character} == 0x0020) { # SP            $self->{next_input_character} == 0x0020) { # SP
765          $before_leave->();          $before_leave->();
766          $self->{state} = 'after attribute name';          $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
767          !!!next-input-character;          !!!next-input-character;
768          redo A;          redo A;
769        } elsif ($self->{next_input_character} == 0x003D) { # =        } elsif ($self->{next_input_character} == 0x003D) { # =
770          $before_leave->();          $before_leave->();
771          $self->{state} = 'before attribute value';          $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
772          !!!next-input-character;          !!!next-input-character;
773          redo A;          redo A;
774        } elsif ($self->{next_input_character} == 0x003E) { # >        } elsif ($self->{next_input_character} == 0x003E) { # >
# Line 642  sub _get_next_token ($) { Line 785  sub _get_next_token ($) {
785          } else {          } else {
786            die "$0: $self->{current_token}->{type}: Unknown token type";            die "$0: $self->{current_token}->{type}: Unknown token type";
787          }          }
788          $self->{state} = 'data';          $self->{state} = DATA_STATE;
789          !!!next-input-character;          !!!next-input-character;
790    
791          !!!emit ($self->{current_token}); # start tag or end tag          !!!emit ($self->{current_token}); # start tag or end tag
# Line 665  sub _get_next_token ($) { Line 808  sub _get_next_token ($) {
808          } else {          } else {
809            !!!parse-error (type => 'nestc');            !!!parse-error (type => 'nestc');
810          }          }
811          $self->{state} = 'before attribute name';          $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
812          # next-input-character is already done          # next-input-character is already done
813          redo A;          redo A;
814        } elsif ($self->{next_input_character} == -1) {        } elsif ($self->{next_input_character} == -1) {
# Line 683  sub _get_next_token ($) { Line 826  sub _get_next_token ($) {
826          } else {          } else {
827            die "$0: $self->{current_token}->{type}: Unknown token type";            die "$0: $self->{current_token}->{type}: Unknown token type";
828          }          }
829          $self->{state} = 'data';          $self->{state} = DATA_STATE;
830          # reconsume          # reconsume
831    
832          !!!emit ($self->{current_token}); # start tag or end tag          !!!emit ($self->{current_token}); # start tag or end tag
# Line 695  sub _get_next_token ($) { Line 838  sub _get_next_token ($) {
838          !!!next-input-character;          !!!next-input-character;
839          redo A;          redo A;
840        }        }
841      } elsif ($self->{state} eq 'after attribute name') {      } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
842        if ($self->{next_input_character} == 0x0009 or # HT        if ($self->{next_input_character} == 0x0009 or # HT
843            $self->{next_input_character} == 0x000A or # LF            $self->{next_input_character} == 0x000A or # LF
844            $self->{next_input_character} == 0x000B or # VT            $self->{next_input_character} == 0x000B or # VT
# Line 705  sub _get_next_token ($) { Line 848  sub _get_next_token ($) {
848          !!!next-input-character;          !!!next-input-character;
849          redo A;          redo A;
850        } elsif ($self->{next_input_character} == 0x003D) { # =        } elsif ($self->{next_input_character} == 0x003D) { # =
851          $self->{state} = 'before attribute value';          $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
852          !!!next-input-character;          !!!next-input-character;
853          redo A;          redo A;
854        } elsif ($self->{next_input_character} == 0x003E) { # >        } elsif ($self->{next_input_character} == 0x003E) { # >
# Line 721  sub _get_next_token ($) { Line 864  sub _get_next_token ($) {
864          } else {          } else {
865            die "$0: $self->{current_token}->{type}: Unknown token type";            die "$0: $self->{current_token}->{type}: Unknown token type";
866          }          }
867          $self->{state} = 'data';          $self->{state} = DATA_STATE;
868          !!!next-input-character;          !!!next-input-character;
869    
870          !!!emit ($self->{current_token}); # start tag or end tag          !!!emit ($self->{current_token}); # start tag or end tag
# Line 731  sub _get_next_token ($) { Line 874  sub _get_next_token ($) {
874                 $self->{next_input_character} <= 0x005A) { # A..Z                 $self->{next_input_character} <= 0x005A) { # A..Z
875          $self->{current_attribute} = {name => chr ($self->{next_input_character} + 0x0020),          $self->{current_attribute} = {name => chr ($self->{next_input_character} + 0x0020),
876                                value => ''};                                value => ''};
877          $self->{state} = 'attribute name';          $self->{state} = ATTRIBUTE_NAME_STATE;
878          !!!next-input-character;          !!!next-input-character;
879          redo A;          redo A;
880        } elsif ($self->{next_input_character} == 0x002F) { # /        } elsif ($self->{next_input_character} == 0x002F) { # /
# Line 745  sub _get_next_token ($) { Line 888  sub _get_next_token ($) {
888            !!!parse-error (type => 'nestc');            !!!parse-error (type => 'nestc');
889            ## TODO: Different error type for <aa / bb> than <aa/>            ## TODO: Different error type for <aa / bb> than <aa/>
890          }          }
891          $self->{state} = 'before attribute name';          $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
892          # next-input-character is already done          # next-input-character is already done
893          redo A;          redo A;
894        } elsif ($self->{next_input_character} == -1) {        } elsif ($self->{next_input_character} == -1) {
# Line 762  sub _get_next_token ($) { Line 905  sub _get_next_token ($) {
905          } else {          } else {
906            die "$0: $self->{current_token}->{type}: Unknown token type";            die "$0: $self->{current_token}->{type}: Unknown token type";
907          }          }
908          $self->{state} = 'data';          $self->{state} = DATA_STATE;
909          # reconsume          # reconsume
910    
911          !!!emit ($self->{current_token}); # start tag or end tag          !!!emit ($self->{current_token}); # start tag or end tag
# Line 771  sub _get_next_token ($) { Line 914  sub _get_next_token ($) {
914        } else {        } else {
915          $self->{current_attribute} = {name => chr ($self->{next_input_character}),          $self->{current_attribute} = {name => chr ($self->{next_input_character}),
916                                value => ''};                                value => ''};
917          $self->{state} = 'attribute name';          $self->{state} = ATTRIBUTE_NAME_STATE;
918          !!!next-input-character;          !!!next-input-character;
919          redo A;                  redo A;        
920        }        }
921      } elsif ($self->{state} eq 'before attribute value') {      } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
922        if ($self->{next_input_character} == 0x0009 or # HT        if ($self->{next_input_character} == 0x0009 or # HT
923            $self->{next_input_character} == 0x000A or # LF            $self->{next_input_character} == 0x000A or # LF
924            $self->{next_input_character} == 0x000B or # VT            $self->{next_input_character} == 0x000B or # VT
# Line 785  sub _get_next_token ($) { Line 928  sub _get_next_token ($) {
928          !!!next-input-character;          !!!next-input-character;
929          redo A;          redo A;
930        } elsif ($self->{next_input_character} == 0x0022) { # "        } elsif ($self->{next_input_character} == 0x0022) { # "
931          $self->{state} = 'attribute value (double-quoted)';          $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
932          !!!next-input-character;          !!!next-input-character;
933          redo A;          redo A;
934        } elsif ($self->{next_input_character} == 0x0026) { # &        } elsif ($self->{next_input_character} == 0x0026) { # &
935          $self->{state} = 'attribute value (unquoted)';          $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
936          ## reconsume          ## reconsume
937          redo A;          redo A;
938        } elsif ($self->{next_input_character} == 0x0027) { # '        } elsif ($self->{next_input_character} == 0x0027) { # '
939          $self->{state} = 'attribute value (single-quoted)';          $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
940          !!!next-input-character;          !!!next-input-character;
941          redo A;          redo A;
942        } elsif ($self->{next_input_character} == 0x003E) { # >        } elsif ($self->{next_input_character} == 0x003E) { # >
# Line 809  sub _get_next_token ($) { Line 952  sub _get_next_token ($) {
952          } else {          } else {
953            die "$0: $self->{current_token}->{type}: Unknown token type";            die "$0: $self->{current_token}->{type}: Unknown token type";
954          }          }
955          $self->{state} = 'data';          $self->{state} = DATA_STATE;
956          !!!next-input-character;          !!!next-input-character;
957    
958          !!!emit ($self->{current_token}); # start tag or end tag          !!!emit ($self->{current_token}); # start tag or end tag
# Line 829  sub _get_next_token ($) { Line 972  sub _get_next_token ($) {
972          } else {          } else {
973            die "$0: $self->{current_token}->{type}: Unknown token type";            die "$0: $self->{current_token}->{type}: Unknown token type";
974          }          }
975          $self->{state} = 'data';          $self->{state} = DATA_STATE;
976          ## reconsume          ## reconsume
977    
978          !!!emit ($self->{current_token}); # start tag or end tag          !!!emit ($self->{current_token}); # start tag or end tag
# Line 837  sub _get_next_token ($) { Line 980  sub _get_next_token ($) {
980          redo A;          redo A;
981        } else {        } else {
982          $self->{current_attribute}->{value} .= chr ($self->{next_input_character});          $self->{current_attribute}->{value} .= chr ($self->{next_input_character});
983          $self->{state} = 'attribute value (unquoted)';          $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
984          !!!next-input-character;          !!!next-input-character;
985          redo A;          redo A;
986        }        }
987      } elsif ($self->{state} eq 'attribute value (double-quoted)') {      } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
988        if ($self->{next_input_character} == 0x0022) { # "        if ($self->{next_input_character} == 0x0022) { # "
989          $self->{state} = 'before attribute name';          $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
990          !!!next-input-character;          !!!next-input-character;
991          redo A;          redo A;
992        } elsif ($self->{next_input_character} == 0x0026) { # &        } elsif ($self->{next_input_character} == 0x0026) { # &
993          $self->{last_attribute_value_state} = 'attribute value (double-quoted)';          $self->{last_attribute_value_state} = $self->{state};
994          $self->{state} = 'entity in attribute value';          $self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE;
995          !!!next-input-character;          !!!next-input-character;
996          redo A;          redo A;
997        } elsif ($self->{next_input_character} == -1) {        } elsif ($self->{next_input_character} == -1) {
# Line 865  sub _get_next_token ($) { Line 1008  sub _get_next_token ($) {
1008          } else {          } else {
1009            die "$0: $self->{current_token}->{type}: Unknown token type";            die "$0: $self->{current_token}->{type}: Unknown token type";
1010          }          }
1011          $self->{state} = 'data';          $self->{state} = DATA_STATE;
1012          ## reconsume          ## reconsume
1013    
1014          !!!emit ($self->{current_token}); # start tag or end tag          !!!emit ($self->{current_token}); # start tag or end tag
# Line 877  sub _get_next_token ($) { Line 1020  sub _get_next_token ($) {
1020          !!!next-input-character;          !!!next-input-character;
1021          redo A;          redo A;
1022        }        }
1023      } elsif ($self->{state} eq 'attribute value (single-quoted)') {      } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1024        if ($self->{next_input_character} == 0x0027) { # '        if ($self->{next_input_character} == 0x0027) { # '
1025          $self->{state} = 'before attribute name';          $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1026          !!!next-input-character;          !!!next-input-character;
1027          redo A;          redo A;
1028        } elsif ($self->{next_input_character} == 0x0026) { # &        } elsif ($self->{next_input_character} == 0x0026) { # &
1029          $self->{last_attribute_value_state} = 'attribute value (single-quoted)';          $self->{last_attribute_value_state} = $self->{state};
1030          $self->{state} = 'entity in attribute value';          $self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE;
1031          !!!next-input-character;          !!!next-input-character;
1032          redo A;          redo A;
1033        } elsif ($self->{next_input_character} == -1) {        } elsif ($self->{next_input_character} == -1) {
# Line 901  sub _get_next_token ($) { Line 1044  sub _get_next_token ($) {
1044          } else {          } else {
1045            die "$0: $self->{current_token}->{type}: Unknown token type";            die "$0: $self->{current_token}->{type}: Unknown token type";
1046          }          }
1047          $self->{state} = 'data';          $self->{state} = DATA_STATE;
1048          ## reconsume          ## reconsume
1049    
1050          !!!emit ($self->{current_token}); # start tag or end tag          !!!emit ($self->{current_token}); # start tag or end tag
# Line 913  sub _get_next_token ($) { Line 1056  sub _get_next_token ($) {
1056          !!!next-input-character;          !!!next-input-character;
1057          redo A;          redo A;
1058        }        }
1059      } elsif ($self->{state} eq 'attribute value (unquoted)') {      } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1060        if ($self->{next_input_character} == 0x0009 or # HT        if ($self->{next_input_character} == 0x0009 or # HT
1061            $self->{next_input_character} == 0x000A or # LF            $self->{next_input_character} == 0x000A or # LF
1062            $self->{next_input_character} == 0x000B or # HT            $self->{next_input_character} == 0x000B or # HT
1063            $self->{next_input_character} == 0x000C or # FF            $self->{next_input_character} == 0x000C or # FF
1064            $self->{next_input_character} == 0x0020) { # SP            $self->{next_input_character} == 0x0020) { # SP
1065          $self->{state} = 'before attribute name';          $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1066          !!!next-input-character;          !!!next-input-character;
1067          redo A;          redo A;
1068        } elsif ($self->{next_input_character} == 0x0026) { # &        } elsif ($self->{next_input_character} == 0x0026) { # &
1069          $self->{last_attribute_value_state} = 'attribute value (unquoted)';          $self->{last_attribute_value_state} = $self->{state};
1070          $self->{state} = 'entity in attribute value';          $self->{state} = ENTITY_IN_ATTRIBUTE_VALUE_STATE;
1071          !!!next-input-character;          !!!next-input-character;
1072          redo A;          redo A;
1073        } elsif ($self->{next_input_character} == 0x003E) { # >        } elsif ($self->{next_input_character} == 0x003E) { # >
# Line 940  sub _get_next_token ($) { Line 1083  sub _get_next_token ($) {
1083          } else {          } else {
1084            die "$0: $self->{current_token}->{type}: Unknown token type";            die "$0: $self->{current_token}->{type}: Unknown token type";
1085          }          }
1086          $self->{state} = 'data';          $self->{state} = DATA_STATE;
1087          !!!next-input-character;          !!!next-input-character;
1088    
1089          !!!emit ($self->{current_token}); # start tag or end tag          !!!emit ($self->{current_token}); # start tag or end tag
# Line 960  sub _get_next_token ($) { Line 1103  sub _get_next_token ($) {
1103          } else {          } else {
1104            die "$0: $self->{current_token}->{type}: Unknown token type";            die "$0: $self->{current_token}->{type}: Unknown token type";
1105          }          }
1106          $self->{state} = 'data';          $self->{state} = DATA_STATE;
1107          ## reconsume          ## reconsume
1108    
1109          !!!emit ($self->{current_token}); # start tag or end tag          !!!emit ($self->{current_token}); # start tag or end tag
# Line 972  sub _get_next_token ($) { Line 1115  sub _get_next_token ($) {
1115          !!!next-input-character;          !!!next-input-character;
1116          redo A;          redo A;
1117        }        }
1118      } elsif ($self->{state} eq 'entity in attribute value') {      } elsif ($self->{state} == ENTITY_IN_ATTRIBUTE_VALUE_STATE) {
1119        my $token = $self->_tokenize_attempt_to_consume_an_entity (1);        my $token = $self->_tokenize_attempt_to_consume_an_entity (1);
1120    
1121        unless (defined $token) {        unless (defined $token) {
1122          $self->{current_attribute}->{value} .= '&';          $self->{current_attribute}->{value} .= '&';
1123        } else {        } else {
1124          $self->{current_attribute}->{value} .= $token->{data};          $self->{current_attribute}->{value} .= $token->{data};
1125            $self->{current_attribute}->{has_reference} = $token->{has_reference};
1126          ## ISSUE: spec says "append the returned character token to the current attribute's value"          ## ISSUE: spec says "append the returned character token to the current attribute's value"
1127        }        }
1128    
1129        $self->{state} = $self->{last_attribute_value_state};        $self->{state} = $self->{last_attribute_value_state};
1130        # next-input-character is already done        # next-input-character is already done
1131        redo A;        redo A;
1132      } elsif ($self->{state} eq 'bogus comment') {      } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
1133        ## (only happen if PCDATA state)        ## (only happen if PCDATA state)
1134                
1135        my $token = {type => COMMENT_TOKEN, data => ''};        my $token = {type => COMMENT_TOKEN, data => ''};
1136    
1137        BC: {        BC: {
1138          if ($self->{next_input_character} == 0x003E) { # >          if ($self->{next_input_character} == 0x003E) { # >
1139            $self->{state} = 'data';            $self->{state} = DATA_STATE;
1140            !!!next-input-character;            !!!next-input-character;
1141    
1142            !!!emit ($token);            !!!emit ($token);
1143    
1144            redo A;            redo A;
1145          } elsif ($self->{next_input_character} == -1) {          } elsif ($self->{next_input_character} == -1) {
1146            $self->{state} = 'data';            $self->{state} = DATA_STATE;
1147            ## reconsume            ## reconsume
1148    
1149            !!!emit ($token);            !!!emit ($token);
# Line 1011  sub _get_next_token ($) { Line 1155  sub _get_next_token ($) {
1155            redo BC;            redo BC;
1156          }          }
1157        } # BC        } # BC
1158      } elsif ($self->{state} eq 'markup declaration open') {      } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
1159        ## (only happen if PCDATA state)        ## (only happen if PCDATA state)
1160    
1161        my @next_char;        my @next_char;
# Line 1022  sub _get_next_token ($) { Line 1166  sub _get_next_token ($) {
1166          push @next_char, $self->{next_input_character};          push @next_char, $self->{next_input_character};
1167          if ($self->{next_input_character} == 0x002D) { # -          if ($self->{next_input_character} == 0x002D) { # -
1168            $self->{current_token} = {type => COMMENT_TOKEN, data => ''};            $self->{current_token} = {type => COMMENT_TOKEN, data => ''};
1169            $self->{state} = 'comment start';            $self->{state} = COMMENT_START_STATE;
1170            !!!next-input-character;            !!!next-input-character;
1171            redo A;            redo A;
1172          }          }
# Line 1053  sub _get_next_token ($) { Line 1197  sub _get_next_token ($) {
1197                    if ($self->{next_input_character} == 0x0045 or # E                    if ($self->{next_input_character} == 0x0045 or # E
1198                        $self->{next_input_character} == 0x0065) { # e                        $self->{next_input_character} == 0x0065) { # e
1199                      ## ISSUE: What a stupid code this is!                      ## ISSUE: What a stupid code this is!
1200                      $self->{state} = 'DOCTYPE';                      $self->{state} = DOCTYPE_STATE;
1201                      !!!next-input-character;                      !!!next-input-character;
1202                      redo A;                      redo A;
1203                    }                    }
# Line 1067  sub _get_next_token ($) { Line 1211  sub _get_next_token ($) {
1211        !!!parse-error (type => 'bogus comment');        !!!parse-error (type => 'bogus comment');
1212        $self->{next_input_character} = shift @next_char;        $self->{next_input_character} = shift @next_char;
1213        !!!back-next-input-character (@next_char);        !!!back-next-input-character (@next_char);
1214        $self->{state} = 'bogus comment';        $self->{state} = BOGUS_COMMENT_STATE;
1215        redo A;        redo A;
1216                
1217        ## ISSUE: typos in spec: chacacters, is is a parse error        ## ISSUE: typos in spec: chacacters, is is a parse error
1218        ## ISSUE: spec is somewhat unclear on "is the first character that will be in the comment"; what is "that will be in the comment" is what the algorithm defines, isn't it?        ## ISSUE: spec is somewhat unclear on "is the first character that will be in the comment"; what is "that will be in the comment" is what the algorithm defines, isn't it?
1219      } elsif ($self->{state} eq 'comment start') {      } elsif ($self->{state} == COMMENT_START_STATE) {
1220        if ($self->{next_input_character} == 0x002D) { # -        if ($self->{next_input_character} == 0x002D) { # -
1221          $self->{state} = 'comment start dash';          $self->{state} = COMMENT_START_DASH_STATE;
1222          !!!next-input-character;          !!!next-input-character;
1223          redo A;          redo A;
1224        } elsif ($self->{next_input_character} == 0x003E) { # >        } elsif ($self->{next_input_character} == 0x003E) { # >
1225          !!!parse-error (type => 'bogus comment');          !!!parse-error (type => 'bogus comment');
1226          $self->{state} = 'data';          $self->{state} = DATA_STATE;
1227          !!!next-input-character;          !!!next-input-character;
1228    
1229          !!!emit ($self->{current_token}); # comment          !!!emit ($self->{current_token}); # comment
# Line 1087  sub _get_next_token ($) { Line 1231  sub _get_next_token ($) {
1231          redo A;          redo A;
1232        } elsif ($self->{next_input_character} == -1) {        } elsif ($self->{next_input_character} == -1) {
1233          !!!parse-error (type => 'unclosed comment');          !!!parse-error (type => 'unclosed comment');
1234          $self->{state} = 'data';          $self->{state} = DATA_STATE;
1235          ## reconsume          ## reconsume
1236    
1237          !!!emit ($self->{current_token}); # comment          !!!emit ($self->{current_token}); # comment
# Line 1096  sub _get_next_token ($) { Line 1240  sub _get_next_token ($) {
1240        } else {        } else {
1241          $self->{current_token}->{data} # comment          $self->{current_token}->{data} # comment
1242              .= chr ($self->{next_input_character});              .= chr ($self->{next_input_character});
1243          $self->{state} = 'comment';          $self->{state} = COMMENT_STATE;
1244          !!!next-input-character;          !!!next-input-character;
1245          redo A;          redo A;
1246        }        }
1247      } elsif ($self->{state} eq 'comment start dash') {      } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
1248        if ($self->{next_input_character} == 0x002D) { # -        if ($self->{next_input_character} == 0x002D) { # -
1249          $self->{state} = 'comment end';          $self->{state} = COMMENT_END_STATE;
1250          !!!next-input-character;          !!!next-input-character;
1251          redo A;          redo A;
1252        } elsif ($self->{next_input_character} == 0x003E) { # >        } elsif ($self->{next_input_character} == 0x003E) { # >
1253          !!!parse-error (type => 'bogus comment');          !!!parse-error (type => 'bogus comment');
1254          $self->{state} = 'data';          $self->{state} = DATA_STATE;
1255          !!!next-input-character;          !!!next-input-character;
1256    
1257          !!!emit ($self->{current_token}); # comment          !!!emit ($self->{current_token}); # comment
# Line 1115  sub _get_next_token ($) { Line 1259  sub _get_next_token ($) {
1259          redo A;          redo A;
1260        } elsif ($self->{next_input_character} == -1) {        } elsif ($self->{next_input_character} == -1) {
1261          !!!parse-error (type => 'unclosed comment');          !!!parse-error (type => 'unclosed comment');
1262          $self->{state} = 'data';          $self->{state} = DATA_STATE;
1263          ## reconsume          ## reconsume
1264    
1265          !!!emit ($self->{current_token}); # comment          !!!emit ($self->{current_token}); # comment
# Line 1124  sub _get_next_token ($) { Line 1268  sub _get_next_token ($) {
1268        } else {        } else {
1269          $self->{current_token}->{data} # comment          $self->{current_token}->{data} # comment
1270              .= '-' . chr ($self->{next_input_character});              .= '-' . chr ($self->{next_input_character});
1271          $self->{state} = 'comment';          $self->{state} = COMMENT_STATE;
1272          !!!next-input-character;          !!!next-input-character;
1273          redo A;          redo A;
1274        }        }
1275      } elsif ($self->{state} eq 'comment') {      } elsif ($self->{state} == COMMENT_STATE) {
1276        if ($self->{next_input_character} == 0x002D) { # -        if ($self->{next_input_character} == 0x002D) { # -
1277          $self->{state} = 'comment end dash';          $self->{state} = COMMENT_END_DASH_STATE;
1278          !!!next-input-character;          !!!next-input-character;
1279          redo A;          redo A;
1280        } elsif ($self->{next_input_character} == -1) {        } elsif ($self->{next_input_character} == -1) {
1281          !!!parse-error (type => 'unclosed comment');          !!!parse-error (type => 'unclosed comment');
1282          $self->{state} = 'data';          $self->{state} = DATA_STATE;
1283          ## reconsume          ## reconsume
1284    
1285          !!!emit ($self->{current_token}); # comment          !!!emit ($self->{current_token}); # comment
# Line 1147  sub _get_next_token ($) { Line 1291  sub _get_next_token ($) {
1291          !!!next-input-character;          !!!next-input-character;
1292          redo A;          redo A;
1293        }        }
1294      } elsif ($self->{state} eq 'comment end dash') {      } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
1295        if ($self->{next_input_character} == 0x002D) { # -        if ($self->{next_input_character} == 0x002D) { # -
1296          $self->{state} = 'comment end';          $self->{state} = COMMENT_END_STATE;
1297          !!!next-input-character;          !!!next-input-character;
1298          redo A;          redo A;
1299        } elsif ($self->{next_input_character} == -1) {        } elsif ($self->{next_input_character} == -1) {
1300          !!!parse-error (type => 'unclosed comment');          !!!parse-error (type => 'unclosed comment');
1301          $self->{state} = 'data';          $self->{state} = DATA_STATE;
1302          ## reconsume          ## reconsume
1303    
1304          !!!emit ($self->{current_token}); # comment          !!!emit ($self->{current_token}); # comment
# Line 1162  sub _get_next_token ($) { Line 1306  sub _get_next_token ($) {
1306          redo A;          redo A;
1307        } else {        } else {
1308          $self->{current_token}->{data} .= '-' . chr ($self->{next_input_character}); # comment          $self->{current_token}->{data} .= '-' . chr ($self->{next_input_character}); # comment
1309          $self->{state} = 'comment';          $self->{state} = COMMENT_STATE;
1310          !!!next-input-character;          !!!next-input-character;
1311          redo A;          redo A;
1312        }        }
1313      } elsif ($self->{state} eq 'comment end') {      } elsif ($self->{state} == COMMENT_END_STATE) {
1314        if ($self->{next_input_character} == 0x003E) { # >        if ($self->{next_input_character} == 0x003E) { # >
1315          $self->{state} = 'data';          $self->{state} = DATA_STATE;
1316          !!!next-input-character;          !!!next-input-character;
1317    
1318          !!!emit ($self->{current_token}); # comment          !!!emit ($self->{current_token}); # comment
# Line 1182  sub _get_next_token ($) { Line 1326  sub _get_next_token ($) {
1326          redo A;          redo A;
1327        } elsif ($self->{next_input_character} == -1) {        } elsif ($self->{next_input_character} == -1) {
1328          !!!parse-error (type => 'unclosed comment');          !!!parse-error (type => 'unclosed comment');
1329          $self->{state} = 'data';          $self->{state} = DATA_STATE;
1330          ## reconsume          ## reconsume
1331    
1332          !!!emit ($self->{current_token}); # comment          !!!emit ($self->{current_token}); # comment
# Line 1191  sub _get_next_token ($) { Line 1335  sub _get_next_token ($) {
1335        } else {        } else {
1336          !!!parse-error (type => 'dash in comment');          !!!parse-error (type => 'dash in comment');
1337          $self->{current_token}->{data} .= '--' . chr ($self->{next_input_character}); # comment          $self->{current_token}->{data} .= '--' . chr ($self->{next_input_character}); # comment
1338          $self->{state} = 'comment';          $self->{state} = COMMENT_STATE;
1339          !!!next-input-character;          !!!next-input-character;
1340          redo A;          redo A;
1341        }        }
1342      } elsif ($self->{state} eq 'DOCTYPE') {      } elsif ($self->{state} == DOCTYPE_STATE) {
1343        if ($self->{next_input_character} == 0x0009 or # HT        if ($self->{next_input_character} == 0x0009 or # HT
1344            $self->{next_input_character} == 0x000A or # LF            $self->{next_input_character} == 0x000A or # LF
1345            $self->{next_input_character} == 0x000B or # VT            $self->{next_input_character} == 0x000B or # VT
1346            $self->{next_input_character} == 0x000C or # FF            $self->{next_input_character} == 0x000C or # FF
1347            $self->{next_input_character} == 0x0020) { # SP            $self->{next_input_character} == 0x0020) { # SP
1348          $self->{state} = 'before DOCTYPE name';          $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
1349          !!!next-input-character;          !!!next-input-character;
1350          redo A;          redo A;
1351        } else {        } else {
1352          !!!parse-error (type => 'no space before DOCTYPE name');          !!!parse-error (type => 'no space before DOCTYPE name');
1353          $self->{state} = 'before DOCTYPE name';          $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
1354          ## reconsume          ## reconsume
1355          redo A;          redo A;
1356        }        }
1357      } elsif ($self->{state} eq 'before DOCTYPE name') {      } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
1358        if ($self->{next_input_character} == 0x0009 or # HT        if ($self->{next_input_character} == 0x0009 or # HT
1359            $self->{next_input_character} == 0x000A or # LF            $self->{next_input_character} == 0x000A or # LF
1360            $self->{next_input_character} == 0x000B or # VT            $self->{next_input_character} == 0x000B or # VT
# Line 1221  sub _get_next_token ($) { Line 1365  sub _get_next_token ($) {
1365          redo A;          redo A;
1366        } elsif ($self->{next_input_character} == 0x003E) { # >        } elsif ($self->{next_input_character} == 0x003E) { # >
1367          !!!parse-error (type => 'no DOCTYPE name');          !!!parse-error (type => 'no DOCTYPE name');
1368          $self->{state} = 'data';          $self->{state} = DATA_STATE;
1369          !!!next-input-character;          !!!next-input-character;
1370    
1371          !!!emit ({type => DOCTYPE_TOKEN}); # incorrect          !!!emit ({type => DOCTYPE_TOKEN}); # incorrect
# Line 1229  sub _get_next_token ($) { Line 1373  sub _get_next_token ($) {
1373          redo A;          redo A;
1374        } elsif ($self->{next_input_character} == -1) {        } elsif ($self->{next_input_character} == -1) {
1375          !!!parse-error (type => 'no DOCTYPE name');          !!!parse-error (type => 'no DOCTYPE name');
1376          $self->{state} = 'data';          $self->{state} = DATA_STATE;
1377          ## reconsume          ## reconsume
1378    
1379          !!!emit ({type => DOCTYPE_TOKEN}); # incorrect          !!!emit ({type => DOCTYPE_TOKEN}); # incorrect
# Line 1241  sub _get_next_token ($) { Line 1385  sub _get_next_token ($) {
1385                 name => chr ($self->{next_input_character}),                 name => chr ($self->{next_input_character}),
1386                 correct => 1};                 correct => 1};
1387  ## ISSUE: "Set the token's name name to the" in the spec  ## ISSUE: "Set the token's name name to the" in the spec
1388          $self->{state} = 'DOCTYPE name';          $self->{state} = DOCTYPE_NAME_STATE;
1389          !!!next-input-character;          !!!next-input-character;
1390          redo A;          redo A;
1391        }        }
1392      } elsif ($self->{state} eq 'DOCTYPE name') {      } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
1393  ## ISSUE: Redundant "First," in the spec.  ## ISSUE: Redundant "First," in the spec.
1394        if ($self->{next_input_character} == 0x0009 or # HT        if ($self->{next_input_character} == 0x0009 or # HT
1395            $self->{next_input_character} == 0x000A or # LF            $self->{next_input_character} == 0x000A or # LF
1396            $self->{next_input_character} == 0x000B or # VT            $self->{next_input_character} == 0x000B or # VT
1397            $self->{next_input_character} == 0x000C or # FF            $self->{next_input_character} == 0x000C or # FF
1398            $self->{next_input_character} == 0x0020) { # SP            $self->{next_input_character} == 0x0020) { # SP
1399          $self->{state} = 'after DOCTYPE name';          $self->{state} = AFTER_DOCTYPE_NAME_STATE;
1400          !!!next-input-character;          !!!next-input-character;
1401          redo A;          redo A;
1402        } elsif ($self->{next_input_character} == 0x003E) { # >        } elsif ($self->{next_input_character} == 0x003E) { # >
1403          $self->{state} = 'data';          $self->{state} = DATA_STATE;
1404          !!!next-input-character;          !!!next-input-character;
1405    
1406          !!!emit ($self->{current_token}); # DOCTYPE          !!!emit ($self->{current_token}); # DOCTYPE
# Line 1264  sub _get_next_token ($) { Line 1408  sub _get_next_token ($) {
1408          redo A;          redo A;
1409        } elsif ($self->{next_input_character} == -1) {        } elsif ($self->{next_input_character} == -1) {
1410          !!!parse-error (type => 'unclosed DOCTYPE');          !!!parse-error (type => 'unclosed DOCTYPE');
1411          $self->{state} = 'data';          $self->{state} = DATA_STATE;
1412          ## reconsume          ## reconsume
1413    
1414          delete $self->{current_token}->{correct};          delete $self->{current_token}->{correct};
# Line 1278  sub _get_next_token ($) { Line 1422  sub _get_next_token ($) {
1422          !!!next-input-character;          !!!next-input-character;
1423          redo A;          redo A;
1424        }        }
1425      } elsif ($self->{state} eq 'after DOCTYPE name') {      } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
1426        if ($self->{next_input_character} == 0x0009 or # HT        if ($self->{next_input_character} == 0x0009 or # HT
1427            $self->{next_input_character} == 0x000A or # LF            $self->{next_input_character} == 0x000A or # LF
1428            $self->{next_input_character} == 0x000B or # VT            $self->{next_input_character} == 0x000B or # VT
# Line 1288  sub _get_next_token ($) { Line 1432  sub _get_next_token ($) {
1432          !!!next-input-character;          !!!next-input-character;
1433          redo A;          redo A;
1434        } elsif ($self->{next_input_character} == 0x003E) { # >        } elsif ($self->{next_input_character} == 0x003E) { # >
1435          $self->{state} = 'data';          $self->{state} = DATA_STATE;
1436          !!!next-input-character;          !!!next-input-character;
1437    
1438          !!!emit ($self->{current_token}); # DOCTYPE          !!!emit ($self->{current_token}); # DOCTYPE
# Line 1296  sub _get_next_token ($) { Line 1440  sub _get_next_token ($) {
1440          redo A;          redo A;
1441        } elsif ($self->{next_input_character} == -1) {        } elsif ($self->{next_input_character} == -1) {
1442          !!!parse-error (type => 'unclosed DOCTYPE');          !!!parse-error (type => 'unclosed DOCTYPE');
1443          $self->{state} = 'data';          $self->{state} = DATA_STATE;
1444          ## reconsume          ## reconsume
1445    
1446          delete $self->{current_token}->{correct};          delete $self->{current_token}->{correct};
# Line 1320  sub _get_next_token ($) { Line 1464  sub _get_next_token ($) {
1464                  !!!next-input-character;                  !!!next-input-character;
1465                  if ($self->{next_input_character} == 0x0043 or # C                  if ($self->{next_input_character} == 0x0043 or # C
1466                      $self->{next_input_character} == 0x0063) { # c                      $self->{next_input_character} == 0x0063) { # c
1467                    $self->{state} = 'before DOCTYPE public identifier';                    $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
1468                    !!!next-input-character;                    !!!next-input-character;
1469                    redo A;                    redo A;
1470                  }                  }
# Line 1347  sub _get_next_token ($) { Line 1491  sub _get_next_token ($) {
1491                  !!!next-input-character;                  !!!next-input-character;
1492                  if ($self->{next_input_character} == 0x004D or # M                  if ($self->{next_input_character} == 0x004D or # M
1493                      $self->{next_input_character} == 0x006D) { # m                      $self->{next_input_character} == 0x006D) { # m
1494                    $self->{state} = 'before DOCTYPE system identifier';                    $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
1495                    !!!next-input-character;                    !!!next-input-character;
1496                    redo A;                    redo A;
1497                  }                  }
# Line 1363  sub _get_next_token ($) { Line 1507  sub _get_next_token ($) {
1507        }        }
1508    
1509        !!!parse-error (type => 'string after DOCTYPE name');        !!!parse-error (type => 'string after DOCTYPE name');
1510        $self->{state} = 'bogus DOCTYPE';        $self->{state} = BOGUS_DOCTYPE_STATE;
1511        # next-input-character is already done        # next-input-character is already done
1512        redo A;        redo A;
1513      } elsif ($self->{state} eq 'before DOCTYPE public identifier') {      } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
1514        if ({        if ({
1515              0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,              0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
1516              #0x000D => 1, # HT, LF, VT, FF, SP, CR              #0x000D => 1, # HT, LF, VT, FF, SP, CR
# Line 1376  sub _get_next_token ($) { Line 1520  sub _get_next_token ($) {
1520          redo A;          redo A;
1521        } elsif ($self->{next_input_character} eq 0x0022) { # "        } elsif ($self->{next_input_character} eq 0x0022) { # "
1522          $self->{current_token}->{public_identifier} = ''; # DOCTYPE          $self->{current_token}->{public_identifier} = ''; # DOCTYPE
1523          $self->{state} = 'DOCTYPE public identifier (double-quoted)';          $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
1524          !!!next-input-character;          !!!next-input-character;
1525          redo A;          redo A;
1526        } elsif ($self->{next_input_character} eq 0x0027) { # '        } elsif ($self->{next_input_character} eq 0x0027) { # '
1527          $self->{current_token}->{public_identifier} = ''; # DOCTYPE          $self->{current_token}->{public_identifier} = ''; # DOCTYPE
1528          $self->{state} = 'DOCTYPE public identifier (single-quoted)';          $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
1529          !!!next-input-character;          !!!next-input-character;
1530          redo A;          redo A;
1531        } elsif ($self->{next_input_character} eq 0x003E) { # >        } elsif ($self->{next_input_character} eq 0x003E) { # >
1532          !!!parse-error (type => 'no PUBLIC literal');          !!!parse-error (type => 'no PUBLIC literal');
1533    
1534          $self->{state} = 'data';          $self->{state} = DATA_STATE;
1535          !!!next-input-character;          !!!next-input-character;
1536    
1537          delete $self->{current_token}->{correct};          delete $self->{current_token}->{correct};
# Line 1397  sub _get_next_token ($) { Line 1541  sub _get_next_token ($) {
1541        } elsif ($self->{next_input_character} == -1) {        } elsif ($self->{next_input_character} == -1) {
1542          !!!parse-error (type => 'unclosed DOCTYPE');          !!!parse-error (type => 'unclosed DOCTYPE');
1543    
1544          $self->{state} = 'data';          $self->{state} = DATA_STATE;
1545          ## reconsume          ## reconsume
1546    
1547          delete $self->{current_token}->{correct};          delete $self->{current_token}->{correct};
# Line 1406  sub _get_next_token ($) { Line 1550  sub _get_next_token ($) {
1550          redo A;          redo A;
1551        } else {        } else {
1552          !!!parse-error (type => 'string after PUBLIC');          !!!parse-error (type => 'string after PUBLIC');
1553          $self->{state} = 'bogus DOCTYPE';          $self->{state} = BOGUS_DOCTYPE_STATE;
1554          !!!next-input-character;          !!!next-input-character;
1555          redo A;          redo A;
1556        }        }
1557      } elsif ($self->{state} eq 'DOCTYPE public identifier (double-quoted)') {      } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
1558        if ($self->{next_input_character} == 0x0022) { # "        if ($self->{next_input_character} == 0x0022) { # "
1559          $self->{state} = 'after DOCTYPE public identifier';          $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
1560            !!!next-input-character;
1561            redo A;
1562          } elsif ($self->{next_input_character} == 0x003E) { # >
1563            !!!parse-error (type => 'unclosed PUBLIC literal');
1564    
1565            $self->{state} = DATA_STATE;
1566          !!!next-input-character;          !!!next-input-character;
1567    
1568            delete $self->{current_token}->{correct};
1569            !!!emit ($self->{current_token}); # DOCTYPE
1570    
1571          redo A;          redo A;
1572        } elsif ($self->{next_input_character} == -1) {        } elsif ($self->{next_input_character} == -1) {
1573          !!!parse-error (type => 'unclosed PUBLIC literal');          !!!parse-error (type => 'unclosed PUBLIC literal');
1574    
1575          $self->{state} = 'data';          $self->{state} = DATA_STATE;
1576          ## reconsume          ## reconsume
1577    
1578          delete $self->{current_token}->{correct};          delete $self->{current_token}->{correct};
# Line 1432  sub _get_next_token ($) { Line 1586  sub _get_next_token ($) {
1586          !!!next-input-character;          !!!next-input-character;
1587          redo A;          redo A;
1588        }        }
1589      } elsif ($self->{state} eq 'DOCTYPE public identifier (single-quoted)') {      } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
1590        if ($self->{next_input_character} == 0x0027) { # '        if ($self->{next_input_character} == 0x0027) { # '
1591          $self->{state} = 'after DOCTYPE public identifier';          $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
1592          !!!next-input-character;          !!!next-input-character;
1593          redo A;          redo A;
1594          } elsif ($self->{next_input_character} == 0x003E) { # >
1595            !!!parse-error (type => 'unclosed PUBLIC literal');
1596    
1597            $self->{state} = DATA_STATE;
1598            !!!next-input-character;
1599    
1600            delete $self->{current_token}->{correct};
1601            !!!emit ($self->{current_token}); # DOCTYPE
1602    
1603            redo A;
1604        } elsif ($self->{next_input_character} == -1) {        } elsif ($self->{next_input_character} == -1) {
1605          !!!parse-error (type => 'unclosed PUBLIC literal');          !!!parse-error (type => 'unclosed PUBLIC literal');
1606    
1607          $self->{state} = 'data';          $self->{state} = DATA_STATE;
1608          ## reconsume          ## reconsume
1609    
1610          delete $self->{current_token}->{correct};          delete $self->{current_token}->{correct};
# Line 1454  sub _get_next_token ($) { Line 1618  sub _get_next_token ($) {
1618          !!!next-input-character;          !!!next-input-character;
1619          redo A;          redo A;
1620        }        }
1621      } elsif ($self->{state} eq 'after DOCTYPE public identifier') {      } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
1622        if ({        if ({
1623              0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,              0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
1624              #0x000D => 1, # HT, LF, VT, FF, SP, CR              #0x000D => 1, # HT, LF, VT, FF, SP, CR
# Line 1464  sub _get_next_token ($) { Line 1628  sub _get_next_token ($) {
1628          redo A;          redo A;
1629        } elsif ($self->{next_input_character} == 0x0022) { # "        } elsif ($self->{next_input_character} == 0x0022) { # "
1630          $self->{current_token}->{system_identifier} = ''; # DOCTYPE          $self->{current_token}->{system_identifier} = ''; # DOCTYPE
1631          $self->{state} = 'DOCTYPE system identifier (double-quoted)';          $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
1632          !!!next-input-character;          !!!next-input-character;
1633          redo A;          redo A;
1634        } elsif ($self->{next_input_character} == 0x0027) { # '        } elsif ($self->{next_input_character} == 0x0027) { # '
1635          $self->{current_token}->{system_identifier} = ''; # DOCTYPE          $self->{current_token}->{system_identifier} = ''; # DOCTYPE
1636          $self->{state} = 'DOCTYPE system identifier (single-quoted)';          $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
1637          !!!next-input-character;          !!!next-input-character;
1638          redo A;          redo A;
1639        } elsif ($self->{next_input_character} == 0x003E) { # >        } elsif ($self->{next_input_character} == 0x003E) { # >
1640          $self->{state} = 'data';          $self->{state} = DATA_STATE;
1641          !!!next-input-character;          !!!next-input-character;
1642    
1643          !!!emit ($self->{current_token}); # DOCTYPE          !!!emit ($self->{current_token}); # DOCTYPE
# Line 1482  sub _get_next_token ($) { Line 1646  sub _get_next_token ($) {
1646        } elsif ($self->{next_input_character} == -1) {        } elsif ($self->{next_input_character} == -1) {
1647          !!!parse-error (type => 'unclosed DOCTYPE');          !!!parse-error (type => 'unclosed DOCTYPE');
1648    
1649          $self->{state} = 'data';          $self->{state} = DATA_STATE;
1650          ## reconsume          ## reconsume
1651    
1652          delete $self->{current_token}->{correct};          delete $self->{current_token}->{correct};
# Line 1491  sub _get_next_token ($) { Line 1655  sub _get_next_token ($) {
1655          redo A;          redo A;
1656        } else {        } else {
1657          !!!parse-error (type => 'string after PUBLIC literal');          !!!parse-error (type => 'string after PUBLIC literal');
1658          $self->{state} = 'bogus DOCTYPE';          $self->{state} = BOGUS_DOCTYPE_STATE;
1659          !!!next-input-character;          !!!next-input-character;
1660          redo A;          redo A;
1661        }        }
1662      } elsif ($self->{state} eq 'before DOCTYPE system identifier') {      } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
1663        if ({        if ({
1664              0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,              0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
1665              #0x000D => 1, # HT, LF, VT, FF, SP, CR              #0x000D => 1, # HT, LF, VT, FF, SP, CR
# Line 1505  sub _get_next_token ($) { Line 1669  sub _get_next_token ($) {
1669          redo A;          redo A;
1670        } elsif ($self->{next_input_character} == 0x0022) { # "        } elsif ($self->{next_input_character} == 0x0022) { # "
1671          $self->{current_token}->{system_identifier} = ''; # DOCTYPE          $self->{current_token}->{system_identifier} = ''; # DOCTYPE
1672          $self->{state} = 'DOCTYPE system identifier (double-quoted)';          $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
1673          !!!next-input-character;          !!!next-input-character;
1674          redo A;          redo A;
1675        } elsif ($self->{next_input_character} == 0x0027) { # '        } elsif ($self->{next_input_character} == 0x0027) { # '
1676          $self->{current_token}->{system_identifier} = ''; # DOCTYPE          $self->{current_token}->{system_identifier} = ''; # DOCTYPE
1677          $self->{state} = 'DOCTYPE system identifier (single-quoted)';          $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
1678          !!!next-input-character;          !!!next-input-character;
1679          redo A;          redo A;
1680        } elsif ($self->{next_input_character} == 0x003E) { # >        } elsif ($self->{next_input_character} == 0x003E) { # >
1681          !!!parse-error (type => 'no SYSTEM literal');          !!!parse-error (type => 'no SYSTEM literal');
1682          $self->{state} = 'data';          $self->{state} = DATA_STATE;
1683          !!!next-input-character;          !!!next-input-character;
1684    
1685          delete $self->{current_token}->{correct};          delete $self->{current_token}->{correct};
# Line 1525  sub _get_next_token ($) { Line 1689  sub _get_next_token ($) {
1689        } elsif ($self->{next_input_character} == -1) {        } elsif ($self->{next_input_character} == -1) {
1690          !!!parse-error (type => 'unclosed DOCTYPE');          !!!parse-error (type => 'unclosed DOCTYPE');
1691    
1692          $self->{state} = 'data';          $self->{state} = DATA_STATE;
1693          ## reconsume          ## reconsume
1694    
1695          delete $self->{current_token}->{correct};          delete $self->{current_token}->{correct};
# Line 1534  sub _get_next_token ($) { Line 1698  sub _get_next_token ($) {
1698          redo A;          redo A;
1699        } else {        } else {
1700          !!!parse-error (type => 'string after SYSTEM');          !!!parse-error (type => 'string after SYSTEM');
1701          $self->{state} = 'bogus DOCTYPE';          $self->{state} = BOGUS_DOCTYPE_STATE;
1702          !!!next-input-character;          !!!next-input-character;
1703          redo A;          redo A;
1704        }        }
1705      } elsif ($self->{state} eq 'DOCTYPE system identifier (double-quoted)') {      } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
1706        if ($self->{next_input_character} == 0x0022) { # "        if ($self->{next_input_character} == 0x0022) { # "
1707          $self->{state} = 'after DOCTYPE system identifier';          $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
1708            !!!next-input-character;
1709            redo A;
1710          } elsif ($self->{next_input_character} == 0x003E) { # >
1711            !!!parse-error (type => 'unclosed PUBLIC literal');
1712    
1713            $self->{state} = DATA_STATE;
1714          !!!next-input-character;          !!!next-input-character;
1715    
1716            delete $self->{current_token}->{correct};
1717            !!!emit ($self->{current_token}); # DOCTYPE
1718    
1719          redo A;          redo A;
1720        } elsif ($self->{next_input_character} == -1) {        } elsif ($self->{next_input_character} == -1) {
1721          !!!parse-error (type => 'unclosed SYSTEM literal');          !!!parse-error (type => 'unclosed SYSTEM literal');
1722    
1723          $self->{state} = 'data';          $self->{state} = DATA_STATE;
1724          ## reconsume          ## reconsume
1725    
1726          delete $self->{current_token}->{correct};          delete $self->{current_token}->{correct};
# Line 1560  sub _get_next_token ($) { Line 1734  sub _get_next_token ($) {
1734          !!!next-input-character;          !!!next-input-character;
1735          redo A;          redo A;
1736        }        }
1737      } elsif ($self->{state} eq 'DOCTYPE system identifier (single-quoted)') {      } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
1738        if ($self->{next_input_character} == 0x0027) { # '        if ($self->{next_input_character} == 0x0027) { # '
1739          $self->{state} = 'after DOCTYPE system identifier';          $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
1740            !!!next-input-character;
1741            redo A;
1742          } elsif ($self->{next_input_character} == 0x003E) { # >
1743            !!!parse-error (type => 'unclosed PUBLIC literal');
1744    
1745            $self->{state} = DATA_STATE;
1746          !!!next-input-character;          !!!next-input-character;
1747    
1748            delete $self->{current_token}->{correct};
1749            !!!emit ($self->{current_token}); # DOCTYPE
1750    
1751          redo A;          redo A;
1752        } elsif ($self->{next_input_character} == -1) {        } elsif ($self->{next_input_character} == -1) {
1753          !!!parse-error (type => 'unclosed SYSTEM literal');          !!!parse-error (type => 'unclosed SYSTEM literal');
1754    
1755          $self->{state} = 'data';          $self->{state} = DATA_STATE;
1756          ## reconsume          ## reconsume
1757    
1758          delete $self->{current_token}->{correct};          delete $self->{current_token}->{correct};
# Line 1582  sub _get_next_token ($) { Line 1766  sub _get_next_token ($) {
1766          !!!next-input-character;          !!!next-input-character;
1767          redo A;          redo A;
1768        }        }
1769      } elsif ($self->{state} eq 'after DOCTYPE system identifier') {      } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
1770        if ({        if ({
1771              0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,              0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
1772              #0x000D => 1, # HT, LF, VT, FF, SP, CR              #0x000D => 1, # HT, LF, VT, FF, SP, CR
# Line 1591  sub _get_next_token ($) { Line 1775  sub _get_next_token ($) {
1775          !!!next-input-character;          !!!next-input-character;
1776          redo A;          redo A;
1777        } elsif ($self->{next_input_character} == 0x003E) { # >        } elsif ($self->{next_input_character} == 0x003E) { # >
1778          $self->{state} = 'data';          $self->{state} = DATA_STATE;
1779          !!!next-input-character;          !!!next-input-character;
1780    
1781          !!!emit ($self->{current_token}); # DOCTYPE          !!!emit ($self->{current_token}); # DOCTYPE
# Line 1600  sub _get_next_token ($) { Line 1784  sub _get_next_token ($) {
1784        } elsif ($self->{next_input_character} == -1) {        } elsif ($self->{next_input_character} == -1) {
1785          !!!parse-error (type => 'unclosed DOCTYPE');          !!!parse-error (type => 'unclosed DOCTYPE');
1786    
1787          $self->{state} = 'data';          $self->{state} = DATA_STATE;
1788          ## reconsume          ## reconsume
1789    
1790          delete $self->{current_token}->{correct};          delete $self->{current_token}->{correct};
# Line 1609  sub _get_next_token ($) { Line 1793  sub _get_next_token ($) {
1793          redo A;          redo A;
1794        } else {        } else {
1795          !!!parse-error (type => 'string after SYSTEM literal');          !!!parse-error (type => 'string after SYSTEM literal');
1796          $self->{state} = 'bogus DOCTYPE';          $self->{state} = BOGUS_DOCTYPE_STATE;
1797          !!!next-input-character;          !!!next-input-character;
1798          redo A;          redo A;
1799        }        }
1800      } elsif ($self->{state} eq 'bogus DOCTYPE') {      } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
1801        if ($self->{next_input_character} == 0x003E) { # >        if ($self->{next_input_character} == 0x003E) { # >
1802          $self->{state} = 'data';          $self->{state} = DATA_STATE;
1803          !!!next-input-character;          !!!next-input-character;
1804    
1805          delete $self->{current_token}->{correct};          delete $self->{current_token}->{correct};
# Line 1624  sub _get_next_token ($) { Line 1808  sub _get_next_token ($) {
1808          redo A;          redo A;
1809        } elsif ($self->{next_input_character} == -1) {        } elsif ($self->{next_input_character} == -1) {
1810          !!!parse-error (type => 'unclosed DOCTYPE');          !!!parse-error (type => 'unclosed DOCTYPE');
1811          $self->{state} = 'data';          $self->{state} = DATA_STATE;
1812          ## reconsume          ## reconsume
1813    
1814          delete $self->{current_token}->{correct};          delete $self->{current_token}->{correct};
# Line 1705  sub _tokenize_attempt_to_consume_an_enti Line 1889  sub _tokenize_attempt_to_consume_an_enti
1889            $code = $c1_entity_char->{$code};            $code = $c1_entity_char->{$code};
1890          }          }
1891    
1892          return {type => CHARACTER_TOKEN, data => chr $code};          return {type => CHARACTER_TOKEN, data => chr $code,
1893                    has_reference => 1};
1894        } # X        } # X
1895      } elsif (0x0030 <= $self->{next_input_character} and      } elsif (0x0030 <= $self->{next_input_character} and
1896               $self->{next_input_character} <= 0x0039) { # 0..9               $self->{next_input_character} <= 0x0039) { # 0..9
# Line 1740  sub _tokenize_attempt_to_consume_an_enti Line 1925  sub _tokenize_attempt_to_consume_an_enti
1925          $code = $c1_entity_char->{$code};          $code = $c1_entity_char->{$code};
1926        }        }
1927                
1928        return {type => CHARACTER_TOKEN, data => chr $code};        return {type => CHARACTER_TOKEN, data => chr $code, has_reference => 1};
1929      } else {      } else {
1930        !!!parse-error (type => 'bare nero');        !!!parse-error (type => 'bare nero');
1931        !!!back-next-input-character ($self->{next_input_character});        !!!back-next-input-character ($self->{next_input_character});
# Line 1788  sub _tokenize_attempt_to_consume_an_enti Line 1973  sub _tokenize_attempt_to_consume_an_enti
1973      }      }
1974            
1975      if ($match > 0) {      if ($match > 0) {
1976        return {type => CHARACTER_TOKEN, data => $value};        return {type => CHARACTER_TOKEN, data => $value, has_reference => 1};
1977      } elsif ($match < 0) {      } elsif ($match < 0) {
1978        !!!parse-error (type => 'no refc');        !!!parse-error (type => 'no refc');
1979        if ($in_attr and $match < -1) {        if ($in_attr and $match < -1) {
1980          return {type => CHARACTER_TOKEN, data => '&'.$entity_name};          return {type => CHARACTER_TOKEN, data => '&'.$entity_name};
1981        } else {        } else {
1982          return {type => CHARACTER_TOKEN, data => $value};          return {type => CHARACTER_TOKEN, data => $value, has_reference => 1};
1983        }        }
1984      } else {      } else {
1985        !!!parse-error (type => 'bare ero');        !!!parse-error (type => 'bare ero');
1986        ## NOTE: No characters are consumed in the spec.        ## NOTE: "No characters are consumed" in the spec.
1987        return {type => CHARACTER_TOKEN, data => '&'.$value};        return {type => CHARACTER_TOKEN, data => '&'.$value};
1988      }      }
1989    } else {    } else {
# Line 2046  sub _tree_construction_root_element ($) Line 2231  sub _tree_construction_root_element ($)
2231              redo B;              redo B;
2232            }            }
2233          }          }
2234    
2235            $self->{application_cache_selection}->(undef);
2236    
2237            #
2238          } elsif ($token->{type} == START_TAG_TOKEN) {
2239            if ($token->{tag_name} eq 'html' and
2240                $token->{attributes}->{manifest}) {
2241              $self->{application_cache_selection}
2242                   ->($token->{attributes}->{manifest}->{value});
2243              ## ISSUE: No relative reference resolution?
2244            } else {
2245              $self->{application_cache_selection}->(undef);
2246            }
2247    
2248            ## ISSUE: There is an issue in the spec
2249          #          #
2250        } elsif ({        } elsif ({
                 START_TAG_TOKEN, 1,  
2251                  END_TAG_TOKEN, 1,                  END_TAG_TOKEN, 1,
2252                  END_OF_FILE_TOKEN, 1,                  END_OF_FILE_TOKEN, 1,
2253                 }->{$token->{type}}) {                 }->{$token->{type}}) {
2254            $self->{application_cache_selection}->(undef);
2255    
2256          ## ISSUE: There is an issue in the spec          ## ISSUE: There is an issue in the spec
2257          #          #
2258        } else {        } else {
2259          die "$0: $token->{type}: Unknown token type";          die "$0: $token->{type}: Unknown token type";
2260        }        }
2261    
2262        my $root_element; !!!create-element ($root_element, 'html');        my $root_element; !!!create-element ($root_element, 'html');
2263        $self->{document}->append_child ($root_element);        $self->{document}->append_child ($root_element);
2264        push @{$self->{open_elements}}, [$root_element, 'html'];        push @{$self->{open_elements}}, [$root_element, 'html'];
# Line 2713  sub _tree_construction_main ($) { Line 2915  sub _tree_construction_main ($) {
2915                  push @{$self->{open_elements}}, [$self->{head_element}, 'head'];                  push @{$self->{open_elements}}, [$self->{head_element}, 'head'];
2916                }                }
2917                !!!insert-element ($token->{tag_name}, $token->{attributes});                !!!insert-element ($token->{tag_name}, $token->{attributes});
2918                pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.                my $meta_el = pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
2919    
2920                unless ($self->{confident}) {                unless ($self->{confident}) {
                 my $charset;  
2921                  if ($token->{attributes}->{charset}) { ## TODO: And if supported                  if ($token->{attributes}->{charset}) { ## TODO: And if supported
2922                    $charset = $token->{attributes}->{charset}->{value};                    $self->{change_encoding}
2923                  }                        ->($self, $token->{attributes}->{charset}->{value});
2924                  if ($token->{attributes}->{'http-equiv'}) {                    
2925                      $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
2926                          ->set_user_data (manakai_has_reference =>
2927                                               $token->{attributes}->{charset}
2928                                                   ->{has_reference});
2929                    } elsif ($token->{attributes}->{content}) {
2930                    ## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition.                    ## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition.
2931                    if ($token->{attributes}->{'http-equiv'}->{value}                    if ($token->{attributes}->{content}->{value}
2932                        =~ /\A[^;]*;[\x09-\x0D\x20]*charset[\x09-\x0D\x20]*=                        =~ /\A[^;]*;[\x09-\x0D\x20]*[Cc][Hh][Aa][Rr][Ss][Ee][Tt]
2933                              [\x09-\x0D\x20]*=
2934                            [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|                            [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|
2935                            ([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) {                            ([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) {
2936                      $charset = defined $1 ? $1 : defined $2 ? $2 : $3;                      $self->{change_encoding}
2937                    } ## TODO: And if supported                          ->($self, defined $1 ? $1 : defined $2 ? $2 : $3);
2938                        $meta_el->[0]->get_attribute_node_ns (undef, 'content')
2939                            ->set_user_data (manakai_has_reference =>
2940                                                 $token->{attributes}->{content}
2941                                                       ->{has_reference});
2942                      }
2943                    }
2944                  } else {
2945                    if ($token->{attributes}->{charset}) {
2946                      $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
2947                          ->set_user_data (manakai_has_reference =>
2948                                               $token->{attributes}->{charset}
2949                                                   ->{has_reference});
2950                    }
2951                    if ($token->{attributes}->{content}) {
2952                      $meta_el->[0]->get_attribute_node_ns (undef, 'content')
2953                          ->set_user_data (manakai_has_reference =>
2954                                               $token->{attributes}->{content}
2955                                                   ->{has_reference});
2956                  }                  }
                 ## TODO: Change the encoding  
2957                }                }
2958    
               ## TODO: Extracting |charset| from |meta|.  
2959                pop @{$self->{open_elements}}                pop @{$self->{open_elements}}
2960                    if $self->{insertion_mode} == AFTER_HEAD_IM;                    if $self->{insertion_mode} == AFTER_HEAD_IM;
2961                !!!next-token;                !!!next-token;
# Line 3302  sub _tree_construction_main ($) { Line 3525  sub _tree_construction_main ($) {
3525        $insert = $insert_to_current;        $insert = $insert_to_current;
3526        #        #
3527      } elsif ($self->{insertion_mode} & TABLE_IMS) {      } elsif ($self->{insertion_mode} & TABLE_IMS) {
3528            if ($token->{type} == CHARACTER_TOKEN) {        if ($token->{type} == CHARACTER_TOKEN) {
             ## NOTE: There are "character in table" code clones.  
3529              if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {              if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
3530                $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);                $self->{open_elements}->[-1]->[0]->manakai_append_text ($1);
3531                                
# Line 3360  sub _tree_construction_main ($) { Line 3582  sub _tree_construction_main ($) {
3582                            
3583              !!!next-token;              !!!next-token;
3584              redo B;              redo B;
3585            } elsif ($token->{type} == START_TAG_TOKEN) {        } elsif ($token->{type} == START_TAG_TOKEN) {
3586              if ({              if ({
3587                   tr => ($self->{insertion_mode} != IN_ROW_IM),                   tr => ($self->{insertion_mode} != IN_ROW_IM),
3588                   th => 1, td => 1,                   th => 1, td => 1,
# Line 3546  sub _tree_construction_main ($) { Line 3768  sub _tree_construction_main ($) {
3768                  die "$0: in table: <>: $token->{tag_name}";                  die "$0: in table: <>: $token->{tag_name}";
3769                }                }
3770              } elsif ($token->{tag_name} eq 'table') {              } elsif ($token->{tag_name} eq 'table') {
               ## NOTE: There are code clones for this "table in table"  
3771                !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);                !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
3772    
3773                ## As if </table>                ## As if </table>
# Line 3594  sub _tree_construction_main ($) { Line 3815  sub _tree_construction_main ($) {
3815    
3816                ## reprocess                ## reprocess
3817                redo B;                redo B;
3818              } else {          } else {
3819                #            !!!parse-error (type => 'in table:'.$token->{tag_name});
3820              }  
3821            } elsif ($token->{type} == END_TAG_TOKEN) {            $insert = $insert_to_foster;
3822              #
3823            }
3824          } elsif ($token->{type} == END_TAG_TOKEN) {
3825              if ($token->{tag_name} eq 'tr' and              if ($token->{tag_name} eq 'tr' and
3826                  $self->{insertion_mode} == IN_ROW_IM) {                  $self->{insertion_mode} == IN_ROW_IM) {
3827                ## have an element in table scope                ## have an element in table scope
# Line 3854  sub _tree_construction_main ($) { Line 4078  sub _tree_construction_main ($) {
4078                ## Ignore the token                ## Ignore the token
4079                !!!next-token;                !!!next-token;
4080                redo B;                redo B;
4081              } else {          } else {
4082                #            !!!parse-error (type => 'in table:/'.$token->{tag_name});
             }  
           } else {  
             die "$0: $token->{type}: Unknown token type";  
           }  
   
       !!!parse-error (type => 'in table:'.$token->{tag_name});  
4083    
4084        $insert = $insert_to_foster;            $insert = $insert_to_foster;
4085        #            #
4086            }
4087          } else {
4088            die "$0: $token->{type}: Unknown token type";
4089          }
4090      } elsif ($self->{insertion_mode} == IN_COLUMN_GROUP_IM) {      } elsif ($self->{insertion_mode} == IN_COLUMN_GROUP_IM) {
4091            if ($token->{type} == CHARACTER_TOKEN) {            if ($token->{type} == CHARACTER_TOKEN) {
4092              if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {              if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
# Line 3923  sub _tree_construction_main ($) { Line 4145  sub _tree_construction_main ($) {
4145              redo B;              redo B;
4146            }            }
4147      } elsif ($self->{insertion_mode} == IN_SELECT_IM) {      } elsif ($self->{insertion_mode} == IN_SELECT_IM) {
4148            if ($token->{type} == CHARACTER_TOKEN) {        if ($token->{type} == CHARACTER_TOKEN) {
4149              $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});          $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
4150              !!!next-token;          !!!next-token;
4151              redo B;          redo B;
4152            } elsif ($token->{type} == START_TAG_TOKEN) {        } elsif ($token->{type} == START_TAG_TOKEN) {
4153              if ($token->{tag_name} eq 'option') {              if ($token->{tag_name} eq 'option') {
4154                if ($self->{open_elements}->[-1]->[1] eq 'option') {                if ($self->{open_elements}->[-1]->[1] eq 'option') {
4155                  ## As if </option>                  ## As if </option>
# Line 3980  sub _tree_construction_main ($) { Line 4202  sub _tree_construction_main ($) {
4202    
4203                !!!next-token;                !!!next-token;
4204                redo B;                redo B;
4205              } else {          } else {
4206                #            !!!parse-error (type => 'in select:'.$token->{tag_name});
4207              }            ## Ignore the token
4208            } elsif ($token->{type} == END_TAG_TOKEN) {            !!!next-token;
4209              redo B;
4210            }
4211          } elsif ($token->{type} == END_TAG_TOKEN) {
4212              if ($token->{tag_name} eq 'optgroup') {              if ($token->{tag_name} eq 'optgroup') {
4213                if ($self->{open_elements}->[-1]->[1] eq 'option' and                if ($self->{open_elements}->[-1]->[1] eq 'option' and
4214                    $self->{open_elements}->[-2]->[1] eq 'optgroup') {                    $self->{open_elements}->[-2]->[1] eq 'optgroup') {
# Line 4085  sub _tree_construction_main ($) { Line 4310  sub _tree_construction_main ($) {
4310    
4311                ## reprocess                ## reprocess
4312                redo B;                redo B;
4313              } else {          } else {
4314                #            !!!parse-error (type => 'in select:/'.$token->{tag_name});
             }  
           } else {  
             #  
           }  
   
           !!!parse-error (type => 'in select:'.$token->{tag_name});  
4315            ## Ignore the token            ## Ignore the token
4316            !!!next-token;            !!!next-token;
4317            redo B;            redo B;
4318            }
4319          } else {
4320            die "$0: $token->{type}: Unknown token type";
4321          }
4322      } elsif ($self->{insertion_mode} & BODY_AFTER_IMS) {      } elsif ($self->{insertion_mode} & BODY_AFTER_IMS) {
4323        if ($token->{type} == CHARACTER_TOKEN) {        if ($token->{type} == CHARACTER_TOKEN) {
4324          if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {          if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {
# Line 4303  sub _tree_construction_main ($) { Line 4526  sub _tree_construction_main ($) {
4526        } elsif ($token->{tag_name} eq 'meta') {        } elsif ($token->{tag_name} eq 'meta') {
4527          ## NOTE: This is an "as if in head" code clone, only "-t" differs          ## NOTE: This is an "as if in head" code clone, only "-t" differs
4528          !!!insert-element-t ($token->{tag_name}, $token->{attributes});          !!!insert-element-t ($token->{tag_name}, $token->{attributes});
4529          pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.          my $meta_el = pop @{$self->{open_elements}}; ## ISSUE: This step is missing in the spec.
4530    
4531          unless ($self->{confident}) {          unless ($self->{confident}) {
           my $charset;  
4532            if ($token->{attributes}->{charset}) { ## TODO: And if supported            if ($token->{attributes}->{charset}) { ## TODO: And if supported
4533              $charset = $token->{attributes}->{charset}->{value};              $self->{change_encoding}
4534            }                  ->($self, $token->{attributes}->{charset}->{value});
4535            if ($token->{attributes}->{'http-equiv'}) {              
4536                $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
4537                    ->set_user_data (manakai_has_reference =>
4538                                         $token->{attributes}->{charset}
4539                                             ->{has_reference});
4540              } elsif ($token->{attributes}->{content}) {
4541              ## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition.              ## ISSUE: Algorithm name in the spec was incorrect so that not linked to the definition.
4542              if ($token->{attributes}->{'http-equiv'}->{value}              if ($token->{attributes}->{content}->{value}
4543                  =~ /\A[^;]*;[\x09-\x0D\x20]*charset[\x09-\x0D\x20]*=                  =~ /\A[^;]*;[\x09-\x0D\x20]*[Cc][Hh][Aa][Rr][Ss][Ee][Tt]
4544                        [\x09-\x0D\x20]*=
4545                      [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|                      [\x09-\x0D\x20]*(?>"([^"]*)"|'([^']*)'|
4546                      ([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) {                      ([^"'\x09-\x0D\x20][^\x09-\x0D\x20]*))/x) {
4547                $charset = defined $1 ? $1 : defined $2 ? $2 : $3;                $self->{change_encoding}
4548              } ## TODO: And if supported                    ->($self, defined $1 ? $1 : defined $2 ? $2 : $3);
4549                  $meta_el->[0]->get_attribute_node_ns (undef, 'content')
4550                      ->set_user_data (manakai_has_reference =>
4551                                           $token->{attributes}->{content}
4552                                                 ->{has_reference});
4553                }
4554              }
4555            } else {
4556              if ($token->{attributes}->{charset}) {
4557                $meta_el->[0]->get_attribute_node_ns (undef, 'charset')
4558                    ->set_user_data (manakai_has_reference =>
4559                                         $token->{attributes}->{charset}
4560                                             ->{has_reference});
4561              }
4562              if ($token->{attributes}->{content}) {
4563                $meta_el->[0]->get_attribute_node_ns (undef, 'content')
4564                    ->set_user_data (manakai_has_reference =>
4565                                         $token->{attributes}->{content}
4566                                             ->{has_reference});
4567            }            }
           ## TODO: Change the encoding  
4568          }          }
4569    
4570          !!!next-token;          !!!next-token;
# Line 4626  sub _tree_construction_main ($) { Line 4871  sub _tree_construction_main ($) {
4871          INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {          INSCOPE: for (reverse 0..$#{$self->{open_elements}}) {
4872            my $node = $self->{open_elements}->[$_];            my $node = $self->{open_elements}->[$_];
4873            if ($node->[1] eq 'nobr') {            if ($node->[1] eq 'nobr') {
4874              !!!parse-error (type => 'not closed:nobr');              !!!parse-error (type => 'in nobr:nobr');
4875              !!!back-token;              !!!back-token;
4876              $token = {type => END_TAG_TOKEN, tag_name => 'nobr'};              $token = {type => END_TAG_TOKEN, tag_name => 'nobr'};
4877              redo B;              redo B;
# Line 4831  sub _tree_construction_main ($) { Line 5076  sub _tree_construction_main ($) {
5076                  noframes => 1,                  noframes => 1,
5077                  noscript => 0, ## TODO: 1 if scripting is enabled                  noscript => 0, ## TODO: 1 if scripting is enabled
5078                 }->{$token->{tag_name}}) {                 }->{$token->{tag_name}}) {
5079          ## NOTE: There are two "as if in body" code clones.          ## NOTE: There is an "as if in body" code clone.
5080          $parse_rcdata->(CDATA_CONTENT_MODEL, $insert);          $parse_rcdata->(CDATA_CONTENT_MODEL, $insert);
5081          redo B;          redo B;
5082        } elsif ($token->{tag_name} eq 'select') {        } elsif ($token->{tag_name} eq 'select') {
# Line 4987  sub _tree_construction_main ($) { Line 5232  sub _tree_construction_main ($) {
5232          if ($self->{open_elements}->[-1]->[1] eq $token->{tag_name}) {          if ($self->{open_elements}->[-1]->[1] eq $token->{tag_name}) {
5233            pop @{$self->{open_elements}};            pop @{$self->{open_elements}};
5234          } else {          } else {
5235            !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);            !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
5236          }          }
5237    
5238          undef $self->{form_element};          undef $self->{form_element};
# Line 5025  sub _tree_construction_main ($) { Line 5270  sub _tree_construction_main ($) {
5270          } # INSCOPE          } # INSCOPE
5271                    
5272          if ($self->{open_elements}->[-1]->[1] ne $token->{tag_name}) {          if ($self->{open_elements}->[-1]->[1] ne $token->{tag_name}) {
5273            !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);            !!!parse-error (type => 'unmatched end tag:'.$token->{tag_name});
5274          }          }
5275                    
5276          splice @{$self->{open_elements}}, $i if defined $i;          splice @{$self->{open_elements}}, $i if defined $i;
# Line 5094  sub _tree_construction_main ($) { Line 5339  sub _tree_construction_main ($) {
5339                    
5340              ## Step 2              ## Step 2
5341              if ($token->{tag_name} ne $self->{open_elements}->[-1]->[1]) {              if ($token->{tag_name} ne $self->{open_elements}->[-1]->[1]) {
5342                  ## NOTE: <x><y></x>
5343                !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);                !!!parse-error (type => 'not closed:'.$self->{open_elements}->[-1]->[1]);
5344              }              }
5345                            
# Line 5144  sub set_inner_html ($$$) { Line 5390  sub set_inner_html ($$$) {
5390    my $s = \$_[0];    my $s = \$_[0];
5391    my $onerror = $_[1];    my $onerror = $_[1];
5392    
5393      ## ISSUE: Should {confident} be true?
5394    
5395    my $nt = $node->node_type;    my $nt = $node->node_type;
5396    if ($nt == 9) {    if ($nt == 9) {
5397      # MUST      # MUST
# Line 5296  sub set_inner_html ($$$) { Line 5544  sub set_inner_html ($$$) {
5544    
5545  } # tree construction stage  } # tree construction stage
5546    
5547  sub get_inner_html ($$$) {  package Whatpm::HTML::RestartParser;
5548    my (undef, $node, $on_error) = @_;  push our @ISA, 'Error';
   
   ## Step 1  
   my $s = '';  
   
   my $in_cdata;  
   my $parent = $node;  
   while (defined $parent) {  
     if ($parent->node_type == 1 and  
         $parent->namespace_uri eq 'http://www.w3.org/1999/xhtml' and  
         {  
           style => 1, script => 1, xmp => 1, iframe => 1,  
           noembed => 1, noframes => 1, noscript => 1,  
         }->{$parent->local_name}) { ## TODO: case thingy  
       $in_cdata = 1;  
     }  
     $parent = $parent->parent_node;  
   }  
   
   ## Step 2  
   my @node = @{$node->child_nodes};  
   C: while (@node) {  
     my $child = shift @node;  
     unless (ref $child) {  
       if ($child eq 'cdata-out') {  
         $in_cdata = 0;  
       } else {  
         $s .= $child; # end tag  
       }  
       next C;  
     }  
       
     my $nt = $child->node_type;  
     if ($nt == 1) { # Element  
       my $tag_name = $child->tag_name; ## TODO: manakai_tag_name  
       $s .= '<' . $tag_name;  
       ## NOTE: Non-HTML case:  
       ## <http://permalink.gmane.org/gmane.org.w3c.whatwg.discuss/11191>  
   
       my @attrs = @{$child->attributes}; # sort order MUST be stable  
       for my $attr (@attrs) { # order is implementation dependent  
         my $attr_name = $attr->name; ## TODO: manakai_name  
         $s .= ' ' . $attr_name . '="';  
         my $attr_value = $attr->value;  
         ## escape  
         $attr_value =~ s/&/&amp;/g;  
         $attr_value =~ s/</&lt;/g;  
         $attr_value =~ s/>/&gt;/g;  
         $attr_value =~ s/"/&quot;/g;  
         $s .= $attr_value . '"';  
       }  
       $s .= '>';  
         
       next C if {  
         area => 1, base => 1, basefont => 1, bgsound => 1,  
         br => 1, col => 1, embed => 1, frame => 1, hr => 1,  
         img => 1, input => 1, link => 1, meta => 1, param => 1,  
         spacer => 1, wbr => 1,  
       }->{$tag_name};  
   
       $s .= "\x0A" if $tag_name eq 'pre' or $tag_name eq 'textarea';  
   
       if (not $in_cdata and {  
         style => 1, script => 1, xmp => 1, iframe => 1,  
         noembed => 1, noframes => 1, noscript => 1,  
         plaintext => 1,  
       }->{$tag_name}) {  
         unshift @node, 'cdata-out';  
         $in_cdata = 1;  
       }  
   
       unshift @node, @{$child->child_nodes}, '</' . $tag_name . '>';  
     } elsif ($nt == 3 or $nt == 4) {  
       if ($in_cdata) {  
         $s .= $child->data;  
       } else {  
         my $value = $child->data;  
         $value =~ s/&/&amp;/g;  
         $value =~ s/</&lt;/g;  
         $value =~ s/>/&gt;/g;  
         $value =~ s/"/&quot;/g;  
         $s .= $value;  
       }  
     } elsif ($nt == 8) {  
       $s .= '<!--' . $child->data . '-->';  
     } elsif ($nt == 10) {  
       $s .= '<!DOCTYPE ' . $child->name . '>';  
     } elsif ($nt == 5) { # entrefs  
       push @node, @{$child->child_nodes};  
     } else {  
       $on_error->($child) if defined $on_error;  
     }  
     ## ISSUE: This code does not support PIs.  
   } # C  
     
   ## Step 3  
   return \$s;  
 } # get_inner_html  
5549    
5550  1;  1;
5551  # $Date$  # $Date$

Legend:
Removed from v.1.56  
changed lines
  Added in v.1.70

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24