/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src
Suika

Diff of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.2 by wakaba, Tue Oct 14 04:32:49 2008 UTC revision 1.11 by wakaba, Wed Oct 15 10:50:38 2008 UTC
# Line 114  sub HEXREF_HEX_STATE () { 48 } Line 114  sub HEXREF_HEX_STATE () { 48 }
114  sub ENTITY_NAME_STATE () { 49 }  sub ENTITY_NAME_STATE () { 49 }
115  sub PCDATA_STATE () { 50 } # "data state" in the spec  sub PCDATA_STATE () { 50 } # "data state" in the spec
116    
117    ## XML states
118    sub PI_STATE () { 51 }
119    sub PI_TARGET_STATE () { 52 }
120    sub PI_TARGET_AFTER_STATE () { 53 }
121    sub PI_DATA_STATE () { 54 }
122    sub PI_AFTER_STATE () { 55 }
123    sub PI_DATA_AFTER_STATE () { 56 }
124    
125  ## Tree constructor state constants (see Whatpm::HTML for the full  ## Tree constructor state constants (see Whatpm::HTML for the full
126  ## list and descriptions)  ## list and descriptions)
127    
# Line 175  sub _initialize_tokenizer ($) { Line 183  sub _initialize_tokenizer ($) {
183    #$self->{level}    #$self->{level}
184    #$self->{set_nc}    #$self->{set_nc}
185    #$self->{parse_error}    #$self->{parse_error}
186      #$self->{is_xml} (if XML)
187    
188    $self->{state} = DATA_STATE; # MUST    $self->{state} = DATA_STATE; # MUST
189    #$self->{s_kwd}; # state keyword - initialized when used    $self->{s_kwd} = ''; # state keyword
190    #$self->{entity__value}; # initialized when used    #$self->{entity__value}; # initialized when used
191    #$self->{entity__match}; # initialized when used    #$self->{entity__match}; # initialized when used
192    $self->{content_model} = PCDATA_CONTENT_MODEL; # be    $self->{content_model} = PCDATA_CONTENT_MODEL; # be
# Line 197  sub _initialize_tokenizer ($) { Line 206  sub _initialize_tokenizer ($) {
206    
207  ## A token has:  ## A token has:
208  ##   ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,  ##   ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
209  ##       CHARACTER_TOKEN, or END_OF_FILE_TOKEN  ##       CHARACTER_TOKEN, END_OF_FILE_TOKEN, PI_TOKEN, or ABORT_TOKEN
210  ##   ->{name} (DOCTYPE_TOKEN)  ##   ->{name} (DOCTYPE_TOKEN)
211  ##   ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)  ##   ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
212    ##   ->{target} (PI_TOKEN)
213  ##   ->{pubid} (DOCTYPE_TOKEN)  ##   ->{pubid} (DOCTYPE_TOKEN)
214  ##   ->{sysid} (DOCTYPE_TOKEN)  ##   ->{sysid} (DOCTYPE_TOKEN)
215  ##   ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag  ##   ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
# Line 207  sub _initialize_tokenizer ($) { Line 217  sub _initialize_tokenizer ($) {
217  ##        ->{name}  ##        ->{name}
218  ##        ->{value}  ##        ->{value}
219  ##        ->{has_reference} == 1 or 0  ##        ->{has_reference} == 1 or 0
220  ##   ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)  ##        ->{index}: Index of the attribute in a tag.
221    ##   ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)
222    ##   ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
223    ##   ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.
224  ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.  ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
225  ##     |->{self_closing}| is used to save the value of |$self->{self_closing}|  ##     |->{self_closing}| is used to save the value of |$self->{self_closing}|
226  ##     while the token is pushed back to the stack.  ##     while the token is pushed back to the stack.
# Line 311  sub _get_next_token ($) { Line 324  sub _get_next_token ($) {
324          }          }
325        } elsif ($self->{nc} == 0x002D) { # -        } elsif ($self->{nc} == 0x002D) { # -
326          if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA          if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
327            $self->{s_kwd} .= '-';            if ($self->{s_kwd} eq '<!-') {
             
           if ($self->{s_kwd} eq '<!--') {  
328              !!!cp (3);              !!!cp (3);
329              $self->{escape} = 1; # unless $self->{escape};              $self->{escape} = 1; # unless $self->{escape};
330              $self->{s_kwd} = '--';              $self->{s_kwd} = '--';
331              #              #
332            } elsif ($self->{s_kwd} eq '---') {            } elsif ($self->{s_kwd} eq '-') {
333              !!!cp (4);              !!!cp (4);
334              $self->{s_kwd} = '--';              $self->{s_kwd} = '--';
335              #              #
336              } elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') {
337                !!!cp (4.1);
338                $self->{s_kwd} .= '-';
339                #
340            } else {            } else {
341              !!!cp (5);              !!!cp (5);
342                $self->{s_kwd} = '-';
343              #              #
344            }            }
345          }          }
# Line 359  sub _get_next_token ($) { Line 375  sub _get_next_token ($) {
375            if ($self->{s_kwd} eq '--') {            if ($self->{s_kwd} eq '--') {
376              !!!cp (8);              !!!cp (8);
377              delete $self->{escape};              delete $self->{escape};
378                #
379            } else {            } else {
380              !!!cp (9);              !!!cp (9);
381                #
382            }            }
383            } elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') {
384              !!!cp (9.1);
385              !!!parse-error (type => 'unmatched mse', ## TODO: type
386                              line => $self->{line_prev},
387                              column => $self->{column_prev} - 1);
388              #
389          } else {          } else {
390            !!!cp (10);            !!!cp (10);
391              #
392          }          }
393                    
394          $self->{s_kwd} = '';          $self->{s_kwd} = '';
395          #          #
396          } elsif ($self->{nc} == 0x005D) { # ]
397            if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') {
398              !!!cp (10.1);
399              $self->{s_kwd} .= ']';
400            } elsif ($self->{s_kwd} eq ']]') {
401              !!!cp (10.2);
402              #
403            } else {
404              !!!cp (10.3);
405              $self->{s_kwd} = '';
406            }
407            #
408        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
409          !!!cp (11);          !!!cp (11);
410          $self->{s_kwd} = '';          $self->{s_kwd} = '';
# Line 385  sub _get_next_token ($) { Line 422  sub _get_next_token ($) {
422                     data => chr $self->{nc},                     data => chr $self->{nc},
423                     line => $self->{line}, column => $self->{column},                     line => $self->{line}, column => $self->{column},
424                    };                    };
425        if ($self->{read_until}->($token->{data}, q[-!<>&],        if ($self->{read_until}->($token->{data}, q{-!<>&\]},
426                                  length $token->{data})) {                                  length $token->{data})) {
427          $self->{s_kwd} = '';          $self->{s_kwd} = '';
428        }        }
429    
430        ## Stay in the data state.        ## Stay in the data state.
431        if ($self->{content_model} == PCDATA_CONTENT_MODEL) {        if (not $self->{is_xml} and
432              $self->{content_model} == PCDATA_CONTENT_MODEL) {
433          !!!cp (13);          !!!cp (13);
434          $self->{state} = PCDATA_STATE;          $self->{state} = PCDATA_STATE;
435        } else {        } else {
# Line 402  sub _get_next_token ($) { Line 440  sub _get_next_token ($) {
440        !!!emit ($token);        !!!emit ($token);
441        redo A;        redo A;
442      } elsif ($self->{state} == TAG_OPEN_STATE) {      } elsif ($self->{state} == TAG_OPEN_STATE) {
443          ## XML5: "tag state".
444    
445        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
446          if ($self->{nc} == 0x002F) { # /          if ($self->{nc} == 0x002F) { # /
447            !!!cp (15);            !!!cp (15);
# Line 419  sub _get_next_token ($) { Line 459  sub _get_next_token ($) {
459    
460          ## reconsume          ## reconsume
461          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
462            $self->{s_kwd} = '';
463          !!!emit ({type => CHARACTER_TOKEN, data => '<',          !!!emit ({type => CHARACTER_TOKEN, data => '<',
464                    line => $self->{line_prev},                    line => $self->{line_prev},
465                    column => $self->{column_prev},                    column => $self->{column_prev},
# Line 440  sub _get_next_token ($) { Line 481  sub _get_next_token ($) {
481            !!!cp (19);            !!!cp (19);
482            $self->{ct}            $self->{ct}
483              = {type => START_TAG_TOKEN,              = {type => START_TAG_TOKEN,
484                 tag_name => chr ($self->{nc} + 0x0020),                 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
485                 line => $self->{line_prev},                 line => $self->{line_prev},
486                 column => $self->{column_prev}};                 column => $self->{column_prev}};
487            $self->{state} = TAG_NAME_STATE;            $self->{state} = TAG_NAME_STATE;
# Line 462  sub _get_next_token ($) { Line 503  sub _get_next_token ($) {
503                            line => $self->{line_prev},                            line => $self->{line_prev},
504                            column => $self->{column_prev});                            column => $self->{column_prev});
505            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
506              $self->{s_kwd} = '';
507            !!!next-input-character;            !!!next-input-character;
508    
509            !!!emit ({type => CHARACTER_TOKEN, data => '<>',            !!!emit ({type => CHARACTER_TOKEN, data => '<>',
# Line 471  sub _get_next_token ($) { Line 513  sub _get_next_token ($) {
513    
514            redo A;            redo A;
515          } elsif ($self->{nc} == 0x003F) { # ?          } elsif ($self->{nc} == 0x003F) { # ?
516            !!!cp (22);            if ($self->{is_xml}) {
517            !!!parse-error (type => 'pio',              !!!cp (22.1);
518                            line => $self->{line_prev},              $self->{state} = PI_STATE;
519                            column => $self->{column_prev});              !!!next-input-character;
520            $self->{state} = BOGUS_COMMENT_STATE;              redo A;
521            $self->{ct} = {type => COMMENT_TOKEN, data => '',            } else {
522                                      line => $self->{line_prev},              !!!cp (22);
523                                      column => $self->{column_prev},              !!!parse-error (type => 'pio',
524                                     };                              line => $self->{line_prev},
525            ## $self->{nc} is intentionally left as is                              column => $self->{column_prev});
526            redo A;              $self->{state} = BOGUS_COMMENT_STATE;
527          } else {              $self->{ct} = {type => COMMENT_TOKEN, data => '',
528                               line => $self->{line_prev},
529                               column => $self->{column_prev},
530                              };
531                ## $self->{nc} is intentionally left as is
532                redo A;
533              }
534            } elsif (not $self->{is_xml} or $is_space->{$self->{nc}}) {
535            !!!cp (23);            !!!cp (23);
536            !!!parse-error (type => 'bare stago',            !!!parse-error (type => 'bare stago',
537                            line => $self->{line_prev},                            line => $self->{line_prev},
538                            column => $self->{column_prev});                            column => $self->{column_prev});
539            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
540              $self->{s_kwd} = '';
541            ## reconsume            ## reconsume
542    
543            !!!emit ({type => CHARACTER_TOKEN, data => '<',            !!!emit ({type => CHARACTER_TOKEN, data => '<',
# Line 496  sub _get_next_token ($) { Line 546  sub _get_next_token ($) {
546                     });                     });
547    
548            redo A;            redo A;
549            } else {
550              ## XML5: "<:" is a parse error.
551              !!!cp (23.1);
552              $self->{ct} = {type => START_TAG_TOKEN,
553                                        tag_name => chr ($self->{nc}),
554                                        line => $self->{line_prev},
555                                        column => $self->{column_prev}};
556              $self->{state} = TAG_NAME_STATE;
557              !!!next-input-character;
558              redo A;
559          }          }
560        } else {        } else {
561          die "$0: $self->{content_model} in tag open";          die "$0: $self->{content_model} in tag open";
# Line 504  sub _get_next_token ($) { Line 564  sub _get_next_token ($) {
564        ## NOTE: The "close tag open state" in the spec is implemented as        ## NOTE: The "close tag open state" in the spec is implemented as
565        ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.        ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
566    
567          ## XML5: "end tag state".
568    
569        my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"        my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
570        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
571          if (defined $self->{last_stag_name}) {          if (defined $self->{last_stag_name}) {
# Line 516  sub _get_next_token ($) { Line 578  sub _get_next_token ($) {
578            ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.            ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
579            !!!cp (28);            !!!cp (28);
580            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
581              $self->{s_kwd} = '';
582            ## Reconsume.            ## Reconsume.
583            !!!emit ({type => CHARACTER_TOKEN, data => '</',            !!!emit ({type => CHARACTER_TOKEN, data => '</',
584                      line => $l, column => $c,                      line => $l, column => $c,
# Line 529  sub _get_next_token ($) { Line 592  sub _get_next_token ($) {
592          !!!cp (29);          !!!cp (29);
593          $self->{ct}          $self->{ct}
594              = {type => END_TAG_TOKEN,              = {type => END_TAG_TOKEN,
595                 tag_name => chr ($self->{nc} + 0x0020),                 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
596                 line => $l, column => $c};                 line => $l, column => $c};
597          $self->{state} = TAG_NAME_STATE;          $self->{state} = TAG_NAME_STATE;
598          !!!next-input-character;          !!!next-input-character;
# Line 544  sub _get_next_token ($) { Line 607  sub _get_next_token ($) {
607          !!!next-input-character;          !!!next-input-character;
608          redo A;          redo A;
609        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
         !!!cp (31);  
610          !!!parse-error (type => 'empty end tag',          !!!parse-error (type => 'empty end tag',
611                          line => $self->{line_prev}, ## "<" in "</>"                          line => $self->{line_prev}, ## "<" in "</>"
612                          column => $self->{column_prev} - 1);                          column => $self->{column_prev} - 1);
613          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
614          !!!next-input-character;          $self->{s_kwd} = '';
615            if ($self->{is_xml}) {
616              !!!cp (31);
617              ## XML5: No parse error.
618              
619              ## NOTE: This parser raises a parse error, since it supports
620              ## XML1, not XML5.
621    
622              ## NOTE: A short end tag token.
623              my $ct = {type => END_TAG_TOKEN,
624                        tag_name => '',
625                        line => $self->{line_prev},
626                        column => $self->{column_prev} - 1,
627                       };
628              !!!next-input-character;
629              !!!emit ($ct);
630            } else {
631              !!!cp (31.1);
632              !!!next-input-character;
633            }
634          redo A;          redo A;
635        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
636          !!!cp (32);          !!!cp (32);
637          !!!parse-error (type => 'bare etago');          !!!parse-error (type => 'bare etago');
638            $self->{s_kwd} = '';
639          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
640          # reconsume          # reconsume
641    
# Line 562  sub _get_next_token ($) { Line 644  sub _get_next_token ($) {
644                   });                   });
645    
646          redo A;          redo A;
647        } else {        } elsif (not $self->{is_xml} or
648                   $is_space->{$self->{nc}}) {
649          !!!cp (33);          !!!cp (33);
650          !!!parse-error (type => 'bogus end tag');          !!!parse-error (type => 'bogus end tag',
651                            line => $self->{line_prev}, # "<" of "</"
652                            column => $self->{column_prev} - 1);
653          $self->{state} = BOGUS_COMMENT_STATE;          $self->{state} = BOGUS_COMMENT_STATE;
654          $self->{ct} = {type => COMMENT_TOKEN, data => '',          $self->{ct} = {type => COMMENT_TOKEN, data => '',
655                                    line => $self->{line_prev}, # "<" of "</"                                    line => $self->{line_prev}, # "<" of "</"
# Line 577  sub _get_next_token ($) { Line 662  sub _get_next_token ($) {
662          ## generated from the bogus end tag, as defined in the          ## generated from the bogus end tag, as defined in the
663          ## "bogus comment state" entry.          ## "bogus comment state" entry.
664          redo A;          redo A;
665          } else {
666            ## XML5: "</:" is a parse error.
667            !!!cp (30.1);
668            $self->{ct} = {type => END_TAG_TOKEN,
669                           tag_name => chr ($self->{nc}),
670                           line => $l, column => $c};
671            $self->{state} = TAG_NAME_STATE; ## XML5: "end tag name state".
672            !!!next-input-character;
673            redo A;
674        }        }
675      } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {      } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
676        my $ch = substr $self->{last_stag_name}, length $self->{s_kwd}, 1;        my $ch = substr $self->{last_stag_name}, length $self->{s_kwd}, 1;
# Line 593  sub _get_next_token ($) { Line 687  sub _get_next_token ($) {
687          } else {          } else {
688            !!!cp (25);            !!!cp (25);
689            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
690              $self->{s_kwd} = '';
691            ## Reconsume.            ## Reconsume.
692            !!!emit ({type => CHARACTER_TOKEN,            !!!emit ({type => CHARACTER_TOKEN,
693                      data => '</' . $self->{s_kwd},                      data => '</' . $self->{s_kwd},
# Line 611  sub _get_next_token ($) { Line 706  sub _get_next_token ($) {
706            !!!cp (26);            !!!cp (26);
707            ## Reconsume.            ## Reconsume.
708            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
709              $self->{s_kwd} = '';
710            !!!emit ({type => CHARACTER_TOKEN,            !!!emit ({type => CHARACTER_TOKEN,
711                      data => '</' . $self->{s_kwd},                      data => '</' . $self->{s_kwd},
712                      line => $self->{line_prev},                      line => $self->{line_prev},
# Line 652  sub _get_next_token ($) { Line 748  sub _get_next_token ($) {
748            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
749          }          }
750          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
751            $self->{s_kwd} = '';
752          !!!next-input-character;          !!!next-input-character;
753    
754          !!!emit ($self->{ct}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
# Line 660  sub _get_next_token ($) { Line 757  sub _get_next_token ($) {
757        } elsif (0x0041 <= $self->{nc} and        } elsif (0x0041 <= $self->{nc} and
758                 $self->{nc} <= 0x005A) { # A..Z                 $self->{nc} <= 0x005A) { # A..Z
759          !!!cp (38);          !!!cp (38);
760          $self->{ct}->{tag_name} .= chr ($self->{nc} + 0x0020);          $self->{ct}->{tag_name}
761                .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
762            # start tag or end tag            # start tag or end tag
763          ## Stay in this state          ## Stay in this state
764          !!!next-input-character;          !!!next-input-character;
# Line 683  sub _get_next_token ($) { Line 781  sub _get_next_token ($) {
781            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
782          }          }
783          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
784            $self->{s_kwd} = '';
785          # reconsume          # reconsume
786    
787          !!!emit ($self->{ct}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
# Line 702  sub _get_next_token ($) { Line 801  sub _get_next_token ($) {
801          redo A;          redo A;
802        }        }
803      } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {      } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
804          ## XML5: "Tag attribute name before state".
805    
806        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
807          !!!cp (45);          !!!cp (45);
808          ## Stay in the state          ## Stay in the state
# Line 723  sub _get_next_token ($) { Line 824  sub _get_next_token ($) {
824            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
825          }          }
826          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
827            $self->{s_kwd} = '';
828          !!!next-input-character;          !!!next-input-character;
829    
830          !!!emit ($self->{ct}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
# Line 732  sub _get_next_token ($) { Line 834  sub _get_next_token ($) {
834                 $self->{nc} <= 0x005A) { # A..Z                 $self->{nc} <= 0x005A) { # A..Z
835          !!!cp (49);          !!!cp (49);
836          $self->{ca}          $self->{ca}
837              = {name => chr ($self->{nc} + 0x0020),              = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
838                 value => '',                 value => '',
839                 line => $self->{line}, column => $self->{column}};                 line => $self->{line}, column => $self->{column}};
840          $self->{state} = ATTRIBUTE_NAME_STATE;          $self->{state} = ATTRIBUTE_NAME_STATE;
# Line 760  sub _get_next_token ($) { Line 862  sub _get_next_token ($) {
862            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
863          }          }
864          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
865            $self->{s_kwd} = '';
866          # reconsume          # reconsume
867    
868          !!!emit ($self->{ct}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
# Line 772  sub _get_next_token ($) { Line 875  sub _get_next_token ($) {
875               0x003D => 1, # =               0x003D => 1, # =
876              }->{$self->{nc}}) {              }->{$self->{nc}}) {
877            !!!cp (55);            !!!cp (55);
878              ## XML5: Not a parse error.
879            !!!parse-error (type => 'bad attribute name');            !!!parse-error (type => 'bad attribute name');
880          } else {          } else {
881            !!!cp (56);            !!!cp (56);
882              ## XML5: ":" raises a parse error and is ignored.
883          }          }
884          $self->{ca}          $self->{ca}
885              = {name => chr ($self->{nc}),              = {name => chr ($self->{nc}),
# Line 785  sub _get_next_token ($) { Line 890  sub _get_next_token ($) {
890          redo A;          redo A;
891        }        }
892      } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {      } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
893          ## XML5: "Tag attribute name state".
894    
895        my $before_leave = sub {        my $before_leave = sub {
896          if (exists $self->{ct}->{attributes} # start tag or end tag          if (exists $self->{ct}->{attributes} # start tag or end tag
897              ->{$self->{ca}->{name}}) { # MUST              ->{$self->{ca}->{name}}) { # MUST
# Line 795  sub _get_next_token ($) { Line 902  sub _get_next_token ($) {
902            !!!cp (58);            !!!cp (58);
903            $self->{ct}->{attributes}->{$self->{ca}->{name}}            $self->{ct}->{attributes}->{$self->{ca}->{name}}
904              = $self->{ca};              = $self->{ca};
905              $self->{ca}->{index} = ++$self->{ct}->{last_index};
906          }          }
907        }; # $before_leave        }; # $before_leave
908    
# Line 811  sub _get_next_token ($) { Line 919  sub _get_next_token ($) {
919          !!!next-input-character;          !!!next-input-character;
920          redo A;          redo A;
921        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
922            if ($self->{is_xml}) {
923              !!!cp (60.1);
924              ## XML5: Not a parse error.
925              !!!parse-error (type => 'no attr value'); ## TODO: type
926            } else {
927              !!!cp (60.2);
928            }
929    
930          $before_leave->();          $before_leave->();
931          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
932            !!!cp (61);            !!!cp (61);
# Line 825  sub _get_next_token ($) { Line 941  sub _get_next_token ($) {
941            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
942          }          }
943          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
944            $self->{s_kwd} = '';
945          !!!next-input-character;          !!!next-input-character;
946    
947          !!!emit ($self->{ct}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
# Line 833  sub _get_next_token ($) { Line 950  sub _get_next_token ($) {
950        } elsif (0x0041 <= $self->{nc} and        } elsif (0x0041 <= $self->{nc} and
951                 $self->{nc} <= 0x005A) { # A..Z                 $self->{nc} <= 0x005A) { # A..Z
952          !!!cp (63);          !!!cp (63);
953          $self->{ca}->{name} .= chr ($self->{nc} + 0x0020);          $self->{ca}->{name}
954                .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
955          ## Stay in the state          ## Stay in the state
956          !!!next-input-character;          !!!next-input-character;
957          redo A;          redo A;
958        } elsif ($self->{nc} == 0x002F) { # /        } elsif ($self->{nc} == 0x002F) { # /
959          !!!cp (64);          if ($self->{is_xml}) {
960              !!!cp (64);
961              ## XML5: Not a parse error.
962              !!!parse-error (type => 'no attr value'); ## TODO: type
963            } else {
964              !!!cp (64.1);
965            }
966            
967          $before_leave->();          $before_leave->();
968          $self->{state} = SELF_CLOSING_START_TAG_STATE;          $self->{state} = SELF_CLOSING_START_TAG_STATE;
969          !!!next-input-character;          !!!next-input-character;
# Line 862  sub _get_next_token ($) { Line 987  sub _get_next_token ($) {
987            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
988          }          }
989          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
990            $self->{s_kwd} = '';
991          # reconsume          # reconsume
992    
993          !!!emit ($self->{ct}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
# Line 871  sub _get_next_token ($) { Line 997  sub _get_next_token ($) {
997          if ($self->{nc} == 0x0022 or # "          if ($self->{nc} == 0x0022 or # "
998              $self->{nc} == 0x0027) { # '              $self->{nc} == 0x0027) { # '
999            !!!cp (69);            !!!cp (69);
1000              ## XML5: Not a parse error.
1001            !!!parse-error (type => 'bad attribute name');            !!!parse-error (type => 'bad attribute name');
1002          } else {          } else {
1003            !!!cp (70);            !!!cp (70);
# Line 881  sub _get_next_token ($) { Line 1008  sub _get_next_token ($) {
1008          redo A;          redo A;
1009        }        }
1010      } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {      } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1011          ## XML5: "Tag attribute name after state".
1012          
1013        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
1014          !!!cp (71);          !!!cp (71);
1015          ## Stay in the state          ## Stay in the state
# Line 892  sub _get_next_token ($) { Line 1021  sub _get_next_token ($) {
1021          !!!next-input-character;          !!!next-input-character;
1022          redo A;          redo A;
1023        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
1024            if ($self->{is_xml}) {
1025              !!!cp (72.1);
1026              ## XML5: Not a parse error.
1027              !!!parse-error (type => 'no attr value'); ## TODO: type
1028            } else {
1029              !!!cp (72.2);
1030            }
1031    
1032          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
1033            !!!cp (73);            !!!cp (73);
1034            $self->{last_stag_name} = $self->{ct}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
# Line 908  sub _get_next_token ($) { Line 1045  sub _get_next_token ($) {
1045            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1046          }          }
1047          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1048            $self->{s_kwd} = '';
1049          !!!next-input-character;          !!!next-input-character;
1050    
1051          !!!emit ($self->{ct}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
# Line 917  sub _get_next_token ($) { Line 1055  sub _get_next_token ($) {
1055                 $self->{nc} <= 0x005A) { # A..Z                 $self->{nc} <= 0x005A) { # A..Z
1056          !!!cp (76);          !!!cp (76);
1057          $self->{ca}          $self->{ca}
1058              = {name => chr ($self->{nc} + 0x0020),              = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1059                 value => '',                 value => '',
1060                 line => $self->{line}, column => $self->{column}};                 line => $self->{line}, column => $self->{column}};
1061          $self->{state} = ATTRIBUTE_NAME_STATE;          $self->{state} = ATTRIBUTE_NAME_STATE;
1062          !!!next-input-character;          !!!next-input-character;
1063          redo A;          redo A;
1064        } elsif ($self->{nc} == 0x002F) { # /        } elsif ($self->{nc} == 0x002F) { # /
1065          !!!cp (77);          if ($self->{is_xml}) {
1066              !!!cp (77);
1067              ## XML5: Not a parse error.
1068              !!!parse-error (type => 'no attr value'); ## TODO: type
1069            } else {
1070              !!!cp (77.1);
1071            }
1072            
1073          $self->{state} = SELF_CLOSING_START_TAG_STATE;          $self->{state} = SELF_CLOSING_START_TAG_STATE;
1074          !!!next-input-character;          !!!next-input-character;
1075          redo A;          redo A;
# Line 945  sub _get_next_token ($) { Line 1090  sub _get_next_token ($) {
1090          } else {          } else {
1091            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1092          }          }
1093            $self->{s_kwd} = '';
1094          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1095          # reconsume          # reconsume
1096    
# Line 952  sub _get_next_token ($) { Line 1098  sub _get_next_token ($) {
1098    
1099          redo A;          redo A;
1100        } else {        } else {
1101            if ($self->{is_xml}) {
1102              !!!cp (78.1);
1103              ## XML5: Not a parse error.
1104              !!!parse-error (type => 'no attr value'); ## TODO: type
1105            } else {
1106              !!!cp (78.2);
1107            }
1108    
1109          if ($self->{nc} == 0x0022 or # "          if ($self->{nc} == 0x0022 or # "
1110              $self->{nc} == 0x0027) { # '              $self->{nc} == 0x0027) { # '
1111            !!!cp (78);            !!!cp (78);
1112              ## XML5: Not a parse error.
1113            !!!parse-error (type => 'bad attribute name');            !!!parse-error (type => 'bad attribute name');
1114          } else {          } else {
1115            !!!cp (82);            !!!cp (82);
# Line 968  sub _get_next_token ($) { Line 1123  sub _get_next_token ($) {
1123          redo A;                  redo A;        
1124        }        }
1125      } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {      } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1126          ## XML5: "Tag attribute value before state".
1127    
1128        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
1129          !!!cp (83);          !!!cp (83);
1130          ## Stay in the state          ## Stay in the state
# Line 1006  sub _get_next_token ($) { Line 1163  sub _get_next_token ($) {
1163            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1164          }          }
1165          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1166            $self->{s_kwd} = '';
1167          !!!next-input-character;          !!!next-input-character;
1168    
1169          !!!emit ($self->{ct}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
# Line 1029  sub _get_next_token ($) { Line 1187  sub _get_next_token ($) {
1187            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1188          }          }
1189          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1190            $self->{s_kwd} = '';
1191          ## reconsume          ## reconsume
1192    
1193          !!!emit ($self->{ct}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
# Line 1037  sub _get_next_token ($) { Line 1196  sub _get_next_token ($) {
1196        } else {        } else {
1197          if ($self->{nc} == 0x003D) { # =          if ($self->{nc} == 0x003D) { # =
1198            !!!cp (93);            !!!cp (93);
1199              ## XML5: Not a parse error.
1200            !!!parse-error (type => 'bad attribute value');            !!!parse-error (type => 'bad attribute value');
1201            } elsif ($self->{is_xml}) {
1202              !!!cp (93.1);
1203              ## XML5: No parse error.
1204              !!!parse-error (type => 'unquoted attr value'); ## TODO
1205          } else {          } else {
1206            !!!cp (94);            !!!cp (94);
1207          }          }
# Line 1047  sub _get_next_token ($) { Line 1211  sub _get_next_token ($) {
1211          redo A;          redo A;
1212        }        }
1213      } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {      } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1214          ## XML5: "Tag attribute value double quoted state".
1215          
1216        if ($self->{nc} == 0x0022) { # "        if ($self->{nc} == 0x0022) { # "
1217          !!!cp (95);          !!!cp (95);
1218            ## XML5: "Tag attribute name before state".
1219          $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;          $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1220          !!!next-input-character;          !!!next-input-character;
1221          redo A;          redo A;
1222        } elsif ($self->{nc} == 0x0026) { # &        } elsif ($self->{nc} == 0x0026) { # &
1223          !!!cp (96);          !!!cp (96);
1224            ## XML5: Not defined yet.
1225    
1226          ## NOTE: In the spec, the tokenizer is switched to the          ## NOTE: In the spec, the tokenizer is switched to the
1227          ## "entity in attribute value state".  In this implementation, the          ## "entity in attribute value state".  In this implementation, the
1228          ## tokenizer is switched to the |ENTITY_STATE|, which is an          ## tokenizer is switched to the |ENTITY_STATE|, which is an
# Line 1081  sub _get_next_token ($) { Line 1250  sub _get_next_token ($) {
1250            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1251          }          }
1252          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1253            $self->{s_kwd} = '';
1254          ## reconsume          ## reconsume
1255    
1256          !!!emit ($self->{ct}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
1257    
1258          redo A;          redo A;
1259        } else {        } else {
1260          !!!cp (100);          if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1261              !!!cp (100);
1262              ## XML5: Not a parse error.
1263              !!!parse-error (type => 'lt in attr value'); ## TODO: type
1264            } else {
1265              !!!cp (100.1);
1266            }
1267          $self->{ca}->{value} .= chr ($self->{nc});          $self->{ca}->{value} .= chr ($self->{nc});
1268          $self->{read_until}->($self->{ca}->{value},          $self->{read_until}->($self->{ca}->{value},
1269                                q["&],                                q["&<],
1270                                length $self->{ca}->{value});                                length $self->{ca}->{value});
1271    
1272          ## Stay in the state          ## Stay in the state
# Line 1098  sub _get_next_token ($) { Line 1274  sub _get_next_token ($) {
1274          redo A;          redo A;
1275        }        }
1276      } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {      } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1277          ## XML5: "Tag attribute value single quoted state".
1278    
1279        if ($self->{nc} == 0x0027) { # '        if ($self->{nc} == 0x0027) { # '
1280          !!!cp (101);          !!!cp (101);
1281            ## XML5: "Before attribute name state" (sic).
1282          $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;          $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1283          !!!next-input-character;          !!!next-input-character;
1284          redo A;          redo A;
1285        } elsif ($self->{nc} == 0x0026) { # &        } elsif ($self->{nc} == 0x0026) { # &
1286          !!!cp (102);          !!!cp (102);
1287            ## XML5: Not defined yet.
1288    
1289          ## NOTE: In the spec, the tokenizer is switched to the          ## NOTE: In the spec, the tokenizer is switched to the
1290          ## "entity in attribute value state".  In this implementation, the          ## "entity in attribute value state".  In this implementation, the
1291          ## tokenizer is switched to the |ENTITY_STATE|, which is an          ## tokenizer is switched to the |ENTITY_STATE|, which is an
# Line 1132  sub _get_next_token ($) { Line 1313  sub _get_next_token ($) {
1313            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1314          }          }
1315          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1316            $self->{s_kwd} = '';
1317          ## reconsume          ## reconsume
1318    
1319          !!!emit ($self->{ct}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
1320    
1321          redo A;          redo A;
1322        } else {        } else {
1323          !!!cp (106);          if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1324              !!!cp (106);
1325              ## XML5: Not a parse error.
1326              !!!parse-error (type => 'lt in attr value'); ## TODO: type
1327            } else {
1328              !!!cp (106.1);
1329            }
1330          $self->{ca}->{value} .= chr ($self->{nc});          $self->{ca}->{value} .= chr ($self->{nc});
1331          $self->{read_until}->($self->{ca}->{value},          $self->{read_until}->($self->{ca}->{value},
1332                                q['&],                                q['&<],
1333                                length $self->{ca}->{value});                                length $self->{ca}->{value});
1334    
1335          ## Stay in the state          ## Stay in the state
# Line 1149  sub _get_next_token ($) { Line 1337  sub _get_next_token ($) {
1337          redo A;          redo A;
1338        }        }
1339      } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {      } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1340          ## XML5: "Tag attribute value unquoted state".
1341    
1342        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
1343          !!!cp (107);          !!!cp (107);
1344            ## XML5: "Tag attribute name before state".
1345          $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;          $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1346          !!!next-input-character;          !!!next-input-character;
1347          redo A;          redo A;
1348        } elsif ($self->{nc} == 0x0026) { # &        } elsif ($self->{nc} == 0x0026) { # &
1349          !!!cp (108);          !!!cp (108);
1350    
1351            ## XML5: Not defined yet.
1352    
1353          ## NOTE: In the spec, the tokenizer is switched to the          ## NOTE: In the spec, the tokenizer is switched to the
1354          ## "entity in attribute value state".  In this implementation, the          ## "entity in attribute value state".  In this implementation, the
1355          ## tokenizer is switched to the |ENTITY_STATE|, which is an          ## tokenizer is switched to the |ENTITY_STATE|, which is an
# Line 1182  sub _get_next_token ($) { Line 1376  sub _get_next_token ($) {
1376            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1377          }          }
1378          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1379            $self->{s_kwd} = '';
1380          !!!next-input-character;          !!!next-input-character;
1381    
1382          !!!emit ($self->{ct}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
# Line 1205  sub _get_next_token ($) { Line 1400  sub _get_next_token ($) {
1400            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1401          }          }
1402          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1403            $self->{s_kwd} = '';
1404          ## reconsume          ## reconsume
1405    
1406          !!!emit ($self->{ct}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
# Line 1217  sub _get_next_token ($) { Line 1413  sub _get_next_token ($) {
1413               0x003D => 1, # =               0x003D => 1, # =
1414              }->{$self->{nc}}) {              }->{$self->{nc}}) {
1415            !!!cp (115);            !!!cp (115);
1416              ## XML5: Not a parse error.
1417            !!!parse-error (type => 'bad attribute value');            !!!parse-error (type => 'bad attribute value');
1418          } else {          } else {
1419            !!!cp (116);            !!!cp (116);
# Line 1253  sub _get_next_token ($) { Line 1450  sub _get_next_token ($) {
1450            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1451          }          }
1452          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1453            $self->{s_kwd} = '';
1454          !!!next-input-character;          !!!next-input-character;
1455    
1456          !!!emit ($self->{ct}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
# Line 1280  sub _get_next_token ($) { Line 1478  sub _get_next_token ($) {
1478            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1479          }          }
1480          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1481            $self->{s_kwd} = '';
1482          ## Reconsume.          ## Reconsume.
1483          !!!emit ($self->{ct}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
1484          redo A;          redo A;
# Line 1291  sub _get_next_token ($) { Line 1490  sub _get_next_token ($) {
1490          redo A;          redo A;
1491        }        }
1492      } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {      } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
1493          ## XML5: "Empty tag state".
1494    
1495        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
1496          if ($self->{ct}->{type} == END_TAG_TOKEN) {          if ($self->{ct}->{type} == END_TAG_TOKEN) {
1497            !!!cp ('124.2');            !!!cp ('124.2');
# Line 1310  sub _get_next_token ($) { Line 1511  sub _get_next_token ($) {
1511          }          }
1512    
1513          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1514            $self->{s_kwd} = '';
1515          !!!next-input-character;          !!!next-input-character;
1516    
1517          !!!emit ($self->{ct}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
# Line 1331  sub _get_next_token ($) { Line 1533  sub _get_next_token ($) {
1533          } else {          } else {
1534            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1535          }          }
1536            ## XML5: "Tag attribute name before state".
1537          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1538            $self->{s_kwd} = '';
1539          ## Reconsume.          ## Reconsume.
1540          !!!emit ($self->{ct}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
1541          redo A;          redo A;
# Line 1352  sub _get_next_token ($) { Line 1556  sub _get_next_token ($) {
1556        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
1557          !!!cp (124);          !!!cp (124);
1558          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1559            $self->{s_kwd} = '';
1560          !!!next-input-character;          !!!next-input-character;
1561    
1562          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1359  sub _get_next_token ($) { Line 1564  sub _get_next_token ($) {
1564        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
1565          !!!cp (125);          !!!cp (125);
1566          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1567            $self->{s_kwd} = '';
1568          ## reconsume          ## reconsume
1569    
1570          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1390  sub _get_next_token ($) { Line 1596  sub _get_next_token ($) {
1596          $self->{s_kwd} = chr $self->{nc};          $self->{s_kwd} = chr $self->{nc};
1597          !!!next-input-character;          !!!next-input-character;
1598          redo A;          redo A;
1599        } elsif ($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and        } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
1600                 $self->{open_elements}->[-1]->[1] & FOREIGN_EL and                   $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
1601                    $self->{is_xml}) and
1602                 $self->{nc} == 0x005B) { # [                 $self->{nc} == 0x005B) { # [
1603          !!!cp (135.4);                          !!!cp (135.4);                
1604          $self->{state} = MD_CDATA_STATE;          $self->{state} = MD_CDATA_STATE;
# Line 1419  sub _get_next_token ($) { Line 1626  sub _get_next_token ($) {
1626                                    line => $self->{line_prev},                                    line => $self->{line_prev},
1627                                    column => $self->{column_prev} - 2,                                    column => $self->{column_prev} - 2,
1628                                   };                                   };
1629          $self->{state} = COMMENT_START_STATE;          $self->{state} = COMMENT_START_STATE; ## XML5: "comment state".
1630          !!!next-input-character;          !!!next-input-character;
1631          redo A;          redo A;
1632        } else {        } else {
# Line 1462  sub _get_next_token ($) { Line 1669  sub _get_next_token ($) {
1669        } elsif ((length $self->{s_kwd}) == 6 and        } elsif ((length $self->{s_kwd}) == 6 and
1670                 ($self->{nc} == 0x0045 or # E                 ($self->{nc} == 0x0045 or # E
1671                  $self->{nc} == 0x0065)) { # e                  $self->{nc} == 0x0065)) { # e
1672          !!!cp (129);          if ($self->{s_kwd} ne 'DOCTYP') {
1673              !!!cp (129);
1674              ## XML5: case-sensitive.
1675              !!!parse-error (type => 'lowercase keyword', ## TODO
1676                              text => 'DOCTYPE',
1677                              line => $self->{line_prev},
1678                              column => $self->{column_prev} - 5);
1679            } else {
1680              !!!cp (129.1);
1681            }
1682          $self->{state} = DOCTYPE_STATE;          $self->{state} = DOCTYPE_STATE;
1683          $self->{ct} = {type => DOCTYPE_TOKEN,          $self->{ct} = {type => DOCTYPE_TOKEN,
1684                                    quirks => 1,                                    quirks => 1,
# Line 1500  sub _get_next_token ($) { Line 1716  sub _get_next_token ($) {
1716          redo A;          redo A;
1717        } elsif ($self->{s_kwd} eq '[CDATA' and        } elsif ($self->{s_kwd} eq '[CDATA' and
1718                 $self->{nc} == 0x005B) { # [                 $self->{nc} == 0x005B) { # [
1719          !!!cp (135.2);          if ($self->{is_xml} and
1720                not $self->{tainted} and
1721                @{$self->{open_elements} or []} == 0) {
1722              !!!cp (135.2);
1723              !!!parse-error (type => 'cdata outside of root element',
1724                              line => $self->{line_prev},
1725                              column => $self->{column_prev} - 7);
1726              $self->{tainted} = 1;
1727            } else {
1728              !!!cp (135.21);
1729            }
1730    
1731          $self->{ct} = {type => CHARACTER_TOKEN,          $self->{ct} = {type => CHARACTER_TOKEN,
1732                                    data => '',                                    data => '',
1733                                    line => $self->{line_prev},                                    line => $self->{line_prev},
# Line 1532  sub _get_next_token ($) { Line 1759  sub _get_next_token ($) {
1759          !!!cp (138);          !!!cp (138);
1760          !!!parse-error (type => 'bogus comment');          !!!parse-error (type => 'bogus comment');
1761          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1762            $self->{s_kwd} = '';
1763          !!!next-input-character;          !!!next-input-character;
1764    
1765          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1541  sub _get_next_token ($) { Line 1769  sub _get_next_token ($) {
1769          !!!cp (139);          !!!cp (139);
1770          !!!parse-error (type => 'unclosed comment');          !!!parse-error (type => 'unclosed comment');
1771          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1772            $self->{s_kwd} = '';
1773          ## reconsume          ## reconsume
1774    
1775          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1564  sub _get_next_token ($) { Line 1793  sub _get_next_token ($) {
1793          !!!cp (142);          !!!cp (142);
1794          !!!parse-error (type => 'bogus comment');          !!!parse-error (type => 'bogus comment');
1795          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1796            $self->{s_kwd} = '';
1797          !!!next-input-character;          !!!next-input-character;
1798    
1799          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1573  sub _get_next_token ($) { Line 1803  sub _get_next_token ($) {
1803          !!!cp (143);          !!!cp (143);
1804          !!!parse-error (type => 'unclosed comment');          !!!parse-error (type => 'unclosed comment');
1805          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1806            $self->{s_kwd} = '';
1807          ## reconsume          ## reconsume
1808    
1809          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1596  sub _get_next_token ($) { Line 1827  sub _get_next_token ($) {
1827          !!!cp (146);          !!!cp (146);
1828          !!!parse-error (type => 'unclosed comment');          !!!parse-error (type => 'unclosed comment');
1829          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1830            $self->{s_kwd} = '';
1831          ## reconsume          ## reconsume
1832    
1833          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1613  sub _get_next_token ($) { Line 1845  sub _get_next_token ($) {
1845          redo A;          redo A;
1846        }        }
1847      } elsif ($self->{state} == COMMENT_END_DASH_STATE) {      } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
1848          ## XML5: "comment dash state".
1849    
1850        if ($self->{nc} == 0x002D) { # -        if ($self->{nc} == 0x002D) { # -
1851          !!!cp (148);          !!!cp (148);
1852          $self->{state} = COMMENT_END_STATE;          $self->{state} = COMMENT_END_STATE;
# Line 1621  sub _get_next_token ($) { Line 1855  sub _get_next_token ($) {
1855        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
1856          !!!cp (149);          !!!cp (149);
1857          !!!parse-error (type => 'unclosed comment');          !!!parse-error (type => 'unclosed comment');
1858            $self->{s_kwd} = '';
1859          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1860            $self->{s_kwd} = '';
1861          ## reconsume          ## reconsume
1862    
1863          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1638  sub _get_next_token ($) { Line 1874  sub _get_next_token ($) {
1874        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
1875          !!!cp (151);          !!!cp (151);
1876          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1877            $self->{s_kwd} = '';
1878          !!!next-input-character;          !!!next-input-character;
1879    
1880          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1645  sub _get_next_token ($) { Line 1882  sub _get_next_token ($) {
1882          redo A;          redo A;
1883        } elsif ($self->{nc} == 0x002D) { # -        } elsif ($self->{nc} == 0x002D) { # -
1884          !!!cp (152);          !!!cp (152);
1885            ## XML5: Not a parse error.
1886          !!!parse-error (type => 'dash in comment',          !!!parse-error (type => 'dash in comment',
1887                          line => $self->{line_prev},                          line => $self->{line_prev},
1888                          column => $self->{column_prev});                          column => $self->{column_prev});
# Line 1656  sub _get_next_token ($) { Line 1894  sub _get_next_token ($) {
1894          !!!cp (153);          !!!cp (153);
1895          !!!parse-error (type => 'unclosed comment');          !!!parse-error (type => 'unclosed comment');
1896          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1897            $self->{s_kwd} = '';
1898          ## reconsume          ## reconsume
1899    
1900          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1663  sub _get_next_token ($) { Line 1902  sub _get_next_token ($) {
1902          redo A;          redo A;
1903        } else {        } else {
1904          !!!cp (154);          !!!cp (154);
1905            ## XML5: Not a parse error.
1906          !!!parse-error (type => 'dash in comment',          !!!parse-error (type => 'dash in comment',
1907                          line => $self->{line_prev},                          line => $self->{line_prev},
1908                          column => $self->{column_prev});                          column => $self->{column_prev});
# Line 1694  sub _get_next_token ($) { Line 1934  sub _get_next_token ($) {
1934          !!!cp (158);          !!!cp (158);
1935          !!!parse-error (type => 'no DOCTYPE name');          !!!parse-error (type => 'no DOCTYPE name');
1936          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1937            $self->{s_kwd} = '';
1938          !!!next-input-character;          !!!next-input-character;
1939    
1940          !!!emit ($self->{ct}); # DOCTYPE (quirks)          !!!emit ($self->{ct}); # DOCTYPE (quirks)
# Line 1703  sub _get_next_token ($) { Line 1944  sub _get_next_token ($) {
1944          !!!cp (159);          !!!cp (159);
1945          !!!parse-error (type => 'no DOCTYPE name');          !!!parse-error (type => 'no DOCTYPE name');
1946          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1947            $self->{s_kwd} = '';
1948          ## reconsume          ## reconsume
1949    
1950          !!!emit ($self->{ct}); # DOCTYPE (quirks)          !!!emit ($self->{ct}); # DOCTYPE (quirks)
# Line 1726  sub _get_next_token ($) { Line 1968  sub _get_next_token ($) {
1968        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
1969          !!!cp (162);          !!!cp (162);
1970          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1971            $self->{s_kwd} = '';
1972          !!!next-input-character;          !!!next-input-character;
1973    
1974          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
# Line 1735  sub _get_next_token ($) { Line 1978  sub _get_next_token ($) {
1978          !!!cp (163);          !!!cp (163);
1979          !!!parse-error (type => 'unclosed DOCTYPE');          !!!parse-error (type => 'unclosed DOCTYPE');
1980          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1981            $self->{s_kwd} = '';
1982          ## reconsume          ## reconsume
1983    
1984          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 1758  sub _get_next_token ($) { Line 2002  sub _get_next_token ($) {
2002        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
2003          !!!cp (166);          !!!cp (166);
2004          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2005            $self->{s_kwd} = '';
2006          !!!next-input-character;          !!!next-input-character;
2007    
2008          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
# Line 1767  sub _get_next_token ($) { Line 2012  sub _get_next_token ($) {
2012          !!!cp (167);          !!!cp (167);
2013          !!!parse-error (type => 'unclosed DOCTYPE');          !!!parse-error (type => 'unclosed DOCTYPE');
2014          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2015            $self->{s_kwd} = '';
2016          ## reconsume          ## reconsume
2017    
2018          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 1895  sub _get_next_token ($) { Line 2141  sub _get_next_token ($) {
2141          !!!parse-error (type => 'no PUBLIC literal');          !!!parse-error (type => 'no PUBLIC literal');
2142    
2143          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2144            $self->{s_kwd} = '';
2145          !!!next-input-character;          !!!next-input-character;
2146    
2147          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 1906  sub _get_next_token ($) { Line 2153  sub _get_next_token ($) {
2153          !!!parse-error (type => 'unclosed DOCTYPE');          !!!parse-error (type => 'unclosed DOCTYPE');
2154    
2155          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2156            $self->{s_kwd} = '';
2157          ## reconsume          ## reconsume
2158    
2159          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 1932  sub _get_next_token ($) { Line 2180  sub _get_next_token ($) {
2180          !!!parse-error (type => 'unclosed PUBLIC literal');          !!!parse-error (type => 'unclosed PUBLIC literal');
2181    
2182          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2183            $self->{s_kwd} = '';
2184          !!!next-input-character;          !!!next-input-character;
2185    
2186          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 1943  sub _get_next_token ($) { Line 2192  sub _get_next_token ($) {
2192          !!!parse-error (type => 'unclosed PUBLIC literal');          !!!parse-error (type => 'unclosed PUBLIC literal');
2193    
2194          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2195            $self->{s_kwd} = '';
2196          ## reconsume          ## reconsume
2197    
2198          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 1971  sub _get_next_token ($) { Line 2221  sub _get_next_token ($) {
2221          !!!parse-error (type => 'unclosed PUBLIC literal');          !!!parse-error (type => 'unclosed PUBLIC literal');
2222    
2223          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2224            $self->{s_kwd} = '';
2225          !!!next-input-character;          !!!next-input-character;
2226    
2227          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 1982  sub _get_next_token ($) { Line 2233  sub _get_next_token ($) {
2233          !!!parse-error (type => 'unclosed PUBLIC literal');          !!!parse-error (type => 'unclosed PUBLIC literal');
2234    
2235          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2236            $self->{s_kwd} = '';
2237          ## reconsume          ## reconsume
2238    
2239          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 2020  sub _get_next_token ($) { Line 2272  sub _get_next_token ($) {
2272        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
2273          !!!cp (198);          !!!cp (198);
2274          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2275            $self->{s_kwd} = '';
2276          !!!next-input-character;          !!!next-input-character;
2277    
2278          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
# Line 2030  sub _get_next_token ($) { Line 2283  sub _get_next_token ($) {
2283          !!!parse-error (type => 'unclosed DOCTYPE');          !!!parse-error (type => 'unclosed DOCTYPE');
2284    
2285          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2286            $self->{s_kwd} = '';
2287          ## reconsume          ## reconsume
2288    
2289          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 2067  sub _get_next_token ($) { Line 2321  sub _get_next_token ($) {
2321          !!!cp (204);          !!!cp (204);
2322          !!!parse-error (type => 'no SYSTEM literal');          !!!parse-error (type => 'no SYSTEM literal');
2323          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2324            $self->{s_kwd} = '';
2325          !!!next-input-character;          !!!next-input-character;
2326    
2327          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 2078  sub _get_next_token ($) { Line 2333  sub _get_next_token ($) {
2333          !!!parse-error (type => 'unclosed DOCTYPE');          !!!parse-error (type => 'unclosed DOCTYPE');
2334    
2335          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2336            $self->{s_kwd} = '';
2337          ## reconsume          ## reconsume
2338    
2339          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 2104  sub _get_next_token ($) { Line 2360  sub _get_next_token ($) {
2360          !!!parse-error (type => 'unclosed SYSTEM literal');          !!!parse-error (type => 'unclosed SYSTEM literal');
2361    
2362          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2363            $self->{s_kwd} = '';
2364          !!!next-input-character;          !!!next-input-character;
2365    
2366          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 2115  sub _get_next_token ($) { Line 2372  sub _get_next_token ($) {
2372          !!!parse-error (type => 'unclosed SYSTEM literal');          !!!parse-error (type => 'unclosed SYSTEM literal');
2373    
2374          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2375            $self->{s_kwd} = '';
2376          ## reconsume          ## reconsume
2377    
2378          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 2143  sub _get_next_token ($) { Line 2401  sub _get_next_token ($) {
2401          !!!parse-error (type => 'unclosed SYSTEM literal');          !!!parse-error (type => 'unclosed SYSTEM literal');
2402    
2403          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2404            $self->{s_kwd} = '';
2405          !!!next-input-character;          !!!next-input-character;
2406    
2407          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 2154  sub _get_next_token ($) { Line 2413  sub _get_next_token ($) {
2413          !!!parse-error (type => 'unclosed SYSTEM literal');          !!!parse-error (type => 'unclosed SYSTEM literal');
2414    
2415          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2416            $self->{s_kwd} = '';
2417          ## reconsume          ## reconsume
2418    
2419          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 2180  sub _get_next_token ($) { Line 2440  sub _get_next_token ($) {
2440        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
2441          !!!cp (216);          !!!cp (216);
2442          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2443            $self->{s_kwd} = '';
2444          !!!next-input-character;          !!!next-input-character;
2445    
2446          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
# Line 2189  sub _get_next_token ($) { Line 2450  sub _get_next_token ($) {
2450          !!!cp (217);          !!!cp (217);
2451          !!!parse-error (type => 'unclosed DOCTYPE');          !!!parse-error (type => 'unclosed DOCTYPE');
2452          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2453            $self->{s_kwd} = '';
2454          ## reconsume          ## reconsume
2455    
2456          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 2208  sub _get_next_token ($) { Line 2470  sub _get_next_token ($) {
2470        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
2471          !!!cp (219);          !!!cp (219);
2472          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2473            $self->{s_kwd} = '';
2474          !!!next-input-character;          !!!next-input-character;
2475    
2476          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
# Line 2216  sub _get_next_token ($) { Line 2479  sub _get_next_token ($) {
2479        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
2480          !!!cp (220);          !!!cp (220);
2481          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2482            $self->{s_kwd} = '';
2483          ## reconsume          ## reconsume
2484    
2485          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
# Line 2234  sub _get_next_token ($) { Line 2498  sub _get_next_token ($) {
2498        ## NOTE: "CDATA section state" in the state is jointly implemented        ## NOTE: "CDATA section state" in the state is jointly implemented
2499        ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,        ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
2500        ## and |CDATA_SECTION_MSE2_STATE|.        ## and |CDATA_SECTION_MSE2_STATE|.
2501    
2502          ## XML5: "CDATA state".
2503                
2504        if ($self->{nc} == 0x005D) { # ]        if ($self->{nc} == 0x005D) { # ]
2505          !!!cp (221.1);          !!!cp (221.1);
# Line 2241  sub _get_next_token ($) { Line 2507  sub _get_next_token ($) {
2507          !!!next-input-character;          !!!next-input-character;
2508          redo A;          redo A;
2509        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
2510            if ($self->{is_xml}) {
2511              !!!cp (221.11);
2512              !!!parse-error (type => 'no mse'); ## TODO: type
2513            } else {
2514              !!!cp (221.12);
2515            }
2516    
2517          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2518          !!!next-input-character;          $self->{s_kwd} = '';
2519            ## Reconsume.
2520          if (length $self->{ct}->{data}) { # character          if (length $self->{ct}->{data}) { # character
2521            !!!cp (221.2);            !!!cp (221.2);
2522            !!!emit ($self->{ct}); # character            !!!emit ($self->{ct}); # character
# Line 2265  sub _get_next_token ($) { Line 2539  sub _get_next_token ($) {
2539    
2540        ## ISSUE: "text tokens" in spec.        ## ISSUE: "text tokens" in spec.
2541      } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {      } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
2542          ## XML5: "CDATA bracket state".
2543    
2544        if ($self->{nc} == 0x005D) { # ]        if ($self->{nc} == 0x005D) { # ]
2545          !!!cp (221.5);          !!!cp (221.5);
2546          $self->{state} = CDATA_SECTION_MSE2_STATE;          $self->{state} = CDATA_SECTION_MSE2_STATE;
# Line 2272  sub _get_next_token ($) { Line 2548  sub _get_next_token ($) {
2548          redo A;          redo A;
2549        } else {        } else {
2550          !!!cp (221.6);          !!!cp (221.6);
2551            ## XML5: If EOF, "]" is not appended and changed to the data state.
2552          $self->{ct}->{data} .= ']';          $self->{ct}->{data} .= ']';
2553          $self->{state} = CDATA_SECTION_STATE;          $self->{state} = CDATA_SECTION_STATE; ## XML5: Stay in the state.
2554          ## Reconsume.          ## Reconsume.
2555          redo A;          redo A;
2556        }        }
2557      } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {      } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
2558          ## XML5: "CDATA end state".
2559    
2560        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
2561          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2562            $self->{s_kwd} = '';
2563          !!!next-input-character;          !!!next-input-character;
2564          if (length $self->{ct}->{data}) { # character          if (length $self->{ct}->{data}) { # character
2565            !!!cp (221.7);            !!!cp (221.7);
# Line 2299  sub _get_next_token ($) { Line 2579  sub _get_next_token ($) {
2579          !!!cp (221.11);          !!!cp (221.11);
2580          $self->{ct}->{data} .= ']]'; # character          $self->{ct}->{data} .= ']]'; # character
2581          $self->{state} = CDATA_SECTION_STATE;          $self->{state} = CDATA_SECTION_STATE;
2582          ## Reconsume.          ## Reconsume. ## XML5: Emit.
2583          redo A;          redo A;
2584        }        }
2585      } elsif ($self->{state} == ENTITY_STATE) {      } elsif ($self->{state} == ENTITY_STATE) {
# Line 2347  sub _get_next_token ($) { Line 2627  sub _get_next_token ($) {
2627        if ($self->{prev_state} == DATA_STATE) {        if ($self->{prev_state} == DATA_STATE) {
2628          !!!cp (997);          !!!cp (997);
2629          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
2630            $self->{s_kwd} = '';
2631          ## Reconsume.          ## Reconsume.
2632          !!!emit ({type => CHARACTER_TOKEN, data => '&',          !!!emit ({type => CHARACTER_TOKEN, data => '&',
2633                    line => $self->{line_prev},                    line => $self->{line_prev},
# Line 2357  sub _get_next_token ($) { Line 2638  sub _get_next_token ($) {
2638          !!!cp (996);          !!!cp (996);
2639          $self->{ca}->{value} .= '&';          $self->{ca}->{value} .= '&';
2640          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
2641            $self->{s_kwd} = '';
2642          ## Reconsume.          ## Reconsume.
2643          redo A;          redo A;
2644        }        }
# Line 2387  sub _get_next_token ($) { Line 2669  sub _get_next_token ($) {
2669          if ($self->{prev_state} == DATA_STATE) {          if ($self->{prev_state} == DATA_STATE) {
2670            !!!cp (1019);            !!!cp (1019);
2671            $self->{state} = $self->{prev_state};            $self->{state} = $self->{prev_state};
2672              $self->{s_kwd} = '';
2673            ## Reconsume.            ## Reconsume.
2674            !!!emit ({type => CHARACTER_TOKEN,            !!!emit ({type => CHARACTER_TOKEN,
2675                      data => '&#',                      data => '&#',
# Line 2398  sub _get_next_token ($) { Line 2681  sub _get_next_token ($) {
2681            !!!cp (993);            !!!cp (993);
2682            $self->{ca}->{value} .= '&#';            $self->{ca}->{value} .= '&#';
2683            $self->{state} = $self->{prev_state};            $self->{state} = $self->{prev_state};
2684              $self->{s_kwd} = '';
2685            ## Reconsume.            ## Reconsume.
2686            redo A;            redo A;
2687          }          }
# Line 2443  sub _get_next_token ($) { Line 2727  sub _get_next_token ($) {
2727        if ($self->{prev_state} == DATA_STATE) {        if ($self->{prev_state} == DATA_STATE) {
2728          !!!cp (992);          !!!cp (992);
2729          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
2730            $self->{s_kwd} = '';
2731          ## Reconsume.          ## Reconsume.
2732          !!!emit ({type => CHARACTER_TOKEN, data => chr $code,          !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
2733                      has_reference => 1,
2734                    line => $l, column => $c,                    line => $l, column => $c,
2735                   });                   });
2736          redo A;          redo A;
# Line 2453  sub _get_next_token ($) { Line 2739  sub _get_next_token ($) {
2739          $self->{ca}->{value} .= chr $code;          $self->{ca}->{value} .= chr $code;
2740          $self->{ca}->{has_reference} = 1;          $self->{ca}->{has_reference} = 1;
2741          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
2742            $self->{s_kwd} = '';
2743          ## Reconsume.          ## Reconsume.
2744          redo A;          redo A;
2745        }        }
# Line 2478  sub _get_next_token ($) { Line 2765  sub _get_next_token ($) {
2765          if ($self->{prev_state} == DATA_STATE) {          if ($self->{prev_state} == DATA_STATE) {
2766            !!!cp (1005);            !!!cp (1005);
2767            $self->{state} = $self->{prev_state};            $self->{state} = $self->{prev_state};
2768              $self->{s_kwd} = '';
2769            ## Reconsume.            ## Reconsume.
2770            !!!emit ({type => CHARACTER_TOKEN,            !!!emit ({type => CHARACTER_TOKEN,
2771                      data => '&' . $self->{s_kwd},                      data => '&' . $self->{s_kwd},
# Line 2489  sub _get_next_token ($) { Line 2777  sub _get_next_token ($) {
2777            !!!cp (989);            !!!cp (989);
2778            $self->{ca}->{value} .= '&' . $self->{s_kwd};            $self->{ca}->{value} .= '&' . $self->{s_kwd};
2779            $self->{state} = $self->{prev_state};            $self->{state} = $self->{prev_state};
2780              $self->{s_kwd} = '';
2781            ## Reconsume.            ## Reconsume.
2782            redo A;            redo A;
2783          }          }
# Line 2551  sub _get_next_token ($) { Line 2840  sub _get_next_token ($) {
2840        if ($self->{prev_state} == DATA_STATE) {        if ($self->{prev_state} == DATA_STATE) {
2841          !!!cp (988);          !!!cp (988);
2842          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
2843            $self->{s_kwd} = '';
2844          ## Reconsume.          ## Reconsume.
2845          !!!emit ({type => CHARACTER_TOKEN, data => chr $code,          !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
2846                      has_reference => 1,
2847                    line => $l, column => $c,                    line => $l, column => $c,
2848                   });                   });
2849          redo A;          redo A;
# Line 2561  sub _get_next_token ($) { Line 2852  sub _get_next_token ($) {
2852          $self->{ca}->{value} .= chr $code;          $self->{ca}->{value} .= chr $code;
2853          $self->{ca}->{has_reference} = 1;          $self->{ca}->{has_reference} = 1;
2854          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
2855            $self->{s_kwd} = '';
2856          ## Reconsume.          ## Reconsume.
2857          redo A;          redo A;
2858        }        }
# Line 2643  sub _get_next_token ($) { Line 2935  sub _get_next_token ($) {
2935        if ($self->{prev_state} == DATA_STATE) {        if ($self->{prev_state} == DATA_STATE) {
2936          !!!cp (986);          !!!cp (986);
2937          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
2938            $self->{s_kwd} = '';
2939          ## Reconsume.          ## Reconsume.
2940          !!!emit ({type => CHARACTER_TOKEN,          !!!emit ({type => CHARACTER_TOKEN,
2941                    data => $data,                    data => $data,
2942                      has_reference => $has_ref,
2943                    line => $self->{line_prev},                    line => $self->{line_prev},
2944                    column => $self->{column_prev} + 1 - length $self->{s_kwd},                    column => $self->{column_prev} + 1 - length $self->{s_kwd},
2945                   });                   });
# Line 2655  sub _get_next_token ($) { Line 2949  sub _get_next_token ($) {
2949          $self->{ca}->{value} .= $data;          $self->{ca}->{value} .= $data;
2950          $self->{ca}->{has_reference} = 1 if $has_ref;          $self->{ca}->{has_reference} = 1 if $has_ref;
2951          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
2952            $self->{s_kwd} = '';
2953          ## Reconsume.          ## Reconsume.
2954          redo A;          redo A;
2955        }        }
2956    
2957        ## XML-only states
2958    
2959        } elsif ($self->{state} == PI_STATE) {
2960          if ($is_space->{$self->{nc}} or
2961              $self->{nc} == 0x003F or # ? ## XML5: Same as "Anything else"
2962              $self->{nc} == -1) {
2963            !!!parse-error (type => 'bare pio', ## TODO: type
2964                            line => $self->{line_prev},
2965                            column => $self->{column_prev}
2966                                - 1 * ($self->{nc} != -1));
2967            $self->{state} = BOGUS_COMMENT_STATE;
2968            ## Reconsume.
2969            $self->{ct} = {type => COMMENT_TOKEN,
2970                           data => '?',
2971                           line => $self->{line_prev},
2972                           column => $self->{column_prev}
2973                               - 1 * ($self->{nc} != -1),
2974                          };
2975            redo A;
2976          } else {
2977            $self->{ct} = {type => PI_TOKEN,
2978                           target => chr $self->{nc},
2979                           data => '',
2980                           line => $self->{line_prev},
2981                           column => $self->{column_prev} - 1,
2982                          };
2983            $self->{state} = PI_TARGET_STATE;
2984            !!!next-input-character;
2985            redo A;
2986          }
2987        } elsif ($self->{state} == PI_TARGET_STATE) {
2988          if ($is_space->{$self->{nc}}) {
2989            $self->{state} = PI_TARGET_AFTER_STATE;
2990            !!!next-input-character;
2991            redo A;
2992          } elsif ($self->{nc} == -1) {
2993            !!!parse-error (type => 'no pic'); ## TODO: type
2994            $self->{state} = DATA_STATE;
2995            $self->{s_kwd} = '';
2996            ## Reconsume.
2997            !!!emit ($self->{ct}); # pi
2998            redo A;
2999          } elsif ($self->{nc} == 0x003F) { # ?
3000            $self->{state} = PI_AFTER_STATE;
3001            !!!next-input-character;
3002            redo A;
3003          } else {
3004            ## XML5: typo ("tag name" -> "target")
3005            $self->{ct}->{target} .= chr $self->{nc}; # pi
3006            !!!next-input-character;
3007            redo A;
3008          }
3009        } elsif ($self->{state} == PI_TARGET_AFTER_STATE) {
3010          if ($is_space->{$self->{nc}}) {
3011            ## Stay in the state.
3012            !!!next-input-character;
3013            redo A;
3014          } else {
3015            $self->{state} = PI_DATA_STATE;
3016            ## Reprocess.
3017            redo A;
3018          }
3019        } elsif ($self->{state} == PI_DATA_STATE) {
3020          if ($self->{nc} == 0x003F) { # ?
3021            $self->{state} = PI_DATA_AFTER_STATE;
3022            !!!next-input-character;
3023            redo A;
3024          } elsif ($self->{nc} == -1) {
3025            !!!parse-error (type => 'no pic'); ## TODO: type
3026            $self->{state} = DATA_STATE;
3027            $self->{s_kwd} = '';
3028            ## Reprocess.
3029            !!!emit ($self->{ct}); # pi
3030            redo A;
3031          } else {
3032            $self->{ct}->{data} .= chr $self->{nc}; # pi
3033            $self->{read_until}->($self->{ct}->{data}, q[?],
3034                                  length $self->{ct}->{data});
3035            ## Stay in the state.
3036            !!!next-input-character;
3037            ## Reprocess.
3038            redo A;
3039          }
3040        } elsif ($self->{state} == PI_AFTER_STATE) {
3041          if ($self->{nc} == 0x003E) { # >
3042            $self->{state} = DATA_STATE;
3043            $self->{s_kwd} = '';
3044            !!!next-input-character;
3045            !!!emit ($self->{ct}); # pi
3046            redo A;
3047          } elsif ($self->{nc} == 0x003F) { # ?
3048            !!!parse-error (type => 'no s after target', ## TODO: type
3049                            line => $self->{line_prev},
3050                            column => $self->{column_prev}); ## XML5: no error
3051            $self->{ct}->{data} .= '?';
3052            $self->{state} = PI_DATA_AFTER_STATE;
3053            !!!next-input-character;
3054            redo A;
3055          } else {
3056            !!!parse-error (type => 'no s after target', ## TODO: type
3057                            line => $self->{line_prev},
3058                            column => $self->{column_prev}
3059                                + 1 * ($self->{nc} == -1)); ## XML5: no error
3060            $self->{ct}->{data} .= '?'; ## XML5: not appended
3061            $self->{state} = PI_DATA_STATE;
3062            ## Reprocess.
3063            redo A;
3064          }
3065        } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
3066          ## XML5: Same as "pi after state" in XML5
3067          if ($self->{nc} == 0x003E) { # >
3068            $self->{state} = DATA_STATE;
3069            $self->{s_kwd} = '';
3070            !!!next-input-character;
3071            !!!emit ($self->{ct}); # pi
3072            redo A;
3073          } elsif ($self->{nc} == 0x003F) { # ?
3074            $self->{ct}->{data} .= '?';
3075            ## Stay in the state.
3076            !!!next-input-character;
3077            redo A;
3078          } else {
3079            $self->{ct}->{data} .= '?'; ## XML5: not appended
3080            $self->{state} = PI_DATA_STATE;
3081            ## Reprocess.
3082            redo A;
3083          }
3084            
3085      } else {      } else {
3086        die "$0: $self->{state}: Unknown state";        die "$0: $self->{state}: Unknown state";
3087      }      }

Legend:
Removed from v.1.2  
changed lines
  Added in v.1.11

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24