/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm
Suika

Diff of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.3 by wakaba, Tue Oct 14 05:34:05 2008 UTC revision 1.11 by wakaba, Wed Oct 15 10:50:38 2008 UTC
# Line 114  sub HEXREF_HEX_STATE () { 48 } Line 114  sub HEXREF_HEX_STATE () { 48 }
114  sub ENTITY_NAME_STATE () { 49 }  sub ENTITY_NAME_STATE () { 49 }
115  sub PCDATA_STATE () { 50 } # "data state" in the spec  sub PCDATA_STATE () { 50 } # "data state" in the spec
116    
117    ## XML states
118    sub PI_STATE () { 51 }
119    sub PI_TARGET_STATE () { 52 }
120    sub PI_TARGET_AFTER_STATE () { 53 }
121    sub PI_DATA_STATE () { 54 }
122    sub PI_AFTER_STATE () { 55 }
123    sub PI_DATA_AFTER_STATE () { 56 }
124    
125  ## Tree constructor state constants (see Whatpm::HTML for the full  ## Tree constructor state constants (see Whatpm::HTML for the full
126  ## list and descriptions)  ## list and descriptions)
127    
# Line 178  sub _initialize_tokenizer ($) { Line 186  sub _initialize_tokenizer ($) {
186    #$self->{is_xml} (if XML)    #$self->{is_xml} (if XML)
187    
188    $self->{state} = DATA_STATE; # MUST    $self->{state} = DATA_STATE; # MUST
189    #$self->{s_kwd}; # state keyword - initialized when used    $self->{s_kwd} = ''; # state keyword
190    #$self->{entity__value}; # initialized when used    #$self->{entity__value}; # initialized when used
191    #$self->{entity__match}; # initialized when used    #$self->{entity__match}; # initialized when used
192    $self->{content_model} = PCDATA_CONTENT_MODEL; # be    $self->{content_model} = PCDATA_CONTENT_MODEL; # be
# Line 208  sub _initialize_tokenizer ($) { Line 216  sub _initialize_tokenizer ($) {
216    
217  ## A token has:  ## A token has:
218  ##   ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,  ##   ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
219  ##       CHARACTER_TOKEN, or END_OF_FILE_TOKEN  ##       CHARACTER_TOKEN, END_OF_FILE_TOKEN, PI_TOKEN, or ABORT_TOKEN
220  ##   ->{name} (DOCTYPE_TOKEN)  ##   ->{name} (DOCTYPE_TOKEN)
221  ##   ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)  ##   ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
222    ##   ->{target} (PI_TOKEN)
223  ##   ->{pubid} (DOCTYPE_TOKEN)  ##   ->{pubid} (DOCTYPE_TOKEN)
224  ##   ->{sysid} (DOCTYPE_TOKEN)  ##   ->{sysid} (DOCTYPE_TOKEN)
225  ##   ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag  ##   ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
# Line 218  sub _initialize_tokenizer ($) { Line 227  sub _initialize_tokenizer ($) {
227  ##        ->{name}  ##        ->{name}
228  ##        ->{value}  ##        ->{value}
229  ##        ->{has_reference} == 1 or 0  ##        ->{has_reference} == 1 or 0
230  ##   ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)  ##        ->{index}: Index of the attribute in a tag.
231    ##   ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)
232    ##   ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
233    ##   ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.
234  ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.  ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
235  ##     |->{self_closing}| is used to save the value of |$self->{self_closing}|  ##     |->{self_closing}| is used to save the value of |$self->{self_closing}|
236  ##     while the token is pushed back to the stack.  ##     while the token is pushed back to the stack.
# Line 362  sub _get_next_token ($) { Line 374  sub _get_next_token ($) {
374          }          }
375        } elsif ($self->{nc} == 0x002D) { # -        } elsif ($self->{nc} == 0x002D) { # -
376          if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA          if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
377            $self->{s_kwd} .= '-';            if ($self->{s_kwd} eq '<!-') {
             
           if ($self->{s_kwd} eq '<!--') {  
378                            
379              $self->{escape} = 1; # unless $self->{escape};              $self->{escape} = 1; # unless $self->{escape};
380              $self->{s_kwd} = '--';              $self->{s_kwd} = '--';
381              #              #
382            } elsif ($self->{s_kwd} eq '---') {            } elsif ($self->{s_kwd} eq '-') {
383                            
384              $self->{s_kwd} = '--';              $self->{s_kwd} = '--';
385              #              #
386              } elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') {
387                
388                $self->{s_kwd} .= '-';
389                #
390            } else {            } else {
391                            
392                $self->{s_kwd} = '-';
393              #              #
394            }            }
395          }          }
# Line 420  sub _get_next_token ($) { Line 435  sub _get_next_token ($) {
435            if ($self->{s_kwd} eq '--') {            if ($self->{s_kwd} eq '--') {
436                            
437              delete $self->{escape};              delete $self->{escape};
438                #
439            } else {            } else {
440                            
441                #
442            }            }
443            } elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') {
444              
445              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched mse', ## TODO: type
446                              line => $self->{line_prev},
447                              column => $self->{column_prev} - 1);
448              #
449          } else {          } else {
450                        
451              #
452          }          }
453                    
454          $self->{s_kwd} = '';          $self->{s_kwd} = '';
455          #          #
456          } elsif ($self->{nc} == 0x005D) { # ]
457            if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') {
458              
459              $self->{s_kwd} .= ']';
460            } elsif ($self->{s_kwd} eq ']]') {
461              
462              #
463            } else {
464              
465              $self->{s_kwd} = '';
466            }
467            #
468        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
469                    
470          $self->{s_kwd} = '';          $self->{s_kwd} = '';
# Line 446  sub _get_next_token ($) { Line 482  sub _get_next_token ($) {
482                     data => chr $self->{nc},                     data => chr $self->{nc},
483                     line => $self->{line}, column => $self->{column},                     line => $self->{line}, column => $self->{column},
484                    };                    };
485        if ($self->{read_until}->($token->{data}, q[-!<>&],        if ($self->{read_until}->($token->{data}, q{-!<>&\]},
486                                  length $token->{data})) {                                  length $token->{data})) {
487          $self->{s_kwd} = '';          $self->{s_kwd} = '';
488        }        }
489    
490        ## Stay in the data state.        ## Stay in the data state.
491        if ($self->{content_model} == PCDATA_CONTENT_MODEL) {        if (not $self->{is_xml} and
492              $self->{content_model} == PCDATA_CONTENT_MODEL) {
493                    
494          $self->{state} = PCDATA_STATE;          $self->{state} = PCDATA_STATE;
495        } else {        } else {
# Line 473  sub _get_next_token ($) { Line 510  sub _get_next_token ($) {
510        return  ($token);        return  ($token);
511        redo A;        redo A;
512      } elsif ($self->{state} == TAG_OPEN_STATE) {      } elsif ($self->{state} == TAG_OPEN_STATE) {
513          ## XML5: "tag state".
514    
515        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
516          if ($self->{nc} == 0x002F) { # /          if ($self->{nc} == 0x002F) { # /
517                        
# Line 500  sub _get_next_token ($) { Line 539  sub _get_next_token ($) {
539    
540          ## reconsume          ## reconsume
541          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
542            $self->{s_kwd} = '';
543          return  ({type => CHARACTER_TOKEN, data => '<',          return  ({type => CHARACTER_TOKEN, data => '<',
544                    line => $self->{line_prev},                    line => $self->{line_prev},
545                    column => $self->{column_prev},                    column => $self->{column_prev},
# Line 541  sub _get_next_token ($) { Line 581  sub _get_next_token ($) {
581                        
582            $self->{ct}            $self->{ct}
583              = {type => START_TAG_TOKEN,              = {type => START_TAG_TOKEN,
584                 tag_name => chr ($self->{nc} + 0x0020),                 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
585                 line => $self->{line_prev},                 line => $self->{line_prev},
586                 column => $self->{column_prev}};                 column => $self->{column_prev}};
587            $self->{state} = TAG_NAME_STATE;            $self->{state} = TAG_NAME_STATE;
# Line 583  sub _get_next_token ($) { Line 623  sub _get_next_token ($) {
623                            line => $self->{line_prev},                            line => $self->{line_prev},
624                            column => $self->{column_prev});                            column => $self->{column_prev});
625            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
626              $self->{s_kwd} = '';
627                        
628      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
629        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 602  sub _get_next_token ($) { Line 643  sub _get_next_token ($) {
643    
644            redo A;            redo A;
645          } elsif ($self->{nc} == 0x003F) { # ?          } elsif ($self->{nc} == 0x003F) { # ?
646                        if ($self->{is_xml}) {
647            $self->{parse_error}->(level => $self->{level}->{must}, type => 'pio',              
648                            line => $self->{line_prev},              $self->{state} = PI_STATE;
649                            column => $self->{column_prev});              
650            $self->{state} = BOGUS_COMMENT_STATE;      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
651            $self->{ct} = {type => COMMENT_TOKEN, data => '',        $self->{line_prev} = $self->{line};
652                                      line => $self->{line_prev},        $self->{column_prev} = $self->{column};
653                                      column => $self->{column_prev},        $self->{column}++;
654                                     };        $self->{nc}
655            ## $self->{nc} is intentionally left as is            = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
656            redo A;      } else {
657          } else {        $self->{set_nc}->($self);
658        }
659      
660                redo A;
661              } else {
662                
663                $self->{parse_error}->(level => $self->{level}->{must}, type => 'pio',
664                                line => $self->{line_prev},
665                                column => $self->{column_prev});
666                $self->{state} = BOGUS_COMMENT_STATE;
667                $self->{ct} = {type => COMMENT_TOKEN, data => '',
668                               line => $self->{line_prev},
669                               column => $self->{column_prev},
670                              };
671                ## $self->{nc} is intentionally left as is
672                redo A;
673              }
674            } elsif (not $self->{is_xml} or $is_space->{$self->{nc}}) {
675                        
676            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago',            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago',
677                            line => $self->{line_prev},                            line => $self->{line_prev},
678                            column => $self->{column_prev});                            column => $self->{column_prev});
679            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
680              $self->{s_kwd} = '';
681            ## reconsume            ## reconsume
682    
683            return  ({type => CHARACTER_TOKEN, data => '<',            return  ({type => CHARACTER_TOKEN, data => '<',
# Line 627  sub _get_next_token ($) { Line 686  sub _get_next_token ($) {
686                     });                     });
687    
688            redo A;            redo A;
689            } else {
690              ## XML5: "<:" is a parse error.
691              
692              $self->{ct} = {type => START_TAG_TOKEN,
693                                        tag_name => chr ($self->{nc}),
694                                        line => $self->{line_prev},
695                                        column => $self->{column_prev}};
696              $self->{state} = TAG_NAME_STATE;
697              
698        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
699          $self->{line_prev} = $self->{line};
700          $self->{column_prev} = $self->{column};
701          $self->{column}++;
702          $self->{nc}
703              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
704        } else {
705          $self->{set_nc}->($self);
706        }
707      
708              redo A;
709          }          }
710        } else {        } else {
711          die "$0: $self->{content_model} in tag open";          die "$0: $self->{content_model} in tag open";
# Line 635  sub _get_next_token ($) { Line 714  sub _get_next_token ($) {
714        ## NOTE: The "close tag open state" in the spec is implemented as        ## NOTE: The "close tag open state" in the spec is implemented as
715        ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.        ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
716    
717          ## XML5: "end tag state".
718    
719        my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"        my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
720        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
721          if (defined $self->{last_stag_name}) {          if (defined $self->{last_stag_name}) {
# Line 647  sub _get_next_token ($) { Line 728  sub _get_next_token ($) {
728            ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.            ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
729                        
730            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
731              $self->{s_kwd} = '';
732            ## Reconsume.            ## Reconsume.
733            return  ({type => CHARACTER_TOKEN, data => '</',            return  ({type => CHARACTER_TOKEN, data => '</',
734                      line => $l, column => $c,                      line => $l, column => $c,
# Line 660  sub _get_next_token ($) { Line 742  sub _get_next_token ($) {
742                    
743          $self->{ct}          $self->{ct}
744              = {type => END_TAG_TOKEN,              = {type => END_TAG_TOKEN,
745                 tag_name => chr ($self->{nc} + 0x0020),                 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
746                 line => $l, column => $c};                 line => $l, column => $c};
747          $self->{state} = TAG_NAME_STATE;          $self->{state} = TAG_NAME_STATE;
748                    
# Line 695  sub _get_next_token ($) { Line 777  sub _get_next_token ($) {
777        
778          redo A;          redo A;
779        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
           
780          $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty end tag',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty end tag',
781                          line => $self->{line_prev}, ## "<" in "</>"                          line => $self->{line_prev}, ## "<" in "</>"
782                          column => $self->{column_prev} - 1);                          column => $self->{column_prev} - 1);
783          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
784                    $self->{s_kwd} = '';
785            if ($self->{is_xml}) {
786              
787              ## XML5: No parse error.
788              
789              ## NOTE: This parser raises a parse error, since it supports
790              ## XML1, not XML5.
791    
792              ## NOTE: A short end tag token.
793              my $ct = {type => END_TAG_TOKEN,
794                        tag_name => '',
795                        line => $self->{line_prev},
796                        column => $self->{column_prev} - 1,
797                       };
798              
799      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
800        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
801        $self->{column_prev} = $self->{column};        $self->{column_prev} = $self->{column};
# Line 711  sub _get_next_token ($) { Line 806  sub _get_next_token ($) {
806        $self->{set_nc}->($self);        $self->{set_nc}->($self);
807      }      }
808        
809              return  ($ct);
810            } else {
811              
812              
813        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
814          $self->{line_prev} = $self->{line};
815          $self->{column_prev} = $self->{column};
816          $self->{column}++;
817          $self->{nc}
818              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
819        } else {
820          $self->{set_nc}->($self);
821        }
822      
823            }
824          redo A;          redo A;
825        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
826                    
827          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare etago');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare etago');
828            $self->{s_kwd} = '';
829          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
830          # reconsume          # reconsume
831    
# Line 723  sub _get_next_token ($) { Line 834  sub _get_next_token ($) {
834                   });                   });
835    
836          redo A;          redo A;
837        } else {        } elsif (not $self->{is_xml} or
838                   $is_space->{$self->{nc}}) {
839                    
840          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus end tag');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus end tag',
841                            line => $self->{line_prev}, # "<" of "</"
842                            column => $self->{column_prev} - 1);
843          $self->{state} = BOGUS_COMMENT_STATE;          $self->{state} = BOGUS_COMMENT_STATE;
844          $self->{ct} = {type => COMMENT_TOKEN, data => '',          $self->{ct} = {type => COMMENT_TOKEN, data => '',
845                                    line => $self->{line_prev}, # "<" of "</"                                    line => $self->{line_prev}, # "<" of "</"
# Line 738  sub _get_next_token ($) { Line 852  sub _get_next_token ($) {
852          ## generated from the bogus end tag, as defined in the          ## generated from the bogus end tag, as defined in the
853          ## "bogus comment state" entry.          ## "bogus comment state" entry.
854          redo A;          redo A;
855          } else {
856            ## XML5: "</:" is a parse error.
857            
858            $self->{ct} = {type => END_TAG_TOKEN,
859                           tag_name => chr ($self->{nc}),
860                           line => $l, column => $c};
861            $self->{state} = TAG_NAME_STATE; ## XML5: "end tag name state".
862            
863        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
864          $self->{line_prev} = $self->{line};
865          $self->{column_prev} = $self->{column};
866          $self->{column}++;
867          $self->{nc}
868              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
869        } else {
870          $self->{set_nc}->($self);
871        }
872      
873            redo A;
874        }        }
875      } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {      } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
876        my $ch = substr $self->{last_stag_name}, length $self->{s_kwd}, 1;        my $ch = substr $self->{last_stag_name}, length $self->{s_kwd}, 1;
# Line 764  sub _get_next_token ($) { Line 897  sub _get_next_token ($) {
897          } else {          } else {
898                        
899            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
900              $self->{s_kwd} = '';
901            ## Reconsume.            ## Reconsume.
902            return  ({type => CHARACTER_TOKEN,            return  ({type => CHARACTER_TOKEN,
903                      data => '</' . $self->{s_kwd},                      data => '</' . $self->{s_kwd},
# Line 782  sub _get_next_token ($) { Line 916  sub _get_next_token ($) {
916                        
917            ## Reconsume.            ## Reconsume.
918            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
919              $self->{s_kwd} = '';
920            return  ({type => CHARACTER_TOKEN,            return  ({type => CHARACTER_TOKEN,
921                      data => '</' . $self->{s_kwd},                      data => '</' . $self->{s_kwd},
922                      line => $self->{line_prev},                      line => $self->{line_prev},
# Line 833  sub _get_next_token ($) { Line 968  sub _get_next_token ($) {
968            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
969          }          }
970          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
971            $self->{s_kwd} = '';
972                    
973      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
974        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 851  sub _get_next_token ($) { Line 987  sub _get_next_token ($) {
987        } elsif (0x0041 <= $self->{nc} and        } elsif (0x0041 <= $self->{nc} and
988                 $self->{nc} <= 0x005A) { # A..Z                 $self->{nc} <= 0x005A) { # A..Z
989                    
990          $self->{ct}->{tag_name} .= chr ($self->{nc} + 0x0020);          $self->{ct}->{tag_name}
991                .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
992            # start tag or end tag            # start tag or end tag
993          ## Stay in this state          ## Stay in this state
994                    
# Line 884  sub _get_next_token ($) { Line 1021  sub _get_next_token ($) {
1021            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1022          }          }
1023          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1024            $self->{s_kwd} = '';
1025          # reconsume          # reconsume
1026    
1027          return  ($self->{ct}); # start tag or end tag          return  ($self->{ct}); # start tag or end tag
# Line 923  sub _get_next_token ($) { Line 1061  sub _get_next_token ($) {
1061          redo A;          redo A;
1062        }        }
1063      } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {      } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
1064          ## XML5: "Tag attribute name before state".
1065    
1066        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
1067                    
1068          ## Stay in the state          ## Stay in the state
# Line 954  sub _get_next_token ($) { Line 1094  sub _get_next_token ($) {
1094            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1095          }          }
1096          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1097            $self->{s_kwd} = '';
1098                    
1099      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1100        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 973  sub _get_next_token ($) { Line 1114  sub _get_next_token ($) {
1114                 $self->{nc} <= 0x005A) { # A..Z                 $self->{nc} <= 0x005A) { # A..Z
1115                    
1116          $self->{ca}          $self->{ca}
1117              = {name => chr ($self->{nc} + 0x0020),              = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1118                 value => '',                 value => '',
1119                 line => $self->{line}, column => $self->{column}};                 line => $self->{line}, column => $self->{column}};
1120          $self->{state} = ATTRIBUTE_NAME_STATE;          $self->{state} = ATTRIBUTE_NAME_STATE;
# Line 1021  sub _get_next_token ($) { Line 1162  sub _get_next_token ($) {
1162            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1163          }          }
1164          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1165            $self->{s_kwd} = '';
1166          # reconsume          # reconsume
1167    
1168          return  ($self->{ct}); # start tag or end tag          return  ($self->{ct}); # start tag or end tag
# Line 1033  sub _get_next_token ($) { Line 1175  sub _get_next_token ($) {
1175               0x003D => 1, # =               0x003D => 1, # =
1176              }->{$self->{nc}}) {              }->{$self->{nc}}) {
1177                        
1178              ## XML5: Not a parse error.
1179            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1180          } else {          } else {
1181                        
1182              ## XML5: ":" raises a parse error and is ignored.
1183          }          }
1184          $self->{ca}          $self->{ca}
1185              = {name => chr ($self->{nc}),              = {name => chr ($self->{nc}),
# Line 1056  sub _get_next_token ($) { Line 1200  sub _get_next_token ($) {
1200          redo A;          redo A;
1201        }        }
1202      } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {      } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
1203          ## XML5: "Tag attribute name state".
1204    
1205        my $before_leave = sub {        my $before_leave = sub {
1206          if (exists $self->{ct}->{attributes} # start tag or end tag          if (exists $self->{ct}->{attributes} # start tag or end tag
1207              ->{$self->{ca}->{name}}) { # MUST              ->{$self->{ca}->{name}}) { # MUST
# Line 1066  sub _get_next_token ($) { Line 1212  sub _get_next_token ($) {
1212                        
1213            $self->{ct}->{attributes}->{$self->{ca}->{name}}            $self->{ct}->{attributes}->{$self->{ca}->{name}}
1214              = $self->{ca};              = $self->{ca};
1215              $self->{ca}->{index} = ++$self->{ct}->{last_index};
1216          }          }
1217        }; # $before_leave        }; # $before_leave
1218    
# Line 1102  sub _get_next_token ($) { Line 1249  sub _get_next_token ($) {
1249        
1250          redo A;          redo A;
1251        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
1252            if ($self->{is_xml}) {
1253              
1254              ## XML5: Not a parse error.
1255              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1256            } else {
1257              
1258            }
1259    
1260          $before_leave->();          $before_leave->();
1261          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
1262                        
# Line 1116  sub _get_next_token ($) { Line 1271  sub _get_next_token ($) {
1271            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1272          }          }
1273          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1274            $self->{s_kwd} = '';
1275                    
1276      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1277        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 1134  sub _get_next_token ($) { Line 1290  sub _get_next_token ($) {
1290        } elsif (0x0041 <= $self->{nc} and        } elsif (0x0041 <= $self->{nc} and
1291                 $self->{nc} <= 0x005A) { # A..Z                 $self->{nc} <= 0x005A) { # A..Z
1292                    
1293          $self->{ca}->{name} .= chr ($self->{nc} + 0x0020);          $self->{ca}->{name}
1294                .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1295          ## Stay in the state          ## Stay in the state
1296                    
1297      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 1149  sub _get_next_token ($) { Line 1306  sub _get_next_token ($) {
1306        
1307          redo A;          redo A;
1308        } elsif ($self->{nc} == 0x002F) { # /        } elsif ($self->{nc} == 0x002F) { # /
1309            if ($self->{is_xml}) {
1310              
1311              ## XML5: Not a parse error.
1312              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1313            } else {
1314              
1315            }
1316                    
1317          $before_leave->();          $before_leave->();
1318          $self->{state} = SELF_CLOSING_START_TAG_STATE;          $self->{state} = SELF_CLOSING_START_TAG_STATE;
# Line 1183  sub _get_next_token ($) { Line 1347  sub _get_next_token ($) {
1347            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1348          }          }
1349          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1350            $self->{s_kwd} = '';
1351          # reconsume          # reconsume
1352    
1353          return  ($self->{ct}); # start tag or end tag          return  ($self->{ct}); # start tag or end tag
# Line 1192  sub _get_next_token ($) { Line 1357  sub _get_next_token ($) {
1357          if ($self->{nc} == 0x0022 or # "          if ($self->{nc} == 0x0022 or # "
1358              $self->{nc} == 0x0027) { # '              $self->{nc} == 0x0027) { # '
1359                        
1360              ## XML5: Not a parse error.
1361            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1362          } else {          } else {
1363                        
# Line 1212  sub _get_next_token ($) { Line 1378  sub _get_next_token ($) {
1378          redo A;          redo A;
1379        }        }
1380      } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {      } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1381          ## XML5: "Tag attribute name after state".
1382          
1383        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
1384                    
1385          ## Stay in the state          ## Stay in the state
# Line 1243  sub _get_next_token ($) { Line 1411  sub _get_next_token ($) {
1411        
1412          redo A;          redo A;
1413        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
1414            if ($self->{is_xml}) {
1415              
1416              ## XML5: Not a parse error.
1417              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1418            } else {
1419              
1420            }
1421    
1422          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
1423                        
1424            $self->{last_stag_name} = $self->{ct}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
# Line 1259  sub _get_next_token ($) { Line 1435  sub _get_next_token ($) {
1435            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1436          }          }
1437          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1438            $self->{s_kwd} = '';
1439                    
1440      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1441        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 1278  sub _get_next_token ($) { Line 1455  sub _get_next_token ($) {
1455                 $self->{nc} <= 0x005A) { # A..Z                 $self->{nc} <= 0x005A) { # A..Z
1456                    
1457          $self->{ca}          $self->{ca}
1458              = {name => chr ($self->{nc} + 0x0020),              = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1459                 value => '',                 value => '',
1460                 line => $self->{line}, column => $self->{column}};                 line => $self->{line}, column => $self->{column}};
1461          $self->{state} = ATTRIBUTE_NAME_STATE;          $self->{state} = ATTRIBUTE_NAME_STATE;
# Line 1295  sub _get_next_token ($) { Line 1472  sub _get_next_token ($) {
1472        
1473          redo A;          redo A;
1474        } elsif ($self->{nc} == 0x002F) { # /        } elsif ($self->{nc} == 0x002F) { # /
1475            if ($self->{is_xml}) {
1476              
1477              ## XML5: Not a parse error.
1478              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1479            } else {
1480              
1481            }
1482                    
1483          $self->{state} = SELF_CLOSING_START_TAG_STATE;          $self->{state} = SELF_CLOSING_START_TAG_STATE;
1484                    
# Line 1326  sub _get_next_token ($) { Line 1510  sub _get_next_token ($) {
1510          } else {          } else {
1511            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1512          }          }
1513            $self->{s_kwd} = '';
1514          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1515          # reconsume          # reconsume
1516    
# Line 1333  sub _get_next_token ($) { Line 1518  sub _get_next_token ($) {
1518    
1519          redo A;          redo A;
1520        } else {        } else {
1521            if ($self->{is_xml}) {
1522              
1523              ## XML5: Not a parse error.
1524              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1525            } else {
1526              
1527            }
1528    
1529          if ($self->{nc} == 0x0022 or # "          if ($self->{nc} == 0x0022 or # "
1530              $self->{nc} == 0x0027) { # '              $self->{nc} == 0x0027) { # '
1531                        
1532              ## XML5: Not a parse error.
1533            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1534          } else {          } else {
1535                        
# Line 1359  sub _get_next_token ($) { Line 1553  sub _get_next_token ($) {
1553          redo A;                  redo A;        
1554        }        }
1555      } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {      } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1556          ## XML5: "Tag attribute value before state".
1557    
1558        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
1559                    
1560          ## Stay in the state          ## Stay in the state
# Line 1427  sub _get_next_token ($) { Line 1623  sub _get_next_token ($) {
1623            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1624          }          }
1625          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1626            $self->{s_kwd} = '';
1627                    
1628      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1629        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 1460  sub _get_next_token ($) { Line 1657  sub _get_next_token ($) {
1657            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1658          }          }
1659          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1660            $self->{s_kwd} = '';
1661          ## reconsume          ## reconsume
1662    
1663          return  ($self->{ct}); # start tag or end tag          return  ($self->{ct}); # start tag or end tag
# Line 1468  sub _get_next_token ($) { Line 1666  sub _get_next_token ($) {
1666        } else {        } else {
1667          if ($self->{nc} == 0x003D) { # =          if ($self->{nc} == 0x003D) { # =
1668                        
1669              ## XML5: Not a parse error.
1670            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
1671            } elsif ($self->{is_xml}) {
1672              
1673              ## XML5: No parse error.
1674              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO
1675          } else {          } else {
1676                        
1677          }          }
# Line 1488  sub _get_next_token ($) { Line 1691  sub _get_next_token ($) {
1691          redo A;          redo A;
1692        }        }
1693      } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {      } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1694          ## XML5: "Tag attribute value double quoted state".
1695          
1696        if ($self->{nc} == 0x0022) { # "        if ($self->{nc} == 0x0022) { # "
1697                    
1698            ## XML5: "Tag attribute name before state".
1699          $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;          $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1700                    
1701      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 1505  sub _get_next_token ($) { Line 1711  sub _get_next_token ($) {
1711          redo A;          redo A;
1712        } elsif ($self->{nc} == 0x0026) { # &        } elsif ($self->{nc} == 0x0026) { # &
1713                    
1714            ## XML5: Not defined yet.
1715    
1716          ## NOTE: In the spec, the tokenizer is switched to the          ## NOTE: In the spec, the tokenizer is switched to the
1717          ## "entity in attribute value state".  In this implementation, the          ## "entity in attribute value state".  In this implementation, the
1718          ## tokenizer is switched to the |ENTITY_STATE|, which is an          ## tokenizer is switched to the |ENTITY_STATE|, which is an
# Line 1542  sub _get_next_token ($) { Line 1750  sub _get_next_token ($) {
1750            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1751          }          }
1752          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1753            $self->{s_kwd} = '';
1754          ## reconsume          ## reconsume
1755    
1756          return  ($self->{ct}); # start tag or end tag          return  ($self->{ct}); # start tag or end tag
1757    
1758          redo A;          redo A;
1759        } else {        } else {
1760                    if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1761              
1762              ## XML5: Not a parse error.
1763              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
1764            } else {
1765              
1766            }
1767          $self->{ca}->{value} .= chr ($self->{nc});          $self->{ca}->{value} .= chr ($self->{nc});
1768          $self->{read_until}->($self->{ca}->{value},          $self->{read_until}->($self->{ca}->{value},
1769                                q["&],                                q["&<],
1770                                length $self->{ca}->{value});                                length $self->{ca}->{value});
1771    
1772          ## Stay in the state          ## Stay in the state
# Line 1569  sub _get_next_token ($) { Line 1784  sub _get_next_token ($) {
1784          redo A;          redo A;
1785        }        }
1786      } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {      } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1787          ## XML5: "Tag attribute value single quoted state".
1788    
1789        if ($self->{nc} == 0x0027) { # '        if ($self->{nc} == 0x0027) { # '
1790                    
1791            ## XML5: "Before attribute name state" (sic).
1792          $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;          $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1793                    
1794      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 1586  sub _get_next_token ($) { Line 1804  sub _get_next_token ($) {
1804          redo A;          redo A;
1805        } elsif ($self->{nc} == 0x0026) { # &        } elsif ($self->{nc} == 0x0026) { # &
1806                    
1807            ## XML5: Not defined yet.
1808    
1809          ## NOTE: In the spec, the tokenizer is switched to the          ## NOTE: In the spec, the tokenizer is switched to the
1810          ## "entity in attribute value state".  In this implementation, the          ## "entity in attribute value state".  In this implementation, the
1811          ## tokenizer is switched to the |ENTITY_STATE|, which is an          ## tokenizer is switched to the |ENTITY_STATE|, which is an
# Line 1623  sub _get_next_token ($) { Line 1843  sub _get_next_token ($) {
1843            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1844          }          }
1845          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1846            $self->{s_kwd} = '';
1847          ## reconsume          ## reconsume
1848    
1849          return  ($self->{ct}); # start tag or end tag          return  ($self->{ct}); # start tag or end tag
1850    
1851          redo A;          redo A;
1852        } else {        } else {
1853                    if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1854              
1855              ## XML5: Not a parse error.
1856              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
1857            } else {
1858              
1859            }
1860          $self->{ca}->{value} .= chr ($self->{nc});          $self->{ca}->{value} .= chr ($self->{nc});
1861          $self->{read_until}->($self->{ca}->{value},          $self->{read_until}->($self->{ca}->{value},
1862                                q['&],                                q['&<],
1863                                length $self->{ca}->{value});                                length $self->{ca}->{value});
1864    
1865          ## Stay in the state          ## Stay in the state
# Line 1650  sub _get_next_token ($) { Line 1877  sub _get_next_token ($) {
1877          redo A;          redo A;
1878        }        }
1879      } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {      } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1880          ## XML5: "Tag attribute value unquoted state".
1881    
1882        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
1883                    
1884            ## XML5: "Tag attribute name before state".
1885          $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;          $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1886                    
1887      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 1667  sub _get_next_token ($) { Line 1897  sub _get_next_token ($) {
1897          redo A;          redo A;
1898        } elsif ($self->{nc} == 0x0026) { # &        } elsif ($self->{nc} == 0x0026) { # &
1899                    
1900    
1901            ## XML5: Not defined yet.
1902    
1903          ## NOTE: In the spec, the tokenizer is switched to the          ## NOTE: In the spec, the tokenizer is switched to the
1904          ## "entity in attribute value state".  In this implementation, the          ## "entity in attribute value state".  In this implementation, the
1905          ## tokenizer is switched to the |ENTITY_STATE|, which is an          ## tokenizer is switched to the |ENTITY_STATE|, which is an
# Line 1703  sub _get_next_token ($) { Line 1936  sub _get_next_token ($) {
1936            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1937          }          }
1938          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1939            $self->{s_kwd} = '';
1940                    
1941      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1942        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 1736  sub _get_next_token ($) { Line 1970  sub _get_next_token ($) {
1970            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1971          }          }
1972          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1973            $self->{s_kwd} = '';
1974          ## reconsume          ## reconsume
1975    
1976          return  ($self->{ct}); # start tag or end tag          return  ($self->{ct}); # start tag or end tag
# Line 1748  sub _get_next_token ($) { Line 1983  sub _get_next_token ($) {
1983               0x003D => 1, # =               0x003D => 1, # =
1984              }->{$self->{nc}}) {              }->{$self->{nc}}) {
1985                        
1986              ## XML5: Not a parse error.
1987            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
1988          } else {          } else {
1989                        
# Line 1804  sub _get_next_token ($) { Line 2040  sub _get_next_token ($) {
2040            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
2041          }          }
2042          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2043            $self->{s_kwd} = '';
2044                    
2045      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2046        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 1851  sub _get_next_token ($) { Line 2088  sub _get_next_token ($) {
2088            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
2089          }          }
2090          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2091            $self->{s_kwd} = '';
2092          ## Reconsume.          ## Reconsume.
2093          return  ($self->{ct}); # start tag or end tag          return  ($self->{ct}); # start tag or end tag
2094          redo A;          redo A;
# Line 1862  sub _get_next_token ($) { Line 2100  sub _get_next_token ($) {
2100          redo A;          redo A;
2101        }        }
2102      } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {      } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
2103          ## XML5: "Empty tag state".
2104    
2105        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
2106          if ($self->{ct}->{type} == END_TAG_TOKEN) {          if ($self->{ct}->{type} == END_TAG_TOKEN) {
2107                        
# Line 1881  sub _get_next_token ($) { Line 2121  sub _get_next_token ($) {
2121          }          }
2122    
2123          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2124            $self->{s_kwd} = '';
2125                    
2126      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2127        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 1912  sub _get_next_token ($) { Line 2153  sub _get_next_token ($) {
2153          } else {          } else {
2154            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
2155          }          }
2156            ## XML5: "Tag attribute name before state".
2157          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2158            $self->{s_kwd} = '';
2159          ## Reconsume.          ## Reconsume.
2160          return  ($self->{ct}); # start tag or end tag          return  ($self->{ct}); # start tag or end tag
2161          redo A;          redo A;
# Line 1933  sub _get_next_token ($) { Line 2176  sub _get_next_token ($) {
2176        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
2177                    
2178          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2179            $self->{s_kwd} = '';
2180                    
2181      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2182        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 1950  sub _get_next_token ($) { Line 2194  sub _get_next_token ($) {
2194        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
2195                    
2196          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2197            $self->{s_kwd} = '';
2198          ## reconsume          ## reconsume
2199    
2200          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
# Line 2051  sub _get_next_token ($) { Line 2296  sub _get_next_token ($) {
2296                                    line => $self->{line_prev},                                    line => $self->{line_prev},
2297                                    column => $self->{column_prev} - 2,                                    column => $self->{column_prev} - 2,
2298                                   };                                   };
2299          $self->{state} = COMMENT_START_STATE;          $self->{state} = COMMENT_START_STATE; ## XML5: "comment state".
2300                    
2301      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2302        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2114  sub _get_next_token ($) { Line 2359  sub _get_next_token ($) {
2359        } elsif ((length $self->{s_kwd}) == 6 and        } elsif ((length $self->{s_kwd}) == 6 and
2360                 ($self->{nc} == 0x0045 or # E                 ($self->{nc} == 0x0045 or # E
2361                  $self->{nc} == 0x0065)) { # e                  $self->{nc} == 0x0065)) { # e
2362                    if ($self->{s_kwd} ne 'DOCTYP') {
2363              
2364              ## XML5: case-sensitive.
2365              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO
2366                              text => 'DOCTYPE',
2367                              line => $self->{line_prev},
2368                              column => $self->{column_prev} - 5);
2369            } else {
2370              
2371            }
2372          $self->{state} = DOCTYPE_STATE;          $self->{state} = DOCTYPE_STATE;
2373          $self->{ct} = {type => DOCTYPE_TOKEN,          $self->{ct} = {type => DOCTYPE_TOKEN,
2374                                    quirks => 1,                                    quirks => 1,
# Line 2172  sub _get_next_token ($) { Line 2426  sub _get_next_token ($) {
2426          redo A;          redo A;
2427        } elsif ($self->{s_kwd} eq '[CDATA' and        } elsif ($self->{s_kwd} eq '[CDATA' and
2428                 $self->{nc} == 0x005B) { # [                 $self->{nc} == 0x005B) { # [
2429                    if ($self->{is_xml} and
2430                not $self->{tainted} and
2431                @{$self->{open_elements} or []} == 0) {
2432              
2433              $self->{parse_error}->(level => $self->{level}->{must}, type => 'cdata outside of root element',
2434                              line => $self->{line_prev},
2435                              column => $self->{column_prev} - 7);
2436              $self->{tainted} = 1;
2437            } else {
2438              
2439            }
2440    
2441          $self->{ct} = {type => CHARACTER_TOKEN,          $self->{ct} = {type => CHARACTER_TOKEN,
2442                                    data => '',                                    data => '',
2443                                    line => $self->{line_prev},                                    line => $self->{line_prev},
# Line 2224  sub _get_next_token ($) { Line 2489  sub _get_next_token ($) {
2489                    
2490          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2491          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2492            $self->{s_kwd} = '';
2493                    
2494      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2495        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2243  sub _get_next_token ($) { Line 2509  sub _get_next_token ($) {
2509                    
2510          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2511          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2512            $self->{s_kwd} = '';
2513          ## reconsume          ## reconsume
2514    
2515          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
# Line 2286  sub _get_next_token ($) { Line 2553  sub _get_next_token ($) {
2553                    
2554          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2555          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2556            $self->{s_kwd} = '';
2557                    
2558      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2559        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2305  sub _get_next_token ($) { Line 2573  sub _get_next_token ($) {
2573                    
2574          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2575          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2576            $self->{s_kwd} = '';
2577          ## reconsume          ## reconsume
2578    
2579          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
# Line 2348  sub _get_next_token ($) { Line 2617  sub _get_next_token ($) {
2617                    
2618          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2619          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2620            $self->{s_kwd} = '';
2621          ## reconsume          ## reconsume
2622    
2623          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
# Line 2375  sub _get_next_token ($) { Line 2645  sub _get_next_token ($) {
2645          redo A;          redo A;
2646        }        }
2647      } elsif ($self->{state} == COMMENT_END_DASH_STATE) {      } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2648          ## XML5: "comment dash state".
2649    
2650        if ($self->{nc} == 0x002D) { # -        if ($self->{nc} == 0x002D) { # -
2651                    
2652          $self->{state} = COMMENT_END_STATE;          $self->{state} = COMMENT_END_STATE;
# Line 2393  sub _get_next_token ($) { Line 2665  sub _get_next_token ($) {
2665        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
2666                    
2667          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2668            $self->{s_kwd} = '';
2669          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2670            $self->{s_kwd} = '';
2671          ## reconsume          ## reconsume
2672    
2673          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
# Line 2420  sub _get_next_token ($) { Line 2694  sub _get_next_token ($) {
2694        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
2695                    
2696          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2697            $self->{s_kwd} = '';
2698                    
2699      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2700        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2437  sub _get_next_token ($) { Line 2712  sub _get_next_token ($) {
2712          redo A;          redo A;
2713        } elsif ($self->{nc} == 0x002D) { # -        } elsif ($self->{nc} == 0x002D) { # -
2714                    
2715            ## XML5: Not a parse error.
2716          $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2717                          line => $self->{line_prev},                          line => $self->{line_prev},
2718                          column => $self->{column_prev});                          column => $self->{column_prev});
# Line 2458  sub _get_next_token ($) { Line 2734  sub _get_next_token ($) {
2734                    
2735          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2736          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2737            $self->{s_kwd} = '';
2738          ## reconsume          ## reconsume
2739    
2740          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
# Line 2465  sub _get_next_token ($) { Line 2742  sub _get_next_token ($) {
2742          redo A;          redo A;
2743        } else {        } else {
2744                    
2745            ## XML5: Not a parse error.
2746          $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2747                          line => $self->{line_prev},                          line => $self->{line_prev},
2748                          column => $self->{column_prev});                          column => $self->{column_prev});
# Line 2526  sub _get_next_token ($) { Line 2804  sub _get_next_token ($) {
2804                    
2805          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
2806          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2807            $self->{s_kwd} = '';
2808                    
2809      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2810        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2545  sub _get_next_token ($) { Line 2824  sub _get_next_token ($) {
2824                    
2825          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
2826          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2827            $self->{s_kwd} = '';
2828          ## reconsume          ## reconsume
2829    
2830          return  ($self->{ct}); # DOCTYPE (quirks)          return  ($self->{ct}); # DOCTYPE (quirks)
# Line 2588  sub _get_next_token ($) { Line 2868  sub _get_next_token ($) {
2868        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
2869                    
2870          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2871            $self->{s_kwd} = '';
2872                    
2873      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2874        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2607  sub _get_next_token ($) { Line 2888  sub _get_next_token ($) {
2888                    
2889          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
2890          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2891            $self->{s_kwd} = '';
2892          ## reconsume          ## reconsume
2893    
2894          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 2650  sub _get_next_token ($) { Line 2932  sub _get_next_token ($) {
2932        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
2933                    
2934          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2935            $self->{s_kwd} = '';
2936                    
2937      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2938        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2669  sub _get_next_token ($) { Line 2952  sub _get_next_token ($) {
2952                    
2953          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
2954          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2955            $self->{s_kwd} = '';
2956          ## reconsume          ## reconsume
2957    
2958          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 2897  sub _get_next_token ($) { Line 3181  sub _get_next_token ($) {
3181          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3182    
3183          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3184            $self->{s_kwd} = '';
3185                    
3186      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3187        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2918  sub _get_next_token ($) { Line 3203  sub _get_next_token ($) {
3203          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3204    
3205          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3206            $self->{s_kwd} = '';
3207          ## reconsume          ## reconsume
3208    
3209          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 2964  sub _get_next_token ($) { Line 3250  sub _get_next_token ($) {
3250          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3251    
3252          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3253            $self->{s_kwd} = '';
3254                    
3255      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3256        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2985  sub _get_next_token ($) { Line 3272  sub _get_next_token ($) {
3272          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3273    
3274          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3275            $self->{s_kwd} = '';
3276          ## reconsume          ## reconsume
3277    
3278          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 3033  sub _get_next_token ($) { Line 3321  sub _get_next_token ($) {
3321          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3322    
3323          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3324            $self->{s_kwd} = '';
3325                    
3326      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3327        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3054  sub _get_next_token ($) { Line 3343  sub _get_next_token ($) {
3343          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3344    
3345          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3346            $self->{s_kwd} = '';
3347          ## reconsume          ## reconsume
3348    
3349          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 3132  sub _get_next_token ($) { Line 3422  sub _get_next_token ($) {
3422        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
3423                    
3424          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3425            $self->{s_kwd} = '';
3426                    
3427      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3428        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3152  sub _get_next_token ($) { Line 3443  sub _get_next_token ($) {
3443          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3444    
3445          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3446            $self->{s_kwd} = '';
3447          ## reconsume          ## reconsume
3448    
3449          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 3229  sub _get_next_token ($) { Line 3521  sub _get_next_token ($) {
3521                    
3522          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3523          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3524            $self->{s_kwd} = '';
3525                    
3526      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3527        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3250  sub _get_next_token ($) { Line 3543  sub _get_next_token ($) {
3543          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3544    
3545          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3546            $self->{s_kwd} = '';
3547          ## reconsume          ## reconsume
3548    
3549          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 3296  sub _get_next_token ($) { Line 3590  sub _get_next_token ($) {
3590          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3591    
3592          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3593            $self->{s_kwd} = '';
3594                    
3595      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3596        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3317  sub _get_next_token ($) { Line 3612  sub _get_next_token ($) {
3612          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3613    
3614          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3615            $self->{s_kwd} = '';
3616          ## reconsume          ## reconsume
3617    
3618          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 3365  sub _get_next_token ($) { Line 3661  sub _get_next_token ($) {
3661          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3662    
3663          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3664            $self->{s_kwd} = '';
3665                    
3666      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3667        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3386  sub _get_next_token ($) { Line 3683  sub _get_next_token ($) {
3683          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3684    
3685          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3686            $self->{s_kwd} = '';
3687          ## reconsume          ## reconsume
3688    
3689          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 3432  sub _get_next_token ($) { Line 3730  sub _get_next_token ($) {
3730        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
3731                    
3732          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3733            $self->{s_kwd} = '';
3734                    
3735      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3736        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3451  sub _get_next_token ($) { Line 3750  sub _get_next_token ($) {
3750                    
3751          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3752          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3753            $self->{s_kwd} = '';
3754          ## reconsume          ## reconsume
3755    
3756          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 3480  sub _get_next_token ($) { Line 3780  sub _get_next_token ($) {
3780        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
3781                    
3782          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3783            $self->{s_kwd} = '';
3784                    
3785      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3786        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3498  sub _get_next_token ($) { Line 3799  sub _get_next_token ($) {
3799        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
3800                    
3801          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3802            $self->{s_kwd} = '';
3803          ## reconsume          ## reconsume
3804    
3805          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
# Line 3526  sub _get_next_token ($) { Line 3828  sub _get_next_token ($) {
3828        ## NOTE: "CDATA section state" in the state is jointly implemented        ## NOTE: "CDATA section state" in the state is jointly implemented
3829        ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,        ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
3830        ## and |CDATA_SECTION_MSE2_STATE|.        ## and |CDATA_SECTION_MSE2_STATE|.
3831    
3832          ## XML5: "CDATA state".
3833                
3834        if ($self->{nc} == 0x005D) { # ]        if ($self->{nc} == 0x005D) { # ]
3835                    
# Line 3543  sub _get_next_token ($) { Line 3847  sub _get_next_token ($) {
3847        
3848          redo A;          redo A;
3849        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
3850            if ($self->{is_xml}) {
3851              
3852              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no mse'); ## TODO: type
3853            } else {
3854              
3855            }
3856    
3857          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3858                    $self->{s_kwd} = '';
3859      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {          ## Reconsume.
       $self->{line_prev} = $self->{line};  
       $self->{column_prev} = $self->{column};  
       $self->{column}++;  
       $self->{nc}  
           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);  
     } else {  
       $self->{set_nc}->($self);  
     }  
     
3860          if (length $self->{ct}->{data}) { # character          if (length $self->{ct}->{data}) { # character
3861                        
3862            return  ($self->{ct}); # character            return  ($self->{ct}); # character
# Line 3587  sub _get_next_token ($) { Line 3889  sub _get_next_token ($) {
3889    
3890        ## ISSUE: "text tokens" in spec.        ## ISSUE: "text tokens" in spec.
3891      } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {      } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
3892          ## XML5: "CDATA bracket state".
3893    
3894        if ($self->{nc} == 0x005D) { # ]        if ($self->{nc} == 0x005D) { # ]
3895                    
3896          $self->{state} = CDATA_SECTION_MSE2_STATE;          $self->{state} = CDATA_SECTION_MSE2_STATE;
# Line 3604  sub _get_next_token ($) { Line 3908  sub _get_next_token ($) {
3908          redo A;          redo A;
3909        } else {        } else {
3910                    
3911            ## XML5: If EOF, "]" is not appended and changed to the data state.
3912          $self->{ct}->{data} .= ']';          $self->{ct}->{data} .= ']';
3913          $self->{state} = CDATA_SECTION_STATE;          $self->{state} = CDATA_SECTION_STATE; ## XML5: Stay in the state.
3914          ## Reconsume.          ## Reconsume.
3915          redo A;          redo A;
3916        }        }
3917      } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {      } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
3918          ## XML5: "CDATA end state".
3919    
3920        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
3921          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3922            $self->{s_kwd} = '';
3923                    
3924      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3925        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3651  sub _get_next_token ($) { Line 3959  sub _get_next_token ($) {
3959                    
3960          $self->{ct}->{data} .= ']]'; # character          $self->{ct}->{data} .= ']]'; # character
3961          $self->{state} = CDATA_SECTION_STATE;          $self->{state} = CDATA_SECTION_STATE;
3962          ## Reconsume.          ## Reconsume. ## XML5: Emit.
3963          redo A;          redo A;
3964        }        }
3965      } elsif ($self->{state} == ENTITY_STATE) {      } elsif ($self->{state} == ENTITY_STATE) {
# Line 3719  sub _get_next_token ($) { Line 4027  sub _get_next_token ($) {
4027        if ($self->{prev_state} == DATA_STATE) {        if ($self->{prev_state} == DATA_STATE) {
4028                    
4029          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
4030            $self->{s_kwd} = '';
4031          ## Reconsume.          ## Reconsume.
4032          return  ({type => CHARACTER_TOKEN, data => '&',          return  ({type => CHARACTER_TOKEN, data => '&',
4033                    line => $self->{line_prev},                    line => $self->{line_prev},
# Line 3729  sub _get_next_token ($) { Line 4038  sub _get_next_token ($) {
4038                    
4039          $self->{ca}->{value} .= '&';          $self->{ca}->{value} .= '&';
4040          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
4041            $self->{s_kwd} = '';
4042          ## Reconsume.          ## Reconsume.
4043          redo A;          redo A;
4044        }        }
# Line 3779  sub _get_next_token ($) { Line 4089  sub _get_next_token ($) {
4089          if ($self->{prev_state} == DATA_STATE) {          if ($self->{prev_state} == DATA_STATE) {
4090                        
4091            $self->{state} = $self->{prev_state};            $self->{state} = $self->{prev_state};
4092              $self->{s_kwd} = '';
4093            ## Reconsume.            ## Reconsume.
4094            return  ({type => CHARACTER_TOKEN,            return  ({type => CHARACTER_TOKEN,
4095                      data => '&#',                      data => '&#',
# Line 3790  sub _get_next_token ($) { Line 4101  sub _get_next_token ($) {
4101                        
4102            $self->{ca}->{value} .= '&#';            $self->{ca}->{value} .= '&#';
4103            $self->{state} = $self->{prev_state};            $self->{state} = $self->{prev_state};
4104              $self->{s_kwd} = '';
4105            ## Reconsume.            ## Reconsume.
4106            redo A;            redo A;
4107          }          }
# Line 3855  sub _get_next_token ($) { Line 4167  sub _get_next_token ($) {
4167        if ($self->{prev_state} == DATA_STATE) {        if ($self->{prev_state} == DATA_STATE) {
4168                    
4169          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
4170            $self->{s_kwd} = '';
4171          ## Reconsume.          ## Reconsume.
4172          return  ({type => CHARACTER_TOKEN, data => chr $code,          return  ({type => CHARACTER_TOKEN, data => chr $code,
4173                      has_reference => 1,
4174                    line => $l, column => $c,                    line => $l, column => $c,
4175                   });                   });
4176          redo A;          redo A;
# Line 3865  sub _get_next_token ($) { Line 4179  sub _get_next_token ($) {
4179          $self->{ca}->{value} .= chr $code;          $self->{ca}->{value} .= chr $code;
4180          $self->{ca}->{has_reference} = 1;          $self->{ca}->{has_reference} = 1;
4181          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
4182            $self->{s_kwd} = '';
4183          ## Reconsume.          ## Reconsume.
4184          redo A;          redo A;
4185        }        }
# Line 3890  sub _get_next_token ($) { Line 4205  sub _get_next_token ($) {
4205          if ($self->{prev_state} == DATA_STATE) {          if ($self->{prev_state} == DATA_STATE) {
4206                        
4207            $self->{state} = $self->{prev_state};            $self->{state} = $self->{prev_state};
4208              $self->{s_kwd} = '';
4209            ## Reconsume.            ## Reconsume.
4210            return  ({type => CHARACTER_TOKEN,            return  ({type => CHARACTER_TOKEN,
4211                      data => '&' . $self->{s_kwd},                      data => '&' . $self->{s_kwd},
# Line 3901  sub _get_next_token ($) { Line 4217  sub _get_next_token ($) {
4217                        
4218            $self->{ca}->{value} .= '&' . $self->{s_kwd};            $self->{ca}->{value} .= '&' . $self->{s_kwd};
4219            $self->{state} = $self->{prev_state};            $self->{state} = $self->{prev_state};
4220              $self->{s_kwd} = '';
4221            ## Reconsume.            ## Reconsume.
4222            redo A;            redo A;
4223          }          }
# Line 4003  sub _get_next_token ($) { Line 4320  sub _get_next_token ($) {
4320        if ($self->{prev_state} == DATA_STATE) {        if ($self->{prev_state} == DATA_STATE) {
4321                    
4322          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
4323            $self->{s_kwd} = '';
4324          ## Reconsume.          ## Reconsume.
4325          return  ({type => CHARACTER_TOKEN, data => chr $code,          return  ({type => CHARACTER_TOKEN, data => chr $code,
4326                      has_reference => 1,
4327                    line => $l, column => $c,                    line => $l, column => $c,
4328                   });                   });
4329          redo A;          redo A;
# Line 4013  sub _get_next_token ($) { Line 4332  sub _get_next_token ($) {
4332          $self->{ca}->{value} .= chr $code;          $self->{ca}->{value} .= chr $code;
4333          $self->{ca}->{has_reference} = 1;          $self->{ca}->{has_reference} = 1;
4334          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
4335            $self->{s_kwd} = '';
4336          ## Reconsume.          ## Reconsume.
4337          redo A;          redo A;
4338        }        }
# Line 4125  sub _get_next_token ($) { Line 4445  sub _get_next_token ($) {
4445        if ($self->{prev_state} == DATA_STATE) {        if ($self->{prev_state} == DATA_STATE) {
4446                    
4447          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
4448            $self->{s_kwd} = '';
4449          ## Reconsume.          ## Reconsume.
4450          return  ({type => CHARACTER_TOKEN,          return  ({type => CHARACTER_TOKEN,
4451                    data => $data,                    data => $data,
4452                      has_reference => $has_ref,
4453                    line => $self->{line_prev},                    line => $self->{line_prev},
4454                    column => $self->{column_prev} + 1 - length $self->{s_kwd},                    column => $self->{column_prev} + 1 - length $self->{s_kwd},
4455                   });                   });
# Line 4137  sub _get_next_token ($) { Line 4459  sub _get_next_token ($) {
4459          $self->{ca}->{value} .= $data;          $self->{ca}->{value} .= $data;
4460          $self->{ca}->{has_reference} = 1 if $has_ref;          $self->{ca}->{has_reference} = 1 if $has_ref;
4461          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
4462            $self->{s_kwd} = '';
4463            ## Reconsume.
4464            redo A;
4465          }
4466    
4467        ## XML-only states
4468    
4469        } elsif ($self->{state} == PI_STATE) {
4470          if ($is_space->{$self->{nc}} or
4471              $self->{nc} == 0x003F or # ? ## XML5: Same as "Anything else"
4472              $self->{nc} == -1) {
4473            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare pio', ## TODO: type
4474                            line => $self->{line_prev},
4475                            column => $self->{column_prev}
4476                                - 1 * ($self->{nc} != -1));
4477            $self->{state} = BOGUS_COMMENT_STATE;
4478            ## Reconsume.
4479            $self->{ct} = {type => COMMENT_TOKEN,
4480                           data => '?',
4481                           line => $self->{line_prev},
4482                           column => $self->{column_prev}
4483                               - 1 * ($self->{nc} != -1),
4484                          };
4485            redo A;
4486          } else {
4487            $self->{ct} = {type => PI_TOKEN,
4488                           target => chr $self->{nc},
4489                           data => '',
4490                           line => $self->{line_prev},
4491                           column => $self->{column_prev} - 1,
4492                          };
4493            $self->{state} = PI_TARGET_STATE;
4494            
4495        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4496          $self->{line_prev} = $self->{line};
4497          $self->{column_prev} = $self->{column};
4498          $self->{column}++;
4499          $self->{nc}
4500              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4501        } else {
4502          $self->{set_nc}->($self);
4503        }
4504      
4505            redo A;
4506          }
4507        } elsif ($self->{state} == PI_TARGET_STATE) {
4508          if ($is_space->{$self->{nc}}) {
4509            $self->{state} = PI_TARGET_AFTER_STATE;
4510            
4511        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4512          $self->{line_prev} = $self->{line};
4513          $self->{column_prev} = $self->{column};
4514          $self->{column}++;
4515          $self->{nc}
4516              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4517        } else {
4518          $self->{set_nc}->($self);
4519        }
4520      
4521            redo A;
4522          } elsif ($self->{nc} == -1) {
4523            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
4524            $self->{state} = DATA_STATE;
4525            $self->{s_kwd} = '';
4526          ## Reconsume.          ## Reconsume.
4527            return  ($self->{ct}); # pi
4528            redo A;
4529          } elsif ($self->{nc} == 0x003F) { # ?
4530            $self->{state} = PI_AFTER_STATE;
4531            
4532        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4533          $self->{line_prev} = $self->{line};
4534          $self->{column_prev} = $self->{column};
4535          $self->{column}++;
4536          $self->{nc}
4537              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4538        } else {
4539          $self->{set_nc}->($self);
4540        }
4541      
4542            redo A;
4543          } else {
4544            ## XML5: typo ("tag name" -> "target")
4545            $self->{ct}->{target} .= chr $self->{nc}; # pi
4546            
4547        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4548          $self->{line_prev} = $self->{line};
4549          $self->{column_prev} = $self->{column};
4550          $self->{column}++;
4551          $self->{nc}
4552              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4553        } else {
4554          $self->{set_nc}->($self);
4555        }
4556      
4557          redo A;          redo A;
4558        }        }
4559        } elsif ($self->{state} == PI_TARGET_AFTER_STATE) {
4560          if ($is_space->{$self->{nc}}) {
4561            ## Stay in the state.
4562            
4563        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4564          $self->{line_prev} = $self->{line};
4565          $self->{column_prev} = $self->{column};
4566          $self->{column}++;
4567          $self->{nc}
4568              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4569        } else {
4570          $self->{set_nc}->($self);
4571        }
4572      
4573            redo A;
4574          } else {
4575            $self->{state} = PI_DATA_STATE;
4576            ## Reprocess.
4577            redo A;
4578          }
4579        } elsif ($self->{state} == PI_DATA_STATE) {
4580          if ($self->{nc} == 0x003F) { # ?
4581            $self->{state} = PI_DATA_AFTER_STATE;
4582            
4583        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4584          $self->{line_prev} = $self->{line};
4585          $self->{column_prev} = $self->{column};
4586          $self->{column}++;
4587          $self->{nc}
4588              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4589        } else {
4590          $self->{set_nc}->($self);
4591        }
4592      
4593            redo A;
4594          } elsif ($self->{nc} == -1) {
4595            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
4596            $self->{state} = DATA_STATE;
4597            $self->{s_kwd} = '';
4598            ## Reprocess.
4599            return  ($self->{ct}); # pi
4600            redo A;
4601          } else {
4602            $self->{ct}->{data} .= chr $self->{nc}; # pi
4603            $self->{read_until}->($self->{ct}->{data}, q[?],
4604                                  length $self->{ct}->{data});
4605            ## Stay in the state.
4606            
4607        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4608          $self->{line_prev} = $self->{line};
4609          $self->{column_prev} = $self->{column};
4610          $self->{column}++;
4611          $self->{nc}
4612              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4613        } else {
4614          $self->{set_nc}->($self);
4615        }
4616      
4617            ## Reprocess.
4618            redo A;
4619          }
4620        } elsif ($self->{state} == PI_AFTER_STATE) {
4621          if ($self->{nc} == 0x003E) { # >
4622            $self->{state} = DATA_STATE;
4623            $self->{s_kwd} = '';
4624            
4625        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4626          $self->{line_prev} = $self->{line};
4627          $self->{column_prev} = $self->{column};
4628          $self->{column}++;
4629          $self->{nc}
4630              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4631        } else {
4632          $self->{set_nc}->($self);
4633        }
4634      
4635            return  ($self->{ct}); # pi
4636            redo A;
4637          } elsif ($self->{nc} == 0x003F) { # ?
4638            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
4639                            line => $self->{line_prev},
4640                            column => $self->{column_prev}); ## XML5: no error
4641            $self->{ct}->{data} .= '?';
4642            $self->{state} = PI_DATA_AFTER_STATE;
4643            
4644        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4645          $self->{line_prev} = $self->{line};
4646          $self->{column_prev} = $self->{column};
4647          $self->{column}++;
4648          $self->{nc}
4649              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4650        } else {
4651          $self->{set_nc}->($self);
4652        }
4653      
4654            redo A;
4655          } else {
4656            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
4657                            line => $self->{line_prev},
4658                            column => $self->{column_prev}
4659                                + 1 * ($self->{nc} == -1)); ## XML5: no error
4660            $self->{ct}->{data} .= '?'; ## XML5: not appended
4661            $self->{state} = PI_DATA_STATE;
4662            ## Reprocess.
4663            redo A;
4664          }
4665        } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
4666          ## XML5: Same as "pi after state" in XML5
4667          if ($self->{nc} == 0x003E) { # >
4668            $self->{state} = DATA_STATE;
4669            $self->{s_kwd} = '';
4670            
4671        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4672          $self->{line_prev} = $self->{line};
4673          $self->{column_prev} = $self->{column};
4674          $self->{column}++;
4675          $self->{nc}
4676              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4677        } else {
4678          $self->{set_nc}->($self);
4679        }
4680      
4681            return  ($self->{ct}); # pi
4682            redo A;
4683          } elsif ($self->{nc} == 0x003F) { # ?
4684            $self->{ct}->{data} .= '?';
4685            ## Stay in the state.
4686            
4687        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4688          $self->{line_prev} = $self->{line};
4689          $self->{column_prev} = $self->{column};
4690          $self->{column}++;
4691          $self->{nc}
4692              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4693        } else {
4694          $self->{set_nc}->($self);
4695        }
4696      
4697            redo A;
4698          } else {
4699            $self->{ct}->{data} .= '?'; ## XML5: not appended
4700            $self->{state} = PI_DATA_STATE;
4701            ## Reprocess.
4702            redo A;
4703          }
4704            
4705      } else {      } else {
4706        die "$0: $self->{state}: Unknown state";        die "$0: $self->{state}: Unknown state";
4707      }      }

Legend:
Removed from v.1.3  
changed lines
  Added in v.1.11

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24