/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm
Suika

Diff of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.3 by wakaba, Tue Oct 14 05:34:05 2008 UTC revision 1.9 by wakaba, Wed Oct 15 08:05:47 2008 UTC
# Line 114  sub HEXREF_HEX_STATE () { 48 } Line 114  sub HEXREF_HEX_STATE () { 48 }
114  sub ENTITY_NAME_STATE () { 49 }  sub ENTITY_NAME_STATE () { 49 }
115  sub PCDATA_STATE () { 50 } # "data state" in the spec  sub PCDATA_STATE () { 50 } # "data state" in the spec
116    
117    ## XML states
118    sub PI_STATE () { 51 }
119    sub PI_TARGET_STATE () { 52 }
120    sub PI_TARGET_AFTER_STATE () { 53 }
121    sub PI_DATA_STATE () { 54 }
122    sub PI_AFTER_STATE () { 55 }
123    sub PI_DATA_AFTER_STATE () { 56 }
124    
125  ## Tree constructor state constants (see Whatpm::HTML for the full  ## Tree constructor state constants (see Whatpm::HTML for the full
126  ## list and descriptions)  ## list and descriptions)
127    
# Line 178  sub _initialize_tokenizer ($) { Line 186  sub _initialize_tokenizer ($) {
186    #$self->{is_xml} (if XML)    #$self->{is_xml} (if XML)
187    
188    $self->{state} = DATA_STATE; # MUST    $self->{state} = DATA_STATE; # MUST
189    #$self->{s_kwd}; # state keyword - initialized when used    $self->{s_kwd} = ''; # state keyword
190    #$self->{entity__value}; # initialized when used    #$self->{entity__value}; # initialized when used
191    #$self->{entity__match}; # initialized when used    #$self->{entity__match}; # initialized when used
192    $self->{content_model} = PCDATA_CONTENT_MODEL; # be    $self->{content_model} = PCDATA_CONTENT_MODEL; # be
# Line 219  sub _initialize_tokenizer ($) { Line 227  sub _initialize_tokenizer ($) {
227  ##        ->{value}  ##        ->{value}
228  ##        ->{has_reference} == 1 or 0  ##        ->{has_reference} == 1 or 0
229  ##   ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)  ##   ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)
230    ##   ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
231  ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.  ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
232  ##     |->{self_closing}| is used to save the value of |$self->{self_closing}|  ##     |->{self_closing}| is used to save the value of |$self->{self_closing}|
233  ##     while the token is pushed back to the stack.  ##     while the token is pushed back to the stack.
# Line 362  sub _get_next_token ($) { Line 371  sub _get_next_token ($) {
371          }          }
372        } elsif ($self->{nc} == 0x002D) { # -        } elsif ($self->{nc} == 0x002D) { # -
373          if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA          if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
374            $self->{s_kwd} .= '-';            if ($self->{s_kwd} eq '<!-') {
             
           if ($self->{s_kwd} eq '<!--') {  
375                            
376              $self->{escape} = 1; # unless $self->{escape};              $self->{escape} = 1; # unless $self->{escape};
377              $self->{s_kwd} = '--';              $self->{s_kwd} = '--';
378              #              #
379            } elsif ($self->{s_kwd} eq '---') {            } elsif ($self->{s_kwd} eq '-') {
380                            
381              $self->{s_kwd} = '--';              $self->{s_kwd} = '--';
382              #              #
383              } elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') {
384                
385                $self->{s_kwd} .= '-';
386                #
387            } else {            } else {
388                            
389                $self->{s_kwd} = '-';
390              #              #
391            }            }
392          }          }
# Line 420  sub _get_next_token ($) { Line 432  sub _get_next_token ($) {
432            if ($self->{s_kwd} eq '--') {            if ($self->{s_kwd} eq '--') {
433                            
434              delete $self->{escape};              delete $self->{escape};
435                #
436            } else {            } else {
437                            
438                #
439            }            }
440            } elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') {
441              
442              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched mse', ## TODO: type
443                              line => $self->{line_prev},
444                              column => $self->{column_prev} - 1);
445              #
446          } else {          } else {
447                        
448              #
449          }          }
450                    
451          $self->{s_kwd} = '';          $self->{s_kwd} = '';
452          #          #
453          } elsif ($self->{nc} == 0x005D) { # ]
454            if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') {
455              
456              $self->{s_kwd} .= ']';
457            } elsif ($self->{s_kwd} eq ']]') {
458              
459              #
460            } else {
461              
462              $self->{s_kwd} = '';
463            }
464            #
465        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
466                    
467          $self->{s_kwd} = '';          $self->{s_kwd} = '';
# Line 446  sub _get_next_token ($) { Line 479  sub _get_next_token ($) {
479                     data => chr $self->{nc},                     data => chr $self->{nc},
480                     line => $self->{line}, column => $self->{column},                     line => $self->{line}, column => $self->{column},
481                    };                    };
482        if ($self->{read_until}->($token->{data}, q[-!<>&],        if ($self->{read_until}->($token->{data}, q{-!<>&\]},
483                                  length $token->{data})) {                                  length $token->{data})) {
484          $self->{s_kwd} = '';          $self->{s_kwd} = '';
485        }        }
486    
487        ## Stay in the data state.        ## Stay in the data state.
488        if ($self->{content_model} == PCDATA_CONTENT_MODEL) {        if (not $self->{is_xml} and
489              $self->{content_model} == PCDATA_CONTENT_MODEL) {
490                    
491          $self->{state} = PCDATA_STATE;          $self->{state} = PCDATA_STATE;
492        } else {        } else {
# Line 500  sub _get_next_token ($) { Line 534  sub _get_next_token ($) {
534    
535          ## reconsume          ## reconsume
536          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
537            $self->{s_kwd} = '';
538          return  ({type => CHARACTER_TOKEN, data => '<',          return  ({type => CHARACTER_TOKEN, data => '<',
539                    line => $self->{line_prev},                    line => $self->{line_prev},
540                    column => $self->{column_prev},                    column => $self->{column_prev},
# Line 541  sub _get_next_token ($) { Line 576  sub _get_next_token ($) {
576                        
577            $self->{ct}            $self->{ct}
578              = {type => START_TAG_TOKEN,              = {type => START_TAG_TOKEN,
579                 tag_name => chr ($self->{nc} + 0x0020),                 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
580                 line => $self->{line_prev},                 line => $self->{line_prev},
581                 column => $self->{column_prev}};                 column => $self->{column_prev}};
582            $self->{state} = TAG_NAME_STATE;            $self->{state} = TAG_NAME_STATE;
# Line 583  sub _get_next_token ($) { Line 618  sub _get_next_token ($) {
618                            line => $self->{line_prev},                            line => $self->{line_prev},
619                            column => $self->{column_prev});                            column => $self->{column_prev});
620            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
621              $self->{s_kwd} = '';
622                        
623      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
624        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 602  sub _get_next_token ($) { Line 638  sub _get_next_token ($) {
638    
639            redo A;            redo A;
640          } elsif ($self->{nc} == 0x003F) { # ?          } elsif ($self->{nc} == 0x003F) { # ?
641                        if ($self->{is_xml}) {
642            $self->{parse_error}->(level => $self->{level}->{must}, type => 'pio',              
643                            line => $self->{line_prev},              $self->{state} = PI_STATE;
644                            column => $self->{column_prev});              
645            $self->{state} = BOGUS_COMMENT_STATE;      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
646            $self->{ct} = {type => COMMENT_TOKEN, data => '',        $self->{line_prev} = $self->{line};
647                                      line => $self->{line_prev},        $self->{column_prev} = $self->{column};
648                                      column => $self->{column_prev},        $self->{column}++;
649                                     };        $self->{nc}
650            ## $self->{nc} is intentionally left as is            = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
651            redo A;      } else {
652          } else {        $self->{set_nc}->($self);
653        }
654      
655                redo A;
656              } else {
657                
658                $self->{parse_error}->(level => $self->{level}->{must}, type => 'pio',
659                                line => $self->{line_prev},
660                                column => $self->{column_prev});
661                $self->{state} = BOGUS_COMMENT_STATE;
662                $self->{ct} = {type => COMMENT_TOKEN, data => '',
663                               line => $self->{line_prev},
664                               column => $self->{column_prev},
665                              };
666                ## $self->{nc} is intentionally left as is
667                redo A;
668              }
669            } elsif (not $self->{is_xml} or $is_space->{$self->{nc}}) {
670                        
671            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago',            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago',
672                            line => $self->{line_prev},                            line => $self->{line_prev},
673                            column => $self->{column_prev});                            column => $self->{column_prev});
674            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
675              $self->{s_kwd} = '';
676            ## reconsume            ## reconsume
677    
678            return  ({type => CHARACTER_TOKEN, data => '<',            return  ({type => CHARACTER_TOKEN, data => '<',
# Line 627  sub _get_next_token ($) { Line 681  sub _get_next_token ($) {
681                     });                     });
682    
683            redo A;            redo A;
684            } else {
685              ## XML5: "<:" is a parse error.
686              
687              $self->{ct} = {type => START_TAG_TOKEN,
688                                        tag_name => chr ($self->{nc}),
689                                        line => $self->{line_prev},
690                                        column => $self->{column_prev}};
691              $self->{state} = TAG_NAME_STATE;
692              
693        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
694          $self->{line_prev} = $self->{line};
695          $self->{column_prev} = $self->{column};
696          $self->{column}++;
697          $self->{nc}
698              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
699        } else {
700          $self->{set_nc}->($self);
701        }
702      
703              redo A;
704          }          }
705        } else {        } else {
706          die "$0: $self->{content_model} in tag open";          die "$0: $self->{content_model} in tag open";
# Line 647  sub _get_next_token ($) { Line 721  sub _get_next_token ($) {
721            ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.            ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
722                        
723            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
724              $self->{s_kwd} = '';
725            ## Reconsume.            ## Reconsume.
726            return  ({type => CHARACTER_TOKEN, data => '</',            return  ({type => CHARACTER_TOKEN, data => '</',
727                      line => $l, column => $c,                      line => $l, column => $c,
# Line 660  sub _get_next_token ($) { Line 735  sub _get_next_token ($) {
735                    
736          $self->{ct}          $self->{ct}
737              = {type => END_TAG_TOKEN,              = {type => END_TAG_TOKEN,
738                 tag_name => chr ($self->{nc} + 0x0020),                 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
739                 line => $l, column => $c};                 line => $l, column => $c};
740          $self->{state} = TAG_NAME_STATE;          $self->{state} = TAG_NAME_STATE;
741                    
# Line 700  sub _get_next_token ($) { Line 775  sub _get_next_token ($) {
775                          line => $self->{line_prev}, ## "<" in "</>"                          line => $self->{line_prev}, ## "<" in "</>"
776                          column => $self->{column_prev} - 1);                          column => $self->{column_prev} - 1);
777          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
778            $self->{s_kwd} = '';
779                    
780      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
781        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 715  sub _get_next_token ($) { Line 791  sub _get_next_token ($) {
791        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
792                    
793          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare etago');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare etago');
794            $self->{s_kwd} = '';
795          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
796          # reconsume          # reconsume
797    
# Line 764  sub _get_next_token ($) { Line 841  sub _get_next_token ($) {
841          } else {          } else {
842                        
843            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
844              $self->{s_kwd} = '';
845            ## Reconsume.            ## Reconsume.
846            return  ({type => CHARACTER_TOKEN,            return  ({type => CHARACTER_TOKEN,
847                      data => '</' . $self->{s_kwd},                      data => '</' . $self->{s_kwd},
# Line 782  sub _get_next_token ($) { Line 860  sub _get_next_token ($) {
860                        
861            ## Reconsume.            ## Reconsume.
862            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
863              $self->{s_kwd} = '';
864            return  ({type => CHARACTER_TOKEN,            return  ({type => CHARACTER_TOKEN,
865                      data => '</' . $self->{s_kwd},                      data => '</' . $self->{s_kwd},
866                      line => $self->{line_prev},                      line => $self->{line_prev},
# Line 833  sub _get_next_token ($) { Line 912  sub _get_next_token ($) {
912            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
913          }          }
914          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
915            $self->{s_kwd} = '';
916                    
917      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
918        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 851  sub _get_next_token ($) { Line 931  sub _get_next_token ($) {
931        } elsif (0x0041 <= $self->{nc} and        } elsif (0x0041 <= $self->{nc} and
932                 $self->{nc} <= 0x005A) { # A..Z                 $self->{nc} <= 0x005A) { # A..Z
933                    
934          $self->{ct}->{tag_name} .= chr ($self->{nc} + 0x0020);          $self->{ct}->{tag_name}
935                .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
936            # start tag or end tag            # start tag or end tag
937          ## Stay in this state          ## Stay in this state
938                    
# Line 884  sub _get_next_token ($) { Line 965  sub _get_next_token ($) {
965            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
966          }          }
967          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
968            $self->{s_kwd} = '';
969          # reconsume          # reconsume
970    
971          return  ($self->{ct}); # start tag or end tag          return  ($self->{ct}); # start tag or end tag
# Line 954  sub _get_next_token ($) { Line 1036  sub _get_next_token ($) {
1036            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1037          }          }
1038          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1039            $self->{s_kwd} = '';
1040                    
1041      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1042        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 973  sub _get_next_token ($) { Line 1056  sub _get_next_token ($) {
1056                 $self->{nc} <= 0x005A) { # A..Z                 $self->{nc} <= 0x005A) { # A..Z
1057                    
1058          $self->{ca}          $self->{ca}
1059              = {name => chr ($self->{nc} + 0x0020),              = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1060                 value => '',                 value => '',
1061                 line => $self->{line}, column => $self->{column}};                 line => $self->{line}, column => $self->{column}};
1062          $self->{state} = ATTRIBUTE_NAME_STATE;          $self->{state} = ATTRIBUTE_NAME_STATE;
# Line 1021  sub _get_next_token ($) { Line 1104  sub _get_next_token ($) {
1104            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1105          }          }
1106          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1107            $self->{s_kwd} = '';
1108          # reconsume          # reconsume
1109    
1110          return  ($self->{ct}); # start tag or end tag          return  ($self->{ct}); # start tag or end tag
# Line 1116  sub _get_next_token ($) { Line 1200  sub _get_next_token ($) {
1200            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1201          }          }
1202          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1203            $self->{s_kwd} = '';
1204                    
1205      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1206        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 1134  sub _get_next_token ($) { Line 1219  sub _get_next_token ($) {
1219        } elsif (0x0041 <= $self->{nc} and        } elsif (0x0041 <= $self->{nc} and
1220                 $self->{nc} <= 0x005A) { # A..Z                 $self->{nc} <= 0x005A) { # A..Z
1221                    
1222          $self->{ca}->{name} .= chr ($self->{nc} + 0x0020);          $self->{ca}->{name}
1223                .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1224          ## Stay in the state          ## Stay in the state
1225                    
1226      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 1183  sub _get_next_token ($) { Line 1269  sub _get_next_token ($) {
1269            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1270          }          }
1271          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1272            $self->{s_kwd} = '';
1273          # reconsume          # reconsume
1274    
1275          return  ($self->{ct}); # start tag or end tag          return  ($self->{ct}); # start tag or end tag
# Line 1259  sub _get_next_token ($) { Line 1346  sub _get_next_token ($) {
1346            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1347          }          }
1348          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1349            $self->{s_kwd} = '';
1350                    
1351      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1352        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 1278  sub _get_next_token ($) { Line 1366  sub _get_next_token ($) {
1366                 $self->{nc} <= 0x005A) { # A..Z                 $self->{nc} <= 0x005A) { # A..Z
1367                    
1368          $self->{ca}          $self->{ca}
1369              = {name => chr ($self->{nc} + 0x0020),              = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1370                 value => '',                 value => '',
1371                 line => $self->{line}, column => $self->{column}};                 line => $self->{line}, column => $self->{column}};
1372          $self->{state} = ATTRIBUTE_NAME_STATE;          $self->{state} = ATTRIBUTE_NAME_STATE;
# Line 1326  sub _get_next_token ($) { Line 1414  sub _get_next_token ($) {
1414          } else {          } else {
1415            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1416          }          }
1417            $self->{s_kwd} = '';
1418          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1419          # reconsume          # reconsume
1420    
# Line 1427  sub _get_next_token ($) { Line 1516  sub _get_next_token ($) {
1516            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1517          }          }
1518          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1519            $self->{s_kwd} = '';
1520                    
1521      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1522        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 1460  sub _get_next_token ($) { Line 1550  sub _get_next_token ($) {
1550            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1551          }          }
1552          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1553            $self->{s_kwd} = '';
1554          ## reconsume          ## reconsume
1555    
1556          return  ($self->{ct}); # start tag or end tag          return  ($self->{ct}); # start tag or end tag
# Line 1542  sub _get_next_token ($) { Line 1633  sub _get_next_token ($) {
1633            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1634          }          }
1635          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1636            $self->{s_kwd} = '';
1637          ## reconsume          ## reconsume
1638    
1639          return  ($self->{ct}); # start tag or end tag          return  ($self->{ct}); # start tag or end tag
# Line 1623  sub _get_next_token ($) { Line 1715  sub _get_next_token ($) {
1715            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1716          }          }
1717          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1718            $self->{s_kwd} = '';
1719          ## reconsume          ## reconsume
1720    
1721          return  ($self->{ct}); # start tag or end tag          return  ($self->{ct}); # start tag or end tag
# Line 1703  sub _get_next_token ($) { Line 1796  sub _get_next_token ($) {
1796            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1797          }          }
1798          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1799            $self->{s_kwd} = '';
1800                    
1801      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1802        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 1736  sub _get_next_token ($) { Line 1830  sub _get_next_token ($) {
1830            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1831          }          }
1832          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1833            $self->{s_kwd} = '';
1834          ## reconsume          ## reconsume
1835    
1836          return  ($self->{ct}); # start tag or end tag          return  ($self->{ct}); # start tag or end tag
# Line 1804  sub _get_next_token ($) { Line 1899  sub _get_next_token ($) {
1899            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1900          }          }
1901          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1902            $self->{s_kwd} = '';
1903                    
1904      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1905        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 1851  sub _get_next_token ($) { Line 1947  sub _get_next_token ($) {
1947            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1948          }          }
1949          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1950            $self->{s_kwd} = '';
1951          ## Reconsume.          ## Reconsume.
1952          return  ($self->{ct}); # start tag or end tag          return  ($self->{ct}); # start tag or end tag
1953          redo A;          redo A;
# Line 1881  sub _get_next_token ($) { Line 1978  sub _get_next_token ($) {
1978          }          }
1979    
1980          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1981            $self->{s_kwd} = '';
1982                    
1983      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1984        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 1913  sub _get_next_token ($) { Line 2011  sub _get_next_token ($) {
2011            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
2012          }          }
2013          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2014            $self->{s_kwd} = '';
2015          ## Reconsume.          ## Reconsume.
2016          return  ($self->{ct}); # start tag or end tag          return  ($self->{ct}); # start tag or end tag
2017          redo A;          redo A;
# Line 1933  sub _get_next_token ($) { Line 2032  sub _get_next_token ($) {
2032        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
2033                    
2034          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2035            $self->{s_kwd} = '';
2036                    
2037      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2038        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 1950  sub _get_next_token ($) { Line 2050  sub _get_next_token ($) {
2050        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
2051                    
2052          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2053            $self->{s_kwd} = '';
2054          ## reconsume          ## reconsume
2055    
2056          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
# Line 2172  sub _get_next_token ($) { Line 2273  sub _get_next_token ($) {
2273          redo A;          redo A;
2274        } elsif ($self->{s_kwd} eq '[CDATA' and        } elsif ($self->{s_kwd} eq '[CDATA' and
2275                 $self->{nc} == 0x005B) { # [                 $self->{nc} == 0x005B) { # [
2276                    if ($self->{is_xml} and
2277                not $self->{tainted} and
2278                @{$self->{open_elements} or []} == 0) {
2279              
2280              $self->{parse_error}->(level => $self->{level}->{must}, type => 'cdata outside of root element',
2281                              line => $self->{line_prev},
2282                              column => $self->{column_prev} - 7);
2283              $self->{tainted} = 1;
2284            } else {
2285              
2286            }
2287    
2288          $self->{ct} = {type => CHARACTER_TOKEN,          $self->{ct} = {type => CHARACTER_TOKEN,
2289                                    data => '',                                    data => '',
2290                                    line => $self->{line_prev},                                    line => $self->{line_prev},
# Line 2224  sub _get_next_token ($) { Line 2336  sub _get_next_token ($) {
2336                    
2337          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2338          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2339            $self->{s_kwd} = '';
2340                    
2341      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2342        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2243  sub _get_next_token ($) { Line 2356  sub _get_next_token ($) {
2356                    
2357          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2358          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2359            $self->{s_kwd} = '';
2360          ## reconsume          ## reconsume
2361    
2362          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
# Line 2286  sub _get_next_token ($) { Line 2400  sub _get_next_token ($) {
2400                    
2401          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2402          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2403            $self->{s_kwd} = '';
2404                    
2405      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2406        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2305  sub _get_next_token ($) { Line 2420  sub _get_next_token ($) {
2420                    
2421          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2422          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2423            $self->{s_kwd} = '';
2424          ## reconsume          ## reconsume
2425    
2426          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
# Line 2348  sub _get_next_token ($) { Line 2464  sub _get_next_token ($) {
2464                    
2465          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2466          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2467            $self->{s_kwd} = '';
2468          ## reconsume          ## reconsume
2469    
2470          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
# Line 2393  sub _get_next_token ($) { Line 2510  sub _get_next_token ($) {
2510        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
2511                    
2512          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2513            $self->{s_kwd} = '';
2514          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2515            $self->{s_kwd} = '';
2516          ## reconsume          ## reconsume
2517    
2518          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
# Line 2420  sub _get_next_token ($) { Line 2539  sub _get_next_token ($) {
2539        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
2540                    
2541          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2542            $self->{s_kwd} = '';
2543                    
2544      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2545        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2458  sub _get_next_token ($) { Line 2578  sub _get_next_token ($) {
2578                    
2579          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2580          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2581            $self->{s_kwd} = '';
2582          ## reconsume          ## reconsume
2583    
2584          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
# Line 2526  sub _get_next_token ($) { Line 2647  sub _get_next_token ($) {
2647                    
2648          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
2649          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2650            $self->{s_kwd} = '';
2651                    
2652      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2653        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2545  sub _get_next_token ($) { Line 2667  sub _get_next_token ($) {
2667                    
2668          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
2669          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2670            $self->{s_kwd} = '';
2671          ## reconsume          ## reconsume
2672    
2673          return  ($self->{ct}); # DOCTYPE (quirks)          return  ($self->{ct}); # DOCTYPE (quirks)
# Line 2588  sub _get_next_token ($) { Line 2711  sub _get_next_token ($) {
2711        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
2712                    
2713          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2714            $self->{s_kwd} = '';
2715                    
2716      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2717        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2607  sub _get_next_token ($) { Line 2731  sub _get_next_token ($) {
2731                    
2732          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
2733          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2734            $self->{s_kwd} = '';
2735          ## reconsume          ## reconsume
2736    
2737          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 2650  sub _get_next_token ($) { Line 2775  sub _get_next_token ($) {
2775        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
2776                    
2777          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2778            $self->{s_kwd} = '';
2779                    
2780      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2781        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2669  sub _get_next_token ($) { Line 2795  sub _get_next_token ($) {
2795                    
2796          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
2797          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2798            $self->{s_kwd} = '';
2799          ## reconsume          ## reconsume
2800    
2801          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 2897  sub _get_next_token ($) { Line 3024  sub _get_next_token ($) {
3024          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3025    
3026          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3027            $self->{s_kwd} = '';
3028                    
3029      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3030        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2918  sub _get_next_token ($) { Line 3046  sub _get_next_token ($) {
3046          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3047    
3048          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3049            $self->{s_kwd} = '';
3050          ## reconsume          ## reconsume
3051    
3052          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 2964  sub _get_next_token ($) { Line 3093  sub _get_next_token ($) {
3093          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3094    
3095          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3096            $self->{s_kwd} = '';
3097                    
3098      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3099        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2985  sub _get_next_token ($) { Line 3115  sub _get_next_token ($) {
3115          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3116    
3117          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3118            $self->{s_kwd} = '';
3119          ## reconsume          ## reconsume
3120    
3121          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 3033  sub _get_next_token ($) { Line 3164  sub _get_next_token ($) {
3164          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3165    
3166          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3167            $self->{s_kwd} = '';
3168                    
3169      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3170        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3054  sub _get_next_token ($) { Line 3186  sub _get_next_token ($) {
3186          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3187    
3188          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3189            $self->{s_kwd} = '';
3190          ## reconsume          ## reconsume
3191    
3192          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 3132  sub _get_next_token ($) { Line 3265  sub _get_next_token ($) {
3265        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
3266                    
3267          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3268            $self->{s_kwd} = '';
3269                    
3270      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3271        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3152  sub _get_next_token ($) { Line 3286  sub _get_next_token ($) {
3286          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3287    
3288          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3289            $self->{s_kwd} = '';
3290          ## reconsume          ## reconsume
3291    
3292          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 3229  sub _get_next_token ($) { Line 3364  sub _get_next_token ($) {
3364                    
3365          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3366          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3367            $self->{s_kwd} = '';
3368                    
3369      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3370        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3250  sub _get_next_token ($) { Line 3386  sub _get_next_token ($) {
3386          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3387    
3388          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3389            $self->{s_kwd} = '';
3390          ## reconsume          ## reconsume
3391    
3392          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 3296  sub _get_next_token ($) { Line 3433  sub _get_next_token ($) {
3433          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3434    
3435          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3436            $self->{s_kwd} = '';
3437                    
3438      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3439        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3317  sub _get_next_token ($) { Line 3455  sub _get_next_token ($) {
3455          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3456    
3457          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3458            $self->{s_kwd} = '';
3459          ## reconsume          ## reconsume
3460    
3461          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 3365  sub _get_next_token ($) { Line 3504  sub _get_next_token ($) {
3504          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3505    
3506          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3507            $self->{s_kwd} = '';
3508                    
3509      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3510        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3386  sub _get_next_token ($) { Line 3526  sub _get_next_token ($) {
3526          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3527    
3528          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3529            $self->{s_kwd} = '';
3530          ## reconsume          ## reconsume
3531    
3532          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 3432  sub _get_next_token ($) { Line 3573  sub _get_next_token ($) {
3573        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
3574                    
3575          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3576            $self->{s_kwd} = '';
3577                    
3578      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3579        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3451  sub _get_next_token ($) { Line 3593  sub _get_next_token ($) {
3593                    
3594          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3595          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3596            $self->{s_kwd} = '';
3597          ## reconsume          ## reconsume
3598    
3599          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 3480  sub _get_next_token ($) { Line 3623  sub _get_next_token ($) {
3623        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
3624                    
3625          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3626            $self->{s_kwd} = '';
3627                    
3628      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3629        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3498  sub _get_next_token ($) { Line 3642  sub _get_next_token ($) {
3642        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
3643                    
3644          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3645            $self->{s_kwd} = '';
3646          ## reconsume          ## reconsume
3647    
3648          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
# Line 3543  sub _get_next_token ($) { Line 3688  sub _get_next_token ($) {
3688        
3689          redo A;          redo A;
3690        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
3691            if ($self->{is_xml}) {
3692              
3693              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no mse'); ## TODO: type
3694            } else {
3695              
3696            }
3697    
3698          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3699            $self->{s_kwd} = '';
3700                    
3701      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3702        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3612  sub _get_next_token ($) { Line 3765  sub _get_next_token ($) {
3765      } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {      } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
3766        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
3767          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3768            $self->{s_kwd} = '';
3769                    
3770      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3771        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3719  sub _get_next_token ($) { Line 3873  sub _get_next_token ($) {
3873        if ($self->{prev_state} == DATA_STATE) {        if ($self->{prev_state} == DATA_STATE) {
3874                    
3875          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
3876            $self->{s_kwd} = '';
3877          ## Reconsume.          ## Reconsume.
3878          return  ({type => CHARACTER_TOKEN, data => '&',          return  ({type => CHARACTER_TOKEN, data => '&',
3879                    line => $self->{line_prev},                    line => $self->{line_prev},
# Line 3729  sub _get_next_token ($) { Line 3884  sub _get_next_token ($) {
3884                    
3885          $self->{ca}->{value} .= '&';          $self->{ca}->{value} .= '&';
3886          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
3887            $self->{s_kwd} = '';
3888          ## Reconsume.          ## Reconsume.
3889          redo A;          redo A;
3890        }        }
# Line 3779  sub _get_next_token ($) { Line 3935  sub _get_next_token ($) {
3935          if ($self->{prev_state} == DATA_STATE) {          if ($self->{prev_state} == DATA_STATE) {
3936                        
3937            $self->{state} = $self->{prev_state};            $self->{state} = $self->{prev_state};
3938              $self->{s_kwd} = '';
3939            ## Reconsume.            ## Reconsume.
3940            return  ({type => CHARACTER_TOKEN,            return  ({type => CHARACTER_TOKEN,
3941                      data => '&#',                      data => '&#',
# Line 3790  sub _get_next_token ($) { Line 3947  sub _get_next_token ($) {
3947                        
3948            $self->{ca}->{value} .= '&#';            $self->{ca}->{value} .= '&#';
3949            $self->{state} = $self->{prev_state};            $self->{state} = $self->{prev_state};
3950              $self->{s_kwd} = '';
3951            ## Reconsume.            ## Reconsume.
3952            redo A;            redo A;
3953          }          }
# Line 3855  sub _get_next_token ($) { Line 4013  sub _get_next_token ($) {
4013        if ($self->{prev_state} == DATA_STATE) {        if ($self->{prev_state} == DATA_STATE) {
4014                    
4015          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
4016            $self->{s_kwd} = '';
4017          ## Reconsume.          ## Reconsume.
4018          return  ({type => CHARACTER_TOKEN, data => chr $code,          return  ({type => CHARACTER_TOKEN, data => chr $code,
4019                      has_reference => 1,
4020                    line => $l, column => $c,                    line => $l, column => $c,
4021                   });                   });
4022          redo A;          redo A;
# Line 3865  sub _get_next_token ($) { Line 4025  sub _get_next_token ($) {
4025          $self->{ca}->{value} .= chr $code;          $self->{ca}->{value} .= chr $code;
4026          $self->{ca}->{has_reference} = 1;          $self->{ca}->{has_reference} = 1;
4027          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
4028            $self->{s_kwd} = '';
4029          ## Reconsume.          ## Reconsume.
4030          redo A;          redo A;
4031        }        }
# Line 3890  sub _get_next_token ($) { Line 4051  sub _get_next_token ($) {
4051          if ($self->{prev_state} == DATA_STATE) {          if ($self->{prev_state} == DATA_STATE) {
4052                        
4053            $self->{state} = $self->{prev_state};            $self->{state} = $self->{prev_state};
4054              $self->{s_kwd} = '';
4055            ## Reconsume.            ## Reconsume.
4056            return  ({type => CHARACTER_TOKEN,            return  ({type => CHARACTER_TOKEN,
4057                      data => '&' . $self->{s_kwd},                      data => '&' . $self->{s_kwd},
# Line 3901  sub _get_next_token ($) { Line 4063  sub _get_next_token ($) {
4063                        
4064            $self->{ca}->{value} .= '&' . $self->{s_kwd};            $self->{ca}->{value} .= '&' . $self->{s_kwd};
4065            $self->{state} = $self->{prev_state};            $self->{state} = $self->{prev_state};
4066              $self->{s_kwd} = '';
4067            ## Reconsume.            ## Reconsume.
4068            redo A;            redo A;
4069          }          }
# Line 4003  sub _get_next_token ($) { Line 4166  sub _get_next_token ($) {
4166        if ($self->{prev_state} == DATA_STATE) {        if ($self->{prev_state} == DATA_STATE) {
4167                    
4168          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
4169            $self->{s_kwd} = '';
4170          ## Reconsume.          ## Reconsume.
4171          return  ({type => CHARACTER_TOKEN, data => chr $code,          return  ({type => CHARACTER_TOKEN, data => chr $code,
4172                      has_reference => 1,
4173                    line => $l, column => $c,                    line => $l, column => $c,
4174                   });                   });
4175          redo A;          redo A;
# Line 4013  sub _get_next_token ($) { Line 4178  sub _get_next_token ($) {
4178          $self->{ca}->{value} .= chr $code;          $self->{ca}->{value} .= chr $code;
4179          $self->{ca}->{has_reference} = 1;          $self->{ca}->{has_reference} = 1;
4180          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
4181            $self->{s_kwd} = '';
4182          ## Reconsume.          ## Reconsume.
4183          redo A;          redo A;
4184        }        }
# Line 4125  sub _get_next_token ($) { Line 4291  sub _get_next_token ($) {
4291        if ($self->{prev_state} == DATA_STATE) {        if ($self->{prev_state} == DATA_STATE) {
4292                    
4293          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
4294            $self->{s_kwd} = '';
4295          ## Reconsume.          ## Reconsume.
4296          return  ({type => CHARACTER_TOKEN,          return  ({type => CHARACTER_TOKEN,
4297                    data => $data,                    data => $data,
4298                      has_reference => $has_ref,
4299                    line => $self->{line_prev},                    line => $self->{line_prev},
4300                    column => $self->{column_prev} + 1 - length $self->{s_kwd},                    column => $self->{column_prev} + 1 - length $self->{s_kwd},
4301                   });                   });
# Line 4137  sub _get_next_token ($) { Line 4305  sub _get_next_token ($) {
4305          $self->{ca}->{value} .= $data;          $self->{ca}->{value} .= $data;
4306          $self->{ca}->{has_reference} = 1 if $has_ref;          $self->{ca}->{has_reference} = 1 if $has_ref;
4307          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
4308            $self->{s_kwd} = '';
4309          ## Reconsume.          ## Reconsume.
4310          redo A;          redo A;
4311        }        }
4312    
4313        ## XML-only states
4314    
4315        } elsif ($self->{state} == PI_STATE) {
4316          if ($is_space->{$self->{nc}} or
4317              $self->{nc} == 0x003F or # ? ## XML5: Same as "Anything else"
4318              $self->{nc} == -1) {
4319            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare pio', ## TODO: type
4320                            line => $self->{line_prev},
4321                            column => $self->{column_prev}
4322                                - 1 * ($self->{nc} != -1));
4323            $self->{state} = BOGUS_COMMENT_STATE;
4324            ## Reconsume.
4325            $self->{ct} = {type => COMMENT_TOKEN,
4326                           data => '?',
4327                           line => $self->{line_prev},
4328                           column => $self->{column_prev}
4329                               - 1 * ($self->{nc} != -1),
4330                          };
4331            redo A;
4332          } else {
4333            $self->{ct} = {type => PI_TOKEN,
4334                           target => chr $self->{nc},
4335                           data => '',
4336                           line => $self->{line_prev},
4337                           column => $self->{column_prev} - 1,
4338                          };
4339            $self->{state} = PI_TARGET_STATE;
4340            
4341        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4342          $self->{line_prev} = $self->{line};
4343          $self->{column_prev} = $self->{column};
4344          $self->{column}++;
4345          $self->{nc}
4346              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4347        } else {
4348          $self->{set_nc}->($self);
4349        }
4350      
4351            redo A;
4352          }
4353        } elsif ($self->{state} == PI_TARGET_STATE) {
4354          if ($is_space->{$self->{nc}}) {
4355            $self->{state} = PI_TARGET_AFTER_STATE;
4356            
4357        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4358          $self->{line_prev} = $self->{line};
4359          $self->{column_prev} = $self->{column};
4360          $self->{column}++;
4361          $self->{nc}
4362              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4363        } else {
4364          $self->{set_nc}->($self);
4365        }
4366      
4367            redo A;
4368          } elsif ($self->{nc} == -1) {
4369            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
4370            $self->{state} = DATA_STATE;
4371            $self->{s_kwd} = '';
4372            ## Reconsume.
4373            return  ($self->{ct}); # pi
4374            redo A;
4375          } elsif ($self->{nc} == 0x003F) { # ?
4376            $self->{state} = PI_AFTER_STATE;
4377            
4378        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4379          $self->{line_prev} = $self->{line};
4380          $self->{column_prev} = $self->{column};
4381          $self->{column}++;
4382          $self->{nc}
4383              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4384        } else {
4385          $self->{set_nc}->($self);
4386        }
4387      
4388            redo A;
4389          } else {
4390            ## XML5: typo ("tag name" -> "target")
4391            $self->{ct}->{target} .= chr $self->{nc}; # pi
4392            
4393        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4394          $self->{line_prev} = $self->{line};
4395          $self->{column_prev} = $self->{column};
4396          $self->{column}++;
4397          $self->{nc}
4398              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4399        } else {
4400          $self->{set_nc}->($self);
4401        }
4402      
4403            redo A;
4404          }
4405        } elsif ($self->{state} == PI_TARGET_AFTER_STATE) {
4406          if ($is_space->{$self->{nc}}) {
4407            ## Stay in the state.
4408            
4409        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4410          $self->{line_prev} = $self->{line};
4411          $self->{column_prev} = $self->{column};
4412          $self->{column}++;
4413          $self->{nc}
4414              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4415        } else {
4416          $self->{set_nc}->($self);
4417        }
4418      
4419            redo A;
4420          } else {
4421            $self->{state} = PI_DATA_STATE;
4422            ## Reprocess.
4423            redo A;
4424          }
4425        } elsif ($self->{state} == PI_DATA_STATE) {
4426          if ($self->{nc} == 0x003F) { # ?
4427            $self->{state} = PI_DATA_AFTER_STATE;
4428            
4429        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4430          $self->{line_prev} = $self->{line};
4431          $self->{column_prev} = $self->{column};
4432          $self->{column}++;
4433          $self->{nc}
4434              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4435        } else {
4436          $self->{set_nc}->($self);
4437        }
4438      
4439            redo A;
4440          } elsif ($self->{nc} == -1) {
4441            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
4442            $self->{state} = DATA_STATE;
4443            $self->{s_kwd} = '';
4444            ## Reprocess.
4445            return  ($self->{ct}); # pi
4446            redo A;
4447          } else {
4448            $self->{ct}->{data} .= chr $self->{nc}; # pi
4449            $self->{read_until}->($self->{ct}->{data}, q[?],
4450                                  length $self->{ct}->{data});
4451            ## Stay in the state.
4452            
4453        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4454          $self->{line_prev} = $self->{line};
4455          $self->{column_prev} = $self->{column};
4456          $self->{column}++;
4457          $self->{nc}
4458              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4459        } else {
4460          $self->{set_nc}->($self);
4461        }
4462      
4463            ## Reprocess.
4464            redo A;
4465          }
4466        } elsif ($self->{state} == PI_AFTER_STATE) {
4467          if ($self->{nc} == 0x003E) { # >
4468            $self->{state} = DATA_STATE;
4469            $self->{s_kwd} = '';
4470            
4471        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4472          $self->{line_prev} = $self->{line};
4473          $self->{column_prev} = $self->{column};
4474          $self->{column}++;
4475          $self->{nc}
4476              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4477        } else {
4478          $self->{set_nc}->($self);
4479        }
4480      
4481            return  ($self->{ct}); # pi
4482            redo A;
4483          } elsif ($self->{nc} == 0x003F) { # ?
4484            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
4485                            line => $self->{line_prev},
4486                            column => $self->{column_prev}); ## XML5: no error
4487            $self->{ct}->{data} .= '?';
4488            $self->{state} = PI_DATA_AFTER_STATE;
4489            
4490        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4491          $self->{line_prev} = $self->{line};
4492          $self->{column_prev} = $self->{column};
4493          $self->{column}++;
4494          $self->{nc}
4495              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4496        } else {
4497          $self->{set_nc}->($self);
4498        }
4499      
4500            redo A;
4501          } else {
4502            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
4503                            line => $self->{line_prev},
4504                            column => $self->{column_prev}
4505                                + 1 * ($self->{nc} == -1)); ## XML5: no error
4506            $self->{ct}->{data} .= '?'; ## XML5: not appended
4507            $self->{state} = PI_DATA_STATE;
4508            ## Reprocess.
4509            redo A;
4510          }
4511        } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
4512          ## XML5: Same as "pi after state" in XML5
4513          if ($self->{nc} == 0x003E) { # >
4514            $self->{state} = DATA_STATE;
4515            $self->{s_kwd} = '';
4516            
4517        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4518          $self->{line_prev} = $self->{line};
4519          $self->{column_prev} = $self->{column};
4520          $self->{column}++;
4521          $self->{nc}
4522              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4523        } else {
4524          $self->{set_nc}->($self);
4525        }
4526      
4527            return  ($self->{ct}); # pi
4528            redo A;
4529          } elsif ($self->{nc} == 0x003F) { # ?
4530            $self->{ct}->{data} .= '?';
4531            ## Stay in the state.
4532            
4533        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4534          $self->{line_prev} = $self->{line};
4535          $self->{column_prev} = $self->{column};
4536          $self->{column}++;
4537          $self->{nc}
4538              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4539        } else {
4540          $self->{set_nc}->($self);
4541        }
4542      
4543            redo A;
4544          } else {
4545            $self->{ct}->{data} .= '?'; ## XML5: not appended
4546            $self->{state} = PI_DATA_STATE;
4547            ## Reprocess.
4548            redo A;
4549          }
4550            
4551      } else {      } else {
4552        die "$0: $self->{state}: Unknown state";        die "$0: $self->{state}: Unknown state";
4553      }      }

Legend:
Removed from v.1.3  
changed lines
  Added in v.1.9

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24