/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm
Suika

Diff of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.1 by wakaba, Tue Oct 14 02:27:58 2008 UTC revision 1.8 by wakaba, Wed Oct 15 04:38:22 2008 UTC
# Line 2  package Whatpm::HTML::Tokenizer; Line 2  package Whatpm::HTML::Tokenizer;
2  use strict;  use strict;
3  our $VERSION=do{my @r=(q$Revision$=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};  our $VERSION=do{my @r=(q$Revision$=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4    
5    BEGIN {
6      require Exporter;
7      push our @ISA, 'Exporter';
8    
9      our @EXPORT_OK = qw(
10        DOCTYPE_TOKEN
11        COMMENT_TOKEN
12        START_TAG_TOKEN
13        END_TAG_TOKEN
14        END_OF_FILE_TOKEN
15        CHARACTER_TOKEN
16        PI_TOKEN
17        ABORT_TOKEN
18      );
19      
20      our %EXPORT_TAGS = (
21        token => [qw(
22          DOCTYPE_TOKEN
23          COMMENT_TOKEN
24          START_TAG_TOKEN
25          END_TAG_TOKEN
26          END_OF_FILE_TOKEN
27          CHARACTER_TOKEN
28          PI_TOKEN
29          ABORT_TOKEN
30        )],
31      );
32    }
33    
34    ## Token types
35    
36    sub DOCTYPE_TOKEN () { 1 }
37    sub COMMENT_TOKEN () { 2 }
38    sub START_TAG_TOKEN () { 3 }
39    sub END_TAG_TOKEN () { 4 }
40    sub END_OF_FILE_TOKEN () { 5 }
41    sub CHARACTER_TOKEN () { 6 }
42    sub PI_TOKEN () { 7 } # XML5
43    sub ABORT_TOKEN () { 8 } # Not a token actually
44    
45  package Whatpm::HTML;  package Whatpm::HTML;
46    
47    BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
48    
49  ## Content model flags  ## Content model flags
50    
51  sub CM_ENTITY () { 0b001 } # & markup in data  sub CM_ENTITY () { 0b001 } # & markup in data
# Line 72  sub HEXREF_HEX_STATE () { 48 } Line 114  sub HEXREF_HEX_STATE () { 48 }
114  sub ENTITY_NAME_STATE () { 49 }  sub ENTITY_NAME_STATE () { 49 }
115  sub PCDATA_STATE () { 50 } # "data state" in the spec  sub PCDATA_STATE () { 50 } # "data state" in the spec
116    
117  ## Token types  ## XML states
118    sub PI_STATE () { 51 }
119  sub DOCTYPE_TOKEN () { 1 }  sub PI_TARGET_STATE () { 52 }
120  sub COMMENT_TOKEN () { 2 }  sub PI_TARGET_AFTER_STATE () { 53 }
121  sub START_TAG_TOKEN () { 3 }  sub PI_DATA_STATE () { 54 }
122  sub END_TAG_TOKEN () { 4 }  sub PI_AFTER_STATE () { 55 }
123  sub END_OF_FILE_TOKEN () { 5 }  sub PI_DATA_AFTER_STATE () { 56 }
 sub CHARACTER_TOKEN () { 6 }  
124    
125  ## Tree constructor state constants (see Whatpm::HTML for the full  ## Tree constructor state constants (see Whatpm::HTML for the full
126  ## list and descriptions)  ## list and descriptions)
# Line 142  sub _initialize_tokenizer ($) { Line 183  sub _initialize_tokenizer ($) {
183    #$self->{level}    #$self->{level}
184    #$self->{set_nc}    #$self->{set_nc}
185    #$self->{parse_error}    #$self->{parse_error}
186      #$self->{is_xml} (if XML)
187    
188    $self->{state} = DATA_STATE; # MUST    $self->{state} = DATA_STATE; # MUST
189    #$self->{s_kwd}; # state keyword - initialized when used    $self->{s_kwd} = ''; # state keyword
190    #$self->{entity__value}; # initialized when used    #$self->{entity__value}; # initialized when used
191    #$self->{entity__match}; # initialized when used    #$self->{entity__match}; # initialized when used
192    $self->{content_model} = PCDATA_CONTENT_MODEL; # be    $self->{content_model} = PCDATA_CONTENT_MODEL; # be
# Line 185  sub _initialize_tokenizer ($) { Line 227  sub _initialize_tokenizer ($) {
227  ##        ->{value}  ##        ->{value}
228  ##        ->{has_reference} == 1 or 0  ##        ->{has_reference} == 1 or 0
229  ##   ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)  ##   ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)
230    ##   ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
231  ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.  ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
232  ##     |->{self_closing}| is used to save the value of |$self->{self_closing}|  ##     |->{self_closing}| is used to save the value of |$self->{self_closing}|
233  ##     while the token is pushed back to the stack.  ##     while the token is pushed back to the stack.
# Line 328  sub _get_next_token ($) { Line 371  sub _get_next_token ($) {
371          }          }
372        } elsif ($self->{nc} == 0x002D) { # -        } elsif ($self->{nc} == 0x002D) { # -
373          if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA          if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
374            $self->{s_kwd} .= '-';            if ($self->{s_kwd} eq '<!-') {
             
           if ($self->{s_kwd} eq '<!--') {  
375                            
376              $self->{escape} = 1; # unless $self->{escape};              $self->{escape} = 1; # unless $self->{escape};
377              $self->{s_kwd} = '--';              $self->{s_kwd} = '--';
378              #              #
379            } elsif ($self->{s_kwd} eq '---') {            } elsif ($self->{s_kwd} eq '-') {
380                            
381              $self->{s_kwd} = '--';              $self->{s_kwd} = '--';
382              #              #
383              } elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') {
384                
385                $self->{s_kwd} .= '-';
386                #
387            } else {            } else {
388                            
389                $self->{s_kwd} = '-';
390              #              #
391            }            }
392          }          }
# Line 386  sub _get_next_token ($) { Line 432  sub _get_next_token ($) {
432            if ($self->{s_kwd} eq '--') {            if ($self->{s_kwd} eq '--') {
433                            
434              delete $self->{escape};              delete $self->{escape};
435                #
436            } else {            } else {
437                            
438                #
439            }            }
440            } elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') {
441              
442              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched mse', ## TODO: type
443                              line => $self->{line_prev},
444                              column => $self->{column_prev} - 1);
445              #
446          } else {          } else {
447                        
448              #
449          }          }
450                    
451          $self->{s_kwd} = '';          $self->{s_kwd} = '';
452          #          #
453          } elsif ($self->{nc} == 0x005D) { # ]
454            if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') {
455              
456              $self->{s_kwd} .= ']';
457            } elsif ($self->{s_kwd} eq ']]') {
458              
459              #
460            } else {
461              
462              $self->{s_kwd} = '';
463            }
464            #
465        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
466                    
467          $self->{s_kwd} = '';          $self->{s_kwd} = '';
# Line 412  sub _get_next_token ($) { Line 479  sub _get_next_token ($) {
479                     data => chr $self->{nc},                     data => chr $self->{nc},
480                     line => $self->{line}, column => $self->{column},                     line => $self->{line}, column => $self->{column},
481                    };                    };
482        if ($self->{read_until}->($token->{data}, q[-!<>&],        if ($self->{read_until}->($token->{data}, q{-!<>&\]},
483                                  length $token->{data})) {                                  length $token->{data})) {
484          $self->{s_kwd} = '';          $self->{s_kwd} = '';
485        }        }
486    
487        ## Stay in the data state.        ## Stay in the data state.
488        if ($self->{content_model} == PCDATA_CONTENT_MODEL) {        if (not $self->{is_xml} and
489              $self->{content_model} == PCDATA_CONTENT_MODEL) {
490                    
491          $self->{state} = PCDATA_STATE;          $self->{state} = PCDATA_STATE;
492        } else {        } else {
# Line 466  sub _get_next_token ($) { Line 534  sub _get_next_token ($) {
534    
535          ## reconsume          ## reconsume
536          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
537            $self->{s_kwd} = '';
538          return  ({type => CHARACTER_TOKEN, data => '<',          return  ({type => CHARACTER_TOKEN, data => '<',
539                    line => $self->{line_prev},                    line => $self->{line_prev},
540                    column => $self->{column_prev},                    column => $self->{column_prev},
# Line 507  sub _get_next_token ($) { Line 576  sub _get_next_token ($) {
576                        
577            $self->{ct}            $self->{ct}
578              = {type => START_TAG_TOKEN,              = {type => START_TAG_TOKEN,
579                 tag_name => chr ($self->{nc} + 0x0020),                 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
580                 line => $self->{line_prev},                 line => $self->{line_prev},
581                 column => $self->{column_prev}};                 column => $self->{column_prev}};
582            $self->{state} = TAG_NAME_STATE;            $self->{state} = TAG_NAME_STATE;
# Line 549  sub _get_next_token ($) { Line 618  sub _get_next_token ($) {
618                            line => $self->{line_prev},                            line => $self->{line_prev},
619                            column => $self->{column_prev});                            column => $self->{column_prev});
620            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
621              $self->{s_kwd} = '';
622                        
623      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
624        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 568  sub _get_next_token ($) { Line 638  sub _get_next_token ($) {
638    
639            redo A;            redo A;
640          } elsif ($self->{nc} == 0x003F) { # ?          } elsif ($self->{nc} == 0x003F) { # ?
641                        if ($self->{is_xml}) {
642            $self->{parse_error}->(level => $self->{level}->{must}, type => 'pio',              
643                            line => $self->{line_prev},              $self->{state} = PI_STATE;
644                            column => $self->{column_prev});              
645            $self->{state} = BOGUS_COMMENT_STATE;      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
646            $self->{ct} = {type => COMMENT_TOKEN, data => '',        $self->{line_prev} = $self->{line};
647                                      line => $self->{line_prev},        $self->{column_prev} = $self->{column};
648                                      column => $self->{column_prev},        $self->{column}++;
649                                     };        $self->{nc}
650            ## $self->{nc} is intentionally left as is            = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
651            redo A;      } else {
652          $self->{set_nc}->($self);
653        }
654      
655                redo A;
656              } else {
657                
658                $self->{parse_error}->(level => $self->{level}->{must}, type => 'pio',
659                                line => $self->{line_prev},
660                                column => $self->{column_prev});
661                $self->{state} = BOGUS_COMMENT_STATE;
662                $self->{ct} = {type => COMMENT_TOKEN, data => '',
663                               line => $self->{line_prev},
664                               column => $self->{column_prev},
665                              };
666                ## $self->{nc} is intentionally left as is
667                redo A;
668              }
669          } else {          } else {
670                        
671            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago',            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago',
672                            line => $self->{line_prev},                            line => $self->{line_prev},
673                            column => $self->{column_prev});                            column => $self->{column_prev});
674            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
675              $self->{s_kwd} = '';
676            ## reconsume            ## reconsume
677    
678            return  ({type => CHARACTER_TOKEN, data => '<',            return  ({type => CHARACTER_TOKEN, data => '<',
# Line 613  sub _get_next_token ($) { Line 701  sub _get_next_token ($) {
701            ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.            ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
702                        
703            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
704              $self->{s_kwd} = '';
705            ## Reconsume.            ## Reconsume.
706            return  ({type => CHARACTER_TOKEN, data => '</',            return  ({type => CHARACTER_TOKEN, data => '</',
707                      line => $l, column => $c,                      line => $l, column => $c,
# Line 626  sub _get_next_token ($) { Line 715  sub _get_next_token ($) {
715                    
716          $self->{ct}          $self->{ct}
717              = {type => END_TAG_TOKEN,              = {type => END_TAG_TOKEN,
718                 tag_name => chr ($self->{nc} + 0x0020),                 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
719                 line => $l, column => $c};                 line => $l, column => $c};
720          $self->{state} = TAG_NAME_STATE;          $self->{state} = TAG_NAME_STATE;
721                    
# Line 666  sub _get_next_token ($) { Line 755  sub _get_next_token ($) {
755                          line => $self->{line_prev}, ## "<" in "</>"                          line => $self->{line_prev}, ## "<" in "</>"
756                          column => $self->{column_prev} - 1);                          column => $self->{column_prev} - 1);
757          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
758            $self->{s_kwd} = '';
759                    
760      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
761        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 681  sub _get_next_token ($) { Line 771  sub _get_next_token ($) {
771        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
772                    
773          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare etago');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare etago');
774            $self->{s_kwd} = '';
775          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
776          # reconsume          # reconsume
777    
# Line 730  sub _get_next_token ($) { Line 821  sub _get_next_token ($) {
821          } else {          } else {
822                        
823            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
824              $self->{s_kwd} = '';
825            ## Reconsume.            ## Reconsume.
826            return  ({type => CHARACTER_TOKEN,            return  ({type => CHARACTER_TOKEN,
827                      data => '</' . $self->{s_kwd},                      data => '</' . $self->{s_kwd},
# Line 748  sub _get_next_token ($) { Line 840  sub _get_next_token ($) {
840                        
841            ## Reconsume.            ## Reconsume.
842            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
843              $self->{s_kwd} = '';
844            return  ({type => CHARACTER_TOKEN,            return  ({type => CHARACTER_TOKEN,
845                      data => '</' . $self->{s_kwd},                      data => '</' . $self->{s_kwd},
846                      line => $self->{line_prev},                      line => $self->{line_prev},
# Line 799  sub _get_next_token ($) { Line 892  sub _get_next_token ($) {
892            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
893          }          }
894          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
895            $self->{s_kwd} = '';
896                    
897      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
898        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 817  sub _get_next_token ($) { Line 911  sub _get_next_token ($) {
911        } elsif (0x0041 <= $self->{nc} and        } elsif (0x0041 <= $self->{nc} and
912                 $self->{nc} <= 0x005A) { # A..Z                 $self->{nc} <= 0x005A) { # A..Z
913                    
914          $self->{ct}->{tag_name} .= chr ($self->{nc} + 0x0020);          $self->{ct}->{tag_name}
915                .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
916            # start tag or end tag            # start tag or end tag
917          ## Stay in this state          ## Stay in this state
918                    
# Line 850  sub _get_next_token ($) { Line 945  sub _get_next_token ($) {
945            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
946          }          }
947          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
948            $self->{s_kwd} = '';
949          # reconsume          # reconsume
950    
951          return  ($self->{ct}); # start tag or end tag          return  ($self->{ct}); # start tag or end tag
# Line 920  sub _get_next_token ($) { Line 1016  sub _get_next_token ($) {
1016            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1017          }          }
1018          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1019            $self->{s_kwd} = '';
1020                    
1021      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1022        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 939  sub _get_next_token ($) { Line 1036  sub _get_next_token ($) {
1036                 $self->{nc} <= 0x005A) { # A..Z                 $self->{nc} <= 0x005A) { # A..Z
1037                    
1038          $self->{ca}          $self->{ca}
1039              = {name => chr ($self->{nc} + 0x0020),              = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1040                 value => '',                 value => '',
1041                 line => $self->{line}, column => $self->{column}};                 line => $self->{line}, column => $self->{column}};
1042          $self->{state} = ATTRIBUTE_NAME_STATE;          $self->{state} = ATTRIBUTE_NAME_STATE;
# Line 987  sub _get_next_token ($) { Line 1084  sub _get_next_token ($) {
1084            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1085          }          }
1086          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1087            $self->{s_kwd} = '';
1088          # reconsume          # reconsume
1089    
1090          return  ($self->{ct}); # start tag or end tag          return  ($self->{ct}); # start tag or end tag
# Line 1082  sub _get_next_token ($) { Line 1180  sub _get_next_token ($) {
1180            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1181          }          }
1182          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1183            $self->{s_kwd} = '';
1184                    
1185      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1186        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 1100  sub _get_next_token ($) { Line 1199  sub _get_next_token ($) {
1199        } elsif (0x0041 <= $self->{nc} and        } elsif (0x0041 <= $self->{nc} and
1200                 $self->{nc} <= 0x005A) { # A..Z                 $self->{nc} <= 0x005A) { # A..Z
1201                    
1202          $self->{ca}->{name} .= chr ($self->{nc} + 0x0020);          $self->{ca}->{name}
1203                .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1204          ## Stay in the state          ## Stay in the state
1205                    
1206      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 1149  sub _get_next_token ($) { Line 1249  sub _get_next_token ($) {
1249            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1250          }          }
1251          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1252            $self->{s_kwd} = '';
1253          # reconsume          # reconsume
1254    
1255          return  ($self->{ct}); # start tag or end tag          return  ($self->{ct}); # start tag or end tag
# Line 1225  sub _get_next_token ($) { Line 1326  sub _get_next_token ($) {
1326            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1327          }          }
1328          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1329            $self->{s_kwd} = '';
1330                    
1331      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1332        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 1244  sub _get_next_token ($) { Line 1346  sub _get_next_token ($) {
1346                 $self->{nc} <= 0x005A) { # A..Z                 $self->{nc} <= 0x005A) { # A..Z
1347                    
1348          $self->{ca}          $self->{ca}
1349              = {name => chr ($self->{nc} + 0x0020),              = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1350                 value => '',                 value => '',
1351                 line => $self->{line}, column => $self->{column}};                 line => $self->{line}, column => $self->{column}};
1352          $self->{state} = ATTRIBUTE_NAME_STATE;          $self->{state} = ATTRIBUTE_NAME_STATE;
# Line 1292  sub _get_next_token ($) { Line 1394  sub _get_next_token ($) {
1394          } else {          } else {
1395            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1396          }          }
1397            $self->{s_kwd} = '';
1398          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1399          # reconsume          # reconsume
1400    
# Line 1393  sub _get_next_token ($) { Line 1496  sub _get_next_token ($) {
1496            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1497          }          }
1498          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1499            $self->{s_kwd} = '';
1500                    
1501      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1502        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 1426  sub _get_next_token ($) { Line 1530  sub _get_next_token ($) {
1530            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1531          }          }
1532          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1533            $self->{s_kwd} = '';
1534          ## reconsume          ## reconsume
1535    
1536          return  ($self->{ct}); # start tag or end tag          return  ($self->{ct}); # start tag or end tag
# Line 1508  sub _get_next_token ($) { Line 1613  sub _get_next_token ($) {
1613            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1614          }          }
1615          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1616            $self->{s_kwd} = '';
1617          ## reconsume          ## reconsume
1618    
1619          return  ($self->{ct}); # start tag or end tag          return  ($self->{ct}); # start tag or end tag
# Line 1589  sub _get_next_token ($) { Line 1695  sub _get_next_token ($) {
1695            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1696          }          }
1697          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1698            $self->{s_kwd} = '';
1699          ## reconsume          ## reconsume
1700    
1701          return  ($self->{ct}); # start tag or end tag          return  ($self->{ct}); # start tag or end tag
# Line 1669  sub _get_next_token ($) { Line 1776  sub _get_next_token ($) {
1776            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1777          }          }
1778          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1779            $self->{s_kwd} = '';
1780                    
1781      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1782        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 1702  sub _get_next_token ($) { Line 1810  sub _get_next_token ($) {
1810            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1811          }          }
1812          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1813            $self->{s_kwd} = '';
1814          ## reconsume          ## reconsume
1815    
1816          return  ($self->{ct}); # start tag or end tag          return  ($self->{ct}); # start tag or end tag
# Line 1770  sub _get_next_token ($) { Line 1879  sub _get_next_token ($) {
1879            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1880          }          }
1881          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1882            $self->{s_kwd} = '';
1883                    
1884      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1885        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 1817  sub _get_next_token ($) { Line 1927  sub _get_next_token ($) {
1927            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1928          }          }
1929          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1930            $self->{s_kwd} = '';
1931          ## Reconsume.          ## Reconsume.
1932          return  ($self->{ct}); # start tag or end tag          return  ($self->{ct}); # start tag or end tag
1933          redo A;          redo A;
# Line 1847  sub _get_next_token ($) { Line 1958  sub _get_next_token ($) {
1958          }          }
1959    
1960          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1961            $self->{s_kwd} = '';
1962                    
1963      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1964        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 1879  sub _get_next_token ($) { Line 1991  sub _get_next_token ($) {
1991            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1992          }          }
1993          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1994            $self->{s_kwd} = '';
1995          ## Reconsume.          ## Reconsume.
1996          return  ($self->{ct}); # start tag or end tag          return  ($self->{ct}); # start tag or end tag
1997          redo A;          redo A;
# Line 1899  sub _get_next_token ($) { Line 2012  sub _get_next_token ($) {
2012        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
2013                    
2014          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2015            $self->{s_kwd} = '';
2016                    
2017      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2018        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 1916  sub _get_next_token ($) { Line 2030  sub _get_next_token ($) {
2030        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
2031                    
2032          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2033            $self->{s_kwd} = '';
2034          ## reconsume          ## reconsume
2035    
2036          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
# Line 1977  sub _get_next_token ($) { Line 2092  sub _get_next_token ($) {
2092      }      }
2093        
2094          redo A;          redo A;
2095        } elsif ($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and        } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
2096                 $self->{open_elements}->[-1]->[1] & FOREIGN_EL and                   $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
2097                    $self->{is_xml}) and
2098                 $self->{nc} == 0x005B) { # [                 $self->{nc} == 0x005B) { # [
2099                                                    
2100          $self->{state} = MD_CDATA_STATE;          $self->{state} = MD_CDATA_STATE;
# Line 2137  sub _get_next_token ($) { Line 2253  sub _get_next_token ($) {
2253          redo A;          redo A;
2254        } elsif ($self->{s_kwd} eq '[CDATA' and        } elsif ($self->{s_kwd} eq '[CDATA' and
2255                 $self->{nc} == 0x005B) { # [                 $self->{nc} == 0x005B) { # [
2256                    if ($self->{is_xml} and
2257                not $self->{tainted} and
2258                @{$self->{open_elements} or []} == 0) {
2259              
2260              $self->{parse_error}->(level => $self->{level}->{must}, type => 'cdata outside of root element',
2261                              line => $self->{line_prev},
2262                              column => $self->{column_prev} - 7);
2263              $self->{tainted} = 1;
2264            } else {
2265              
2266            }
2267    
2268          $self->{ct} = {type => CHARACTER_TOKEN,          $self->{ct} = {type => CHARACTER_TOKEN,
2269                                    data => '',                                    data => '',
2270                                    line => $self->{line_prev},                                    line => $self->{line_prev},
# Line 2189  sub _get_next_token ($) { Line 2316  sub _get_next_token ($) {
2316                    
2317          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2318          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2319            $self->{s_kwd} = '';
2320                    
2321      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2322        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2208  sub _get_next_token ($) { Line 2336  sub _get_next_token ($) {
2336                    
2337          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2338          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2339            $self->{s_kwd} = '';
2340          ## reconsume          ## reconsume
2341    
2342          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
# Line 2251  sub _get_next_token ($) { Line 2380  sub _get_next_token ($) {
2380                    
2381          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2382          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2383            $self->{s_kwd} = '';
2384                    
2385      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2386        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2270  sub _get_next_token ($) { Line 2400  sub _get_next_token ($) {
2400                    
2401          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2402          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2403            $self->{s_kwd} = '';
2404          ## reconsume          ## reconsume
2405    
2406          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
# Line 2313  sub _get_next_token ($) { Line 2444  sub _get_next_token ($) {
2444                    
2445          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2446          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2447            $self->{s_kwd} = '';
2448          ## reconsume          ## reconsume
2449    
2450          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
# Line 2358  sub _get_next_token ($) { Line 2490  sub _get_next_token ($) {
2490        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
2491                    
2492          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2493            $self->{s_kwd} = '';
2494          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2495            $self->{s_kwd} = '';
2496          ## reconsume          ## reconsume
2497    
2498          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
# Line 2385  sub _get_next_token ($) { Line 2519  sub _get_next_token ($) {
2519        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
2520                    
2521          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2522            $self->{s_kwd} = '';
2523                    
2524      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2525        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2423  sub _get_next_token ($) { Line 2558  sub _get_next_token ($) {
2558                    
2559          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2560          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2561            $self->{s_kwd} = '';
2562          ## reconsume          ## reconsume
2563    
2564          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
# Line 2491  sub _get_next_token ($) { Line 2627  sub _get_next_token ($) {
2627                    
2628          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
2629          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2630            $self->{s_kwd} = '';
2631                    
2632      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2633        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2510  sub _get_next_token ($) { Line 2647  sub _get_next_token ($) {
2647                    
2648          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
2649          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2650            $self->{s_kwd} = '';
2651          ## reconsume          ## reconsume
2652    
2653          return  ($self->{ct}); # DOCTYPE (quirks)          return  ($self->{ct}); # DOCTYPE (quirks)
# Line 2553  sub _get_next_token ($) { Line 2691  sub _get_next_token ($) {
2691        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
2692                    
2693          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2694            $self->{s_kwd} = '';
2695                    
2696      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2697        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2572  sub _get_next_token ($) { Line 2711  sub _get_next_token ($) {
2711                    
2712          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
2713          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2714            $self->{s_kwd} = '';
2715          ## reconsume          ## reconsume
2716    
2717          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 2615  sub _get_next_token ($) { Line 2755  sub _get_next_token ($) {
2755        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
2756                    
2757          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2758            $self->{s_kwd} = '';
2759                    
2760      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2761        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2634  sub _get_next_token ($) { Line 2775  sub _get_next_token ($) {
2775                    
2776          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
2777          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2778            $self->{s_kwd} = '';
2779          ## reconsume          ## reconsume
2780    
2781          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 2862  sub _get_next_token ($) { Line 3004  sub _get_next_token ($) {
3004          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3005    
3006          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3007            $self->{s_kwd} = '';
3008                    
3009      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3010        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2883  sub _get_next_token ($) { Line 3026  sub _get_next_token ($) {
3026          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3027    
3028          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3029            $self->{s_kwd} = '';
3030          ## reconsume          ## reconsume
3031    
3032          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 2929  sub _get_next_token ($) { Line 3073  sub _get_next_token ($) {
3073          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3074    
3075          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3076            $self->{s_kwd} = '';
3077                    
3078      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3079        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2950  sub _get_next_token ($) { Line 3095  sub _get_next_token ($) {
3095          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3096    
3097          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3098            $self->{s_kwd} = '';
3099          ## reconsume          ## reconsume
3100    
3101          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 2998  sub _get_next_token ($) { Line 3144  sub _get_next_token ($) {
3144          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3145    
3146          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3147            $self->{s_kwd} = '';
3148                    
3149      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3150        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3019  sub _get_next_token ($) { Line 3166  sub _get_next_token ($) {
3166          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3167    
3168          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3169            $self->{s_kwd} = '';
3170          ## reconsume          ## reconsume
3171    
3172          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 3097  sub _get_next_token ($) { Line 3245  sub _get_next_token ($) {
3245        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
3246                    
3247          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3248            $self->{s_kwd} = '';
3249                    
3250      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3251        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3117  sub _get_next_token ($) { Line 3266  sub _get_next_token ($) {
3266          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3267    
3268          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3269            $self->{s_kwd} = '';
3270          ## reconsume          ## reconsume
3271    
3272          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 3194  sub _get_next_token ($) { Line 3344  sub _get_next_token ($) {
3344                    
3345          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3346          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3347            $self->{s_kwd} = '';
3348                    
3349      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3350        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3215  sub _get_next_token ($) { Line 3366  sub _get_next_token ($) {
3366          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3367    
3368          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3369            $self->{s_kwd} = '';
3370          ## reconsume          ## reconsume
3371    
3372          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 3261  sub _get_next_token ($) { Line 3413  sub _get_next_token ($) {
3413          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3414    
3415          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3416            $self->{s_kwd} = '';
3417                    
3418      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3419        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3282  sub _get_next_token ($) { Line 3435  sub _get_next_token ($) {
3435          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3436    
3437          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3438            $self->{s_kwd} = '';
3439          ## reconsume          ## reconsume
3440    
3441          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 3330  sub _get_next_token ($) { Line 3484  sub _get_next_token ($) {
3484          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3485    
3486          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3487            $self->{s_kwd} = '';
3488                    
3489      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3490        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3351  sub _get_next_token ($) { Line 3506  sub _get_next_token ($) {
3506          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3507    
3508          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3509            $self->{s_kwd} = '';
3510          ## reconsume          ## reconsume
3511    
3512          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 3397  sub _get_next_token ($) { Line 3553  sub _get_next_token ($) {
3553        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
3554                    
3555          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3556            $self->{s_kwd} = '';
3557                    
3558      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3559        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3416  sub _get_next_token ($) { Line 3573  sub _get_next_token ($) {
3573                    
3574          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3575          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3576            $self->{s_kwd} = '';
3577          ## reconsume          ## reconsume
3578    
3579          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 3445  sub _get_next_token ($) { Line 3603  sub _get_next_token ($) {
3603        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
3604                    
3605          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3606            $self->{s_kwd} = '';
3607                    
3608      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3609        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3463  sub _get_next_token ($) { Line 3622  sub _get_next_token ($) {
3622        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
3623                    
3624          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3625            $self->{s_kwd} = '';
3626          ## reconsume          ## reconsume
3627    
3628          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
# Line 3508  sub _get_next_token ($) { Line 3668  sub _get_next_token ($) {
3668        
3669          redo A;          redo A;
3670        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
3671            if ($self->{is_xml}) {
3672              
3673              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no mse'); ## TODO: type
3674            } else {
3675              
3676            }
3677    
3678          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3679            $self->{s_kwd} = '';
3680                    
3681      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3682        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3577  sub _get_next_token ($) { Line 3745  sub _get_next_token ($) {
3745      } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {      } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
3746        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
3747          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3748            $self->{s_kwd} = '';
3749                    
3750      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3751        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3684  sub _get_next_token ($) { Line 3853  sub _get_next_token ($) {
3853        if ($self->{prev_state} == DATA_STATE) {        if ($self->{prev_state} == DATA_STATE) {
3854                    
3855          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
3856            $self->{s_kwd} = '';
3857          ## Reconsume.          ## Reconsume.
3858          return  ({type => CHARACTER_TOKEN, data => '&',          return  ({type => CHARACTER_TOKEN, data => '&',
3859                    line => $self->{line_prev},                    line => $self->{line_prev},
# Line 3694  sub _get_next_token ($) { Line 3864  sub _get_next_token ($) {
3864                    
3865          $self->{ca}->{value} .= '&';          $self->{ca}->{value} .= '&';
3866          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
3867            $self->{s_kwd} = '';
3868          ## Reconsume.          ## Reconsume.
3869          redo A;          redo A;
3870        }        }
# Line 3744  sub _get_next_token ($) { Line 3915  sub _get_next_token ($) {
3915          if ($self->{prev_state} == DATA_STATE) {          if ($self->{prev_state} == DATA_STATE) {
3916                        
3917            $self->{state} = $self->{prev_state};            $self->{state} = $self->{prev_state};
3918              $self->{s_kwd} = '';
3919            ## Reconsume.            ## Reconsume.
3920            return  ({type => CHARACTER_TOKEN,            return  ({type => CHARACTER_TOKEN,
3921                      data => '&#',                      data => '&#',
# Line 3755  sub _get_next_token ($) { Line 3927  sub _get_next_token ($) {
3927                        
3928            $self->{ca}->{value} .= '&#';            $self->{ca}->{value} .= '&#';
3929            $self->{state} = $self->{prev_state};            $self->{state} = $self->{prev_state};
3930              $self->{s_kwd} = '';
3931            ## Reconsume.            ## Reconsume.
3932            redo A;            redo A;
3933          }          }
# Line 3820  sub _get_next_token ($) { Line 3993  sub _get_next_token ($) {
3993        if ($self->{prev_state} == DATA_STATE) {        if ($self->{prev_state} == DATA_STATE) {
3994                    
3995          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
3996            $self->{s_kwd} = '';
3997          ## Reconsume.          ## Reconsume.
3998          return  ({type => CHARACTER_TOKEN, data => chr $code,          return  ({type => CHARACTER_TOKEN, data => chr $code,
3999                      has_reference => 1,
4000                    line => $l, column => $c,                    line => $l, column => $c,
4001                   });                   });
4002          redo A;          redo A;
# Line 3830  sub _get_next_token ($) { Line 4005  sub _get_next_token ($) {
4005          $self->{ca}->{value} .= chr $code;          $self->{ca}->{value} .= chr $code;
4006          $self->{ca}->{has_reference} = 1;          $self->{ca}->{has_reference} = 1;
4007          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
4008            $self->{s_kwd} = '';
4009          ## Reconsume.          ## Reconsume.
4010          redo A;          redo A;
4011        }        }
# Line 3855  sub _get_next_token ($) { Line 4031  sub _get_next_token ($) {
4031          if ($self->{prev_state} == DATA_STATE) {          if ($self->{prev_state} == DATA_STATE) {
4032                        
4033            $self->{state} = $self->{prev_state};            $self->{state} = $self->{prev_state};
4034              $self->{s_kwd} = '';
4035            ## Reconsume.            ## Reconsume.
4036            return  ({type => CHARACTER_TOKEN,            return  ({type => CHARACTER_TOKEN,
4037                      data => '&' . $self->{s_kwd},                      data => '&' . $self->{s_kwd},
# Line 3866  sub _get_next_token ($) { Line 4043  sub _get_next_token ($) {
4043                        
4044            $self->{ca}->{value} .= '&' . $self->{s_kwd};            $self->{ca}->{value} .= '&' . $self->{s_kwd};
4045            $self->{state} = $self->{prev_state};            $self->{state} = $self->{prev_state};
4046              $self->{s_kwd} = '';
4047            ## Reconsume.            ## Reconsume.
4048            redo A;            redo A;
4049          }          }
# Line 3968  sub _get_next_token ($) { Line 4146  sub _get_next_token ($) {
4146        if ($self->{prev_state} == DATA_STATE) {        if ($self->{prev_state} == DATA_STATE) {
4147                    
4148          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
4149            $self->{s_kwd} = '';
4150          ## Reconsume.          ## Reconsume.
4151          return  ({type => CHARACTER_TOKEN, data => chr $code,          return  ({type => CHARACTER_TOKEN, data => chr $code,
4152                      has_reference => 1,
4153                    line => $l, column => $c,                    line => $l, column => $c,
4154                   });                   });
4155          redo A;          redo A;
# Line 3978  sub _get_next_token ($) { Line 4158  sub _get_next_token ($) {
4158          $self->{ca}->{value} .= chr $code;          $self->{ca}->{value} .= chr $code;
4159          $self->{ca}->{has_reference} = 1;          $self->{ca}->{has_reference} = 1;
4160          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
4161            $self->{s_kwd} = '';
4162          ## Reconsume.          ## Reconsume.
4163          redo A;          redo A;
4164        }        }
# Line 4090  sub _get_next_token ($) { Line 4271  sub _get_next_token ($) {
4271        if ($self->{prev_state} == DATA_STATE) {        if ($self->{prev_state} == DATA_STATE) {
4272                    
4273          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
4274            $self->{s_kwd} = '';
4275          ## Reconsume.          ## Reconsume.
4276          return  ({type => CHARACTER_TOKEN,          return  ({type => CHARACTER_TOKEN,
4277                    data => $data,                    data => $data,
4278                      has_reference => $has_ref,
4279                    line => $self->{line_prev},                    line => $self->{line_prev},
4280                    column => $self->{column_prev} + 1 - length $self->{s_kwd},                    column => $self->{column_prev} + 1 - length $self->{s_kwd},
4281                   });                   });
# Line 4102  sub _get_next_token ($) { Line 4285  sub _get_next_token ($) {
4285          $self->{ca}->{value} .= $data;          $self->{ca}->{value} .= $data;
4286          $self->{ca}->{has_reference} = 1 if $has_ref;          $self->{ca}->{has_reference} = 1 if $has_ref;
4287          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
4288            $self->{s_kwd} = '';
4289          ## Reconsume.          ## Reconsume.
4290          redo A;          redo A;
4291        }        }
4292    
4293        ## XML-only states
4294    
4295        } elsif ($self->{state} == PI_STATE) {
4296          if ($is_space->{$self->{nc}} or
4297              $self->{nc} == 0x003F or # ? ## XML5: Same as "Anything else"
4298              $self->{nc} == -1) {
4299            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare pio', ## TODO: type
4300                            line => $self->{line_prev},
4301                            column => $self->{column_prev}
4302                                - 1 * ($self->{nc} != -1));
4303            $self->{state} = BOGUS_COMMENT_STATE;
4304            ## Reconsume.
4305            $self->{ct} = {type => COMMENT_TOKEN,
4306                           data => '?',
4307                           line => $self->{line_prev},
4308                           column => $self->{column_prev}
4309                               - 1 * ($self->{nc} != -1),
4310                          };
4311            redo A;
4312          } else {
4313            $self->{ct} = {type => PI_TOKEN,
4314                           target => chr $self->{nc},
4315                           data => '',
4316                           line => $self->{line_prev},
4317                           column => $self->{column_prev} - 1,
4318                          };
4319            $self->{state} = PI_TARGET_STATE;
4320            
4321        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4322          $self->{line_prev} = $self->{line};
4323          $self->{column_prev} = $self->{column};
4324          $self->{column}++;
4325          $self->{nc}
4326              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4327        } else {
4328          $self->{set_nc}->($self);
4329        }
4330      
4331            redo A;
4332          }
4333        } elsif ($self->{state} == PI_TARGET_STATE) {
4334          if ($is_space->{$self->{nc}}) {
4335            $self->{state} = PI_TARGET_AFTER_STATE;
4336            
4337        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4338          $self->{line_prev} = $self->{line};
4339          $self->{column_prev} = $self->{column};
4340          $self->{column}++;
4341          $self->{nc}
4342              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4343        } else {
4344          $self->{set_nc}->($self);
4345        }
4346      
4347            redo A;
4348          } elsif ($self->{nc} == -1) {
4349            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
4350            $self->{state} = DATA_STATE;
4351            $self->{s_kwd} = '';
4352            ## Reconsume.
4353            return  ($self->{ct}); # pi
4354            redo A;
4355          } elsif ($self->{nc} == 0x003F) { # ?
4356            $self->{state} = PI_AFTER_STATE;
4357            
4358        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4359          $self->{line_prev} = $self->{line};
4360          $self->{column_prev} = $self->{column};
4361          $self->{column}++;
4362          $self->{nc}
4363              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4364        } else {
4365          $self->{set_nc}->($self);
4366        }
4367      
4368            redo A;
4369          } else {
4370            ## XML5: typo ("tag name" -> "target")
4371            $self->{ct}->{target} .= chr $self->{nc}; # pi
4372            
4373        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4374          $self->{line_prev} = $self->{line};
4375          $self->{column_prev} = $self->{column};
4376          $self->{column}++;
4377          $self->{nc}
4378              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4379        } else {
4380          $self->{set_nc}->($self);
4381        }
4382      
4383            redo A;
4384          }
4385        } elsif ($self->{state} == PI_TARGET_AFTER_STATE) {
4386          if ($is_space->{$self->{nc}}) {
4387            ## Stay in the state.
4388            
4389        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4390          $self->{line_prev} = $self->{line};
4391          $self->{column_prev} = $self->{column};
4392          $self->{column}++;
4393          $self->{nc}
4394              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4395        } else {
4396          $self->{set_nc}->($self);
4397        }
4398      
4399            redo A;
4400          } else {
4401            $self->{state} = PI_DATA_STATE;
4402            ## Reprocess.
4403            redo A;
4404          }
4405        } elsif ($self->{state} == PI_DATA_STATE) {
4406          if ($self->{nc} == 0x003F) { # ?
4407            $self->{state} = PI_DATA_AFTER_STATE;
4408            
4409        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4410          $self->{line_prev} = $self->{line};
4411          $self->{column_prev} = $self->{column};
4412          $self->{column}++;
4413          $self->{nc}
4414              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4415        } else {
4416          $self->{set_nc}->($self);
4417        }
4418      
4419            redo A;
4420          } elsif ($self->{nc} == -1) {
4421            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
4422            $self->{state} = DATA_STATE;
4423            $self->{s_kwd} = '';
4424            ## Reprocess.
4425            return  ($self->{ct}); # pi
4426            redo A;
4427          } else {
4428            $self->{ct}->{data} .= chr $self->{nc}; # pi
4429            $self->{read_until}->($self->{ct}->{data}, q[?],
4430                                  length $self->{ct}->{data});
4431            ## Stay in the state.
4432            
4433        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4434          $self->{line_prev} = $self->{line};
4435          $self->{column_prev} = $self->{column};
4436          $self->{column}++;
4437          $self->{nc}
4438              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4439        } else {
4440          $self->{set_nc}->($self);
4441        }
4442      
4443            ## Reprocess.
4444            redo A;
4445          }
4446        } elsif ($self->{state} == PI_AFTER_STATE) {
4447          if ($self->{nc} == 0x003E) { # >
4448            $self->{state} = DATA_STATE;
4449            $self->{s_kwd} = '';
4450            
4451        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4452          $self->{line_prev} = $self->{line};
4453          $self->{column_prev} = $self->{column};
4454          $self->{column}++;
4455          $self->{nc}
4456              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4457        } else {
4458          $self->{set_nc}->($self);
4459        }
4460      
4461            return  ($self->{ct}); # pi
4462            redo A;
4463          } elsif ($self->{nc} == 0x003F) { # ?
4464            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
4465                            line => $self->{line_prev},
4466                            column => $self->{column_prev}); ## XML5: no error
4467            $self->{ct}->{data} .= '?';
4468            $self->{state} = PI_DATA_AFTER_STATE;
4469            
4470        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4471          $self->{line_prev} = $self->{line};
4472          $self->{column_prev} = $self->{column};
4473          $self->{column}++;
4474          $self->{nc}
4475              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4476        } else {
4477          $self->{set_nc}->($self);
4478        }
4479      
4480            redo A;
4481          } else {
4482            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
4483                            line => $self->{line_prev},
4484                            column => $self->{column_prev}
4485                                + 1 * ($self->{nc} == -1)); ## XML5: no error
4486            $self->{ct}->{data} .= '?'; ## XML5: not appended
4487            $self->{state} = PI_DATA_STATE;
4488            ## Reprocess.
4489            redo A;
4490          }
4491        } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
4492          ## XML5: Same as "pi after state" in XML5
4493          if ($self->{nc} == 0x003E) { # >
4494            $self->{state} = DATA_STATE;
4495            $self->{s_kwd} = '';
4496            
4497        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4498          $self->{line_prev} = $self->{line};
4499          $self->{column_prev} = $self->{column};
4500          $self->{column}++;
4501          $self->{nc}
4502              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4503        } else {
4504          $self->{set_nc}->($self);
4505        }
4506      
4507            return  ($self->{ct}); # pi
4508            redo A;
4509          } elsif ($self->{nc} == 0x003F) { # ?
4510            $self->{ct}->{data} .= '?';
4511            ## Stay in the state.
4512            
4513        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4514          $self->{line_prev} = $self->{line};
4515          $self->{column_prev} = $self->{column};
4516          $self->{column}++;
4517          $self->{nc}
4518              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4519        } else {
4520          $self->{set_nc}->($self);
4521        }
4522      
4523            redo A;
4524          } else {
4525            $self->{ct}->{data} .= '?'; ## XML5: not appended
4526            $self->{state} = PI_DATA_STATE;
4527            ## Reprocess.
4528            redo A;
4529          }
4530            
4531      } else {      } else {
4532        die "$0: $self->{state}: Unknown state";        die "$0: $self->{state}: Unknown state";
4533      }      }

Legend:
Removed from v.1.1  
changed lines
  Added in v.1.8

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24