/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src
Suika

Diff of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.2 by wakaba, Tue Oct 14 04:32:49 2008 UTC revision 1.9 by wakaba, Wed Oct 15 08:05:47 2008 UTC
# Line 114  sub HEXREF_HEX_STATE () { 48 } Line 114  sub HEXREF_HEX_STATE () { 48 }
114  sub ENTITY_NAME_STATE () { 49 }  sub ENTITY_NAME_STATE () { 49 }
115  sub PCDATA_STATE () { 50 } # "data state" in the spec  sub PCDATA_STATE () { 50 } # "data state" in the spec
116    
117    ## XML states
118    sub PI_STATE () { 51 }
119    sub PI_TARGET_STATE () { 52 }
120    sub PI_TARGET_AFTER_STATE () { 53 }
121    sub PI_DATA_STATE () { 54 }
122    sub PI_AFTER_STATE () { 55 }
123    sub PI_DATA_AFTER_STATE () { 56 }
124    
125  ## Tree constructor state constants (see Whatpm::HTML for the full  ## Tree constructor state constants (see Whatpm::HTML for the full
126  ## list and descriptions)  ## list and descriptions)
127    
# Line 175  sub _initialize_tokenizer ($) { Line 183  sub _initialize_tokenizer ($) {
183    #$self->{level}    #$self->{level}
184    #$self->{set_nc}    #$self->{set_nc}
185    #$self->{parse_error}    #$self->{parse_error}
186      #$self->{is_xml} (if XML)
187    
188    $self->{state} = DATA_STATE; # MUST    $self->{state} = DATA_STATE; # MUST
189    #$self->{s_kwd}; # state keyword - initialized when used    $self->{s_kwd} = ''; # state keyword
190    #$self->{entity__value}; # initialized when used    #$self->{entity__value}; # initialized when used
191    #$self->{entity__match}; # initialized when used    #$self->{entity__match}; # initialized when used
192    $self->{content_model} = PCDATA_CONTENT_MODEL; # be    $self->{content_model} = PCDATA_CONTENT_MODEL; # be
# Line 208  sub _initialize_tokenizer ($) { Line 217  sub _initialize_tokenizer ($) {
217  ##        ->{value}  ##        ->{value}
218  ##        ->{has_reference} == 1 or 0  ##        ->{has_reference} == 1 or 0
219  ##   ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)  ##   ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)
220    ##   ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
221  ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.  ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
222  ##     |->{self_closing}| is used to save the value of |$self->{self_closing}|  ##     |->{self_closing}| is used to save the value of |$self->{self_closing}|
223  ##     while the token is pushed back to the stack.  ##     while the token is pushed back to the stack.
# Line 311  sub _get_next_token ($) { Line 321  sub _get_next_token ($) {
321          }          }
322        } elsif ($self->{nc} == 0x002D) { # -        } elsif ($self->{nc} == 0x002D) { # -
323          if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA          if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
324            $self->{s_kwd} .= '-';            if ($self->{s_kwd} eq '<!-') {
             
           if ($self->{s_kwd} eq '<!--') {  
325              !!!cp (3);              !!!cp (3);
326              $self->{escape} = 1; # unless $self->{escape};              $self->{escape} = 1; # unless $self->{escape};
327              $self->{s_kwd} = '--';              $self->{s_kwd} = '--';
328              #              #
329            } elsif ($self->{s_kwd} eq '---') {            } elsif ($self->{s_kwd} eq '-') {
330              !!!cp (4);              !!!cp (4);
331              $self->{s_kwd} = '--';              $self->{s_kwd} = '--';
332              #              #
333              } elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') {
334                !!!cp (4.1);
335                $self->{s_kwd} .= '-';
336                #
337            } else {            } else {
338              !!!cp (5);              !!!cp (5);
339                $self->{s_kwd} = '-';
340              #              #
341            }            }
342          }          }
# Line 359  sub _get_next_token ($) { Line 372  sub _get_next_token ($) {
372            if ($self->{s_kwd} eq '--') {            if ($self->{s_kwd} eq '--') {
373              !!!cp (8);              !!!cp (8);
374              delete $self->{escape};              delete $self->{escape};
375                #
376            } else {            } else {
377              !!!cp (9);              !!!cp (9);
378                #
379            }            }
380            } elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') {
381              !!!cp (9.1);
382              !!!parse-error (type => 'unmatched mse', ## TODO: type
383                              line => $self->{line_prev},
384                              column => $self->{column_prev} - 1);
385              #
386          } else {          } else {
387            !!!cp (10);            !!!cp (10);
388              #
389          }          }
390                    
391          $self->{s_kwd} = '';          $self->{s_kwd} = '';
392          #          #
393          } elsif ($self->{nc} == 0x005D) { # ]
394            if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') {
395              !!!cp (10.1);
396              $self->{s_kwd} .= ']';
397            } elsif ($self->{s_kwd} eq ']]') {
398              !!!cp (10.2);
399              #
400            } else {
401              !!!cp (10.3);
402              $self->{s_kwd} = '';
403            }
404            #
405        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
406          !!!cp (11);          !!!cp (11);
407          $self->{s_kwd} = '';          $self->{s_kwd} = '';
# Line 385  sub _get_next_token ($) { Line 419  sub _get_next_token ($) {
419                     data => chr $self->{nc},                     data => chr $self->{nc},
420                     line => $self->{line}, column => $self->{column},                     line => $self->{line}, column => $self->{column},
421                    };                    };
422        if ($self->{read_until}->($token->{data}, q[-!<>&],        if ($self->{read_until}->($token->{data}, q{-!<>&\]},
423                                  length $token->{data})) {                                  length $token->{data})) {
424          $self->{s_kwd} = '';          $self->{s_kwd} = '';
425        }        }
426    
427        ## Stay in the data state.        ## Stay in the data state.
428        if ($self->{content_model} == PCDATA_CONTENT_MODEL) {        if (not $self->{is_xml} and
429              $self->{content_model} == PCDATA_CONTENT_MODEL) {
430          !!!cp (13);          !!!cp (13);
431          $self->{state} = PCDATA_STATE;          $self->{state} = PCDATA_STATE;
432        } else {        } else {
# Line 419  sub _get_next_token ($) { Line 454  sub _get_next_token ($) {
454    
455          ## reconsume          ## reconsume
456          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
457            $self->{s_kwd} = '';
458          !!!emit ({type => CHARACTER_TOKEN, data => '<',          !!!emit ({type => CHARACTER_TOKEN, data => '<',
459                    line => $self->{line_prev},                    line => $self->{line_prev},
460                    column => $self->{column_prev},                    column => $self->{column_prev},
# Line 440  sub _get_next_token ($) { Line 476  sub _get_next_token ($) {
476            !!!cp (19);            !!!cp (19);
477            $self->{ct}            $self->{ct}
478              = {type => START_TAG_TOKEN,              = {type => START_TAG_TOKEN,
479                 tag_name => chr ($self->{nc} + 0x0020),                 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
480                 line => $self->{line_prev},                 line => $self->{line_prev},
481                 column => $self->{column_prev}};                 column => $self->{column_prev}};
482            $self->{state} = TAG_NAME_STATE;            $self->{state} = TAG_NAME_STATE;
# Line 462  sub _get_next_token ($) { Line 498  sub _get_next_token ($) {
498                            line => $self->{line_prev},                            line => $self->{line_prev},
499                            column => $self->{column_prev});                            column => $self->{column_prev});
500            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
501              $self->{s_kwd} = '';
502            !!!next-input-character;            !!!next-input-character;
503    
504            !!!emit ({type => CHARACTER_TOKEN, data => '<>',            !!!emit ({type => CHARACTER_TOKEN, data => '<>',
# Line 471  sub _get_next_token ($) { Line 508  sub _get_next_token ($) {
508    
509            redo A;            redo A;
510          } elsif ($self->{nc} == 0x003F) { # ?          } elsif ($self->{nc} == 0x003F) { # ?
511            !!!cp (22);            if ($self->{is_xml}) {
512            !!!parse-error (type => 'pio',              !!!cp (22.1);
513                            line => $self->{line_prev},              $self->{state} = PI_STATE;
514                            column => $self->{column_prev});              !!!next-input-character;
515            $self->{state} = BOGUS_COMMENT_STATE;              redo A;
516            $self->{ct} = {type => COMMENT_TOKEN, data => '',            } else {
517                                      line => $self->{line_prev},              !!!cp (22);
518                                      column => $self->{column_prev},              !!!parse-error (type => 'pio',
519                                     };                              line => $self->{line_prev},
520            ## $self->{nc} is intentionally left as is                              column => $self->{column_prev});
521            redo A;              $self->{state} = BOGUS_COMMENT_STATE;
522          } else {              $self->{ct} = {type => COMMENT_TOKEN, data => '',
523                               line => $self->{line_prev},
524                               column => $self->{column_prev},
525                              };
526                ## $self->{nc} is intentionally left as is
527                redo A;
528              }
529            } elsif (not $self->{is_xml} or $is_space->{$self->{nc}}) {
530            !!!cp (23);            !!!cp (23);
531            !!!parse-error (type => 'bare stago',            !!!parse-error (type => 'bare stago',
532                            line => $self->{line_prev},                            line => $self->{line_prev},
533                            column => $self->{column_prev});                            column => $self->{column_prev});
534            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
535              $self->{s_kwd} = '';
536            ## reconsume            ## reconsume
537    
538            !!!emit ({type => CHARACTER_TOKEN, data => '<',            !!!emit ({type => CHARACTER_TOKEN, data => '<',
# Line 496  sub _get_next_token ($) { Line 541  sub _get_next_token ($) {
541                     });                     });
542    
543            redo A;            redo A;
544            } else {
545              ## XML5: "<:" is a parse error.
546              !!!cp (23.1);
547              $self->{ct} = {type => START_TAG_TOKEN,
548                                        tag_name => chr ($self->{nc}),
549                                        line => $self->{line_prev},
550                                        column => $self->{column_prev}};
551              $self->{state} = TAG_NAME_STATE;
552              !!!next-input-character;
553              redo A;
554          }          }
555        } else {        } else {
556          die "$0: $self->{content_model} in tag open";          die "$0: $self->{content_model} in tag open";
# Line 516  sub _get_next_token ($) { Line 571  sub _get_next_token ($) {
571            ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.            ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
572            !!!cp (28);            !!!cp (28);
573            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
574              $self->{s_kwd} = '';
575            ## Reconsume.            ## Reconsume.
576            !!!emit ({type => CHARACTER_TOKEN, data => '</',            !!!emit ({type => CHARACTER_TOKEN, data => '</',
577                      line => $l, column => $c,                      line => $l, column => $c,
# Line 529  sub _get_next_token ($) { Line 585  sub _get_next_token ($) {
585          !!!cp (29);          !!!cp (29);
586          $self->{ct}          $self->{ct}
587              = {type => END_TAG_TOKEN,              = {type => END_TAG_TOKEN,
588                 tag_name => chr ($self->{nc} + 0x0020),                 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
589                 line => $l, column => $c};                 line => $l, column => $c};
590          $self->{state} = TAG_NAME_STATE;          $self->{state} = TAG_NAME_STATE;
591          !!!next-input-character;          !!!next-input-character;
# Line 549  sub _get_next_token ($) { Line 605  sub _get_next_token ($) {
605                          line => $self->{line_prev}, ## "<" in "</>"                          line => $self->{line_prev}, ## "<" in "</>"
606                          column => $self->{column_prev} - 1);                          column => $self->{column_prev} - 1);
607          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
608            $self->{s_kwd} = '';
609          !!!next-input-character;          !!!next-input-character;
610          redo A;          redo A;
611        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
612          !!!cp (32);          !!!cp (32);
613          !!!parse-error (type => 'bare etago');          !!!parse-error (type => 'bare etago');
614            $self->{s_kwd} = '';
615          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
616          # reconsume          # reconsume
617    
# Line 593  sub _get_next_token ($) { Line 651  sub _get_next_token ($) {
651          } else {          } else {
652            !!!cp (25);            !!!cp (25);
653            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
654              $self->{s_kwd} = '';
655            ## Reconsume.            ## Reconsume.
656            !!!emit ({type => CHARACTER_TOKEN,            !!!emit ({type => CHARACTER_TOKEN,
657                      data => '</' . $self->{s_kwd},                      data => '</' . $self->{s_kwd},
# Line 611  sub _get_next_token ($) { Line 670  sub _get_next_token ($) {
670            !!!cp (26);            !!!cp (26);
671            ## Reconsume.            ## Reconsume.
672            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
673              $self->{s_kwd} = '';
674            !!!emit ({type => CHARACTER_TOKEN,            !!!emit ({type => CHARACTER_TOKEN,
675                      data => '</' . $self->{s_kwd},                      data => '</' . $self->{s_kwd},
676                      line => $self->{line_prev},                      line => $self->{line_prev},
# Line 652  sub _get_next_token ($) { Line 712  sub _get_next_token ($) {
712            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
713          }          }
714          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
715            $self->{s_kwd} = '';
716          !!!next-input-character;          !!!next-input-character;
717    
718          !!!emit ($self->{ct}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
# Line 660  sub _get_next_token ($) { Line 721  sub _get_next_token ($) {
721        } elsif (0x0041 <= $self->{nc} and        } elsif (0x0041 <= $self->{nc} and
722                 $self->{nc} <= 0x005A) { # A..Z                 $self->{nc} <= 0x005A) { # A..Z
723          !!!cp (38);          !!!cp (38);
724          $self->{ct}->{tag_name} .= chr ($self->{nc} + 0x0020);          $self->{ct}->{tag_name}
725                .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
726            # start tag or end tag            # start tag or end tag
727          ## Stay in this state          ## Stay in this state
728          !!!next-input-character;          !!!next-input-character;
# Line 683  sub _get_next_token ($) { Line 745  sub _get_next_token ($) {
745            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
746          }          }
747          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
748            $self->{s_kwd} = '';
749          # reconsume          # reconsume
750    
751          !!!emit ($self->{ct}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
# Line 723  sub _get_next_token ($) { Line 786  sub _get_next_token ($) {
786            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
787          }          }
788          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
789            $self->{s_kwd} = '';
790          !!!next-input-character;          !!!next-input-character;
791    
792          !!!emit ($self->{ct}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
# Line 732  sub _get_next_token ($) { Line 796  sub _get_next_token ($) {
796                 $self->{nc} <= 0x005A) { # A..Z                 $self->{nc} <= 0x005A) { # A..Z
797          !!!cp (49);          !!!cp (49);
798          $self->{ca}          $self->{ca}
799              = {name => chr ($self->{nc} + 0x0020),              = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
800                 value => '',                 value => '',
801                 line => $self->{line}, column => $self->{column}};                 line => $self->{line}, column => $self->{column}};
802          $self->{state} = ATTRIBUTE_NAME_STATE;          $self->{state} = ATTRIBUTE_NAME_STATE;
# Line 760  sub _get_next_token ($) { Line 824  sub _get_next_token ($) {
824            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
825          }          }
826          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
827            $self->{s_kwd} = '';
828          # reconsume          # reconsume
829    
830          !!!emit ($self->{ct}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
# Line 825  sub _get_next_token ($) { Line 890  sub _get_next_token ($) {
890            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
891          }          }
892          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
893            $self->{s_kwd} = '';
894          !!!next-input-character;          !!!next-input-character;
895    
896          !!!emit ($self->{ct}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
# Line 833  sub _get_next_token ($) { Line 899  sub _get_next_token ($) {
899        } elsif (0x0041 <= $self->{nc} and        } elsif (0x0041 <= $self->{nc} and
900                 $self->{nc} <= 0x005A) { # A..Z                 $self->{nc} <= 0x005A) { # A..Z
901          !!!cp (63);          !!!cp (63);
902          $self->{ca}->{name} .= chr ($self->{nc} + 0x0020);          $self->{ca}->{name}
903                .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
904          ## Stay in the state          ## Stay in the state
905          !!!next-input-character;          !!!next-input-character;
906          redo A;          redo A;
# Line 862  sub _get_next_token ($) { Line 929  sub _get_next_token ($) {
929            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
930          }          }
931          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
932            $self->{s_kwd} = '';
933          # reconsume          # reconsume
934    
935          !!!emit ($self->{ct}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
# Line 908  sub _get_next_token ($) { Line 976  sub _get_next_token ($) {
976            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
977          }          }
978          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
979            $self->{s_kwd} = '';
980          !!!next-input-character;          !!!next-input-character;
981    
982          !!!emit ($self->{ct}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
# Line 917  sub _get_next_token ($) { Line 986  sub _get_next_token ($) {
986                 $self->{nc} <= 0x005A) { # A..Z                 $self->{nc} <= 0x005A) { # A..Z
987          !!!cp (76);          !!!cp (76);
988          $self->{ca}          $self->{ca}
989              = {name => chr ($self->{nc} + 0x0020),              = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
990                 value => '',                 value => '',
991                 line => $self->{line}, column => $self->{column}};                 line => $self->{line}, column => $self->{column}};
992          $self->{state} = ATTRIBUTE_NAME_STATE;          $self->{state} = ATTRIBUTE_NAME_STATE;
# Line 945  sub _get_next_token ($) { Line 1014  sub _get_next_token ($) {
1014          } else {          } else {
1015            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1016          }          }
1017            $self->{s_kwd} = '';
1018          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1019          # reconsume          # reconsume
1020    
# Line 1006  sub _get_next_token ($) { Line 1076  sub _get_next_token ($) {
1076            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1077          }          }
1078          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1079            $self->{s_kwd} = '';
1080          !!!next-input-character;          !!!next-input-character;
1081    
1082          !!!emit ($self->{ct}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
# Line 1029  sub _get_next_token ($) { Line 1100  sub _get_next_token ($) {
1100            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1101          }          }
1102          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1103            $self->{s_kwd} = '';
1104          ## reconsume          ## reconsume
1105    
1106          !!!emit ($self->{ct}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
# Line 1081  sub _get_next_token ($) { Line 1153  sub _get_next_token ($) {
1153            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1154          }          }
1155          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1156            $self->{s_kwd} = '';
1157          ## reconsume          ## reconsume
1158    
1159          !!!emit ($self->{ct}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
# Line 1132  sub _get_next_token ($) { Line 1205  sub _get_next_token ($) {
1205            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1206          }          }
1207          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1208            $self->{s_kwd} = '';
1209          ## reconsume          ## reconsume
1210    
1211          !!!emit ($self->{ct}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
# Line 1182  sub _get_next_token ($) { Line 1256  sub _get_next_token ($) {
1256            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1257          }          }
1258          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1259            $self->{s_kwd} = '';
1260          !!!next-input-character;          !!!next-input-character;
1261    
1262          !!!emit ($self->{ct}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
# Line 1205  sub _get_next_token ($) { Line 1280  sub _get_next_token ($) {
1280            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1281          }          }
1282          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1283            $self->{s_kwd} = '';
1284          ## reconsume          ## reconsume
1285    
1286          !!!emit ($self->{ct}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
# Line 1253  sub _get_next_token ($) { Line 1329  sub _get_next_token ($) {
1329            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1330          }          }
1331          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1332            $self->{s_kwd} = '';
1333          !!!next-input-character;          !!!next-input-character;
1334    
1335          !!!emit ($self->{ct}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
# Line 1280  sub _get_next_token ($) { Line 1357  sub _get_next_token ($) {
1357            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1358          }          }
1359          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1360            $self->{s_kwd} = '';
1361          ## Reconsume.          ## Reconsume.
1362          !!!emit ($self->{ct}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
1363          redo A;          redo A;
# Line 1310  sub _get_next_token ($) { Line 1388  sub _get_next_token ($) {
1388          }          }
1389    
1390          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1391            $self->{s_kwd} = '';
1392          !!!next-input-character;          !!!next-input-character;
1393    
1394          !!!emit ($self->{ct}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
# Line 1332  sub _get_next_token ($) { Line 1411  sub _get_next_token ($) {
1411            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1412          }          }
1413          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1414            $self->{s_kwd} = '';
1415          ## Reconsume.          ## Reconsume.
1416          !!!emit ($self->{ct}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
1417          redo A;          redo A;
# Line 1352  sub _get_next_token ($) { Line 1432  sub _get_next_token ($) {
1432        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
1433          !!!cp (124);          !!!cp (124);
1434          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1435            $self->{s_kwd} = '';
1436          !!!next-input-character;          !!!next-input-character;
1437    
1438          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1359  sub _get_next_token ($) { Line 1440  sub _get_next_token ($) {
1440        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
1441          !!!cp (125);          !!!cp (125);
1442          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1443            $self->{s_kwd} = '';
1444          ## reconsume          ## reconsume
1445    
1446          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1390  sub _get_next_token ($) { Line 1472  sub _get_next_token ($) {
1472          $self->{s_kwd} = chr $self->{nc};          $self->{s_kwd} = chr $self->{nc};
1473          !!!next-input-character;          !!!next-input-character;
1474          redo A;          redo A;
1475        } elsif ($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and        } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
1476                 $self->{open_elements}->[-1]->[1] & FOREIGN_EL and                   $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
1477                    $self->{is_xml}) and
1478                 $self->{nc} == 0x005B) { # [                 $self->{nc} == 0x005B) { # [
1479          !!!cp (135.4);                          !!!cp (135.4);                
1480          $self->{state} = MD_CDATA_STATE;          $self->{state} = MD_CDATA_STATE;
# Line 1500  sub _get_next_token ($) { Line 1583  sub _get_next_token ($) {
1583          redo A;          redo A;
1584        } elsif ($self->{s_kwd} eq '[CDATA' and        } elsif ($self->{s_kwd} eq '[CDATA' and
1585                 $self->{nc} == 0x005B) { # [                 $self->{nc} == 0x005B) { # [
1586          !!!cp (135.2);          if ($self->{is_xml} and
1587                not $self->{tainted} and
1588                @{$self->{open_elements} or []} == 0) {
1589              !!!cp (135.2);
1590              !!!parse-error (type => 'cdata outside of root element',
1591                              line => $self->{line_prev},
1592                              column => $self->{column_prev} - 7);
1593              $self->{tainted} = 1;
1594            } else {
1595              !!!cp (135.21);
1596            }
1597    
1598          $self->{ct} = {type => CHARACTER_TOKEN,          $self->{ct} = {type => CHARACTER_TOKEN,
1599                                    data => '',                                    data => '',
1600                                    line => $self->{line_prev},                                    line => $self->{line_prev},
# Line 1532  sub _get_next_token ($) { Line 1626  sub _get_next_token ($) {
1626          !!!cp (138);          !!!cp (138);
1627          !!!parse-error (type => 'bogus comment');          !!!parse-error (type => 'bogus comment');
1628          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1629            $self->{s_kwd} = '';
1630          !!!next-input-character;          !!!next-input-character;
1631    
1632          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1541  sub _get_next_token ($) { Line 1636  sub _get_next_token ($) {
1636          !!!cp (139);          !!!cp (139);
1637          !!!parse-error (type => 'unclosed comment');          !!!parse-error (type => 'unclosed comment');
1638          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1639            $self->{s_kwd} = '';
1640          ## reconsume          ## reconsume
1641    
1642          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1564  sub _get_next_token ($) { Line 1660  sub _get_next_token ($) {
1660          !!!cp (142);          !!!cp (142);
1661          !!!parse-error (type => 'bogus comment');          !!!parse-error (type => 'bogus comment');
1662          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1663            $self->{s_kwd} = '';
1664          !!!next-input-character;          !!!next-input-character;
1665    
1666          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1573  sub _get_next_token ($) { Line 1670  sub _get_next_token ($) {
1670          !!!cp (143);          !!!cp (143);
1671          !!!parse-error (type => 'unclosed comment');          !!!parse-error (type => 'unclosed comment');
1672          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1673            $self->{s_kwd} = '';
1674          ## reconsume          ## reconsume
1675    
1676          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1596  sub _get_next_token ($) { Line 1694  sub _get_next_token ($) {
1694          !!!cp (146);          !!!cp (146);
1695          !!!parse-error (type => 'unclosed comment');          !!!parse-error (type => 'unclosed comment');
1696          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1697            $self->{s_kwd} = '';
1698          ## reconsume          ## reconsume
1699    
1700          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1621  sub _get_next_token ($) { Line 1720  sub _get_next_token ($) {
1720        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
1721          !!!cp (149);          !!!cp (149);
1722          !!!parse-error (type => 'unclosed comment');          !!!parse-error (type => 'unclosed comment');
1723            $self->{s_kwd} = '';
1724          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1725            $self->{s_kwd} = '';
1726          ## reconsume          ## reconsume
1727    
1728          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1638  sub _get_next_token ($) { Line 1739  sub _get_next_token ($) {
1739        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
1740          !!!cp (151);          !!!cp (151);
1741          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1742            $self->{s_kwd} = '';
1743          !!!next-input-character;          !!!next-input-character;
1744    
1745          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1656  sub _get_next_token ($) { Line 1758  sub _get_next_token ($) {
1758          !!!cp (153);          !!!cp (153);
1759          !!!parse-error (type => 'unclosed comment');          !!!parse-error (type => 'unclosed comment');
1760          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1761            $self->{s_kwd} = '';
1762          ## reconsume          ## reconsume
1763    
1764          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1694  sub _get_next_token ($) { Line 1797  sub _get_next_token ($) {
1797          !!!cp (158);          !!!cp (158);
1798          !!!parse-error (type => 'no DOCTYPE name');          !!!parse-error (type => 'no DOCTYPE name');
1799          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1800            $self->{s_kwd} = '';
1801          !!!next-input-character;          !!!next-input-character;
1802    
1803          !!!emit ($self->{ct}); # DOCTYPE (quirks)          !!!emit ($self->{ct}); # DOCTYPE (quirks)
# Line 1703  sub _get_next_token ($) { Line 1807  sub _get_next_token ($) {
1807          !!!cp (159);          !!!cp (159);
1808          !!!parse-error (type => 'no DOCTYPE name');          !!!parse-error (type => 'no DOCTYPE name');
1809          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1810            $self->{s_kwd} = '';
1811          ## reconsume          ## reconsume
1812    
1813          !!!emit ($self->{ct}); # DOCTYPE (quirks)          !!!emit ($self->{ct}); # DOCTYPE (quirks)
# Line 1726  sub _get_next_token ($) { Line 1831  sub _get_next_token ($) {
1831        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
1832          !!!cp (162);          !!!cp (162);
1833          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1834            $self->{s_kwd} = '';
1835          !!!next-input-character;          !!!next-input-character;
1836    
1837          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
# Line 1735  sub _get_next_token ($) { Line 1841  sub _get_next_token ($) {
1841          !!!cp (163);          !!!cp (163);
1842          !!!parse-error (type => 'unclosed DOCTYPE');          !!!parse-error (type => 'unclosed DOCTYPE');
1843          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1844            $self->{s_kwd} = '';
1845          ## reconsume          ## reconsume
1846    
1847          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 1758  sub _get_next_token ($) { Line 1865  sub _get_next_token ($) {
1865        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
1866          !!!cp (166);          !!!cp (166);
1867          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1868            $self->{s_kwd} = '';
1869          !!!next-input-character;          !!!next-input-character;
1870    
1871          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
# Line 1767  sub _get_next_token ($) { Line 1875  sub _get_next_token ($) {
1875          !!!cp (167);          !!!cp (167);
1876          !!!parse-error (type => 'unclosed DOCTYPE');          !!!parse-error (type => 'unclosed DOCTYPE');
1877          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1878            $self->{s_kwd} = '';
1879          ## reconsume          ## reconsume
1880    
1881          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 1895  sub _get_next_token ($) { Line 2004  sub _get_next_token ($) {
2004          !!!parse-error (type => 'no PUBLIC literal');          !!!parse-error (type => 'no PUBLIC literal');
2005    
2006          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2007            $self->{s_kwd} = '';
2008          !!!next-input-character;          !!!next-input-character;
2009    
2010          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 1906  sub _get_next_token ($) { Line 2016  sub _get_next_token ($) {
2016          !!!parse-error (type => 'unclosed DOCTYPE');          !!!parse-error (type => 'unclosed DOCTYPE');
2017    
2018          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2019            $self->{s_kwd} = '';
2020          ## reconsume          ## reconsume
2021    
2022          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 1932  sub _get_next_token ($) { Line 2043  sub _get_next_token ($) {
2043          !!!parse-error (type => 'unclosed PUBLIC literal');          !!!parse-error (type => 'unclosed PUBLIC literal');
2044    
2045          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2046            $self->{s_kwd} = '';
2047          !!!next-input-character;          !!!next-input-character;
2048    
2049          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 1943  sub _get_next_token ($) { Line 2055  sub _get_next_token ($) {
2055          !!!parse-error (type => 'unclosed PUBLIC literal');          !!!parse-error (type => 'unclosed PUBLIC literal');
2056    
2057          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2058            $self->{s_kwd} = '';
2059          ## reconsume          ## reconsume
2060    
2061          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 1971  sub _get_next_token ($) { Line 2084  sub _get_next_token ($) {
2084          !!!parse-error (type => 'unclosed PUBLIC literal');          !!!parse-error (type => 'unclosed PUBLIC literal');
2085    
2086          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2087            $self->{s_kwd} = '';
2088          !!!next-input-character;          !!!next-input-character;
2089    
2090          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 1982  sub _get_next_token ($) { Line 2096  sub _get_next_token ($) {
2096          !!!parse-error (type => 'unclosed PUBLIC literal');          !!!parse-error (type => 'unclosed PUBLIC literal');
2097    
2098          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2099            $self->{s_kwd} = '';
2100          ## reconsume          ## reconsume
2101    
2102          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 2020  sub _get_next_token ($) { Line 2135  sub _get_next_token ($) {
2135        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
2136          !!!cp (198);          !!!cp (198);
2137          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2138            $self->{s_kwd} = '';
2139          !!!next-input-character;          !!!next-input-character;
2140    
2141          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
# Line 2030  sub _get_next_token ($) { Line 2146  sub _get_next_token ($) {
2146          !!!parse-error (type => 'unclosed DOCTYPE');          !!!parse-error (type => 'unclosed DOCTYPE');
2147    
2148          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2149            $self->{s_kwd} = '';
2150          ## reconsume          ## reconsume
2151    
2152          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 2067  sub _get_next_token ($) { Line 2184  sub _get_next_token ($) {
2184          !!!cp (204);          !!!cp (204);
2185          !!!parse-error (type => 'no SYSTEM literal');          !!!parse-error (type => 'no SYSTEM literal');
2186          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2187            $self->{s_kwd} = '';
2188          !!!next-input-character;          !!!next-input-character;
2189    
2190          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 2078  sub _get_next_token ($) { Line 2196  sub _get_next_token ($) {
2196          !!!parse-error (type => 'unclosed DOCTYPE');          !!!parse-error (type => 'unclosed DOCTYPE');
2197    
2198          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2199            $self->{s_kwd} = '';
2200          ## reconsume          ## reconsume
2201    
2202          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 2104  sub _get_next_token ($) { Line 2223  sub _get_next_token ($) {
2223          !!!parse-error (type => 'unclosed SYSTEM literal');          !!!parse-error (type => 'unclosed SYSTEM literal');
2224    
2225          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2226            $self->{s_kwd} = '';
2227          !!!next-input-character;          !!!next-input-character;
2228    
2229          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 2115  sub _get_next_token ($) { Line 2235  sub _get_next_token ($) {
2235          !!!parse-error (type => 'unclosed SYSTEM literal');          !!!parse-error (type => 'unclosed SYSTEM literal');
2236    
2237          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2238            $self->{s_kwd} = '';
2239          ## reconsume          ## reconsume
2240    
2241          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 2143  sub _get_next_token ($) { Line 2264  sub _get_next_token ($) {
2264          !!!parse-error (type => 'unclosed SYSTEM literal');          !!!parse-error (type => 'unclosed SYSTEM literal');
2265    
2266          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2267            $self->{s_kwd} = '';
2268          !!!next-input-character;          !!!next-input-character;
2269    
2270          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 2154  sub _get_next_token ($) { Line 2276  sub _get_next_token ($) {
2276          !!!parse-error (type => 'unclosed SYSTEM literal');          !!!parse-error (type => 'unclosed SYSTEM literal');
2277    
2278          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2279            $self->{s_kwd} = '';
2280          ## reconsume          ## reconsume
2281    
2282          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 2180  sub _get_next_token ($) { Line 2303  sub _get_next_token ($) {
2303        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
2304          !!!cp (216);          !!!cp (216);
2305          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2306            $self->{s_kwd} = '';
2307          !!!next-input-character;          !!!next-input-character;
2308    
2309          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
# Line 2189  sub _get_next_token ($) { Line 2313  sub _get_next_token ($) {
2313          !!!cp (217);          !!!cp (217);
2314          !!!parse-error (type => 'unclosed DOCTYPE');          !!!parse-error (type => 'unclosed DOCTYPE');
2315          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2316            $self->{s_kwd} = '';
2317          ## reconsume          ## reconsume
2318    
2319          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 2208  sub _get_next_token ($) { Line 2333  sub _get_next_token ($) {
2333        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
2334          !!!cp (219);          !!!cp (219);
2335          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2336            $self->{s_kwd} = '';
2337          !!!next-input-character;          !!!next-input-character;
2338    
2339          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
# Line 2216  sub _get_next_token ($) { Line 2342  sub _get_next_token ($) {
2342        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
2343          !!!cp (220);          !!!cp (220);
2344          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2345            $self->{s_kwd} = '';
2346          ## reconsume          ## reconsume
2347    
2348          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
# Line 2241  sub _get_next_token ($) { Line 2368  sub _get_next_token ($) {
2368          !!!next-input-character;          !!!next-input-character;
2369          redo A;          redo A;
2370        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
2371            if ($self->{is_xml}) {
2372              !!!cp (221.11);
2373              !!!parse-error (type => 'no mse'); ## TODO: type
2374            } else {
2375              !!!cp (221.12);
2376            }
2377    
2378          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2379            $self->{s_kwd} = '';
2380          !!!next-input-character;          !!!next-input-character;
2381          if (length $self->{ct}->{data}) { # character          if (length $self->{ct}->{data}) { # character
2382            !!!cp (221.2);            !!!cp (221.2);
# Line 2280  sub _get_next_token ($) { Line 2415  sub _get_next_token ($) {
2415      } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {      } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
2416        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
2417          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2418            $self->{s_kwd} = '';
2419          !!!next-input-character;          !!!next-input-character;
2420          if (length $self->{ct}->{data}) { # character          if (length $self->{ct}->{data}) { # character
2421            !!!cp (221.7);            !!!cp (221.7);
# Line 2347  sub _get_next_token ($) { Line 2483  sub _get_next_token ($) {
2483        if ($self->{prev_state} == DATA_STATE) {        if ($self->{prev_state} == DATA_STATE) {
2484          !!!cp (997);          !!!cp (997);
2485          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
2486            $self->{s_kwd} = '';
2487          ## Reconsume.          ## Reconsume.
2488          !!!emit ({type => CHARACTER_TOKEN, data => '&',          !!!emit ({type => CHARACTER_TOKEN, data => '&',
2489                    line => $self->{line_prev},                    line => $self->{line_prev},
# Line 2357  sub _get_next_token ($) { Line 2494  sub _get_next_token ($) {
2494          !!!cp (996);          !!!cp (996);
2495          $self->{ca}->{value} .= '&';          $self->{ca}->{value} .= '&';
2496          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
2497            $self->{s_kwd} = '';
2498          ## Reconsume.          ## Reconsume.
2499          redo A;          redo A;
2500        }        }
# Line 2387  sub _get_next_token ($) { Line 2525  sub _get_next_token ($) {
2525          if ($self->{prev_state} == DATA_STATE) {          if ($self->{prev_state} == DATA_STATE) {
2526            !!!cp (1019);            !!!cp (1019);
2527            $self->{state} = $self->{prev_state};            $self->{state} = $self->{prev_state};
2528              $self->{s_kwd} = '';
2529            ## Reconsume.            ## Reconsume.
2530            !!!emit ({type => CHARACTER_TOKEN,            !!!emit ({type => CHARACTER_TOKEN,
2531                      data => '&#',                      data => '&#',
# Line 2398  sub _get_next_token ($) { Line 2537  sub _get_next_token ($) {
2537            !!!cp (993);            !!!cp (993);
2538            $self->{ca}->{value} .= '&#';            $self->{ca}->{value} .= '&#';
2539            $self->{state} = $self->{prev_state};            $self->{state} = $self->{prev_state};
2540              $self->{s_kwd} = '';
2541            ## Reconsume.            ## Reconsume.
2542            redo A;            redo A;
2543          }          }
# Line 2443  sub _get_next_token ($) { Line 2583  sub _get_next_token ($) {
2583        if ($self->{prev_state} == DATA_STATE) {        if ($self->{prev_state} == DATA_STATE) {
2584          !!!cp (992);          !!!cp (992);
2585          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
2586            $self->{s_kwd} = '';
2587          ## Reconsume.          ## Reconsume.
2588          !!!emit ({type => CHARACTER_TOKEN, data => chr $code,          !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
2589                      has_reference => 1,
2590                    line => $l, column => $c,                    line => $l, column => $c,
2591                   });                   });
2592          redo A;          redo A;
# Line 2453  sub _get_next_token ($) { Line 2595  sub _get_next_token ($) {
2595          $self->{ca}->{value} .= chr $code;          $self->{ca}->{value} .= chr $code;
2596          $self->{ca}->{has_reference} = 1;          $self->{ca}->{has_reference} = 1;
2597          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
2598            $self->{s_kwd} = '';
2599          ## Reconsume.          ## Reconsume.
2600          redo A;          redo A;
2601        }        }
# Line 2478  sub _get_next_token ($) { Line 2621  sub _get_next_token ($) {
2621          if ($self->{prev_state} == DATA_STATE) {          if ($self->{prev_state} == DATA_STATE) {
2622            !!!cp (1005);            !!!cp (1005);
2623            $self->{state} = $self->{prev_state};            $self->{state} = $self->{prev_state};
2624              $self->{s_kwd} = '';
2625            ## Reconsume.            ## Reconsume.
2626            !!!emit ({type => CHARACTER_TOKEN,            !!!emit ({type => CHARACTER_TOKEN,
2627                      data => '&' . $self->{s_kwd},                      data => '&' . $self->{s_kwd},
# Line 2489  sub _get_next_token ($) { Line 2633  sub _get_next_token ($) {
2633            !!!cp (989);            !!!cp (989);
2634            $self->{ca}->{value} .= '&' . $self->{s_kwd};            $self->{ca}->{value} .= '&' . $self->{s_kwd};
2635            $self->{state} = $self->{prev_state};            $self->{state} = $self->{prev_state};
2636              $self->{s_kwd} = '';
2637            ## Reconsume.            ## Reconsume.
2638            redo A;            redo A;
2639          }          }
# Line 2551  sub _get_next_token ($) { Line 2696  sub _get_next_token ($) {
2696        if ($self->{prev_state} == DATA_STATE) {        if ($self->{prev_state} == DATA_STATE) {
2697          !!!cp (988);          !!!cp (988);
2698          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
2699            $self->{s_kwd} = '';
2700          ## Reconsume.          ## Reconsume.
2701          !!!emit ({type => CHARACTER_TOKEN, data => chr $code,          !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
2702                      has_reference => 1,
2703                    line => $l, column => $c,                    line => $l, column => $c,
2704                   });                   });
2705          redo A;          redo A;
# Line 2561  sub _get_next_token ($) { Line 2708  sub _get_next_token ($) {
2708          $self->{ca}->{value} .= chr $code;          $self->{ca}->{value} .= chr $code;
2709          $self->{ca}->{has_reference} = 1;          $self->{ca}->{has_reference} = 1;
2710          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
2711            $self->{s_kwd} = '';
2712          ## Reconsume.          ## Reconsume.
2713          redo A;          redo A;
2714        }        }
# Line 2643  sub _get_next_token ($) { Line 2791  sub _get_next_token ($) {
2791        if ($self->{prev_state} == DATA_STATE) {        if ($self->{prev_state} == DATA_STATE) {
2792          !!!cp (986);          !!!cp (986);
2793          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
2794            $self->{s_kwd} = '';
2795          ## Reconsume.          ## Reconsume.
2796          !!!emit ({type => CHARACTER_TOKEN,          !!!emit ({type => CHARACTER_TOKEN,
2797                    data => $data,                    data => $data,
2798                      has_reference => $has_ref,
2799                    line => $self->{line_prev},                    line => $self->{line_prev},
2800                    column => $self->{column_prev} + 1 - length $self->{s_kwd},                    column => $self->{column_prev} + 1 - length $self->{s_kwd},
2801                   });                   });
# Line 2655  sub _get_next_token ($) { Line 2805  sub _get_next_token ($) {
2805          $self->{ca}->{value} .= $data;          $self->{ca}->{value} .= $data;
2806          $self->{ca}->{has_reference} = 1 if $has_ref;          $self->{ca}->{has_reference} = 1 if $has_ref;
2807          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
2808            $self->{s_kwd} = '';
2809            ## Reconsume.
2810            redo A;
2811          }
2812    
2813        ## XML-only states
2814    
2815        } elsif ($self->{state} == PI_STATE) {
2816          if ($is_space->{$self->{nc}} or
2817              $self->{nc} == 0x003F or # ? ## XML5: Same as "Anything else"
2818              $self->{nc} == -1) {
2819            !!!parse-error (type => 'bare pio', ## TODO: type
2820                            line => $self->{line_prev},
2821                            column => $self->{column_prev}
2822                                - 1 * ($self->{nc} != -1));
2823            $self->{state} = BOGUS_COMMENT_STATE;
2824          ## Reconsume.          ## Reconsume.
2825            $self->{ct} = {type => COMMENT_TOKEN,
2826                           data => '?',
2827                           line => $self->{line_prev},
2828                           column => $self->{column_prev}
2829                               - 1 * ($self->{nc} != -1),
2830                          };
2831            redo A;
2832          } else {
2833            $self->{ct} = {type => PI_TOKEN,
2834                           target => chr $self->{nc},
2835                           data => '',
2836                           line => $self->{line_prev},
2837                           column => $self->{column_prev} - 1,
2838                          };
2839            $self->{state} = PI_TARGET_STATE;
2840            !!!next-input-character;
2841          redo A;          redo A;
2842        }        }
2843        } elsif ($self->{state} == PI_TARGET_STATE) {
2844          if ($is_space->{$self->{nc}}) {
2845            $self->{state} = PI_TARGET_AFTER_STATE;
2846            !!!next-input-character;
2847            redo A;
2848          } elsif ($self->{nc} == -1) {
2849            !!!parse-error (type => 'no pic'); ## TODO: type
2850            $self->{state} = DATA_STATE;
2851            $self->{s_kwd} = '';
2852            ## Reconsume.
2853            !!!emit ($self->{ct}); # pi
2854            redo A;
2855          } elsif ($self->{nc} == 0x003F) { # ?
2856            $self->{state} = PI_AFTER_STATE;
2857            !!!next-input-character;
2858            redo A;
2859          } else {
2860            ## XML5: typo ("tag name" -> "target")
2861            $self->{ct}->{target} .= chr $self->{nc}; # pi
2862            !!!next-input-character;
2863            redo A;
2864          }
2865        } elsif ($self->{state} == PI_TARGET_AFTER_STATE) {
2866          if ($is_space->{$self->{nc}}) {
2867            ## Stay in the state.
2868            !!!next-input-character;
2869            redo A;
2870          } else {
2871            $self->{state} = PI_DATA_STATE;
2872            ## Reprocess.
2873            redo A;
2874          }
2875        } elsif ($self->{state} == PI_DATA_STATE) {
2876          if ($self->{nc} == 0x003F) { # ?
2877            $self->{state} = PI_DATA_AFTER_STATE;
2878            !!!next-input-character;
2879            redo A;
2880          } elsif ($self->{nc} == -1) {
2881            !!!parse-error (type => 'no pic'); ## TODO: type
2882            $self->{state} = DATA_STATE;
2883            $self->{s_kwd} = '';
2884            ## Reprocess.
2885            !!!emit ($self->{ct}); # pi
2886            redo A;
2887          } else {
2888            $self->{ct}->{data} .= chr $self->{nc}; # pi
2889            $self->{read_until}->($self->{ct}->{data}, q[?],
2890                                  length $self->{ct}->{data});
2891            ## Stay in the state.
2892            !!!next-input-character;
2893            ## Reprocess.
2894            redo A;
2895          }
2896        } elsif ($self->{state} == PI_AFTER_STATE) {
2897          if ($self->{nc} == 0x003E) { # >
2898            $self->{state} = DATA_STATE;
2899            $self->{s_kwd} = '';
2900            !!!next-input-character;
2901            !!!emit ($self->{ct}); # pi
2902            redo A;
2903          } elsif ($self->{nc} == 0x003F) { # ?
2904            !!!parse-error (type => 'no s after target', ## TODO: type
2905                            line => $self->{line_prev},
2906                            column => $self->{column_prev}); ## XML5: no error
2907            $self->{ct}->{data} .= '?';
2908            $self->{state} = PI_DATA_AFTER_STATE;
2909            !!!next-input-character;
2910            redo A;
2911          } else {
2912            !!!parse-error (type => 'no s after target', ## TODO: type
2913                            line => $self->{line_prev},
2914                            column => $self->{column_prev}
2915                                + 1 * ($self->{nc} == -1)); ## XML5: no error
2916            $self->{ct}->{data} .= '?'; ## XML5: not appended
2917            $self->{state} = PI_DATA_STATE;
2918            ## Reprocess.
2919            redo A;
2920          }
2921        } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
2922          ## XML5: Same as "pi after state" in XML5
2923          if ($self->{nc} == 0x003E) { # >
2924            $self->{state} = DATA_STATE;
2925            $self->{s_kwd} = '';
2926            !!!next-input-character;
2927            !!!emit ($self->{ct}); # pi
2928            redo A;
2929          } elsif ($self->{nc} == 0x003F) { # ?
2930            $self->{ct}->{data} .= '?';
2931            ## Stay in the state.
2932            !!!next-input-character;
2933            redo A;
2934          } else {
2935            $self->{ct}->{data} .= '?'; ## XML5: not appended
2936            $self->{state} = PI_DATA_STATE;
2937            ## Reprocess.
2938            redo A;
2939          }
2940            
2941      } else {      } else {
2942        die "$0: $self->{state}: Unknown state";        die "$0: $self->{state}: Unknown state";
2943      }      }

Legend:
Removed from v.1.2  
changed lines
  Added in v.1.9

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24