/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src
Suika

Diff of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.1 by wakaba, Tue Oct 14 02:27:58 2008 UTC revision 1.8 by wakaba, Wed Oct 15 04:38:22 2008 UTC
# Line 2  package Whatpm::HTML::Tokenizer; Line 2  package Whatpm::HTML::Tokenizer;
2  use strict;  use strict;
3  our $VERSION=do{my @r=(q$Revision$=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};  our $VERSION=do{my @r=(q$Revision$=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4    
5    BEGIN {
6      require Exporter;
7      push our @ISA, 'Exporter';
8    
9      our @EXPORT_OK = qw(
10        DOCTYPE_TOKEN
11        COMMENT_TOKEN
12        START_TAG_TOKEN
13        END_TAG_TOKEN
14        END_OF_FILE_TOKEN
15        CHARACTER_TOKEN
16        PI_TOKEN
17        ABORT_TOKEN
18      );
19      
20      our %EXPORT_TAGS = (
21        token => [qw(
22          DOCTYPE_TOKEN
23          COMMENT_TOKEN
24          START_TAG_TOKEN
25          END_TAG_TOKEN
26          END_OF_FILE_TOKEN
27          CHARACTER_TOKEN
28          PI_TOKEN
29          ABORT_TOKEN
30        )],
31      );
32    }
33    
34    ## Token types
35    
36    sub DOCTYPE_TOKEN () { 1 }
37    sub COMMENT_TOKEN () { 2 }
38    sub START_TAG_TOKEN () { 3 }
39    sub END_TAG_TOKEN () { 4 }
40    sub END_OF_FILE_TOKEN () { 5 }
41    sub CHARACTER_TOKEN () { 6 }
42    sub PI_TOKEN () { 7 } # XML5
43    sub ABORT_TOKEN () { 8 } # Not a token actually
44    
45  package Whatpm::HTML;  package Whatpm::HTML;
46    
47    BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
48    
49  ## Content model flags  ## Content model flags
50    
51  sub CM_ENTITY () { 0b001 } # & markup in data  sub CM_ENTITY () { 0b001 } # & markup in data
# Line 72  sub HEXREF_HEX_STATE () { 48 } Line 114  sub HEXREF_HEX_STATE () { 48 }
114  sub ENTITY_NAME_STATE () { 49 }  sub ENTITY_NAME_STATE () { 49 }
115  sub PCDATA_STATE () { 50 } # "data state" in the spec  sub PCDATA_STATE () { 50 } # "data state" in the spec
116    
117  ## Token types  ## XML states
118    sub PI_STATE () { 51 }
119  sub DOCTYPE_TOKEN () { 1 }  sub PI_TARGET_STATE () { 52 }
120  sub COMMENT_TOKEN () { 2 }  sub PI_TARGET_AFTER_STATE () { 53 }
121  sub START_TAG_TOKEN () { 3 }  sub PI_DATA_STATE () { 54 }
122  sub END_TAG_TOKEN () { 4 }  sub PI_AFTER_STATE () { 55 }
123  sub END_OF_FILE_TOKEN () { 5 }  sub PI_DATA_AFTER_STATE () { 56 }
 sub CHARACTER_TOKEN () { 6 }  
124    
125  ## Tree constructor state constants (see Whatpm::HTML for the full  ## Tree constructor state constants (see Whatpm::HTML for the full
126  ## list and descriptions)  ## list and descriptions)
# Line 142  sub _initialize_tokenizer ($) { Line 183  sub _initialize_tokenizer ($) {
183    #$self->{level}    #$self->{level}
184    #$self->{set_nc}    #$self->{set_nc}
185    #$self->{parse_error}    #$self->{parse_error}
186      #$self->{is_xml} (if XML)
187    
188    $self->{state} = DATA_STATE; # MUST    $self->{state} = DATA_STATE; # MUST
189    #$self->{s_kwd}; # state keyword - initialized when used    $self->{s_kwd} = ''; # state keyword
190    #$self->{entity__value}; # initialized when used    #$self->{entity__value}; # initialized when used
191    #$self->{entity__match}; # initialized when used    #$self->{entity__match}; # initialized when used
192    $self->{content_model} = PCDATA_CONTENT_MODEL; # be    $self->{content_model} = PCDATA_CONTENT_MODEL; # be
# Line 175  sub _initialize_tokenizer ($) { Line 217  sub _initialize_tokenizer ($) {
217  ##        ->{value}  ##        ->{value}
218  ##        ->{has_reference} == 1 or 0  ##        ->{has_reference} == 1 or 0
219  ##   ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)  ##   ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)
220    ##   ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
221  ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.  ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
222  ##     |->{self_closing}| is used to save the value of |$self->{self_closing}|  ##     |->{self_closing}| is used to save the value of |$self->{self_closing}|
223  ##     while the token is pushed back to the stack.  ##     while the token is pushed back to the stack.
# Line 278  sub _get_next_token ($) { Line 321  sub _get_next_token ($) {
321          }          }
322        } elsif ($self->{nc} == 0x002D) { # -        } elsif ($self->{nc} == 0x002D) { # -
323          if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA          if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
324            $self->{s_kwd} .= '-';            if ($self->{s_kwd} eq '<!-') {
             
           if ($self->{s_kwd} eq '<!--') {  
325              !!!cp (3);              !!!cp (3);
326              $self->{escape} = 1; # unless $self->{escape};              $self->{escape} = 1; # unless $self->{escape};
327              $self->{s_kwd} = '--';              $self->{s_kwd} = '--';
328              #              #
329            } elsif ($self->{s_kwd} eq '---') {            } elsif ($self->{s_kwd} eq '-') {
330              !!!cp (4);              !!!cp (4);
331              $self->{s_kwd} = '--';              $self->{s_kwd} = '--';
332              #              #
333              } elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') {
334                !!!cp (4.1);
335                $self->{s_kwd} .= '-';
336                #
337            } else {            } else {
338              !!!cp (5);              !!!cp (5);
339                $self->{s_kwd} = '-';
340              #              #
341            }            }
342          }          }
# Line 326  sub _get_next_token ($) { Line 372  sub _get_next_token ($) {
372            if ($self->{s_kwd} eq '--') {            if ($self->{s_kwd} eq '--') {
373              !!!cp (8);              !!!cp (8);
374              delete $self->{escape};              delete $self->{escape};
375                #
376            } else {            } else {
377              !!!cp (9);              !!!cp (9);
378                #
379            }            }
380            } elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') {
381              !!!cp (9.1);
382              !!!parse-error (type => 'unmatched mse', ## TODO: type
383                              line => $self->{line_prev},
384                              column => $self->{column_prev} - 1);
385              #
386          } else {          } else {
387            !!!cp (10);            !!!cp (10);
388              #
389          }          }
390                    
391          $self->{s_kwd} = '';          $self->{s_kwd} = '';
392          #          #
393          } elsif ($self->{nc} == 0x005D) { # ]
394            if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') {
395              !!!cp (10.1);
396              $self->{s_kwd} .= ']';
397            } elsif ($self->{s_kwd} eq ']]') {
398              !!!cp (10.2);
399              #
400            } else {
401              !!!cp (10.3);
402              $self->{s_kwd} = '';
403            }
404            #
405        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
406          !!!cp (11);          !!!cp (11);
407          $self->{s_kwd} = '';          $self->{s_kwd} = '';
# Line 352  sub _get_next_token ($) { Line 419  sub _get_next_token ($) {
419                     data => chr $self->{nc},                     data => chr $self->{nc},
420                     line => $self->{line}, column => $self->{column},                     line => $self->{line}, column => $self->{column},
421                    };                    };
422        if ($self->{read_until}->($token->{data}, q[-!<>&],        if ($self->{read_until}->($token->{data}, q{-!<>&\]},
423                                  length $token->{data})) {                                  length $token->{data})) {
424          $self->{s_kwd} = '';          $self->{s_kwd} = '';
425        }        }
426    
427        ## Stay in the data state.        ## Stay in the data state.
428        if ($self->{content_model} == PCDATA_CONTENT_MODEL) {        if (not $self->{is_xml} and
429              $self->{content_model} == PCDATA_CONTENT_MODEL) {
430          !!!cp (13);          !!!cp (13);
431          $self->{state} = PCDATA_STATE;          $self->{state} = PCDATA_STATE;
432        } else {        } else {
# Line 386  sub _get_next_token ($) { Line 454  sub _get_next_token ($) {
454    
455          ## reconsume          ## reconsume
456          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
457            $self->{s_kwd} = '';
458          !!!emit ({type => CHARACTER_TOKEN, data => '<',          !!!emit ({type => CHARACTER_TOKEN, data => '<',
459                    line => $self->{line_prev},                    line => $self->{line_prev},
460                    column => $self->{column_prev},                    column => $self->{column_prev},
# Line 407  sub _get_next_token ($) { Line 476  sub _get_next_token ($) {
476            !!!cp (19);            !!!cp (19);
477            $self->{ct}            $self->{ct}
478              = {type => START_TAG_TOKEN,              = {type => START_TAG_TOKEN,
479                 tag_name => chr ($self->{nc} + 0x0020),                 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
480                 line => $self->{line_prev},                 line => $self->{line_prev},
481                 column => $self->{column_prev}};                 column => $self->{column_prev}};
482            $self->{state} = TAG_NAME_STATE;            $self->{state} = TAG_NAME_STATE;
# Line 429  sub _get_next_token ($) { Line 498  sub _get_next_token ($) {
498                            line => $self->{line_prev},                            line => $self->{line_prev},
499                            column => $self->{column_prev});                            column => $self->{column_prev});
500            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
501              $self->{s_kwd} = '';
502            !!!next-input-character;            !!!next-input-character;
503    
504            !!!emit ({type => CHARACTER_TOKEN, data => '<>',            !!!emit ({type => CHARACTER_TOKEN, data => '<>',
# Line 438  sub _get_next_token ($) { Line 508  sub _get_next_token ($) {
508    
509            redo A;            redo A;
510          } elsif ($self->{nc} == 0x003F) { # ?          } elsif ($self->{nc} == 0x003F) { # ?
511            !!!cp (22);            if ($self->{is_xml}) {
512            !!!parse-error (type => 'pio',              !!!cp (22.1);
513                            line => $self->{line_prev},              $self->{state} = PI_STATE;
514                            column => $self->{column_prev});              !!!next-input-character;
515            $self->{state} = BOGUS_COMMENT_STATE;              redo A;
516            $self->{ct} = {type => COMMENT_TOKEN, data => '',            } else {
517                                      line => $self->{line_prev},              !!!cp (22);
518                                      column => $self->{column_prev},              !!!parse-error (type => 'pio',
519                                     };                              line => $self->{line_prev},
520            ## $self->{nc} is intentionally left as is                              column => $self->{column_prev});
521            redo A;              $self->{state} = BOGUS_COMMENT_STATE;
522                $self->{ct} = {type => COMMENT_TOKEN, data => '',
523                               line => $self->{line_prev},
524                               column => $self->{column_prev},
525                              };
526                ## $self->{nc} is intentionally left as is
527                redo A;
528              }
529          } else {          } else {
530            !!!cp (23);            !!!cp (23);
531            !!!parse-error (type => 'bare stago',            !!!parse-error (type => 'bare stago',
532                            line => $self->{line_prev},                            line => $self->{line_prev},
533                            column => $self->{column_prev});                            column => $self->{column_prev});
534            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
535              $self->{s_kwd} = '';
536            ## reconsume            ## reconsume
537    
538            !!!emit ({type => CHARACTER_TOKEN, data => '<',            !!!emit ({type => CHARACTER_TOKEN, data => '<',
# Line 483  sub _get_next_token ($) { Line 561  sub _get_next_token ($) {
561            ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.            ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
562            !!!cp (28);            !!!cp (28);
563            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
564              $self->{s_kwd} = '';
565            ## Reconsume.            ## Reconsume.
566            !!!emit ({type => CHARACTER_TOKEN, data => '</',            !!!emit ({type => CHARACTER_TOKEN, data => '</',
567                      line => $l, column => $c,                      line => $l, column => $c,
# Line 496  sub _get_next_token ($) { Line 575  sub _get_next_token ($) {
575          !!!cp (29);          !!!cp (29);
576          $self->{ct}          $self->{ct}
577              = {type => END_TAG_TOKEN,              = {type => END_TAG_TOKEN,
578                 tag_name => chr ($self->{nc} + 0x0020),                 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
579                 line => $l, column => $c};                 line => $l, column => $c};
580          $self->{state} = TAG_NAME_STATE;          $self->{state} = TAG_NAME_STATE;
581          !!!next-input-character;          !!!next-input-character;
# Line 516  sub _get_next_token ($) { Line 595  sub _get_next_token ($) {
595                          line => $self->{line_prev}, ## "<" in "</>"                          line => $self->{line_prev}, ## "<" in "</>"
596                          column => $self->{column_prev} - 1);                          column => $self->{column_prev} - 1);
597          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
598            $self->{s_kwd} = '';
599          !!!next-input-character;          !!!next-input-character;
600          redo A;          redo A;
601        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
602          !!!cp (32);          !!!cp (32);
603          !!!parse-error (type => 'bare etago');          !!!parse-error (type => 'bare etago');
604            $self->{s_kwd} = '';
605          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
606          # reconsume          # reconsume
607    
# Line 560  sub _get_next_token ($) { Line 641  sub _get_next_token ($) {
641          } else {          } else {
642            !!!cp (25);            !!!cp (25);
643            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
644              $self->{s_kwd} = '';
645            ## Reconsume.            ## Reconsume.
646            !!!emit ({type => CHARACTER_TOKEN,            !!!emit ({type => CHARACTER_TOKEN,
647                      data => '</' . $self->{s_kwd},                      data => '</' . $self->{s_kwd},
# Line 578  sub _get_next_token ($) { Line 660  sub _get_next_token ($) {
660            !!!cp (26);            !!!cp (26);
661            ## Reconsume.            ## Reconsume.
662            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
663              $self->{s_kwd} = '';
664            !!!emit ({type => CHARACTER_TOKEN,            !!!emit ({type => CHARACTER_TOKEN,
665                      data => '</' . $self->{s_kwd},                      data => '</' . $self->{s_kwd},
666                      line => $self->{line_prev},                      line => $self->{line_prev},
# Line 619  sub _get_next_token ($) { Line 702  sub _get_next_token ($) {
702            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
703          }          }
704          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
705            $self->{s_kwd} = '';
706          !!!next-input-character;          !!!next-input-character;
707    
708          !!!emit ($self->{ct}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
# Line 627  sub _get_next_token ($) { Line 711  sub _get_next_token ($) {
711        } elsif (0x0041 <= $self->{nc} and        } elsif (0x0041 <= $self->{nc} and
712                 $self->{nc} <= 0x005A) { # A..Z                 $self->{nc} <= 0x005A) { # A..Z
713          !!!cp (38);          !!!cp (38);
714          $self->{ct}->{tag_name} .= chr ($self->{nc} + 0x0020);          $self->{ct}->{tag_name}
715                .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
716            # start tag or end tag            # start tag or end tag
717          ## Stay in this state          ## Stay in this state
718          !!!next-input-character;          !!!next-input-character;
# Line 650  sub _get_next_token ($) { Line 735  sub _get_next_token ($) {
735            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
736          }          }
737          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
738            $self->{s_kwd} = '';
739          # reconsume          # reconsume
740    
741          !!!emit ($self->{ct}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
# Line 690  sub _get_next_token ($) { Line 776  sub _get_next_token ($) {
776            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
777          }          }
778          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
779            $self->{s_kwd} = '';
780          !!!next-input-character;          !!!next-input-character;
781    
782          !!!emit ($self->{ct}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
# Line 699  sub _get_next_token ($) { Line 786  sub _get_next_token ($) {
786                 $self->{nc} <= 0x005A) { # A..Z                 $self->{nc} <= 0x005A) { # A..Z
787          !!!cp (49);          !!!cp (49);
788          $self->{ca}          $self->{ca}
789              = {name => chr ($self->{nc} + 0x0020),              = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
790                 value => '',                 value => '',
791                 line => $self->{line}, column => $self->{column}};                 line => $self->{line}, column => $self->{column}};
792          $self->{state} = ATTRIBUTE_NAME_STATE;          $self->{state} = ATTRIBUTE_NAME_STATE;
# Line 727  sub _get_next_token ($) { Line 814  sub _get_next_token ($) {
814            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
815          }          }
816          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
817            $self->{s_kwd} = '';
818          # reconsume          # reconsume
819    
820          !!!emit ($self->{ct}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
# Line 792  sub _get_next_token ($) { Line 880  sub _get_next_token ($) {
880            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
881          }          }
882          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
883            $self->{s_kwd} = '';
884          !!!next-input-character;          !!!next-input-character;
885    
886          !!!emit ($self->{ct}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
# Line 800  sub _get_next_token ($) { Line 889  sub _get_next_token ($) {
889        } elsif (0x0041 <= $self->{nc} and        } elsif (0x0041 <= $self->{nc} and
890                 $self->{nc} <= 0x005A) { # A..Z                 $self->{nc} <= 0x005A) { # A..Z
891          !!!cp (63);          !!!cp (63);
892          $self->{ca}->{name} .= chr ($self->{nc} + 0x0020);          $self->{ca}->{name}
893                .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
894          ## Stay in the state          ## Stay in the state
895          !!!next-input-character;          !!!next-input-character;
896          redo A;          redo A;
# Line 829  sub _get_next_token ($) { Line 919  sub _get_next_token ($) {
919            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
920          }          }
921          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
922            $self->{s_kwd} = '';
923          # reconsume          # reconsume
924    
925          !!!emit ($self->{ct}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
# Line 875  sub _get_next_token ($) { Line 966  sub _get_next_token ($) {
966            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
967          }          }
968          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
969            $self->{s_kwd} = '';
970          !!!next-input-character;          !!!next-input-character;
971    
972          !!!emit ($self->{ct}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
# Line 884  sub _get_next_token ($) { Line 976  sub _get_next_token ($) {
976                 $self->{nc} <= 0x005A) { # A..Z                 $self->{nc} <= 0x005A) { # A..Z
977          !!!cp (76);          !!!cp (76);
978          $self->{ca}          $self->{ca}
979              = {name => chr ($self->{nc} + 0x0020),              = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
980                 value => '',                 value => '',
981                 line => $self->{line}, column => $self->{column}};                 line => $self->{line}, column => $self->{column}};
982          $self->{state} = ATTRIBUTE_NAME_STATE;          $self->{state} = ATTRIBUTE_NAME_STATE;
# Line 912  sub _get_next_token ($) { Line 1004  sub _get_next_token ($) {
1004          } else {          } else {
1005            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1006          }          }
1007            $self->{s_kwd} = '';
1008          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1009          # reconsume          # reconsume
1010    
# Line 973  sub _get_next_token ($) { Line 1066  sub _get_next_token ($) {
1066            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1067          }          }
1068          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1069            $self->{s_kwd} = '';
1070          !!!next-input-character;          !!!next-input-character;
1071    
1072          !!!emit ($self->{ct}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
# Line 996  sub _get_next_token ($) { Line 1090  sub _get_next_token ($) {
1090            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1091          }          }
1092          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1093            $self->{s_kwd} = '';
1094          ## reconsume          ## reconsume
1095    
1096          !!!emit ($self->{ct}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
# Line 1048  sub _get_next_token ($) { Line 1143  sub _get_next_token ($) {
1143            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1144          }          }
1145          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1146            $self->{s_kwd} = '';
1147          ## reconsume          ## reconsume
1148    
1149          !!!emit ($self->{ct}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
# Line 1099  sub _get_next_token ($) { Line 1195  sub _get_next_token ($) {
1195            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1196          }          }
1197          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1198            $self->{s_kwd} = '';
1199          ## reconsume          ## reconsume
1200    
1201          !!!emit ($self->{ct}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
# Line 1149  sub _get_next_token ($) { Line 1246  sub _get_next_token ($) {
1246            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1247          }          }
1248          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1249            $self->{s_kwd} = '';
1250          !!!next-input-character;          !!!next-input-character;
1251    
1252          !!!emit ($self->{ct}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
# Line 1172  sub _get_next_token ($) { Line 1270  sub _get_next_token ($) {
1270            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1271          }          }
1272          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1273            $self->{s_kwd} = '';
1274          ## reconsume          ## reconsume
1275    
1276          !!!emit ($self->{ct}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
# Line 1220  sub _get_next_token ($) { Line 1319  sub _get_next_token ($) {
1319            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1320          }          }
1321          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1322            $self->{s_kwd} = '';
1323          !!!next-input-character;          !!!next-input-character;
1324    
1325          !!!emit ($self->{ct}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
# Line 1247  sub _get_next_token ($) { Line 1347  sub _get_next_token ($) {
1347            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1348          }          }
1349          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1350            $self->{s_kwd} = '';
1351          ## Reconsume.          ## Reconsume.
1352          !!!emit ($self->{ct}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
1353          redo A;          redo A;
# Line 1277  sub _get_next_token ($) { Line 1378  sub _get_next_token ($) {
1378          }          }
1379    
1380          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1381            $self->{s_kwd} = '';
1382          !!!next-input-character;          !!!next-input-character;
1383    
1384          !!!emit ($self->{ct}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
# Line 1299  sub _get_next_token ($) { Line 1401  sub _get_next_token ($) {
1401            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1402          }          }
1403          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1404            $self->{s_kwd} = '';
1405          ## Reconsume.          ## Reconsume.
1406          !!!emit ($self->{ct}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
1407          redo A;          redo A;
# Line 1319  sub _get_next_token ($) { Line 1422  sub _get_next_token ($) {
1422        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
1423          !!!cp (124);          !!!cp (124);
1424          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1425            $self->{s_kwd} = '';
1426          !!!next-input-character;          !!!next-input-character;
1427    
1428          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1326  sub _get_next_token ($) { Line 1430  sub _get_next_token ($) {
1430        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
1431          !!!cp (125);          !!!cp (125);
1432          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1433            $self->{s_kwd} = '';
1434          ## reconsume          ## reconsume
1435    
1436          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1357  sub _get_next_token ($) { Line 1462  sub _get_next_token ($) {
1462          $self->{s_kwd} = chr $self->{nc};          $self->{s_kwd} = chr $self->{nc};
1463          !!!next-input-character;          !!!next-input-character;
1464          redo A;          redo A;
1465        } elsif ($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and        } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
1466                 $self->{open_elements}->[-1]->[1] & FOREIGN_EL and                   $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
1467                    $self->{is_xml}) and
1468                 $self->{nc} == 0x005B) { # [                 $self->{nc} == 0x005B) { # [
1469          !!!cp (135.4);                          !!!cp (135.4);                
1470          $self->{state} = MD_CDATA_STATE;          $self->{state} = MD_CDATA_STATE;
# Line 1467  sub _get_next_token ($) { Line 1573  sub _get_next_token ($) {
1573          redo A;          redo A;
1574        } elsif ($self->{s_kwd} eq '[CDATA' and        } elsif ($self->{s_kwd} eq '[CDATA' and
1575                 $self->{nc} == 0x005B) { # [                 $self->{nc} == 0x005B) { # [
1576          !!!cp (135.2);          if ($self->{is_xml} and
1577                not $self->{tainted} and
1578                @{$self->{open_elements} or []} == 0) {
1579              !!!cp (135.2);
1580              !!!parse-error (type => 'cdata outside of root element',
1581                              line => $self->{line_prev},
1582                              column => $self->{column_prev} - 7);
1583              $self->{tainted} = 1;
1584            } else {
1585              !!!cp (135.21);
1586            }
1587    
1588          $self->{ct} = {type => CHARACTER_TOKEN,          $self->{ct} = {type => CHARACTER_TOKEN,
1589                                    data => '',                                    data => '',
1590                                    line => $self->{line_prev},                                    line => $self->{line_prev},
# Line 1499  sub _get_next_token ($) { Line 1616  sub _get_next_token ($) {
1616          !!!cp (138);          !!!cp (138);
1617          !!!parse-error (type => 'bogus comment');          !!!parse-error (type => 'bogus comment');
1618          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1619            $self->{s_kwd} = '';
1620          !!!next-input-character;          !!!next-input-character;
1621    
1622          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1508  sub _get_next_token ($) { Line 1626  sub _get_next_token ($) {
1626          !!!cp (139);          !!!cp (139);
1627          !!!parse-error (type => 'unclosed comment');          !!!parse-error (type => 'unclosed comment');
1628          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1629            $self->{s_kwd} = '';
1630          ## reconsume          ## reconsume
1631    
1632          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1531  sub _get_next_token ($) { Line 1650  sub _get_next_token ($) {
1650          !!!cp (142);          !!!cp (142);
1651          !!!parse-error (type => 'bogus comment');          !!!parse-error (type => 'bogus comment');
1652          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1653            $self->{s_kwd} = '';
1654          !!!next-input-character;          !!!next-input-character;
1655    
1656          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1540  sub _get_next_token ($) { Line 1660  sub _get_next_token ($) {
1660          !!!cp (143);          !!!cp (143);
1661          !!!parse-error (type => 'unclosed comment');          !!!parse-error (type => 'unclosed comment');
1662          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1663            $self->{s_kwd} = '';
1664          ## reconsume          ## reconsume
1665    
1666          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1563  sub _get_next_token ($) { Line 1684  sub _get_next_token ($) {
1684          !!!cp (146);          !!!cp (146);
1685          !!!parse-error (type => 'unclosed comment');          !!!parse-error (type => 'unclosed comment');
1686          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1687            $self->{s_kwd} = '';
1688          ## reconsume          ## reconsume
1689    
1690          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1588  sub _get_next_token ($) { Line 1710  sub _get_next_token ($) {
1710        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
1711          !!!cp (149);          !!!cp (149);
1712          !!!parse-error (type => 'unclosed comment');          !!!parse-error (type => 'unclosed comment');
1713            $self->{s_kwd} = '';
1714          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1715            $self->{s_kwd} = '';
1716          ## reconsume          ## reconsume
1717    
1718          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1605  sub _get_next_token ($) { Line 1729  sub _get_next_token ($) {
1729        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
1730          !!!cp (151);          !!!cp (151);
1731          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1732            $self->{s_kwd} = '';
1733          !!!next-input-character;          !!!next-input-character;
1734    
1735          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1623  sub _get_next_token ($) { Line 1748  sub _get_next_token ($) {
1748          !!!cp (153);          !!!cp (153);
1749          !!!parse-error (type => 'unclosed comment');          !!!parse-error (type => 'unclosed comment');
1750          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1751            $self->{s_kwd} = '';
1752          ## reconsume          ## reconsume
1753    
1754          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1661  sub _get_next_token ($) { Line 1787  sub _get_next_token ($) {
1787          !!!cp (158);          !!!cp (158);
1788          !!!parse-error (type => 'no DOCTYPE name');          !!!parse-error (type => 'no DOCTYPE name');
1789          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1790            $self->{s_kwd} = '';
1791          !!!next-input-character;          !!!next-input-character;
1792    
1793          !!!emit ($self->{ct}); # DOCTYPE (quirks)          !!!emit ($self->{ct}); # DOCTYPE (quirks)
# Line 1670  sub _get_next_token ($) { Line 1797  sub _get_next_token ($) {
1797          !!!cp (159);          !!!cp (159);
1798          !!!parse-error (type => 'no DOCTYPE name');          !!!parse-error (type => 'no DOCTYPE name');
1799          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1800            $self->{s_kwd} = '';
1801          ## reconsume          ## reconsume
1802    
1803          !!!emit ($self->{ct}); # DOCTYPE (quirks)          !!!emit ($self->{ct}); # DOCTYPE (quirks)
# Line 1693  sub _get_next_token ($) { Line 1821  sub _get_next_token ($) {
1821        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
1822          !!!cp (162);          !!!cp (162);
1823          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1824            $self->{s_kwd} = '';
1825          !!!next-input-character;          !!!next-input-character;
1826    
1827          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
# Line 1702  sub _get_next_token ($) { Line 1831  sub _get_next_token ($) {
1831          !!!cp (163);          !!!cp (163);
1832          !!!parse-error (type => 'unclosed DOCTYPE');          !!!parse-error (type => 'unclosed DOCTYPE');
1833          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1834            $self->{s_kwd} = '';
1835          ## reconsume          ## reconsume
1836    
1837          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 1725  sub _get_next_token ($) { Line 1855  sub _get_next_token ($) {
1855        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
1856          !!!cp (166);          !!!cp (166);
1857          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1858            $self->{s_kwd} = '';
1859          !!!next-input-character;          !!!next-input-character;
1860    
1861          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
# Line 1734  sub _get_next_token ($) { Line 1865  sub _get_next_token ($) {
1865          !!!cp (167);          !!!cp (167);
1866          !!!parse-error (type => 'unclosed DOCTYPE');          !!!parse-error (type => 'unclosed DOCTYPE');
1867          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1868            $self->{s_kwd} = '';
1869          ## reconsume          ## reconsume
1870    
1871          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 1862  sub _get_next_token ($) { Line 1994  sub _get_next_token ($) {
1994          !!!parse-error (type => 'no PUBLIC literal');          !!!parse-error (type => 'no PUBLIC literal');
1995    
1996          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1997            $self->{s_kwd} = '';
1998          !!!next-input-character;          !!!next-input-character;
1999    
2000          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 1873  sub _get_next_token ($) { Line 2006  sub _get_next_token ($) {
2006          !!!parse-error (type => 'unclosed DOCTYPE');          !!!parse-error (type => 'unclosed DOCTYPE');
2007    
2008          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2009            $self->{s_kwd} = '';
2010          ## reconsume          ## reconsume
2011    
2012          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 1899  sub _get_next_token ($) { Line 2033  sub _get_next_token ($) {
2033          !!!parse-error (type => 'unclosed PUBLIC literal');          !!!parse-error (type => 'unclosed PUBLIC literal');
2034    
2035          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2036            $self->{s_kwd} = '';
2037          !!!next-input-character;          !!!next-input-character;
2038    
2039          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 1910  sub _get_next_token ($) { Line 2045  sub _get_next_token ($) {
2045          !!!parse-error (type => 'unclosed PUBLIC literal');          !!!parse-error (type => 'unclosed PUBLIC literal');
2046    
2047          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2048            $self->{s_kwd} = '';
2049          ## reconsume          ## reconsume
2050    
2051          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 1938  sub _get_next_token ($) { Line 2074  sub _get_next_token ($) {
2074          !!!parse-error (type => 'unclosed PUBLIC literal');          !!!parse-error (type => 'unclosed PUBLIC literal');
2075    
2076          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2077            $self->{s_kwd} = '';
2078          !!!next-input-character;          !!!next-input-character;
2079    
2080          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 1949  sub _get_next_token ($) { Line 2086  sub _get_next_token ($) {
2086          !!!parse-error (type => 'unclosed PUBLIC literal');          !!!parse-error (type => 'unclosed PUBLIC literal');
2087    
2088          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2089            $self->{s_kwd} = '';
2090          ## reconsume          ## reconsume
2091    
2092          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 1987  sub _get_next_token ($) { Line 2125  sub _get_next_token ($) {
2125        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
2126          !!!cp (198);          !!!cp (198);
2127          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2128            $self->{s_kwd} = '';
2129          !!!next-input-character;          !!!next-input-character;
2130    
2131          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
# Line 1997  sub _get_next_token ($) { Line 2136  sub _get_next_token ($) {
2136          !!!parse-error (type => 'unclosed DOCTYPE');          !!!parse-error (type => 'unclosed DOCTYPE');
2137    
2138          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2139            $self->{s_kwd} = '';
2140          ## reconsume          ## reconsume
2141    
2142          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 2034  sub _get_next_token ($) { Line 2174  sub _get_next_token ($) {
2174          !!!cp (204);          !!!cp (204);
2175          !!!parse-error (type => 'no SYSTEM literal');          !!!parse-error (type => 'no SYSTEM literal');
2176          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2177            $self->{s_kwd} = '';
2178          !!!next-input-character;          !!!next-input-character;
2179    
2180          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 2045  sub _get_next_token ($) { Line 2186  sub _get_next_token ($) {
2186          !!!parse-error (type => 'unclosed DOCTYPE');          !!!parse-error (type => 'unclosed DOCTYPE');
2187    
2188          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2189            $self->{s_kwd} = '';
2190          ## reconsume          ## reconsume
2191    
2192          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 2071  sub _get_next_token ($) { Line 2213  sub _get_next_token ($) {
2213          !!!parse-error (type => 'unclosed SYSTEM literal');          !!!parse-error (type => 'unclosed SYSTEM literal');
2214    
2215          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2216            $self->{s_kwd} = '';
2217          !!!next-input-character;          !!!next-input-character;
2218    
2219          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 2082  sub _get_next_token ($) { Line 2225  sub _get_next_token ($) {
2225          !!!parse-error (type => 'unclosed SYSTEM literal');          !!!parse-error (type => 'unclosed SYSTEM literal');
2226    
2227          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2228            $self->{s_kwd} = '';
2229          ## reconsume          ## reconsume
2230    
2231          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 2110  sub _get_next_token ($) { Line 2254  sub _get_next_token ($) {
2254          !!!parse-error (type => 'unclosed SYSTEM literal');          !!!parse-error (type => 'unclosed SYSTEM literal');
2255    
2256          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2257            $self->{s_kwd} = '';
2258          !!!next-input-character;          !!!next-input-character;
2259    
2260          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 2121  sub _get_next_token ($) { Line 2266  sub _get_next_token ($) {
2266          !!!parse-error (type => 'unclosed SYSTEM literal');          !!!parse-error (type => 'unclosed SYSTEM literal');
2267    
2268          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2269            $self->{s_kwd} = '';
2270          ## reconsume          ## reconsume
2271    
2272          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 2147  sub _get_next_token ($) { Line 2293  sub _get_next_token ($) {
2293        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
2294          !!!cp (216);          !!!cp (216);
2295          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2296            $self->{s_kwd} = '';
2297          !!!next-input-character;          !!!next-input-character;
2298    
2299          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
# Line 2156  sub _get_next_token ($) { Line 2303  sub _get_next_token ($) {
2303          !!!cp (217);          !!!cp (217);
2304          !!!parse-error (type => 'unclosed DOCTYPE');          !!!parse-error (type => 'unclosed DOCTYPE');
2305          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2306            $self->{s_kwd} = '';
2307          ## reconsume          ## reconsume
2308    
2309          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 2175  sub _get_next_token ($) { Line 2323  sub _get_next_token ($) {
2323        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
2324          !!!cp (219);          !!!cp (219);
2325          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2326            $self->{s_kwd} = '';
2327          !!!next-input-character;          !!!next-input-character;
2328    
2329          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
# Line 2183  sub _get_next_token ($) { Line 2332  sub _get_next_token ($) {
2332        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
2333          !!!cp (220);          !!!cp (220);
2334          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2335            $self->{s_kwd} = '';
2336          ## reconsume          ## reconsume
2337    
2338          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
# Line 2208  sub _get_next_token ($) { Line 2358  sub _get_next_token ($) {
2358          !!!next-input-character;          !!!next-input-character;
2359          redo A;          redo A;
2360        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
2361            if ($self->{is_xml}) {
2362              !!!cp (221.11);
2363              !!!parse-error (type => 'no mse'); ## TODO: type
2364            } else {
2365              !!!cp (221.12);
2366            }
2367    
2368          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2369            $self->{s_kwd} = '';
2370          !!!next-input-character;          !!!next-input-character;
2371          if (length $self->{ct}->{data}) { # character          if (length $self->{ct}->{data}) { # character
2372            !!!cp (221.2);            !!!cp (221.2);
# Line 2247  sub _get_next_token ($) { Line 2405  sub _get_next_token ($) {
2405      } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {      } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
2406        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
2407          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2408            $self->{s_kwd} = '';
2409          !!!next-input-character;          !!!next-input-character;
2410          if (length $self->{ct}->{data}) { # character          if (length $self->{ct}->{data}) { # character
2411            !!!cp (221.7);            !!!cp (221.7);
# Line 2314  sub _get_next_token ($) { Line 2473  sub _get_next_token ($) {
2473        if ($self->{prev_state} == DATA_STATE) {        if ($self->{prev_state} == DATA_STATE) {
2474          !!!cp (997);          !!!cp (997);
2475          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
2476            $self->{s_kwd} = '';
2477          ## Reconsume.          ## Reconsume.
2478          !!!emit ({type => CHARACTER_TOKEN, data => '&',          !!!emit ({type => CHARACTER_TOKEN, data => '&',
2479                    line => $self->{line_prev},                    line => $self->{line_prev},
# Line 2324  sub _get_next_token ($) { Line 2484  sub _get_next_token ($) {
2484          !!!cp (996);          !!!cp (996);
2485          $self->{ca}->{value} .= '&';          $self->{ca}->{value} .= '&';
2486          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
2487            $self->{s_kwd} = '';
2488          ## Reconsume.          ## Reconsume.
2489          redo A;          redo A;
2490        }        }
# Line 2354  sub _get_next_token ($) { Line 2515  sub _get_next_token ($) {
2515          if ($self->{prev_state} == DATA_STATE) {          if ($self->{prev_state} == DATA_STATE) {
2516            !!!cp (1019);            !!!cp (1019);
2517            $self->{state} = $self->{prev_state};            $self->{state} = $self->{prev_state};
2518              $self->{s_kwd} = '';
2519            ## Reconsume.            ## Reconsume.
2520            !!!emit ({type => CHARACTER_TOKEN,            !!!emit ({type => CHARACTER_TOKEN,
2521                      data => '&#',                      data => '&#',
# Line 2365  sub _get_next_token ($) { Line 2527  sub _get_next_token ($) {
2527            !!!cp (993);            !!!cp (993);
2528            $self->{ca}->{value} .= '&#';            $self->{ca}->{value} .= '&#';
2529            $self->{state} = $self->{prev_state};            $self->{state} = $self->{prev_state};
2530              $self->{s_kwd} = '';
2531            ## Reconsume.            ## Reconsume.
2532            redo A;            redo A;
2533          }          }
# Line 2410  sub _get_next_token ($) { Line 2573  sub _get_next_token ($) {
2573        if ($self->{prev_state} == DATA_STATE) {        if ($self->{prev_state} == DATA_STATE) {
2574          !!!cp (992);          !!!cp (992);
2575          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
2576            $self->{s_kwd} = '';
2577          ## Reconsume.          ## Reconsume.
2578          !!!emit ({type => CHARACTER_TOKEN, data => chr $code,          !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
2579                      has_reference => 1,
2580                    line => $l, column => $c,                    line => $l, column => $c,
2581                   });                   });
2582          redo A;          redo A;
# Line 2420  sub _get_next_token ($) { Line 2585  sub _get_next_token ($) {
2585          $self->{ca}->{value} .= chr $code;          $self->{ca}->{value} .= chr $code;
2586          $self->{ca}->{has_reference} = 1;          $self->{ca}->{has_reference} = 1;
2587          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
2588            $self->{s_kwd} = '';
2589          ## Reconsume.          ## Reconsume.
2590          redo A;          redo A;
2591        }        }
# Line 2445  sub _get_next_token ($) { Line 2611  sub _get_next_token ($) {
2611          if ($self->{prev_state} == DATA_STATE) {          if ($self->{prev_state} == DATA_STATE) {
2612            !!!cp (1005);            !!!cp (1005);
2613            $self->{state} = $self->{prev_state};            $self->{state} = $self->{prev_state};
2614              $self->{s_kwd} = '';
2615            ## Reconsume.            ## Reconsume.
2616            !!!emit ({type => CHARACTER_TOKEN,            !!!emit ({type => CHARACTER_TOKEN,
2617                      data => '&' . $self->{s_kwd},                      data => '&' . $self->{s_kwd},
# Line 2456  sub _get_next_token ($) { Line 2623  sub _get_next_token ($) {
2623            !!!cp (989);            !!!cp (989);
2624            $self->{ca}->{value} .= '&' . $self->{s_kwd};            $self->{ca}->{value} .= '&' . $self->{s_kwd};
2625            $self->{state} = $self->{prev_state};            $self->{state} = $self->{prev_state};
2626              $self->{s_kwd} = '';
2627            ## Reconsume.            ## Reconsume.
2628            redo A;            redo A;
2629          }          }
# Line 2518  sub _get_next_token ($) { Line 2686  sub _get_next_token ($) {
2686        if ($self->{prev_state} == DATA_STATE) {        if ($self->{prev_state} == DATA_STATE) {
2687          !!!cp (988);          !!!cp (988);
2688          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
2689            $self->{s_kwd} = '';
2690          ## Reconsume.          ## Reconsume.
2691          !!!emit ({type => CHARACTER_TOKEN, data => chr $code,          !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
2692                      has_reference => 1,
2693                    line => $l, column => $c,                    line => $l, column => $c,
2694                   });                   });
2695          redo A;          redo A;
# Line 2528  sub _get_next_token ($) { Line 2698  sub _get_next_token ($) {
2698          $self->{ca}->{value} .= chr $code;          $self->{ca}->{value} .= chr $code;
2699          $self->{ca}->{has_reference} = 1;          $self->{ca}->{has_reference} = 1;
2700          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
2701            $self->{s_kwd} = '';
2702          ## Reconsume.          ## Reconsume.
2703          redo A;          redo A;
2704        }        }
# Line 2610  sub _get_next_token ($) { Line 2781  sub _get_next_token ($) {
2781        if ($self->{prev_state} == DATA_STATE) {        if ($self->{prev_state} == DATA_STATE) {
2782          !!!cp (986);          !!!cp (986);
2783          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
2784            $self->{s_kwd} = '';
2785          ## Reconsume.          ## Reconsume.
2786          !!!emit ({type => CHARACTER_TOKEN,          !!!emit ({type => CHARACTER_TOKEN,
2787                    data => $data,                    data => $data,
2788                      has_reference => $has_ref,
2789                    line => $self->{line_prev},                    line => $self->{line_prev},
2790                    column => $self->{column_prev} + 1 - length $self->{s_kwd},                    column => $self->{column_prev} + 1 - length $self->{s_kwd},
2791                   });                   });
# Line 2622  sub _get_next_token ($) { Line 2795  sub _get_next_token ($) {
2795          $self->{ca}->{value} .= $data;          $self->{ca}->{value} .= $data;
2796          $self->{ca}->{has_reference} = 1 if $has_ref;          $self->{ca}->{has_reference} = 1 if $has_ref;
2797          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
2798            $self->{s_kwd} = '';
2799            ## Reconsume.
2800            redo A;
2801          }
2802    
2803        ## XML-only states
2804    
2805        } elsif ($self->{state} == PI_STATE) {
2806          if ($is_space->{$self->{nc}} or
2807              $self->{nc} == 0x003F or # ? ## XML5: Same as "Anything else"
2808              $self->{nc} == -1) {
2809            !!!parse-error (type => 'bare pio', ## TODO: type
2810                            line => $self->{line_prev},
2811                            column => $self->{column_prev}
2812                                - 1 * ($self->{nc} != -1));
2813            $self->{state} = BOGUS_COMMENT_STATE;
2814            ## Reconsume.
2815            $self->{ct} = {type => COMMENT_TOKEN,
2816                           data => '?',
2817                           line => $self->{line_prev},
2818                           column => $self->{column_prev}
2819                               - 1 * ($self->{nc} != -1),
2820                          };
2821            redo A;
2822          } else {
2823            $self->{ct} = {type => PI_TOKEN,
2824                           target => chr $self->{nc},
2825                           data => '',
2826                           line => $self->{line_prev},
2827                           column => $self->{column_prev} - 1,
2828                          };
2829            $self->{state} = PI_TARGET_STATE;
2830            !!!next-input-character;
2831            redo A;
2832          }
2833        } elsif ($self->{state} == PI_TARGET_STATE) {
2834          if ($is_space->{$self->{nc}}) {
2835            $self->{state} = PI_TARGET_AFTER_STATE;
2836            !!!next-input-character;
2837            redo A;
2838          } elsif ($self->{nc} == -1) {
2839            !!!parse-error (type => 'no pic'); ## TODO: type
2840            $self->{state} = DATA_STATE;
2841            $self->{s_kwd} = '';
2842          ## Reconsume.          ## Reconsume.
2843            !!!emit ($self->{ct}); # pi
2844            redo A;
2845          } elsif ($self->{nc} == 0x003F) { # ?
2846            $self->{state} = PI_AFTER_STATE;
2847            !!!next-input-character;
2848            redo A;
2849          } else {
2850            ## XML5: typo ("tag name" -> "target")
2851            $self->{ct}->{target} .= chr $self->{nc}; # pi
2852            !!!next-input-character;
2853            redo A;
2854          }
2855        } elsif ($self->{state} == PI_TARGET_AFTER_STATE) {
2856          if ($is_space->{$self->{nc}}) {
2857            ## Stay in the state.
2858            !!!next-input-character;
2859            redo A;
2860          } else {
2861            $self->{state} = PI_DATA_STATE;
2862            ## Reprocess.
2863            redo A;
2864          }
2865        } elsif ($self->{state} == PI_DATA_STATE) {
2866          if ($self->{nc} == 0x003F) { # ?
2867            $self->{state} = PI_DATA_AFTER_STATE;
2868            !!!next-input-character;
2869            redo A;
2870          } elsif ($self->{nc} == -1) {
2871            !!!parse-error (type => 'no pic'); ## TODO: type
2872            $self->{state} = DATA_STATE;
2873            $self->{s_kwd} = '';
2874            ## Reprocess.
2875            !!!emit ($self->{ct}); # pi
2876            redo A;
2877          } else {
2878            $self->{ct}->{data} .= chr $self->{nc}; # pi
2879            $self->{read_until}->($self->{ct}->{data}, q[?],
2880                                  length $self->{ct}->{data});
2881            ## Stay in the state.
2882            !!!next-input-character;
2883            ## Reprocess.
2884          redo A;          redo A;
2885        }        }
2886        } elsif ($self->{state} == PI_AFTER_STATE) {
2887          if ($self->{nc} == 0x003E) { # >
2888            $self->{state} = DATA_STATE;
2889            $self->{s_kwd} = '';
2890            !!!next-input-character;
2891            !!!emit ($self->{ct}); # pi
2892            redo A;
2893          } elsif ($self->{nc} == 0x003F) { # ?
2894            !!!parse-error (type => 'no s after target', ## TODO: type
2895                            line => $self->{line_prev},
2896                            column => $self->{column_prev}); ## XML5: no error
2897            $self->{ct}->{data} .= '?';
2898            $self->{state} = PI_DATA_AFTER_STATE;
2899            !!!next-input-character;
2900            redo A;
2901          } else {
2902            !!!parse-error (type => 'no s after target', ## TODO: type
2903                            line => $self->{line_prev},
2904                            column => $self->{column_prev}
2905                                + 1 * ($self->{nc} == -1)); ## XML5: no error
2906            $self->{ct}->{data} .= '?'; ## XML5: not appended
2907            $self->{state} = PI_DATA_STATE;
2908            ## Reprocess.
2909            redo A;
2910          }
2911        } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
2912          ## XML5: Same as "pi after state" in XML5
2913          if ($self->{nc} == 0x003E) { # >
2914            $self->{state} = DATA_STATE;
2915            $self->{s_kwd} = '';
2916            !!!next-input-character;
2917            !!!emit ($self->{ct}); # pi
2918            redo A;
2919          } elsif ($self->{nc} == 0x003F) { # ?
2920            $self->{ct}->{data} .= '?';
2921            ## Stay in the state.
2922            !!!next-input-character;
2923            redo A;
2924          } else {
2925            $self->{ct}->{data} .= '?'; ## XML5: not appended
2926            $self->{state} = PI_DATA_STATE;
2927            ## Reprocess.
2928            redo A;
2929          }
2930            
2931      } else {      } else {
2932        die "$0: $self->{state}: Unknown state";        die "$0: $self->{state}: Unknown state";
2933      }      }

Legend:
Removed from v.1.1  
changed lines
  Added in v.1.8

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24