/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src
Suika

Diff of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.1 by wakaba, Tue Oct 14 02:27:58 2008 UTC revision 1.7 by wakaba, Tue Oct 14 15:25:50 2008 UTC
# Line 2  package Whatpm::HTML::Tokenizer; Line 2  package Whatpm::HTML::Tokenizer;
2  use strict;  use strict;
3  our $VERSION=do{my @r=(q$Revision$=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};  our $VERSION=do{my @r=(q$Revision$=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4    
5    BEGIN {
6      require Exporter;
7      push our @ISA, 'Exporter';
8    
9      our @EXPORT_OK = qw(
10        DOCTYPE_TOKEN
11        COMMENT_TOKEN
12        START_TAG_TOKEN
13        END_TAG_TOKEN
14        END_OF_FILE_TOKEN
15        CHARACTER_TOKEN
16        PI_TOKEN
17        ABORT_TOKEN
18      );
19      
20      our %EXPORT_TAGS = (
21        token => [qw(
22          DOCTYPE_TOKEN
23          COMMENT_TOKEN
24          START_TAG_TOKEN
25          END_TAG_TOKEN
26          END_OF_FILE_TOKEN
27          CHARACTER_TOKEN
28          PI_TOKEN
29          ABORT_TOKEN
30        )],
31      );
32    }
33    
34    ## Token types
35    
36    sub DOCTYPE_TOKEN () { 1 }
37    sub COMMENT_TOKEN () { 2 }
38    sub START_TAG_TOKEN () { 3 }
39    sub END_TAG_TOKEN () { 4 }
40    sub END_OF_FILE_TOKEN () { 5 }
41    sub CHARACTER_TOKEN () { 6 }
42    sub PI_TOKEN () { 7 } # XML5
43    sub ABORT_TOKEN () { 8 } # Not a token actually
44    
45  package Whatpm::HTML;  package Whatpm::HTML;
46    
47    BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
48    
49  ## Content model flags  ## Content model flags
50    
51  sub CM_ENTITY () { 0b001 } # & markup in data  sub CM_ENTITY () { 0b001 } # & markup in data
# Line 72  sub HEXREF_HEX_STATE () { 48 } Line 114  sub HEXREF_HEX_STATE () { 48 }
114  sub ENTITY_NAME_STATE () { 49 }  sub ENTITY_NAME_STATE () { 49 }
115  sub PCDATA_STATE () { 50 } # "data state" in the spec  sub PCDATA_STATE () { 50 } # "data state" in the spec
116    
 ## Token types  
   
 sub DOCTYPE_TOKEN () { 1 }  
 sub COMMENT_TOKEN () { 2 }  
 sub START_TAG_TOKEN () { 3 }  
 sub END_TAG_TOKEN () { 4 }  
 sub END_OF_FILE_TOKEN () { 5 }  
 sub CHARACTER_TOKEN () { 6 }  
   
117  ## Tree constructor state constants (see Whatpm::HTML for the full  ## Tree constructor state constants (see Whatpm::HTML for the full
118  ## list and descriptions)  ## list and descriptions)
119    
# Line 142  sub _initialize_tokenizer ($) { Line 175  sub _initialize_tokenizer ($) {
175    #$self->{level}    #$self->{level}
176    #$self->{set_nc}    #$self->{set_nc}
177    #$self->{parse_error}    #$self->{parse_error}
178      #$self->{is_xml} (if XML)
179    
180    $self->{state} = DATA_STATE; # MUST    $self->{state} = DATA_STATE; # MUST
181    #$self->{s_kwd}; # state keyword - initialized when used    $self->{s_kwd} = ''; # state keyword
182    #$self->{entity__value}; # initialized when used    #$self->{entity__value}; # initialized when used
183    #$self->{entity__match}; # initialized when used    #$self->{entity__match}; # initialized when used
184    $self->{content_model} = PCDATA_CONTENT_MODEL; # be    $self->{content_model} = PCDATA_CONTENT_MODEL; # be
# Line 175  sub _initialize_tokenizer ($) { Line 209  sub _initialize_tokenizer ($) {
209  ##        ->{value}  ##        ->{value}
210  ##        ->{has_reference} == 1 or 0  ##        ->{has_reference} == 1 or 0
211  ##   ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)  ##   ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)
212    ##   ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
213  ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.  ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
214  ##     |->{self_closing}| is used to save the value of |$self->{self_closing}|  ##     |->{self_closing}| is used to save the value of |$self->{self_closing}|
215  ##     while the token is pushed back to the stack.  ##     while the token is pushed back to the stack.
# Line 278  sub _get_next_token ($) { Line 313  sub _get_next_token ($) {
313          }          }
314        } elsif ($self->{nc} == 0x002D) { # -        } elsif ($self->{nc} == 0x002D) { # -
315          if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA          if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
316            $self->{s_kwd} .= '-';            if ($self->{s_kwd} eq '<!-') {
             
           if ($self->{s_kwd} eq '<!--') {  
317              !!!cp (3);              !!!cp (3);
318              $self->{escape} = 1; # unless $self->{escape};              $self->{escape} = 1; # unless $self->{escape};
319              $self->{s_kwd} = '--';              $self->{s_kwd} = '--';
320              #              #
321            } elsif ($self->{s_kwd} eq '---') {            } elsif ($self->{s_kwd} eq '-') {
322              !!!cp (4);              !!!cp (4);
323              $self->{s_kwd} = '--';              $self->{s_kwd} = '--';
324              #              #
325              } elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') {
326                !!!cp (4.1);
327                $self->{s_kwd} .= '-';
328                #
329            } else {            } else {
330              !!!cp (5);              !!!cp (5);
331                $self->{s_kwd} = '-';
332              #              #
333            }            }
334          }          }
# Line 326  sub _get_next_token ($) { Line 364  sub _get_next_token ($) {
364            if ($self->{s_kwd} eq '--') {            if ($self->{s_kwd} eq '--') {
365              !!!cp (8);              !!!cp (8);
366              delete $self->{escape};              delete $self->{escape};
367                #
368            } else {            } else {
369              !!!cp (9);              !!!cp (9);
370                #
371            }            }
372            } elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') {
373              !!!cp (9.1);
374              !!!parse-error (type => 'unmatched mse', ## TODO: type
375                              line => $self->{line_prev},
376                              column => $self->{column_prev} - 1);
377              #
378          } else {          } else {
379            !!!cp (10);            !!!cp (10);
380              #
381          }          }
382                    
383          $self->{s_kwd} = '';          $self->{s_kwd} = '';
384          #          #
385          } elsif ($self->{nc} == 0x005D) { # ]
386            if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') {
387              !!!cp (10.1);
388              $self->{s_kwd} .= ']';
389            } elsif ($self->{s_kwd} eq ']]') {
390              !!!cp (10.2);
391              #
392            } else {
393              !!!cp (10.3);
394              $self->{s_kwd} = '';
395            }
396            #
397        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
398          !!!cp (11);          !!!cp (11);
399          $self->{s_kwd} = '';          $self->{s_kwd} = '';
# Line 352  sub _get_next_token ($) { Line 411  sub _get_next_token ($) {
411                     data => chr $self->{nc},                     data => chr $self->{nc},
412                     line => $self->{line}, column => $self->{column},                     line => $self->{line}, column => $self->{column},
413                    };                    };
414        if ($self->{read_until}->($token->{data}, q[-!<>&],        if ($self->{read_until}->($token->{data}, q{-!<>&\]},
415                                  length $token->{data})) {                                  length $token->{data})) {
416          $self->{s_kwd} = '';          $self->{s_kwd} = '';
417        }        }
418    
419        ## Stay in the data state.        ## Stay in the data state.
420        if ($self->{content_model} == PCDATA_CONTENT_MODEL) {        if (not $self->{is_xml} and
421              $self->{content_model} == PCDATA_CONTENT_MODEL) {
422          !!!cp (13);          !!!cp (13);
423          $self->{state} = PCDATA_STATE;          $self->{state} = PCDATA_STATE;
424        } else {        } else {
# Line 386  sub _get_next_token ($) { Line 446  sub _get_next_token ($) {
446    
447          ## reconsume          ## reconsume
448          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
449            $self->{s_kwd} = '';
450          !!!emit ({type => CHARACTER_TOKEN, data => '<',          !!!emit ({type => CHARACTER_TOKEN, data => '<',
451                    line => $self->{line_prev},                    line => $self->{line_prev},
452                    column => $self->{column_prev},                    column => $self->{column_prev},
# Line 407  sub _get_next_token ($) { Line 468  sub _get_next_token ($) {
468            !!!cp (19);            !!!cp (19);
469            $self->{ct}            $self->{ct}
470              = {type => START_TAG_TOKEN,              = {type => START_TAG_TOKEN,
471                 tag_name => chr ($self->{nc} + 0x0020),                 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
472                 line => $self->{line_prev},                 line => $self->{line_prev},
473                 column => $self->{column_prev}};                 column => $self->{column_prev}};
474            $self->{state} = TAG_NAME_STATE;            $self->{state} = TAG_NAME_STATE;
# Line 429  sub _get_next_token ($) { Line 490  sub _get_next_token ($) {
490                            line => $self->{line_prev},                            line => $self->{line_prev},
491                            column => $self->{column_prev});                            column => $self->{column_prev});
492            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
493              $self->{s_kwd} = '';
494            !!!next-input-character;            !!!next-input-character;
495    
496            !!!emit ({type => CHARACTER_TOKEN, data => '<>',            !!!emit ({type => CHARACTER_TOKEN, data => '<>',
# Line 455  sub _get_next_token ($) { Line 517  sub _get_next_token ($) {
517                            line => $self->{line_prev},                            line => $self->{line_prev},
518                            column => $self->{column_prev});                            column => $self->{column_prev});
519            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
520              $self->{s_kwd} = '';
521            ## reconsume            ## reconsume
522    
523            !!!emit ({type => CHARACTER_TOKEN, data => '<',            !!!emit ({type => CHARACTER_TOKEN, data => '<',
# Line 483  sub _get_next_token ($) { Line 546  sub _get_next_token ($) {
546            ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.            ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
547            !!!cp (28);            !!!cp (28);
548            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
549              $self->{s_kwd} = '';
550            ## Reconsume.            ## Reconsume.
551            !!!emit ({type => CHARACTER_TOKEN, data => '</',            !!!emit ({type => CHARACTER_TOKEN, data => '</',
552                      line => $l, column => $c,                      line => $l, column => $c,
# Line 496  sub _get_next_token ($) { Line 560  sub _get_next_token ($) {
560          !!!cp (29);          !!!cp (29);
561          $self->{ct}          $self->{ct}
562              = {type => END_TAG_TOKEN,              = {type => END_TAG_TOKEN,
563                 tag_name => chr ($self->{nc} + 0x0020),                 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
564                 line => $l, column => $c};                 line => $l, column => $c};
565          $self->{state} = TAG_NAME_STATE;          $self->{state} = TAG_NAME_STATE;
566          !!!next-input-character;          !!!next-input-character;
# Line 516  sub _get_next_token ($) { Line 580  sub _get_next_token ($) {
580                          line => $self->{line_prev}, ## "<" in "</>"                          line => $self->{line_prev}, ## "<" in "</>"
581                          column => $self->{column_prev} - 1);                          column => $self->{column_prev} - 1);
582          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
583            $self->{s_kwd} = '';
584          !!!next-input-character;          !!!next-input-character;
585          redo A;          redo A;
586        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
587          !!!cp (32);          !!!cp (32);
588          !!!parse-error (type => 'bare etago');          !!!parse-error (type => 'bare etago');
589            $self->{s_kwd} = '';
590          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
591          # reconsume          # reconsume
592    
# Line 560  sub _get_next_token ($) { Line 626  sub _get_next_token ($) {
626          } else {          } else {
627            !!!cp (25);            !!!cp (25);
628            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
629              $self->{s_kwd} = '';
630            ## Reconsume.            ## Reconsume.
631            !!!emit ({type => CHARACTER_TOKEN,            !!!emit ({type => CHARACTER_TOKEN,
632                      data => '</' . $self->{s_kwd},                      data => '</' . $self->{s_kwd},
# Line 578  sub _get_next_token ($) { Line 645  sub _get_next_token ($) {
645            !!!cp (26);            !!!cp (26);
646            ## Reconsume.            ## Reconsume.
647            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
648              $self->{s_kwd} = '';
649            !!!emit ({type => CHARACTER_TOKEN,            !!!emit ({type => CHARACTER_TOKEN,
650                      data => '</' . $self->{s_kwd},                      data => '</' . $self->{s_kwd},
651                      line => $self->{line_prev},                      line => $self->{line_prev},
# Line 619  sub _get_next_token ($) { Line 687  sub _get_next_token ($) {
687            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
688          }          }
689          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
690            $self->{s_kwd} = '';
691          !!!next-input-character;          !!!next-input-character;
692    
693          !!!emit ($self->{ct}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
# Line 627  sub _get_next_token ($) { Line 696  sub _get_next_token ($) {
696        } elsif (0x0041 <= $self->{nc} and        } elsif (0x0041 <= $self->{nc} and
697                 $self->{nc} <= 0x005A) { # A..Z                 $self->{nc} <= 0x005A) { # A..Z
698          !!!cp (38);          !!!cp (38);
699          $self->{ct}->{tag_name} .= chr ($self->{nc} + 0x0020);          $self->{ct}->{tag_name}
700                .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
701            # start tag or end tag            # start tag or end tag
702          ## Stay in this state          ## Stay in this state
703          !!!next-input-character;          !!!next-input-character;
# Line 650  sub _get_next_token ($) { Line 720  sub _get_next_token ($) {
720            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
721          }          }
722          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
723            $self->{s_kwd} = '';
724          # reconsume          # reconsume
725    
726          !!!emit ($self->{ct}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
# Line 690  sub _get_next_token ($) { Line 761  sub _get_next_token ($) {
761            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
762          }          }
763          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
764            $self->{s_kwd} = '';
765          !!!next-input-character;          !!!next-input-character;
766    
767          !!!emit ($self->{ct}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
# Line 699  sub _get_next_token ($) { Line 771  sub _get_next_token ($) {
771                 $self->{nc} <= 0x005A) { # A..Z                 $self->{nc} <= 0x005A) { # A..Z
772          !!!cp (49);          !!!cp (49);
773          $self->{ca}          $self->{ca}
774              = {name => chr ($self->{nc} + 0x0020),              = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
775                 value => '',                 value => '',
776                 line => $self->{line}, column => $self->{column}};                 line => $self->{line}, column => $self->{column}};
777          $self->{state} = ATTRIBUTE_NAME_STATE;          $self->{state} = ATTRIBUTE_NAME_STATE;
# Line 727  sub _get_next_token ($) { Line 799  sub _get_next_token ($) {
799            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
800          }          }
801          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
802            $self->{s_kwd} = '';
803          # reconsume          # reconsume
804    
805          !!!emit ($self->{ct}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
# Line 792  sub _get_next_token ($) { Line 865  sub _get_next_token ($) {
865            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
866          }          }
867          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
868            $self->{s_kwd} = '';
869          !!!next-input-character;          !!!next-input-character;
870    
871          !!!emit ($self->{ct}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
# Line 800  sub _get_next_token ($) { Line 874  sub _get_next_token ($) {
874        } elsif (0x0041 <= $self->{nc} and        } elsif (0x0041 <= $self->{nc} and
875                 $self->{nc} <= 0x005A) { # A..Z                 $self->{nc} <= 0x005A) { # A..Z
876          !!!cp (63);          !!!cp (63);
877          $self->{ca}->{name} .= chr ($self->{nc} + 0x0020);          $self->{ca}->{name}
878                .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
879          ## Stay in the state          ## Stay in the state
880          !!!next-input-character;          !!!next-input-character;
881          redo A;          redo A;
# Line 829  sub _get_next_token ($) { Line 904  sub _get_next_token ($) {
904            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
905          }          }
906          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
907            $self->{s_kwd} = '';
908          # reconsume          # reconsume
909    
910          !!!emit ($self->{ct}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
# Line 875  sub _get_next_token ($) { Line 951  sub _get_next_token ($) {
951            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
952          }          }
953          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
954            $self->{s_kwd} = '';
955          !!!next-input-character;          !!!next-input-character;
956    
957          !!!emit ($self->{ct}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
# Line 884  sub _get_next_token ($) { Line 961  sub _get_next_token ($) {
961                 $self->{nc} <= 0x005A) { # A..Z                 $self->{nc} <= 0x005A) { # A..Z
962          !!!cp (76);          !!!cp (76);
963          $self->{ca}          $self->{ca}
964              = {name => chr ($self->{nc} + 0x0020),              = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
965                 value => '',                 value => '',
966                 line => $self->{line}, column => $self->{column}};                 line => $self->{line}, column => $self->{column}};
967          $self->{state} = ATTRIBUTE_NAME_STATE;          $self->{state} = ATTRIBUTE_NAME_STATE;
# Line 912  sub _get_next_token ($) { Line 989  sub _get_next_token ($) {
989          } else {          } else {
990            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
991          }          }
992            $self->{s_kwd} = '';
993          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
994          # reconsume          # reconsume
995    
# Line 973  sub _get_next_token ($) { Line 1051  sub _get_next_token ($) {
1051            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1052          }          }
1053          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1054            $self->{s_kwd} = '';
1055          !!!next-input-character;          !!!next-input-character;
1056    
1057          !!!emit ($self->{ct}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
# Line 996  sub _get_next_token ($) { Line 1075  sub _get_next_token ($) {
1075            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1076          }          }
1077          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1078            $self->{s_kwd} = '';
1079          ## reconsume          ## reconsume
1080    
1081          !!!emit ($self->{ct}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
# Line 1048  sub _get_next_token ($) { Line 1128  sub _get_next_token ($) {
1128            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1129          }          }
1130          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1131            $self->{s_kwd} = '';
1132          ## reconsume          ## reconsume
1133    
1134          !!!emit ($self->{ct}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
# Line 1099  sub _get_next_token ($) { Line 1180  sub _get_next_token ($) {
1180            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1181          }          }
1182          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1183            $self->{s_kwd} = '';
1184          ## reconsume          ## reconsume
1185    
1186          !!!emit ($self->{ct}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
# Line 1149  sub _get_next_token ($) { Line 1231  sub _get_next_token ($) {
1231            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1232          }          }
1233          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1234            $self->{s_kwd} = '';
1235          !!!next-input-character;          !!!next-input-character;
1236    
1237          !!!emit ($self->{ct}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
# Line 1172  sub _get_next_token ($) { Line 1255  sub _get_next_token ($) {
1255            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1256          }          }
1257          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1258            $self->{s_kwd} = '';
1259          ## reconsume          ## reconsume
1260    
1261          !!!emit ($self->{ct}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
# Line 1220  sub _get_next_token ($) { Line 1304  sub _get_next_token ($) {
1304            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1305          }          }
1306          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1307            $self->{s_kwd} = '';
1308          !!!next-input-character;          !!!next-input-character;
1309    
1310          !!!emit ($self->{ct}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
# Line 1247  sub _get_next_token ($) { Line 1332  sub _get_next_token ($) {
1332            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1333          }          }
1334          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1335            $self->{s_kwd} = '';
1336          ## Reconsume.          ## Reconsume.
1337          !!!emit ($self->{ct}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
1338          redo A;          redo A;
# Line 1277  sub _get_next_token ($) { Line 1363  sub _get_next_token ($) {
1363          }          }
1364    
1365          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1366            $self->{s_kwd} = '';
1367          !!!next-input-character;          !!!next-input-character;
1368    
1369          !!!emit ($self->{ct}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
# Line 1299  sub _get_next_token ($) { Line 1386  sub _get_next_token ($) {
1386            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1387          }          }
1388          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1389            $self->{s_kwd} = '';
1390          ## Reconsume.          ## Reconsume.
1391          !!!emit ($self->{ct}); # start tag or end tag          !!!emit ($self->{ct}); # start tag or end tag
1392          redo A;          redo A;
# Line 1319  sub _get_next_token ($) { Line 1407  sub _get_next_token ($) {
1407        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
1408          !!!cp (124);          !!!cp (124);
1409          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1410            $self->{s_kwd} = '';
1411          !!!next-input-character;          !!!next-input-character;
1412    
1413          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1326  sub _get_next_token ($) { Line 1415  sub _get_next_token ($) {
1415        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
1416          !!!cp (125);          !!!cp (125);
1417          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1418            $self->{s_kwd} = '';
1419          ## reconsume          ## reconsume
1420    
1421          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1357  sub _get_next_token ($) { Line 1447  sub _get_next_token ($) {
1447          $self->{s_kwd} = chr $self->{nc};          $self->{s_kwd} = chr $self->{nc};
1448          !!!next-input-character;          !!!next-input-character;
1449          redo A;          redo A;
1450        } elsif ($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and        } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
1451                 $self->{open_elements}->[-1]->[1] & FOREIGN_EL and                   $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
1452                    $self->{is_xml}) and
1453                 $self->{nc} == 0x005B) { # [                 $self->{nc} == 0x005B) { # [
1454          !!!cp (135.4);                          !!!cp (135.4);                
1455          $self->{state} = MD_CDATA_STATE;          $self->{state} = MD_CDATA_STATE;
# Line 1468  sub _get_next_token ($) { Line 1559  sub _get_next_token ($) {
1559        } elsif ($self->{s_kwd} eq '[CDATA' and        } elsif ($self->{s_kwd} eq '[CDATA' and
1560                 $self->{nc} == 0x005B) { # [                 $self->{nc} == 0x005B) { # [
1561          !!!cp (135.2);          !!!cp (135.2);
1562    
1563            if ($self->{is_xml} and
1564                not $self->{tainted} and
1565                @{$self->{open_elements} or []} == 0) {
1566              !!!parse-error (type => 'cdata outside of root element',
1567                              line => $self->{line_prev},
1568                              column => $self->{column_prev} - 7);
1569              $self->{tainted} = 1;
1570            }
1571    
1572          $self->{ct} = {type => CHARACTER_TOKEN,          $self->{ct} = {type => CHARACTER_TOKEN,
1573                                    data => '',                                    data => '',
1574                                    line => $self->{line_prev},                                    line => $self->{line_prev},
# Line 1499  sub _get_next_token ($) { Line 1600  sub _get_next_token ($) {
1600          !!!cp (138);          !!!cp (138);
1601          !!!parse-error (type => 'bogus comment');          !!!parse-error (type => 'bogus comment');
1602          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1603            $self->{s_kwd} = '';
1604          !!!next-input-character;          !!!next-input-character;
1605    
1606          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1508  sub _get_next_token ($) { Line 1610  sub _get_next_token ($) {
1610          !!!cp (139);          !!!cp (139);
1611          !!!parse-error (type => 'unclosed comment');          !!!parse-error (type => 'unclosed comment');
1612          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1613            $self->{s_kwd} = '';
1614          ## reconsume          ## reconsume
1615    
1616          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1531  sub _get_next_token ($) { Line 1634  sub _get_next_token ($) {
1634          !!!cp (142);          !!!cp (142);
1635          !!!parse-error (type => 'bogus comment');          !!!parse-error (type => 'bogus comment');
1636          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1637            $self->{s_kwd} = '';
1638          !!!next-input-character;          !!!next-input-character;
1639    
1640          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1540  sub _get_next_token ($) { Line 1644  sub _get_next_token ($) {
1644          !!!cp (143);          !!!cp (143);
1645          !!!parse-error (type => 'unclosed comment');          !!!parse-error (type => 'unclosed comment');
1646          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1647            $self->{s_kwd} = '';
1648          ## reconsume          ## reconsume
1649    
1650          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1563  sub _get_next_token ($) { Line 1668  sub _get_next_token ($) {
1668          !!!cp (146);          !!!cp (146);
1669          !!!parse-error (type => 'unclosed comment');          !!!parse-error (type => 'unclosed comment');
1670          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1671            $self->{s_kwd} = '';
1672          ## reconsume          ## reconsume
1673    
1674          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1588  sub _get_next_token ($) { Line 1694  sub _get_next_token ($) {
1694        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
1695          !!!cp (149);          !!!cp (149);
1696          !!!parse-error (type => 'unclosed comment');          !!!parse-error (type => 'unclosed comment');
1697            $self->{s_kwd} = '';
1698          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1699            $self->{s_kwd} = '';
1700          ## reconsume          ## reconsume
1701    
1702          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1605  sub _get_next_token ($) { Line 1713  sub _get_next_token ($) {
1713        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
1714          !!!cp (151);          !!!cp (151);
1715          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1716            $self->{s_kwd} = '';
1717          !!!next-input-character;          !!!next-input-character;
1718    
1719          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1623  sub _get_next_token ($) { Line 1732  sub _get_next_token ($) {
1732          !!!cp (153);          !!!cp (153);
1733          !!!parse-error (type => 'unclosed comment');          !!!parse-error (type => 'unclosed comment');
1734          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1735            $self->{s_kwd} = '';
1736          ## reconsume          ## reconsume
1737    
1738          !!!emit ($self->{ct}); # comment          !!!emit ($self->{ct}); # comment
# Line 1661  sub _get_next_token ($) { Line 1771  sub _get_next_token ($) {
1771          !!!cp (158);          !!!cp (158);
1772          !!!parse-error (type => 'no DOCTYPE name');          !!!parse-error (type => 'no DOCTYPE name');
1773          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1774            $self->{s_kwd} = '';
1775          !!!next-input-character;          !!!next-input-character;
1776    
1777          !!!emit ($self->{ct}); # DOCTYPE (quirks)          !!!emit ($self->{ct}); # DOCTYPE (quirks)
# Line 1670  sub _get_next_token ($) { Line 1781  sub _get_next_token ($) {
1781          !!!cp (159);          !!!cp (159);
1782          !!!parse-error (type => 'no DOCTYPE name');          !!!parse-error (type => 'no DOCTYPE name');
1783          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1784            $self->{s_kwd} = '';
1785          ## reconsume          ## reconsume
1786    
1787          !!!emit ($self->{ct}); # DOCTYPE (quirks)          !!!emit ($self->{ct}); # DOCTYPE (quirks)
# Line 1693  sub _get_next_token ($) { Line 1805  sub _get_next_token ($) {
1805        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
1806          !!!cp (162);          !!!cp (162);
1807          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1808            $self->{s_kwd} = '';
1809          !!!next-input-character;          !!!next-input-character;
1810    
1811          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
# Line 1702  sub _get_next_token ($) { Line 1815  sub _get_next_token ($) {
1815          !!!cp (163);          !!!cp (163);
1816          !!!parse-error (type => 'unclosed DOCTYPE');          !!!parse-error (type => 'unclosed DOCTYPE');
1817          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1818            $self->{s_kwd} = '';
1819          ## reconsume          ## reconsume
1820    
1821          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 1725  sub _get_next_token ($) { Line 1839  sub _get_next_token ($) {
1839        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
1840          !!!cp (166);          !!!cp (166);
1841          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1842            $self->{s_kwd} = '';
1843          !!!next-input-character;          !!!next-input-character;
1844    
1845          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
# Line 1734  sub _get_next_token ($) { Line 1849  sub _get_next_token ($) {
1849          !!!cp (167);          !!!cp (167);
1850          !!!parse-error (type => 'unclosed DOCTYPE');          !!!parse-error (type => 'unclosed DOCTYPE');
1851          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1852            $self->{s_kwd} = '';
1853          ## reconsume          ## reconsume
1854    
1855          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 1862  sub _get_next_token ($) { Line 1978  sub _get_next_token ($) {
1978          !!!parse-error (type => 'no PUBLIC literal');          !!!parse-error (type => 'no PUBLIC literal');
1979    
1980          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1981            $self->{s_kwd} = '';
1982          !!!next-input-character;          !!!next-input-character;
1983    
1984          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 1873  sub _get_next_token ($) { Line 1990  sub _get_next_token ($) {
1990          !!!parse-error (type => 'unclosed DOCTYPE');          !!!parse-error (type => 'unclosed DOCTYPE');
1991    
1992          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1993            $self->{s_kwd} = '';
1994          ## reconsume          ## reconsume
1995    
1996          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 1899  sub _get_next_token ($) { Line 2017  sub _get_next_token ($) {
2017          !!!parse-error (type => 'unclosed PUBLIC literal');          !!!parse-error (type => 'unclosed PUBLIC literal');
2018    
2019          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2020            $self->{s_kwd} = '';
2021          !!!next-input-character;          !!!next-input-character;
2022    
2023          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 1910  sub _get_next_token ($) { Line 2029  sub _get_next_token ($) {
2029          !!!parse-error (type => 'unclosed PUBLIC literal');          !!!parse-error (type => 'unclosed PUBLIC literal');
2030    
2031          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2032            $self->{s_kwd} = '';
2033          ## reconsume          ## reconsume
2034    
2035          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 1938  sub _get_next_token ($) { Line 2058  sub _get_next_token ($) {
2058          !!!parse-error (type => 'unclosed PUBLIC literal');          !!!parse-error (type => 'unclosed PUBLIC literal');
2059    
2060          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2061            $self->{s_kwd} = '';
2062          !!!next-input-character;          !!!next-input-character;
2063    
2064          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 1949  sub _get_next_token ($) { Line 2070  sub _get_next_token ($) {
2070          !!!parse-error (type => 'unclosed PUBLIC literal');          !!!parse-error (type => 'unclosed PUBLIC literal');
2071    
2072          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2073            $self->{s_kwd} = '';
2074          ## reconsume          ## reconsume
2075    
2076          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 1987  sub _get_next_token ($) { Line 2109  sub _get_next_token ($) {
2109        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
2110          !!!cp (198);          !!!cp (198);
2111          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2112            $self->{s_kwd} = '';
2113          !!!next-input-character;          !!!next-input-character;
2114    
2115          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
# Line 1997  sub _get_next_token ($) { Line 2120  sub _get_next_token ($) {
2120          !!!parse-error (type => 'unclosed DOCTYPE');          !!!parse-error (type => 'unclosed DOCTYPE');
2121    
2122          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2123            $self->{s_kwd} = '';
2124          ## reconsume          ## reconsume
2125    
2126          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 2034  sub _get_next_token ($) { Line 2158  sub _get_next_token ($) {
2158          !!!cp (204);          !!!cp (204);
2159          !!!parse-error (type => 'no SYSTEM literal');          !!!parse-error (type => 'no SYSTEM literal');
2160          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2161            $self->{s_kwd} = '';
2162          !!!next-input-character;          !!!next-input-character;
2163    
2164          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 2045  sub _get_next_token ($) { Line 2170  sub _get_next_token ($) {
2170          !!!parse-error (type => 'unclosed DOCTYPE');          !!!parse-error (type => 'unclosed DOCTYPE');
2171    
2172          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2173            $self->{s_kwd} = '';
2174          ## reconsume          ## reconsume
2175    
2176          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 2071  sub _get_next_token ($) { Line 2197  sub _get_next_token ($) {
2197          !!!parse-error (type => 'unclosed SYSTEM literal');          !!!parse-error (type => 'unclosed SYSTEM literal');
2198    
2199          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2200            $self->{s_kwd} = '';
2201          !!!next-input-character;          !!!next-input-character;
2202    
2203          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 2082  sub _get_next_token ($) { Line 2209  sub _get_next_token ($) {
2209          !!!parse-error (type => 'unclosed SYSTEM literal');          !!!parse-error (type => 'unclosed SYSTEM literal');
2210    
2211          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2212            $self->{s_kwd} = '';
2213          ## reconsume          ## reconsume
2214    
2215          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 2110  sub _get_next_token ($) { Line 2238  sub _get_next_token ($) {
2238          !!!parse-error (type => 'unclosed SYSTEM literal');          !!!parse-error (type => 'unclosed SYSTEM literal');
2239    
2240          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2241            $self->{s_kwd} = '';
2242          !!!next-input-character;          !!!next-input-character;
2243    
2244          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 2121  sub _get_next_token ($) { Line 2250  sub _get_next_token ($) {
2250          !!!parse-error (type => 'unclosed SYSTEM literal');          !!!parse-error (type => 'unclosed SYSTEM literal');
2251    
2252          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2253            $self->{s_kwd} = '';
2254          ## reconsume          ## reconsume
2255    
2256          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 2147  sub _get_next_token ($) { Line 2277  sub _get_next_token ($) {
2277        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
2278          !!!cp (216);          !!!cp (216);
2279          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2280            $self->{s_kwd} = '';
2281          !!!next-input-character;          !!!next-input-character;
2282    
2283          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
# Line 2156  sub _get_next_token ($) { Line 2287  sub _get_next_token ($) {
2287          !!!cp (217);          !!!cp (217);
2288          !!!parse-error (type => 'unclosed DOCTYPE');          !!!parse-error (type => 'unclosed DOCTYPE');
2289          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2290            $self->{s_kwd} = '';
2291          ## reconsume          ## reconsume
2292    
2293          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 2175  sub _get_next_token ($) { Line 2307  sub _get_next_token ($) {
2307        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
2308          !!!cp (219);          !!!cp (219);
2309          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2310            $self->{s_kwd} = '';
2311          !!!next-input-character;          !!!next-input-character;
2312    
2313          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
# Line 2183  sub _get_next_token ($) { Line 2316  sub _get_next_token ($) {
2316        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
2317          !!!cp (220);          !!!cp (220);
2318          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2319            $self->{s_kwd} = '';
2320          ## reconsume          ## reconsume
2321    
2322          !!!emit ($self->{ct}); # DOCTYPE          !!!emit ($self->{ct}); # DOCTYPE
# Line 2208  sub _get_next_token ($) { Line 2342  sub _get_next_token ($) {
2342          !!!next-input-character;          !!!next-input-character;
2343          redo A;          redo A;
2344        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
2345            if ($self->{is_xml}) {
2346              !!!parse-error (type => 'no mse'); ## TODO: type
2347            }
2348    
2349          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2350            $self->{s_kwd} = '';
2351          !!!next-input-character;          !!!next-input-character;
2352          if (length $self->{ct}->{data}) { # character          if (length $self->{ct}->{data}) { # character
2353            !!!cp (221.2);            !!!cp (221.2);
# Line 2247  sub _get_next_token ($) { Line 2386  sub _get_next_token ($) {
2386      } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {      } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
2387        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
2388          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2389            $self->{s_kwd} = '';
2390          !!!next-input-character;          !!!next-input-character;
2391          if (length $self->{ct}->{data}) { # character          if (length $self->{ct}->{data}) { # character
2392            !!!cp (221.7);            !!!cp (221.7);
# Line 2314  sub _get_next_token ($) { Line 2454  sub _get_next_token ($) {
2454        if ($self->{prev_state} == DATA_STATE) {        if ($self->{prev_state} == DATA_STATE) {
2455          !!!cp (997);          !!!cp (997);
2456          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
2457            $self->{s_kwd} = '';
2458          ## Reconsume.          ## Reconsume.
2459          !!!emit ({type => CHARACTER_TOKEN, data => '&',          !!!emit ({type => CHARACTER_TOKEN, data => '&',
2460                    line => $self->{line_prev},                    line => $self->{line_prev},
# Line 2324  sub _get_next_token ($) { Line 2465  sub _get_next_token ($) {
2465          !!!cp (996);          !!!cp (996);
2466          $self->{ca}->{value} .= '&';          $self->{ca}->{value} .= '&';
2467          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
2468            $self->{s_kwd} = '';
2469          ## Reconsume.          ## Reconsume.
2470          redo A;          redo A;
2471        }        }
# Line 2354  sub _get_next_token ($) { Line 2496  sub _get_next_token ($) {
2496          if ($self->{prev_state} == DATA_STATE) {          if ($self->{prev_state} == DATA_STATE) {
2497            !!!cp (1019);            !!!cp (1019);
2498            $self->{state} = $self->{prev_state};            $self->{state} = $self->{prev_state};
2499              $self->{s_kwd} = '';
2500            ## Reconsume.            ## Reconsume.
2501            !!!emit ({type => CHARACTER_TOKEN,            !!!emit ({type => CHARACTER_TOKEN,
2502                      data => '&#',                      data => '&#',
# Line 2365  sub _get_next_token ($) { Line 2508  sub _get_next_token ($) {
2508            !!!cp (993);            !!!cp (993);
2509            $self->{ca}->{value} .= '&#';            $self->{ca}->{value} .= '&#';
2510            $self->{state} = $self->{prev_state};            $self->{state} = $self->{prev_state};
2511              $self->{s_kwd} = '';
2512            ## Reconsume.            ## Reconsume.
2513            redo A;            redo A;
2514          }          }
# Line 2410  sub _get_next_token ($) { Line 2554  sub _get_next_token ($) {
2554        if ($self->{prev_state} == DATA_STATE) {        if ($self->{prev_state} == DATA_STATE) {
2555          !!!cp (992);          !!!cp (992);
2556          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
2557            $self->{s_kwd} = '';
2558          ## Reconsume.          ## Reconsume.
2559          !!!emit ({type => CHARACTER_TOKEN, data => chr $code,          !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
2560                      has_reference => 1,
2561                    line => $l, column => $c,                    line => $l, column => $c,
2562                   });                   });
2563          redo A;          redo A;
# Line 2420  sub _get_next_token ($) { Line 2566  sub _get_next_token ($) {
2566          $self->{ca}->{value} .= chr $code;          $self->{ca}->{value} .= chr $code;
2567          $self->{ca}->{has_reference} = 1;          $self->{ca}->{has_reference} = 1;
2568          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
2569            $self->{s_kwd} = '';
2570          ## Reconsume.          ## Reconsume.
2571          redo A;          redo A;
2572        }        }
# Line 2445  sub _get_next_token ($) { Line 2592  sub _get_next_token ($) {
2592          if ($self->{prev_state} == DATA_STATE) {          if ($self->{prev_state} == DATA_STATE) {
2593            !!!cp (1005);            !!!cp (1005);
2594            $self->{state} = $self->{prev_state};            $self->{state} = $self->{prev_state};
2595              $self->{s_kwd} = '';
2596            ## Reconsume.            ## Reconsume.
2597            !!!emit ({type => CHARACTER_TOKEN,            !!!emit ({type => CHARACTER_TOKEN,
2598                      data => '&' . $self->{s_kwd},                      data => '&' . $self->{s_kwd},
# Line 2456  sub _get_next_token ($) { Line 2604  sub _get_next_token ($) {
2604            !!!cp (989);            !!!cp (989);
2605            $self->{ca}->{value} .= '&' . $self->{s_kwd};            $self->{ca}->{value} .= '&' . $self->{s_kwd};
2606            $self->{state} = $self->{prev_state};            $self->{state} = $self->{prev_state};
2607              $self->{s_kwd} = '';
2608            ## Reconsume.            ## Reconsume.
2609            redo A;            redo A;
2610          }          }
# Line 2518  sub _get_next_token ($) { Line 2667  sub _get_next_token ($) {
2667        if ($self->{prev_state} == DATA_STATE) {        if ($self->{prev_state} == DATA_STATE) {
2668          !!!cp (988);          !!!cp (988);
2669          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
2670            $self->{s_kwd} = '';
2671          ## Reconsume.          ## Reconsume.
2672          !!!emit ({type => CHARACTER_TOKEN, data => chr $code,          !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
2673                      has_reference => 1,
2674                    line => $l, column => $c,                    line => $l, column => $c,
2675                   });                   });
2676          redo A;          redo A;
# Line 2528  sub _get_next_token ($) { Line 2679  sub _get_next_token ($) {
2679          $self->{ca}->{value} .= chr $code;          $self->{ca}->{value} .= chr $code;
2680          $self->{ca}->{has_reference} = 1;          $self->{ca}->{has_reference} = 1;
2681          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
2682            $self->{s_kwd} = '';
2683          ## Reconsume.          ## Reconsume.
2684          redo A;          redo A;
2685        }        }
# Line 2610  sub _get_next_token ($) { Line 2762  sub _get_next_token ($) {
2762        if ($self->{prev_state} == DATA_STATE) {        if ($self->{prev_state} == DATA_STATE) {
2763          !!!cp (986);          !!!cp (986);
2764          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
2765            $self->{s_kwd} = '';
2766          ## Reconsume.          ## Reconsume.
2767          !!!emit ({type => CHARACTER_TOKEN,          !!!emit ({type => CHARACTER_TOKEN,
2768                    data => $data,                    data => $data,
2769                      has_reference => $has_ref,
2770                    line => $self->{line_prev},                    line => $self->{line_prev},
2771                    column => $self->{column_prev} + 1 - length $self->{s_kwd},                    column => $self->{column_prev} + 1 - length $self->{s_kwd},
2772                   });                   });
# Line 2622  sub _get_next_token ($) { Line 2776  sub _get_next_token ($) {
2776          $self->{ca}->{value} .= $data;          $self->{ca}->{value} .= $data;
2777          $self->{ca}->{has_reference} = 1 if $has_ref;          $self->{ca}->{has_reference} = 1 if $has_ref;
2778          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
2779            $self->{s_kwd} = '';
2780          ## Reconsume.          ## Reconsume.
2781          redo A;          redo A;
2782        }        }

Legend:
Removed from v.1.1  
changed lines
  Added in v.1.7

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24