/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm
Suika

Diff of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.2 by wakaba, Tue Oct 14 04:32:49 2008 UTC revision 1.12 by wakaba, Wed Oct 15 12:49:49 2008 UTC
# Line 31  BEGIN { Line 31  BEGIN {
31    );    );
32  }  }
33    
34    ## NOTE: Differences from the XML5 draft are marked as "XML5:".
35    
36  ## Token types  ## Token types
37    
38  sub DOCTYPE_TOKEN () { 1 }  sub DOCTYPE_TOKEN () { 1 } ## XML5: No DOCTYPE token.
39  sub COMMENT_TOKEN () { 2 }  sub COMMENT_TOKEN () { 2 }
40  sub START_TAG_TOKEN () { 3 }  sub START_TAG_TOKEN () { 3 }
41  sub END_TAG_TOKEN () { 4 }  sub END_TAG_TOKEN () { 4 }
42  sub END_OF_FILE_TOKEN () { 5 }  sub END_OF_FILE_TOKEN () { 5 }
43  sub CHARACTER_TOKEN () { 6 }  sub CHARACTER_TOKEN () { 6 }
44  sub PI_TOKEN () { 7 } # XML5  sub PI_TOKEN () { 7 } ## NOTE: XML only.
45  sub ABORT_TOKEN () { 8 } # Not a token actually  sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing.
46    
47    ## XML5: XML5 has "empty tag token".  In this implementation, it is
48    ## represented as a start tag token with $self->{self_closing} flag
49    ## set to true.
50    
51    ## XML5: XML5 has "short end tag token".  In this implementation, it
52    ## is represented as an end tag token with $token->{tag_name} flag set
53    ## to an empty string.
54    
55  package Whatpm::HTML;  package Whatpm::HTML;
56    
# Line 114  sub HEXREF_HEX_STATE () { 48 } Line 124  sub HEXREF_HEX_STATE () { 48 }
124  sub ENTITY_NAME_STATE () { 49 }  sub ENTITY_NAME_STATE () { 49 }
125  sub PCDATA_STATE () { 50 } # "data state" in the spec  sub PCDATA_STATE () { 50 } # "data state" in the spec
126    
127    ## XML-only states
128    sub PI_STATE () { 51 }
129    sub PI_TARGET_STATE () { 52 }
130    sub PI_TARGET_AFTER_STATE () { 53 }
131    sub PI_DATA_STATE () { 54 }
132    sub PI_AFTER_STATE () { 55 }
133    sub PI_DATA_AFTER_STATE () { 56 }
134    sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 }
135    sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 }
136    
137  ## Tree constructor state constants (see Whatpm::HTML for the full  ## Tree constructor state constants (see Whatpm::HTML for the full
138  ## list and descriptions)  ## list and descriptions)
139    
# Line 175  sub _initialize_tokenizer ($) { Line 195  sub _initialize_tokenizer ($) {
195    #$self->{level}    #$self->{level}
196    #$self->{set_nc}    #$self->{set_nc}
197    #$self->{parse_error}    #$self->{parse_error}
198      #$self->{is_xml} (if XML)
199    
200    $self->{state} = DATA_STATE; # MUST    $self->{state} = DATA_STATE; # MUST
201    #$self->{s_kwd}; # state keyword - initialized when used    $self->{s_kwd} = ''; # Data state keyword
202      #$self->{kwd} = ''; # State-dependent keyword; initialized when used
203    #$self->{entity__value}; # initialized when used    #$self->{entity__value}; # initialized when used
204    #$self->{entity__match}; # initialized when used    #$self->{entity__match}; # initialized when used
205    $self->{content_model} = PCDATA_CONTENT_MODEL; # be    $self->{content_model} = PCDATA_CONTENT_MODEL; # be
# Line 207  sub _initialize_tokenizer ($) { Line 229  sub _initialize_tokenizer ($) {
229    
230  ## A token has:  ## A token has:
231  ##   ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,  ##   ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
232  ##       CHARACTER_TOKEN, or END_OF_FILE_TOKEN  ##       CHARACTER_TOKEN, END_OF_FILE_TOKEN, PI_TOKEN, or ABORT_TOKEN
233  ##   ->{name} (DOCTYPE_TOKEN)  ##   ->{name} (DOCTYPE_TOKEN)
234  ##   ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)  ##   ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
235    ##   ->{target} (PI_TOKEN)
236  ##   ->{pubid} (DOCTYPE_TOKEN)  ##   ->{pubid} (DOCTYPE_TOKEN)
237  ##   ->{sysid} (DOCTYPE_TOKEN)  ##   ->{sysid} (DOCTYPE_TOKEN)
238  ##   ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag  ##   ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
# Line 217  sub _initialize_tokenizer ($) { Line 240  sub _initialize_tokenizer ($) {
240  ##        ->{name}  ##        ->{name}
241  ##        ->{value}  ##        ->{value}
242  ##        ->{has_reference} == 1 or 0  ##        ->{has_reference} == 1 or 0
243  ##   ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)  ##        ->{index}: Index of the attribute in a tag.
244    ##   ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)
245    ##   ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
246    ##   ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.
247    ##   ->{has_internal_subset} = 1 or 0 (DOCTYPE_TOKEN)
248    
249  ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.  ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
250  ##     |->{self_closing}| is used to save the value of |$self->{self_closing}|  ##     |->{self_closing}| is used to save the value of |$self->{self_closing}|
251  ##     while the token is pushed back to the stack.  ##     while the token is pushed back to the stack.
# Line 237  my $is_space = { Line 265  my $is_space = {
265    0x0009 => 1, # CHARACTER TABULATION (HT)    0x0009 => 1, # CHARACTER TABULATION (HT)
266    0x000A => 1, # LINE FEED (LF)    0x000A => 1, # LINE FEED (LF)
267    #0x000B => 0, # LINE TABULATION (VT)    #0x000B => 0, # LINE TABULATION (VT)
268    0x000C => 1, # FORM FEED (FF)    0x000C => 1, # FORM FEED (FF) ## XML5: Not a space character.
269    #0x000D => 1, # CARRIAGE RETURN (CR)    #0x000D => 1, # CARRIAGE RETURN (CR)
270    0x0020 => 1, # SPACE (SP)    0x0020 => 1, # SPACE (SP)
271  };  };
# Line 361  sub _get_next_token ($) { Line 389  sub _get_next_token ($) {
389          }          }
390        } elsif ($self->{nc} == 0x002D) { # -        } elsif ($self->{nc} == 0x002D) { # -
391          if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA          if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
392            $self->{s_kwd} .= '-';            if ($self->{s_kwd} eq '<!-') {
             
           if ($self->{s_kwd} eq '<!--') {  
393                            
394              $self->{escape} = 1; # unless $self->{escape};              $self->{escape} = 1; # unless $self->{escape};
395              $self->{s_kwd} = '--';              $self->{s_kwd} = '--';
396              #              #
397            } elsif ($self->{s_kwd} eq '---') {            } elsif ($self->{s_kwd} eq '-') {
398                            
399              $self->{s_kwd} = '--';              $self->{s_kwd} = '--';
400              #              #
401              } elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') {
402                
403                $self->{s_kwd} .= '-';
404                #
405            } else {            } else {
406                            
407                $self->{s_kwd} = '-';
408              #              #
409            }            }
410          }          }
# Line 419  sub _get_next_token ($) { Line 450  sub _get_next_token ($) {
450            if ($self->{s_kwd} eq '--') {            if ($self->{s_kwd} eq '--') {
451                            
452              delete $self->{escape};              delete $self->{escape};
453                #
454            } else {            } else {
455                            
456                #
457            }            }
458            } elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') {
459              
460              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched mse', ## TODO: type
461                              line => $self->{line_prev},
462                              column => $self->{column_prev} - 1);
463              #
464          } else {          } else {
465                        
466              #
467          }          }
468                    
469          $self->{s_kwd} = '';          $self->{s_kwd} = '';
470          #          #
471          } elsif ($self->{nc} == 0x005D) { # ]
472            if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') {
473              
474              $self->{s_kwd} .= ']';
475            } elsif ($self->{s_kwd} eq ']]') {
476              
477              #
478            } else {
479              
480              $self->{s_kwd} = '';
481            }
482            #
483        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
484                    
485          $self->{s_kwd} = '';          $self->{s_kwd} = '';
# Line 445  sub _get_next_token ($) { Line 497  sub _get_next_token ($) {
497                     data => chr $self->{nc},                     data => chr $self->{nc},
498                     line => $self->{line}, column => $self->{column},                     line => $self->{line}, column => $self->{column},
499                    };                    };
500        if ($self->{read_until}->($token->{data}, q[-!<>&],        if ($self->{read_until}->($token->{data}, q{-!<>&\]},
501                                  length $token->{data})) {                                  length $token->{data})) {
502          $self->{s_kwd} = '';          $self->{s_kwd} = '';
503        }        }
504    
505        ## Stay in the data state.        ## Stay in the data state.
506        if ($self->{content_model} == PCDATA_CONTENT_MODEL) {        if (not $self->{is_xml} and
507              $self->{content_model} == PCDATA_CONTENT_MODEL) {
508                    
509          $self->{state} = PCDATA_STATE;          $self->{state} = PCDATA_STATE;
510        } else {        } else {
# Line 472  sub _get_next_token ($) { Line 525  sub _get_next_token ($) {
525        return  ($token);        return  ($token);
526        redo A;        redo A;
527      } elsif ($self->{state} == TAG_OPEN_STATE) {      } elsif ($self->{state} == TAG_OPEN_STATE) {
528          ## XML5: "tag state".
529    
530        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
531          if ($self->{nc} == 0x002F) { # /          if ($self->{nc} == 0x002F) { # /
532                        
# Line 490  sub _get_next_token ($) { Line 545  sub _get_next_token ($) {
545            redo A;            redo A;
546          } elsif ($self->{nc} == 0x0021) { # !          } elsif ($self->{nc} == 0x0021) { # !
547                        
548            $self->{s_kwd} = '<' unless $self->{escape};            $self->{s_kwd} = $self->{escaped} ? '' : '<';
549            #            #
550          } else {          } else {
551                        
552              $self->{s_kwd} = '';
553            #            #
554          }          }
555    
# Line 540  sub _get_next_token ($) { Line 596  sub _get_next_token ($) {
596                        
597            $self->{ct}            $self->{ct}
598              = {type => START_TAG_TOKEN,              = {type => START_TAG_TOKEN,
599                 tag_name => chr ($self->{nc} + 0x0020),                 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
600                 line => $self->{line_prev},                 line => $self->{line_prev},
601                 column => $self->{column_prev}};                 column => $self->{column_prev}};
602            $self->{state} = TAG_NAME_STATE;            $self->{state} = TAG_NAME_STATE;
# Line 582  sub _get_next_token ($) { Line 638  sub _get_next_token ($) {
638                            line => $self->{line_prev},                            line => $self->{line_prev},
639                            column => $self->{column_prev});                            column => $self->{column_prev});
640            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
641              $self->{s_kwd} = '';
642                        
643      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
644        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 601  sub _get_next_token ($) { Line 658  sub _get_next_token ($) {
658    
659            redo A;            redo A;
660          } elsif ($self->{nc} == 0x003F) { # ?          } elsif ($self->{nc} == 0x003F) { # ?
661                        if ($self->{is_xml}) {
662            $self->{parse_error}->(level => $self->{level}->{must}, type => 'pio',              
663                            line => $self->{line_prev},              $self->{state} = PI_STATE;
664                            column => $self->{column_prev});              
665            $self->{state} = BOGUS_COMMENT_STATE;      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
666            $self->{ct} = {type => COMMENT_TOKEN, data => '',        $self->{line_prev} = $self->{line};
667                                      line => $self->{line_prev},        $self->{column_prev} = $self->{column};
668                                      column => $self->{column_prev},        $self->{column}++;
669                                     };        $self->{nc}
670            ## $self->{nc} is intentionally left as is            = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
671            redo A;      } else {
672          } else {        $self->{set_nc}->($self);
673        }
674      
675                redo A;
676              } else {
677                
678                $self->{parse_error}->(level => $self->{level}->{must}, type => 'pio',
679                                line => $self->{line_prev},
680                                column => $self->{column_prev});
681                $self->{state} = BOGUS_COMMENT_STATE;
682                $self->{ct} = {type => COMMENT_TOKEN, data => '',
683                               line => $self->{line_prev},
684                               column => $self->{column_prev},
685                              };
686                ## $self->{nc} is intentionally left as is
687                redo A;
688              }
689            } elsif (not $self->{is_xml} or $is_space->{$self->{nc}}) {
690                        
691            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago',            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago',
692                            line => $self->{line_prev},                            line => $self->{line_prev},
693                            column => $self->{column_prev});                            column => $self->{column_prev});
694            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
695              $self->{s_kwd} = '';
696            ## reconsume            ## reconsume
697    
698            return  ({type => CHARACTER_TOKEN, data => '<',            return  ({type => CHARACTER_TOKEN, data => '<',
# Line 626  sub _get_next_token ($) { Line 701  sub _get_next_token ($) {
701                     });                     });
702    
703            redo A;            redo A;
704            } else {
705              ## XML5: "<:" is a parse error.
706              
707              $self->{ct} = {type => START_TAG_TOKEN,
708                                        tag_name => chr ($self->{nc}),
709                                        line => $self->{line_prev},
710                                        column => $self->{column_prev}};
711              $self->{state} = TAG_NAME_STATE;
712              
713        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
714          $self->{line_prev} = $self->{line};
715          $self->{column_prev} = $self->{column};
716          $self->{column}++;
717          $self->{nc}
718              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
719        } else {
720          $self->{set_nc}->($self);
721        }
722      
723              redo A;
724          }          }
725        } else {        } else {
726          die "$0: $self->{content_model} in tag open";          die "$0: $self->{content_model} in tag open";
# Line 634  sub _get_next_token ($) { Line 729  sub _get_next_token ($) {
729        ## NOTE: The "close tag open state" in the spec is implemented as        ## NOTE: The "close tag open state" in the spec is implemented as
730        ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.        ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
731    
732          ## XML5: "end tag state".
733    
734        my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"        my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
735        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
736          if (defined $self->{last_stag_name}) {          if (defined $self->{last_stag_name}) {
737            $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;            $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
738            $self->{s_kwd} = '';            $self->{kwd} = '';
739            ## Reconsume.            ## Reconsume.
740            redo A;            redo A;
741          } else {          } else {
# Line 646  sub _get_next_token ($) { Line 743  sub _get_next_token ($) {
743            ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.            ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
744                        
745            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
746              $self->{s_kwd} = '';
747            ## Reconsume.            ## Reconsume.
748            return  ({type => CHARACTER_TOKEN, data => '</',            return  ({type => CHARACTER_TOKEN, data => '</',
749                      line => $l, column => $c,                      line => $l, column => $c,
# Line 659  sub _get_next_token ($) { Line 757  sub _get_next_token ($) {
757                    
758          $self->{ct}          $self->{ct}
759              = {type => END_TAG_TOKEN,              = {type => END_TAG_TOKEN,
760                 tag_name => chr ($self->{nc} + 0x0020),                 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
761                 line => $l, column => $c};                 line => $l, column => $c};
762          $self->{state} = TAG_NAME_STATE;          $self->{state} = TAG_NAME_STATE;
763                    
# Line 694  sub _get_next_token ($) { Line 792  sub _get_next_token ($) {
792        
793          redo A;          redo A;
794        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
           
795          $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty end tag',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty end tag',
796                          line => $self->{line_prev}, ## "<" in "</>"                          line => $self->{line_prev}, ## "<" in "</>"
797                          column => $self->{column_prev} - 1);                          column => $self->{column_prev} - 1);
798          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
799                    $self->{s_kwd} = '';
800            if ($self->{is_xml}) {
801              
802              ## XML5: No parse error.
803              
804              ## NOTE: This parser raises a parse error, since it supports
805              ## XML1, not XML5.
806    
807              ## NOTE: A short end tag token.
808              my $ct = {type => END_TAG_TOKEN,
809                        tag_name => '',
810                        line => $self->{line_prev},
811                        column => $self->{column_prev} - 1,
812                       };
813              
814        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
815          $self->{line_prev} = $self->{line};
816          $self->{column_prev} = $self->{column};
817          $self->{column}++;
818          $self->{nc}
819              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
820        } else {
821          $self->{set_nc}->($self);
822        }
823      
824              return  ($ct);
825            } else {
826              
827              
828      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
829        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
830        $self->{column_prev} = $self->{column};        $self->{column_prev} = $self->{column};
# Line 710  sub _get_next_token ($) { Line 835  sub _get_next_token ($) {
835        $self->{set_nc}->($self);        $self->{set_nc}->($self);
836      }      }
837        
838            }
839          redo A;          redo A;
840        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
841                    
842          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare etago');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare etago');
843            $self->{s_kwd} = '';
844          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
845          # reconsume          # reconsume
846    
# Line 722  sub _get_next_token ($) { Line 849  sub _get_next_token ($) {
849                   });                   });
850    
851          redo A;          redo A;
852        } else {        } elsif (not $self->{is_xml} or
853                   $is_space->{$self->{nc}}) {
854                    
855          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus end tag');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus end tag',
856                            line => $self->{line_prev}, # "<" of "</"
857                            column => $self->{column_prev} - 1);
858          $self->{state} = BOGUS_COMMENT_STATE;          $self->{state} = BOGUS_COMMENT_STATE;
859          $self->{ct} = {type => COMMENT_TOKEN, data => '',          $self->{ct} = {type => COMMENT_TOKEN, data => '',
860                                    line => $self->{line_prev}, # "<" of "</"                                    line => $self->{line_prev}, # "<" of "</"
# Line 737  sub _get_next_token ($) { Line 867  sub _get_next_token ($) {
867          ## generated from the bogus end tag, as defined in the          ## generated from the bogus end tag, as defined in the
868          ## "bogus comment state" entry.          ## "bogus comment state" entry.
869          redo A;          redo A;
870          } else {
871            ## XML5: "</:" is a parse error.
872            
873            $self->{ct} = {type => END_TAG_TOKEN,
874                           tag_name => chr ($self->{nc}),
875                           line => $l, column => $c};
876            $self->{state} = TAG_NAME_STATE; ## XML5: "end tag name state".
877            
878        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
879          $self->{line_prev} = $self->{line};
880          $self->{column_prev} = $self->{column};
881          $self->{column}++;
882          $self->{nc}
883              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
884        } else {
885          $self->{set_nc}->($self);
886        }
887      
888            redo A;
889        }        }
890      } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {      } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
891        my $ch = substr $self->{last_stag_name}, length $self->{s_kwd}, 1;        my $ch = substr $self->{last_stag_name}, length $self->{kwd}, 1;
892        if (length $ch) {        if (length $ch) {
893          my $CH = $ch;          my $CH = $ch;
894          $ch =~ tr/a-z/A-Z/;          $ch =~ tr/a-z/A-Z/;
# Line 747  sub _get_next_token ($) { Line 896  sub _get_next_token ($) {
896          if ($nch eq $ch or $nch eq $CH) {          if ($nch eq $ch or $nch eq $CH) {
897                        
898            ## Stay in the state.            ## Stay in the state.
899            $self->{s_kwd} .= $nch;            $self->{kwd} .= $nch;
900                        
901      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
902        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 763  sub _get_next_token ($) { Line 912  sub _get_next_token ($) {
912          } else {          } else {
913                        
914            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
915              $self->{s_kwd} = '';
916            ## Reconsume.            ## Reconsume.
917            return  ({type => CHARACTER_TOKEN,            return  ({type => CHARACTER_TOKEN,
918                      data => '</' . $self->{s_kwd},                      data => '</' . $self->{kwd},
919                      line => $self->{line_prev},                      line => $self->{line_prev},
920                      column => $self->{column_prev} - 1 - length $self->{s_kwd},                      column => $self->{column_prev} - 1 - length $self->{kwd},
921                     });                     });
922            redo A;            redo A;
923          }          }
# Line 781  sub _get_next_token ($) { Line 931  sub _get_next_token ($) {
931                        
932            ## Reconsume.            ## Reconsume.
933            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
934              $self->{s_kwd} = '';
935            return  ({type => CHARACTER_TOKEN,            return  ({type => CHARACTER_TOKEN,
936                      data => '</' . $self->{s_kwd},                      data => '</' . $self->{kwd},
937                      line => $self->{line_prev},                      line => $self->{line_prev},
938                      column => $self->{column_prev} - 1 - length $self->{s_kwd},                      column => $self->{column_prev} - 1 - length $self->{kwd},
939                     });                     });
940            redo A;            redo A;
941          } else {          } else {
# Line 793  sub _get_next_token ($) { Line 944  sub _get_next_token ($) {
944                = {type => END_TAG_TOKEN,                = {type => END_TAG_TOKEN,
945                   tag_name => $self->{last_stag_name},                   tag_name => $self->{last_stag_name},
946                   line => $self->{line_prev},                   line => $self->{line_prev},
947                   column => $self->{column_prev} - 1 - length $self->{s_kwd}};                   column => $self->{column_prev} - 1 - length $self->{kwd}};
948            $self->{state} = TAG_NAME_STATE;            $self->{state} = TAG_NAME_STATE;
949            ## Reconsume.            ## Reconsume.
950            redo A;            redo A;
# Line 832  sub _get_next_token ($) { Line 983  sub _get_next_token ($) {
983            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
984          }          }
985          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
986            $self->{s_kwd} = '';
987                    
988      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
989        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 850  sub _get_next_token ($) { Line 1002  sub _get_next_token ($) {
1002        } elsif (0x0041 <= $self->{nc} and        } elsif (0x0041 <= $self->{nc} and
1003                 $self->{nc} <= 0x005A) { # A..Z                 $self->{nc} <= 0x005A) { # A..Z
1004                    
1005          $self->{ct}->{tag_name} .= chr ($self->{nc} + 0x0020);          $self->{ct}->{tag_name}
1006                .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1007            # start tag or end tag            # start tag or end tag
1008          ## Stay in this state          ## Stay in this state
1009                    
# Line 883  sub _get_next_token ($) { Line 1036  sub _get_next_token ($) {
1036            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1037          }          }
1038          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1039            $self->{s_kwd} = '';
1040          # reconsume          # reconsume
1041    
1042          return  ($self->{ct}); # start tag or end tag          return  ($self->{ct}); # start tag or end tag
# Line 922  sub _get_next_token ($) { Line 1076  sub _get_next_token ($) {
1076          redo A;          redo A;
1077        }        }
1078      } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {      } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
1079          ## XML5: "Tag attribute name before state".
1080    
1081        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
1082                    
1083          ## Stay in the state          ## Stay in the state
# Line 953  sub _get_next_token ($) { Line 1109  sub _get_next_token ($) {
1109            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1110          }          }
1111          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1112            $self->{s_kwd} = '';
1113                    
1114      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1115        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 972  sub _get_next_token ($) { Line 1129  sub _get_next_token ($) {
1129                 $self->{nc} <= 0x005A) { # A..Z                 $self->{nc} <= 0x005A) { # A..Z
1130                    
1131          $self->{ca}          $self->{ca}
1132              = {name => chr ($self->{nc} + 0x0020),              = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1133                 value => '',                 value => '',
1134                 line => $self->{line}, column => $self->{column}};                 line => $self->{line}, column => $self->{column}};
1135          $self->{state} = ATTRIBUTE_NAME_STATE;          $self->{state} = ATTRIBUTE_NAME_STATE;
# Line 1020  sub _get_next_token ($) { Line 1177  sub _get_next_token ($) {
1177            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1178          }          }
1179          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1180            $self->{s_kwd} = '';
1181          # reconsume          # reconsume
1182    
1183          return  ($self->{ct}); # start tag or end tag          return  ($self->{ct}); # start tag or end tag
# Line 1032  sub _get_next_token ($) { Line 1190  sub _get_next_token ($) {
1190               0x003D => 1, # =               0x003D => 1, # =
1191              }->{$self->{nc}}) {              }->{$self->{nc}}) {
1192                        
1193              ## XML5: Not a parse error.
1194            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1195          } else {          } else {
1196                        
1197              ## XML5: ":" raises a parse error and is ignored.
1198          }          }
1199          $self->{ca}          $self->{ca}
1200              = {name => chr ($self->{nc}),              = {name => chr ($self->{nc}),
# Line 1055  sub _get_next_token ($) { Line 1215  sub _get_next_token ($) {
1215          redo A;          redo A;
1216        }        }
1217      } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {      } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
1218          ## XML5: "Tag attribute name state".
1219    
1220        my $before_leave = sub {        my $before_leave = sub {
1221          if (exists $self->{ct}->{attributes} # start tag or end tag          if (exists $self->{ct}->{attributes} # start tag or end tag
1222              ->{$self->{ca}->{name}}) { # MUST              ->{$self->{ca}->{name}}) { # MUST
# Line 1065  sub _get_next_token ($) { Line 1227  sub _get_next_token ($) {
1227                        
1228            $self->{ct}->{attributes}->{$self->{ca}->{name}}            $self->{ct}->{attributes}->{$self->{ca}->{name}}
1229              = $self->{ca};              = $self->{ca};
1230              $self->{ca}->{index} = ++$self->{ct}->{last_index};
1231          }          }
1232        }; # $before_leave        }; # $before_leave
1233    
# Line 1101  sub _get_next_token ($) { Line 1264  sub _get_next_token ($) {
1264        
1265          redo A;          redo A;
1266        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
1267            if ($self->{is_xml}) {
1268              
1269              ## XML5: Not a parse error.
1270              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1271            } else {
1272              
1273            }
1274    
1275          $before_leave->();          $before_leave->();
1276          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
1277                        
# Line 1115  sub _get_next_token ($) { Line 1286  sub _get_next_token ($) {
1286            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1287          }          }
1288          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1289            $self->{s_kwd} = '';
1290                    
1291      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1292        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 1133  sub _get_next_token ($) { Line 1305  sub _get_next_token ($) {
1305        } elsif (0x0041 <= $self->{nc} and        } elsif (0x0041 <= $self->{nc} and
1306                 $self->{nc} <= 0x005A) { # A..Z                 $self->{nc} <= 0x005A) { # A..Z
1307                    
1308          $self->{ca}->{name} .= chr ($self->{nc} + 0x0020);          $self->{ca}->{name}
1309                .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1310          ## Stay in the state          ## Stay in the state
1311                    
1312      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 1148  sub _get_next_token ($) { Line 1321  sub _get_next_token ($) {
1321        
1322          redo A;          redo A;
1323        } elsif ($self->{nc} == 0x002F) { # /        } elsif ($self->{nc} == 0x002F) { # /
1324            if ($self->{is_xml}) {
1325              
1326              ## XML5: Not a parse error.
1327              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1328            } else {
1329              
1330            }
1331                    
1332          $before_leave->();          $before_leave->();
1333          $self->{state} = SELF_CLOSING_START_TAG_STATE;          $self->{state} = SELF_CLOSING_START_TAG_STATE;
# Line 1182  sub _get_next_token ($) { Line 1362  sub _get_next_token ($) {
1362            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1363          }          }
1364          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1365            $self->{s_kwd} = '';
1366          # reconsume          # reconsume
1367    
1368          return  ($self->{ct}); # start tag or end tag          return  ($self->{ct}); # start tag or end tag
# Line 1191  sub _get_next_token ($) { Line 1372  sub _get_next_token ($) {
1372          if ($self->{nc} == 0x0022 or # "          if ($self->{nc} == 0x0022 or # "
1373              $self->{nc} == 0x0027) { # '              $self->{nc} == 0x0027) { # '
1374                        
1375              ## XML5: Not a parse error.
1376            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1377          } else {          } else {
1378                        
# Line 1211  sub _get_next_token ($) { Line 1393  sub _get_next_token ($) {
1393          redo A;          redo A;
1394        }        }
1395      } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {      } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1396          ## XML5: "Tag attribute name after state".
1397          
1398        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
1399                    
1400          ## Stay in the state          ## Stay in the state
# Line 1242  sub _get_next_token ($) { Line 1426  sub _get_next_token ($) {
1426        
1427          redo A;          redo A;
1428        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
1429            if ($self->{is_xml}) {
1430              
1431              ## XML5: Not a parse error.
1432              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1433            } else {
1434              
1435            }
1436    
1437          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
1438                        
1439            $self->{last_stag_name} = $self->{ct}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
# Line 1258  sub _get_next_token ($) { Line 1450  sub _get_next_token ($) {
1450            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1451          }          }
1452          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1453            $self->{s_kwd} = '';
1454                    
1455      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1456        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 1277  sub _get_next_token ($) { Line 1470  sub _get_next_token ($) {
1470                 $self->{nc} <= 0x005A) { # A..Z                 $self->{nc} <= 0x005A) { # A..Z
1471                    
1472          $self->{ca}          $self->{ca}
1473              = {name => chr ($self->{nc} + 0x0020),              = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1474                 value => '',                 value => '',
1475                 line => $self->{line}, column => $self->{column}};                 line => $self->{line}, column => $self->{column}};
1476          $self->{state} = ATTRIBUTE_NAME_STATE;          $self->{state} = ATTRIBUTE_NAME_STATE;
# Line 1294  sub _get_next_token ($) { Line 1487  sub _get_next_token ($) {
1487        
1488          redo A;          redo A;
1489        } elsif ($self->{nc} == 0x002F) { # /        } elsif ($self->{nc} == 0x002F) { # /
1490            if ($self->{is_xml}) {
1491              
1492              ## XML5: Not a parse error.
1493              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1494            } else {
1495              
1496            }
1497                    
1498          $self->{state} = SELF_CLOSING_START_TAG_STATE;          $self->{state} = SELF_CLOSING_START_TAG_STATE;
1499                    
# Line 1325  sub _get_next_token ($) { Line 1525  sub _get_next_token ($) {
1525          } else {          } else {
1526            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1527          }          }
1528            $self->{s_kwd} = '';
1529          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1530          # reconsume          # reconsume
1531    
# Line 1332  sub _get_next_token ($) { Line 1533  sub _get_next_token ($) {
1533    
1534          redo A;          redo A;
1535        } else {        } else {
1536            if ($self->{is_xml}) {
1537              
1538              ## XML5: Not a parse error.
1539              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1540            } else {
1541              
1542            }
1543    
1544          if ($self->{nc} == 0x0022 or # "          if ($self->{nc} == 0x0022 or # "
1545              $self->{nc} == 0x0027) { # '              $self->{nc} == 0x0027) { # '
1546                        
1547              ## XML5: Not a parse error.
1548            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1549          } else {          } else {
1550                        
# Line 1358  sub _get_next_token ($) { Line 1568  sub _get_next_token ($) {
1568          redo A;                  redo A;        
1569        }        }
1570      } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {      } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1571          ## XML5: "Tag attribute value before state".
1572    
1573        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
1574                    
1575          ## Stay in the state          ## Stay in the state
# Line 1426  sub _get_next_token ($) { Line 1638  sub _get_next_token ($) {
1638            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1639          }          }
1640          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1641            $self->{s_kwd} = '';
1642                    
1643      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1644        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 1459  sub _get_next_token ($) { Line 1672  sub _get_next_token ($) {
1672            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1673          }          }
1674          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1675            $self->{s_kwd} = '';
1676          ## reconsume          ## reconsume
1677    
1678          return  ($self->{ct}); # start tag or end tag          return  ($self->{ct}); # start tag or end tag
# Line 1467  sub _get_next_token ($) { Line 1681  sub _get_next_token ($) {
1681        } else {        } else {
1682          if ($self->{nc} == 0x003D) { # =          if ($self->{nc} == 0x003D) { # =
1683                        
1684              ## XML5: Not a parse error.
1685            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
1686            } elsif ($self->{is_xml}) {
1687              
1688              ## XML5: No parse error.
1689              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO
1690          } else {          } else {
1691                        
1692          }          }
# Line 1487  sub _get_next_token ($) { Line 1706  sub _get_next_token ($) {
1706          redo A;          redo A;
1707        }        }
1708      } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {      } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1709          ## XML5: "Tag attribute value double quoted state".
1710          
1711        if ($self->{nc} == 0x0022) { # "        if ($self->{nc} == 0x0022) { # "
1712                    
1713            ## XML5: "Tag attribute name before state".
1714          $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;          $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1715                    
1716      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 1504  sub _get_next_token ($) { Line 1726  sub _get_next_token ($) {
1726          redo A;          redo A;
1727        } elsif ($self->{nc} == 0x0026) { # &        } elsif ($self->{nc} == 0x0026) { # &
1728                    
1729            ## XML5: Not defined yet.
1730    
1731          ## NOTE: In the spec, the tokenizer is switched to the          ## NOTE: In the spec, the tokenizer is switched to the
1732          ## "entity in attribute value state".  In this implementation, the          ## "entity in attribute value state".  In this implementation, the
1733          ## tokenizer is switched to the |ENTITY_STATE|, which is an          ## tokenizer is switched to the |ENTITY_STATE|, which is an
# Line 1541  sub _get_next_token ($) { Line 1765  sub _get_next_token ($) {
1765            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1766          }          }
1767          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1768            $self->{s_kwd} = '';
1769          ## reconsume          ## reconsume
1770    
1771          return  ($self->{ct}); # start tag or end tag          return  ($self->{ct}); # start tag or end tag
1772    
1773          redo A;          redo A;
1774        } else {        } else {
1775                    if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1776              
1777              ## XML5: Not a parse error.
1778              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
1779            } else {
1780              
1781            }
1782          $self->{ca}->{value} .= chr ($self->{nc});          $self->{ca}->{value} .= chr ($self->{nc});
1783          $self->{read_until}->($self->{ca}->{value},          $self->{read_until}->($self->{ca}->{value},
1784                                q["&],                                q["&<],
1785                                length $self->{ca}->{value});                                length $self->{ca}->{value});
1786    
1787          ## Stay in the state          ## Stay in the state
# Line 1568  sub _get_next_token ($) { Line 1799  sub _get_next_token ($) {
1799          redo A;          redo A;
1800        }        }
1801      } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {      } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1802          ## XML5: "Tag attribute value single quoted state".
1803    
1804        if ($self->{nc} == 0x0027) { # '        if ($self->{nc} == 0x0027) { # '
1805                    
1806            ## XML5: "Before attribute name state" (sic).
1807          $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;          $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1808                    
1809      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 1585  sub _get_next_token ($) { Line 1819  sub _get_next_token ($) {
1819          redo A;          redo A;
1820        } elsif ($self->{nc} == 0x0026) { # &        } elsif ($self->{nc} == 0x0026) { # &
1821                    
1822            ## XML5: Not defined yet.
1823    
1824          ## NOTE: In the spec, the tokenizer is switched to the          ## NOTE: In the spec, the tokenizer is switched to the
1825          ## "entity in attribute value state".  In this implementation, the          ## "entity in attribute value state".  In this implementation, the
1826          ## tokenizer is switched to the |ENTITY_STATE|, which is an          ## tokenizer is switched to the |ENTITY_STATE|, which is an
# Line 1622  sub _get_next_token ($) { Line 1858  sub _get_next_token ($) {
1858            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1859          }          }
1860          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1861            $self->{s_kwd} = '';
1862          ## reconsume          ## reconsume
1863    
1864          return  ($self->{ct}); # start tag or end tag          return  ($self->{ct}); # start tag or end tag
1865    
1866          redo A;          redo A;
1867        } else {        } else {
1868                    if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1869              
1870              ## XML5: Not a parse error.
1871              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
1872            } else {
1873              
1874            }
1875          $self->{ca}->{value} .= chr ($self->{nc});          $self->{ca}->{value} .= chr ($self->{nc});
1876          $self->{read_until}->($self->{ca}->{value},          $self->{read_until}->($self->{ca}->{value},
1877                                q['&],                                q['&<],
1878                                length $self->{ca}->{value});                                length $self->{ca}->{value});
1879    
1880          ## Stay in the state          ## Stay in the state
# Line 1649  sub _get_next_token ($) { Line 1892  sub _get_next_token ($) {
1892          redo A;          redo A;
1893        }        }
1894      } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {      } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1895          ## XML5: "Tag attribute value unquoted state".
1896    
1897        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
1898                    
1899            ## XML5: "Tag attribute name before state".
1900          $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;          $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1901                    
1902      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 1666  sub _get_next_token ($) { Line 1912  sub _get_next_token ($) {
1912          redo A;          redo A;
1913        } elsif ($self->{nc} == 0x0026) { # &        } elsif ($self->{nc} == 0x0026) { # &
1914                    
1915    
1916            ## XML5: Not defined yet.
1917    
1918          ## NOTE: In the spec, the tokenizer is switched to the          ## NOTE: In the spec, the tokenizer is switched to the
1919          ## "entity in attribute value state".  In this implementation, the          ## "entity in attribute value state".  In this implementation, the
1920          ## tokenizer is switched to the |ENTITY_STATE|, which is an          ## tokenizer is switched to the |ENTITY_STATE|, which is an
# Line 1702  sub _get_next_token ($) { Line 1951  sub _get_next_token ($) {
1951            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1952          }          }
1953          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1954            $self->{s_kwd} = '';
1955                    
1956      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1957        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 1735  sub _get_next_token ($) { Line 1985  sub _get_next_token ($) {
1985            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1986          }          }
1987          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1988            $self->{s_kwd} = '';
1989          ## reconsume          ## reconsume
1990    
1991          return  ($self->{ct}); # start tag or end tag          return  ($self->{ct}); # start tag or end tag
# Line 1747  sub _get_next_token ($) { Line 1998  sub _get_next_token ($) {
1998               0x003D => 1, # =               0x003D => 1, # =
1999              }->{$self->{nc}}) {              }->{$self->{nc}}) {
2000                        
2001              ## XML5: Not a parse error.
2002            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
2003          } else {          } else {
2004                        
# Line 1803  sub _get_next_token ($) { Line 2055  sub _get_next_token ($) {
2055            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
2056          }          }
2057          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2058            $self->{s_kwd} = '';
2059                    
2060      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2061        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 1850  sub _get_next_token ($) { Line 2103  sub _get_next_token ($) {
2103            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
2104          }          }
2105          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2106            $self->{s_kwd} = '';
2107          ## Reconsume.          ## Reconsume.
2108          return  ($self->{ct}); # start tag or end tag          return  ($self->{ct}); # start tag or end tag
2109          redo A;          redo A;
# Line 1861  sub _get_next_token ($) { Line 2115  sub _get_next_token ($) {
2115          redo A;          redo A;
2116        }        }
2117      } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {      } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
2118          ## XML5: "Empty tag state".
2119    
2120        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
2121          if ($self->{ct}->{type} == END_TAG_TOKEN) {          if ($self->{ct}->{type} == END_TAG_TOKEN) {
2122                        
# Line 1880  sub _get_next_token ($) { Line 2136  sub _get_next_token ($) {
2136          }          }
2137    
2138          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2139            $self->{s_kwd} = '';
2140                    
2141      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2142        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 1911  sub _get_next_token ($) { Line 2168  sub _get_next_token ($) {
2168          } else {          } else {
2169            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
2170          }          }
2171            ## XML5: "Tag attribute name before state".
2172          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2173            $self->{s_kwd} = '';
2174          ## Reconsume.          ## Reconsume.
2175          return  ($self->{ct}); # start tag or end tag          return  ($self->{ct}); # start tag or end tag
2176          redo A;          redo A;
# Line 1932  sub _get_next_token ($) { Line 2191  sub _get_next_token ($) {
2191        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
2192                    
2193          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2194            $self->{s_kwd} = '';
2195                    
2196      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2197        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 1949  sub _get_next_token ($) { Line 2209  sub _get_next_token ($) {
2209        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
2210                    
2211          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2212            $self->{s_kwd} = '';
2213          ## reconsume          ## reconsume
2214    
2215          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
# Line 1997  sub _get_next_token ($) { Line 2258  sub _get_next_token ($) {
2258          ## ASCII case-insensitive.          ## ASCII case-insensitive.
2259                    
2260          $self->{state} = MD_DOCTYPE_STATE;          $self->{state} = MD_DOCTYPE_STATE;
2261          $self->{s_kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
2262                    
2263      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2264        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2010  sub _get_next_token ($) { Line 2271  sub _get_next_token ($) {
2271      }      }
2272        
2273          redo A;          redo A;
2274        } elsif ($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and        } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
2275                 $self->{open_elements}->[-1]->[1] & FOREIGN_EL and                   $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
2276                    $self->{is_xml}) and
2277                 $self->{nc} == 0x005B) { # [                 $self->{nc} == 0x005B) { # [
2278                                                    
2279          $self->{state} = MD_CDATA_STATE;          $self->{state} = MD_CDATA_STATE;
2280          $self->{s_kwd} = '[';          $self->{kwd} = '[';
2281                    
2282      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2283        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2049  sub _get_next_token ($) { Line 2311  sub _get_next_token ($) {
2311                                    line => $self->{line_prev},                                    line => $self->{line_prev},
2312                                    column => $self->{column_prev} - 2,                                    column => $self->{column_prev} - 2,
2313                                   };                                   };
2314          $self->{state} = COMMENT_START_STATE;          $self->{state} = COMMENT_START_STATE; ## XML5: "comment state".
2315                    
2316      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2317        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2085  sub _get_next_token ($) { Line 2347  sub _get_next_token ($) {
2347              0x0054, # T              0x0054, # T
2348              0x0059, # Y              0x0059, # Y
2349              0x0050, # P              0x0050, # P
2350            ]->[length $self->{s_kwd}] or            ]->[length $self->{kwd}] or
2351            $self->{nc} == [            $self->{nc} == [
2352              undef,              undef,
2353              0x006F, # o              0x006F, # o
# Line 2093  sub _get_next_token ($) { Line 2355  sub _get_next_token ($) {
2355              0x0074, # t              0x0074, # t
2356              0x0079, # y              0x0079, # y
2357              0x0070, # p              0x0070, # p
2358            ]->[length $self->{s_kwd}]) {            ]->[length $self->{kwd}]) {
2359                    
2360          ## Stay in the state.          ## Stay in the state.
2361          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
2362                    
2363      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2364        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2109  sub _get_next_token ($) { Line 2371  sub _get_next_token ($) {
2371      }      }
2372        
2373          redo A;          redo A;
2374        } elsif ((length $self->{s_kwd}) == 6 and        } elsif ((length $self->{kwd}) == 6 and
2375                 ($self->{nc} == 0x0045 or # E                 ($self->{nc} == 0x0045 or # E
2376                  $self->{nc} == 0x0065)) { # e                  $self->{nc} == 0x0065)) { # e
2377                    if ($self->{is_xml} and
2378                ($self->{kwd} ne 'DOCTYP' or $self->{nc} == 0x0065)) {
2379              
2380              ## XML5: case-sensitive.
2381              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO
2382                              text => 'DOCTYPE',
2383                              line => $self->{line_prev},
2384                              column => $self->{column_prev} - 5);
2385            } else {
2386              
2387            }
2388          $self->{state} = DOCTYPE_STATE;          $self->{state} = DOCTYPE_STATE;
2389          $self->{ct} = {type => DOCTYPE_TOKEN,          $self->{ct} = {type => DOCTYPE_TOKEN,
2390                                    quirks => 1,                                    quirks => 1,
# Line 2135  sub _get_next_token ($) { Line 2407  sub _get_next_token ($) {
2407                                    
2408          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2409                          line => $self->{line_prev},                          line => $self->{line_prev},
2410                          column => $self->{column_prev} - 1 - length $self->{s_kwd});                          column => $self->{column_prev} - 1 - length $self->{kwd});
2411          $self->{state} = BOGUS_COMMENT_STATE;          $self->{state} = BOGUS_COMMENT_STATE;
2412          ## Reconsume.          ## Reconsume.
2413          $self->{ct} = {type => COMMENT_TOKEN,          $self->{ct} = {type => COMMENT_TOKEN,
2414                                    data => $self->{s_kwd},                                    data => $self->{kwd},
2415                                    line => $self->{line_prev},                                    line => $self->{line_prev},
2416                                    column => $self->{column_prev} - 1 - length $self->{s_kwd},                                    column => $self->{column_prev} - 1 - length $self->{kwd},
2417                                   };                                   };
2418          redo A;          redo A;
2419        }        }
# Line 2152  sub _get_next_token ($) { Line 2424  sub _get_next_token ($) {
2424              '[CD' => 0x0041, # A              '[CD' => 0x0041, # A
2425              '[CDA' => 0x0054, # T              '[CDA' => 0x0054, # T
2426              '[CDAT' => 0x0041, # A              '[CDAT' => 0x0041, # A
2427            }->{$self->{s_kwd}}) {            }->{$self->{kwd}}) {
2428                    
2429          ## Stay in the state.          ## Stay in the state.
2430          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
2431                    
2432      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2433        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2168  sub _get_next_token ($) { Line 2440  sub _get_next_token ($) {
2440      }      }
2441        
2442          redo A;          redo A;
2443        } elsif ($self->{s_kwd} eq '[CDATA' and        } elsif ($self->{kwd} eq '[CDATA' and
2444                 $self->{nc} == 0x005B) { # [                 $self->{nc} == 0x005B) { # [
2445                    if ($self->{is_xml} and
2446                not $self->{tainted} and
2447                @{$self->{open_elements} or []} == 0) {
2448              
2449              $self->{parse_error}->(level => $self->{level}->{must}, type => 'cdata outside of root element',
2450                              line => $self->{line_prev},
2451                              column => $self->{column_prev} - 7);
2452              $self->{tainted} = 1;
2453            } else {
2454              
2455            }
2456    
2457          $self->{ct} = {type => CHARACTER_TOKEN,          $self->{ct} = {type => CHARACTER_TOKEN,
2458                                    data => '',                                    data => '',
2459                                    line => $self->{line_prev},                                    line => $self->{line_prev},
# Line 2192  sub _get_next_token ($) { Line 2475  sub _get_next_token ($) {
2475                    
2476          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2477                          line => $self->{line_prev},                          line => $self->{line_prev},
2478                          column => $self->{column_prev} - 1 - length $self->{s_kwd});                          column => $self->{column_prev} - 1 - length $self->{kwd});
2479          $self->{state} = BOGUS_COMMENT_STATE;          $self->{state} = BOGUS_COMMENT_STATE;
2480          ## Reconsume.          ## Reconsume.
2481          $self->{ct} = {type => COMMENT_TOKEN,          $self->{ct} = {type => COMMENT_TOKEN,
2482                                    data => $self->{s_kwd},                                    data => $self->{kwd},
2483                                    line => $self->{line_prev},                                    line => $self->{line_prev},
2484                                    column => $self->{column_prev} - 1 - length $self->{s_kwd},                                    column => $self->{column_prev} - 1 - length $self->{kwd},
2485                                   };                                   };
2486          redo A;          redo A;
2487        }        }
# Line 2222  sub _get_next_token ($) { Line 2505  sub _get_next_token ($) {
2505                    
2506          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2507          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2508            $self->{s_kwd} = '';
2509                    
2510      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2511        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2241  sub _get_next_token ($) { Line 2525  sub _get_next_token ($) {
2525                    
2526          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2527          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2528            $self->{s_kwd} = '';
2529          ## reconsume          ## reconsume
2530    
2531          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
# Line 2284  sub _get_next_token ($) { Line 2569  sub _get_next_token ($) {
2569                    
2570          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2571          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2572            $self->{s_kwd} = '';
2573                    
2574      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2575        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2303  sub _get_next_token ($) { Line 2589  sub _get_next_token ($) {
2589                    
2590          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2591          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2592            $self->{s_kwd} = '';
2593          ## reconsume          ## reconsume
2594    
2595          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
# Line 2346  sub _get_next_token ($) { Line 2633  sub _get_next_token ($) {
2633                    
2634          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2635          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2636            $self->{s_kwd} = '';
2637          ## reconsume          ## reconsume
2638    
2639          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
# Line 2373  sub _get_next_token ($) { Line 2661  sub _get_next_token ($) {
2661          redo A;          redo A;
2662        }        }
2663      } elsif ($self->{state} == COMMENT_END_DASH_STATE) {      } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2664          ## XML5: "comment dash state".
2665    
2666        if ($self->{nc} == 0x002D) { # -        if ($self->{nc} == 0x002D) { # -
2667                    
2668          $self->{state} = COMMENT_END_STATE;          $self->{state} = COMMENT_END_STATE;
# Line 2392  sub _get_next_token ($) { Line 2682  sub _get_next_token ($) {
2682                    
2683          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2684          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2685            $self->{s_kwd} = '';
2686          ## reconsume          ## reconsume
2687    
2688          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
# Line 2418  sub _get_next_token ($) { Line 2709  sub _get_next_token ($) {
2709        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
2710                    
2711          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2712            $self->{s_kwd} = '';
2713                    
2714      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2715        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2435  sub _get_next_token ($) { Line 2727  sub _get_next_token ($) {
2727          redo A;          redo A;
2728        } elsif ($self->{nc} == 0x002D) { # -        } elsif ($self->{nc} == 0x002D) { # -
2729                    
2730            ## XML5: Not a parse error.
2731          $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2732                          line => $self->{line_prev},                          line => $self->{line_prev},
2733                          column => $self->{column_prev});                          column => $self->{column_prev});
# Line 2456  sub _get_next_token ($) { Line 2749  sub _get_next_token ($) {
2749                    
2750          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2751          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2752            $self->{s_kwd} = '';
2753          ## reconsume          ## reconsume
2754    
2755          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
# Line 2463  sub _get_next_token ($) { Line 2757  sub _get_next_token ($) {
2757          redo A;          redo A;
2758        } else {        } else {
2759                    
2760            ## XML5: Not a parse error.
2761          $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2762                          line => $self->{line_prev},                          line => $self->{line_prev},
2763                          column => $self->{column_prev});                          column => $self->{column_prev});
# Line 2499  sub _get_next_token ($) { Line 2794  sub _get_next_token ($) {
2794          redo A;          redo A;
2795        } else {        } else {
2796                    
2797            ## XML5: Unless EOF, swith to the bogus comment state.
2798          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before DOCTYPE name');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before DOCTYPE name');
2799          $self->{state} = BEFORE_DOCTYPE_NAME_STATE;          $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2800          ## reconsume          ## reconsume
2801          redo A;          redo A;
2802        }        }
2803      } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {      } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
2804          ## XML5: "DOCTYPE root name before state".
2805    
2806        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
2807                    
2808          ## Stay in the state          ## Stay in the state
# Line 2522  sub _get_next_token ($) { Line 2820  sub _get_next_token ($) {
2820          redo A;          redo A;
2821        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
2822                    
2823            ## XML5: No parse error.
2824          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
2825          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2826            $self->{s_kwd} = '';
2827                    
2828      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2829        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2543  sub _get_next_token ($) { Line 2843  sub _get_next_token ($) {
2843                    
2844          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
2845          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2846            $self->{s_kwd} = '';
2847          ## reconsume          ## reconsume
2848    
2849          return  ($self->{ct}); # DOCTYPE (quirks)          return  ($self->{ct}); # DOCTYPE (quirks)
2850    
2851          redo A;          redo A;
2852          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2853            
2854            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
2855            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2856            
2857        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2858          $self->{line_prev} = $self->{line};
2859          $self->{column_prev} = $self->{column};
2860          $self->{column}++;
2861          $self->{nc}
2862              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2863        } else {
2864          $self->{set_nc}->($self);
2865        }
2866      
2867            redo A;
2868        } else {        } else {
2869                    
2870          $self->{ct}->{name} = chr $self->{nc};          $self->{ct}->{name} = chr $self->{nc};
# Line 2567  sub _get_next_token ($) { Line 2884  sub _get_next_token ($) {
2884          redo A;          redo A;
2885        }        }
2886      } elsif ($self->{state} == DOCTYPE_NAME_STATE) {      } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
2887  ## ISSUE: Redundant "First," in the spec.        ## XML5: "DOCTYPE root name state".
2888    
2889          ## ISSUE: Redundant "First," in the spec.
2890    
2891        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
2892                    
2893          $self->{state} = AFTER_DOCTYPE_NAME_STATE;          $self->{state} = AFTER_DOCTYPE_NAME_STATE;
# Line 2586  sub _get_next_token ($) { Line 2906  sub _get_next_token ($) {
2906        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
2907                    
2908          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2909            $self->{s_kwd} = '';
2910                    
2911      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2912        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2605  sub _get_next_token ($) { Line 2926  sub _get_next_token ($) {
2926                    
2927          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
2928          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2929            $self->{s_kwd} = '';
2930          ## reconsume          ## reconsume
2931    
2932          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
2933          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
2934    
2935          redo A;          redo A;
2936          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2937            
2938            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2939            
2940        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2941          $self->{line_prev} = $self->{line};
2942          $self->{column_prev} = $self->{column};
2943          $self->{column}++;
2944          $self->{nc}
2945              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2946        } else {
2947          $self->{set_nc}->($self);
2948        }
2949      
2950            redo A;
2951        } else {        } else {
2952                    
2953          $self->{ct}->{name}          $self->{ct}->{name}
# Line 2630  sub _get_next_token ($) { Line 2967  sub _get_next_token ($) {
2967          redo A;          redo A;
2968        }        }
2969      } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {      } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
2970          ## XML5: Corresponding to XML5's "DOCTYPE root name after
2971          ## state", but implemented differently.
2972    
2973        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
2974                    
2975          ## Stay in the state          ## Stay in the state
# Line 2648  sub _get_next_token ($) { Line 2988  sub _get_next_token ($) {
2988        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
2989                    
2990          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2991            $self->{s_kwd} = '';
2992                    
2993      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2994        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2667  sub _get_next_token ($) { Line 3008  sub _get_next_token ($) {
3008                    
3009          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3010          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3011            $self->{s_kwd} = '';
3012          ## reconsume          ## reconsume
3013    
3014          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 2675  sub _get_next_token ($) { Line 3017  sub _get_next_token ($) {
3017          redo A;          redo A;
3018        } elsif ($self->{nc} == 0x0050 or # P        } elsif ($self->{nc} == 0x0050 or # P
3019                 $self->{nc} == 0x0070) { # p                 $self->{nc} == 0x0070) { # p
3020            
3021          $self->{state} = PUBLIC_STATE;          $self->{state} = PUBLIC_STATE;
3022          $self->{s_kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
3023                    
3024      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3025        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2691  sub _get_next_token ($) { Line 3034  sub _get_next_token ($) {
3034          redo A;          redo A;
3035        } elsif ($self->{nc} == 0x0053 or # S        } elsif ($self->{nc} == 0x0053 or # S
3036                 $self->{nc} == 0x0073) { # s                 $self->{nc} == 0x0073) { # s
3037            
3038          $self->{state} = SYSTEM_STATE;          $self->{state} = SYSTEM_STATE;
3039          $self->{s_kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
3040            
3041        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3042          $self->{line_prev} = $self->{line};
3043          $self->{column_prev} = $self->{column};
3044          $self->{column}++;
3045          $self->{nc}
3046              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3047        } else {
3048          $self->{set_nc}->($self);
3049        }
3050      
3051            redo A;
3052          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3053            
3054            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3055            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3056                    
3057      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3058        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2732  sub _get_next_token ($) { Line 3092  sub _get_next_token ($) {
3092              0x0042, # B              0x0042, # B
3093              0x004C, # L              0x004C, # L
3094              0x0049, # I              0x0049, # I
3095            ]->[length $self->{s_kwd}] or            ]->[length $self->{kwd}] or
3096            $self->{nc} == [            $self->{nc} == [
3097              undef,              undef,
3098              0x0075, # u              0x0075, # u
3099              0x0062, # b              0x0062, # b
3100              0x006C, # l              0x006C, # l
3101              0x0069, # i              0x0069, # i
3102            ]->[length $self->{s_kwd}]) {            ]->[length $self->{kwd}]) {
3103                    
3104          ## Stay in the state.          ## Stay in the state.
3105          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
3106                    
3107      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3108        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2755  sub _get_next_token ($) { Line 3115  sub _get_next_token ($) {
3115      }      }
3116        
3117          redo A;          redo A;
3118        } elsif ((length $self->{s_kwd}) == 5 and        } elsif ((length $self->{kwd}) == 5 and
3119                 ($self->{nc} == 0x0043 or # C                 ($self->{nc} == 0x0043 or # C
3120                  $self->{nc} == 0x0063)) { # c                  $self->{nc} == 0x0063)) { # c
3121                    if ($self->{is_xml} and
3122                ($self->{kwd} ne 'PUBLI' or $self->{nc} == 0x0063)) { # c
3123              
3124              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
3125                              text => 'PUBLIC',
3126                              line => $self->{line_prev},
3127                              column => $self->{column_prev} - 4);
3128            } else {
3129              
3130            }
3131          $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;          $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3132                    
3133      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 2776  sub _get_next_token ($) { Line 3145  sub _get_next_token ($) {
3145                    
3146          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name',
3147                          line => $self->{line_prev},                          line => $self->{line_prev},
3148                          column => $self->{column_prev} + 1 - length $self->{s_kwd});                          column => $self->{column_prev} + 1 - length $self->{kwd});
3149          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
3150    
3151          $self->{state} = BOGUS_DOCTYPE_STATE;          $self->{state} = BOGUS_DOCTYPE_STATE;
# Line 2791  sub _get_next_token ($) { Line 3160  sub _get_next_token ($) {
3160              0x0053, # S              0x0053, # S
3161              0x0054, # T              0x0054, # T
3162              0x0045, # E              0x0045, # E
3163            ]->[length $self->{s_kwd}] or            ]->[length $self->{kwd}] or
3164            $self->{nc} == [            $self->{nc} == [
3165              undef,              undef,
3166              0x0079, # y              0x0079, # y
3167              0x0073, # s              0x0073, # s
3168              0x0074, # t              0x0074, # t
3169              0x0065, # e              0x0065, # e
3170            ]->[length $self->{s_kwd}]) {            ]->[length $self->{kwd}]) {
3171                    
3172          ## Stay in the state.          ## Stay in the state.
3173          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
3174                    
3175      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3176        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2814  sub _get_next_token ($) { Line 3183  sub _get_next_token ($) {
3183      }      }
3184        
3185          redo A;          redo A;
3186        } elsif ((length $self->{s_kwd}) == 5 and        } elsif ((length $self->{kwd}) == 5 and
3187                 ($self->{nc} == 0x004D or # M                 ($self->{nc} == 0x004D or # M
3188                  $self->{nc} == 0x006D)) { # m                  $self->{nc} == 0x006D)) { # m
3189                    if ($self->{is_xml} and
3190                ($self->{kwd} ne 'SYSTE' or $self->{nc} == 0x006D)) { # m
3191              
3192              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
3193                              text => 'SYSTEM',
3194                              line => $self->{line_prev},
3195                              column => $self->{column_prev} - 4);
3196            } else {
3197              
3198            }
3199          $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;          $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
3200                    
3201      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 2835  sub _get_next_token ($) { Line 3213  sub _get_next_token ($) {
3213                    
3214          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name',
3215                          line => $self->{line_prev},                          line => $self->{line_prev},
3216                          column => $self->{column_prev} + 1 - length $self->{s_kwd});                          column => $self->{column_prev} + 1 - length $self->{kwd});
3217          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
3218    
3219          $self->{state} = BOGUS_DOCTYPE_STATE;          $self->{state} = BOGUS_DOCTYPE_STATE;
# Line 2895  sub _get_next_token ($) { Line 3273  sub _get_next_token ($) {
3273          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3274    
3275          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3276            $self->{s_kwd} = '';
3277                    
3278      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3279        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2916  sub _get_next_token ($) { Line 3295  sub _get_next_token ($) {
3295          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3296    
3297          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3298            $self->{s_kwd} = '';
3299          ## reconsume          ## reconsume
3300    
3301          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
3302          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
3303    
3304          redo A;          redo A;
3305          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3306            
3307            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3308            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3309            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3310            
3311        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3312          $self->{line_prev} = $self->{line};
3313          $self->{column_prev} = $self->{column};
3314          $self->{column}++;
3315          $self->{nc}
3316              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3317        } else {
3318          $self->{set_nc}->($self);
3319        }
3320      
3321            redo A;
3322        } else {        } else {
3323                    
3324          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC');
# Line 2962  sub _get_next_token ($) { Line 3359  sub _get_next_token ($) {
3359          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3360    
3361          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3362            $self->{s_kwd} = '';
3363                    
3364      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3365        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2983  sub _get_next_token ($) { Line 3381  sub _get_next_token ($) {
3381          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3382    
3383          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3384            $self->{s_kwd} = '';
3385          ## reconsume          ## reconsume
3386    
3387          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 3031  sub _get_next_token ($) { Line 3430  sub _get_next_token ($) {
3430          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3431    
3432          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3433            $self->{s_kwd} = '';
3434                    
3435      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3436        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3052  sub _get_next_token ($) { Line 3452  sub _get_next_token ($) {
3452          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3453    
3454          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3455            $self->{s_kwd} = '';
3456          ## reconsume          ## reconsume
3457    
3458          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 3128  sub _get_next_token ($) { Line 3529  sub _get_next_token ($) {
3529        
3530          redo A;          redo A;
3531        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
3532                    if ($self->{is_xml}) {
3533              
3534              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3535            } else {
3536              
3537            }
3538          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3539            $self->{s_kwd} = '';
3540                    
3541      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3542        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3150  sub _get_next_token ($) { Line 3557  sub _get_next_token ($) {
3557          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3558    
3559          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3560            $self->{s_kwd} = '';
3561          ## reconsume          ## reconsume
3562    
3563          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
3564          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
3565    
3566          redo A;          redo A;
3567          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3568            
3569            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3570            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3571            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3572            
3573        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3574          $self->{line_prev} = $self->{line};
3575          $self->{column_prev} = $self->{column};
3576          $self->{column}++;
3577          $self->{nc}
3578              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3579        } else {
3580          $self->{set_nc}->($self);
3581        }
3582      
3583            redo A;
3584        } else {        } else {
3585                    
3586          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC literal');
# Line 3227  sub _get_next_token ($) { Line 3652  sub _get_next_token ($) {
3652                    
3653          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3654          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3655            $self->{s_kwd} = '';
3656                    
3657      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3658        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3248  sub _get_next_token ($) { Line 3674  sub _get_next_token ($) {
3674          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3675    
3676          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3677            $self->{s_kwd} = '';
3678          ## reconsume          ## reconsume
3679    
3680          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
3681          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
3682    
3683          redo A;          redo A;
3684          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3685            
3686            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3687    
3688            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3689            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3690            
3691        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3692          $self->{line_prev} = $self->{line};
3693          $self->{column_prev} = $self->{column};
3694          $self->{column}++;
3695          $self->{nc}
3696              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3697        } else {
3698          $self->{set_nc}->($self);
3699        }
3700      
3701            redo A;
3702        } else {        } else {
3703                    
3704          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM');
# Line 3289  sub _get_next_token ($) { Line 3734  sub _get_next_token ($) {
3734      }      }
3735        
3736          redo A;          redo A;
3737        } elsif ($self->{nc} == 0x003E) { # >        } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
3738                    
3739          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3740    
3741          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3742            $self->{s_kwd} = '';
3743                    
3744      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3745        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3315  sub _get_next_token ($) { Line 3761  sub _get_next_token ($) {
3761          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3762    
3763          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3764            $self->{s_kwd} = '';
3765          ## reconsume          ## reconsume
3766    
3767          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 3358  sub _get_next_token ($) { Line 3805  sub _get_next_token ($) {
3805      }      }
3806        
3807          redo A;          redo A;
3808        } elsif ($self->{nc} == 0x003E) { # >        } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
3809                    
3810          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3811    
3812          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3813            $self->{s_kwd} = '';
3814                    
3815      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3816        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3384  sub _get_next_token ($) { Line 3832  sub _get_next_token ($) {
3832          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3833    
3834          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3835            $self->{s_kwd} = '';
3836          ## reconsume          ## reconsume
3837    
3838          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
# Line 3430  sub _get_next_token ($) { Line 3879  sub _get_next_token ($) {
3879        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
3880                    
3881          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3882            $self->{s_kwd} = '';
3883                    
3884      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3885        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3449  sub _get_next_token ($) { Line 3899  sub _get_next_token ($) {
3899                    
3900          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3901          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3902            $self->{s_kwd} = '';
3903          ## reconsume          ## reconsume
3904    
3905          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
3906          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
3907    
3908          redo A;          redo A;
3909          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3910            
3911            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3912            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3913            
3914        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3915          $self->{line_prev} = $self->{line};
3916          $self->{column_prev} = $self->{column};
3917          $self->{column}++;
3918          $self->{nc}
3919              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3920        } else {
3921          $self->{set_nc}->($self);
3922        }
3923      
3924            redo A;
3925        } else {        } else {
3926                    
3927          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
# Line 3478  sub _get_next_token ($) { Line 3945  sub _get_next_token ($) {
3945        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
3946                    
3947          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3948            $self->{s_kwd} = '';
3949                    
3950      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3951        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3493  sub _get_next_token ($) { Line 3961  sub _get_next_token ($) {
3961          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
3962    
3963          redo A;          redo A;
3964          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3965            if ($self->{ct}->{has_internal_subset}) { # DOCTYPE
3966              
3967              ## Stay in the state.
3968              
3969        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3970          $self->{line_prev} = $self->{line};
3971          $self->{column_prev} = $self->{column};
3972          $self->{column}++;
3973          $self->{nc}
3974              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3975        } else {
3976          $self->{set_nc}->($self);
3977        }
3978      
3979              redo A;
3980            } else {
3981              
3982              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3983              $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3984              
3985        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3986          $self->{line_prev} = $self->{line};
3987          $self->{column_prev} = $self->{column};
3988          $self->{column}++;
3989          $self->{nc}
3990              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3991        } else {
3992          $self->{set_nc}->($self);
3993        }
3994      
3995              redo A;
3996            }
3997        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
3998                    
3999          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
4000            $self->{s_kwd} = '';
4001          ## reconsume          ## reconsume
4002    
4003          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
# Line 3504  sub _get_next_token ($) { Line 4006  sub _get_next_token ($) {
4006        } else {        } else {
4007                    
4008          my $s = '';          my $s = '';
4009          $self->{read_until}->($s, q[>], 0);          $self->{read_until}->($s, q{>[}, 0);
4010    
4011          ## Stay in the state          ## Stay in the state
4012                    
# Line 3524  sub _get_next_token ($) { Line 4026  sub _get_next_token ($) {
4026        ## NOTE: "CDATA section state" in the state is jointly implemented        ## NOTE: "CDATA section state" in the state is jointly implemented
4027        ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,        ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
4028        ## and |CDATA_SECTION_MSE2_STATE|.        ## and |CDATA_SECTION_MSE2_STATE|.
4029    
4030          ## XML5: "CDATA state".
4031                
4032        if ($self->{nc} == 0x005D) { # ]        if ($self->{nc} == 0x005D) { # ]
4033                    
# Line 3541  sub _get_next_token ($) { Line 4045  sub _get_next_token ($) {
4045        
4046          redo A;          redo A;
4047        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
4048            if ($self->{is_xml}) {
4049              
4050              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no mse'); ## TODO: type
4051            } else {
4052              
4053            }
4054    
4055          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
4056                    $self->{s_kwd} = '';
4057      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {          ## Reconsume.
       $self->{line_prev} = $self->{line};  
       $self->{column_prev} = $self->{column};  
       $self->{column}++;  
       $self->{nc}  
           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);  
     } else {  
       $self->{set_nc}->($self);  
     }  
     
4058          if (length $self->{ct}->{data}) { # character          if (length $self->{ct}->{data}) { # character
4059                        
4060            return  ($self->{ct}); # character            return  ($self->{ct}); # character
# Line 3585  sub _get_next_token ($) { Line 4087  sub _get_next_token ($) {
4087    
4088        ## ISSUE: "text tokens" in spec.        ## ISSUE: "text tokens" in spec.
4089      } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {      } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
4090          ## XML5: "CDATA bracket state".
4091    
4092        if ($self->{nc} == 0x005D) { # ]        if ($self->{nc} == 0x005D) { # ]
4093                    
4094          $self->{state} = CDATA_SECTION_MSE2_STATE;          $self->{state} = CDATA_SECTION_MSE2_STATE;
# Line 3602  sub _get_next_token ($) { Line 4106  sub _get_next_token ($) {
4106          redo A;          redo A;
4107        } else {        } else {
4108                    
4109            ## XML5: If EOF, "]" is not appended and changed to the data state.
4110          $self->{ct}->{data} .= ']';          $self->{ct}->{data} .= ']';
4111          $self->{state} = CDATA_SECTION_STATE;          $self->{state} = CDATA_SECTION_STATE; ## XML5: Stay in the state.
4112          ## Reconsume.          ## Reconsume.
4113          redo A;          redo A;
4114        }        }
4115      } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {      } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
4116          ## XML5: "CDATA end state".
4117    
4118        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
4119          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
4120            $self->{s_kwd} = '';
4121                    
4122      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4123        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3649  sub _get_next_token ($) { Line 4157  sub _get_next_token ($) {
4157                    
4158          $self->{ct}->{data} .= ']]'; # character          $self->{ct}->{data} .= ']]'; # character
4159          $self->{state} = CDATA_SECTION_STATE;          $self->{state} = CDATA_SECTION_STATE;
4160          ## Reconsume.          ## Reconsume. ## XML5: Emit.
4161          redo A;          redo A;
4162        }        }
4163      } elsif ($self->{state} == ENTITY_STATE) {      } elsif ($self->{state} == ENTITY_STATE) {
# Line 3666  sub _get_next_token ($) { Line 4174  sub _get_next_token ($) {
4174        } elsif ($self->{nc} == 0x0023) { # #        } elsif ($self->{nc} == 0x0023) { # #
4175                    
4176          $self->{state} = ENTITY_HASH_STATE;          $self->{state} = ENTITY_HASH_STATE;
4177          $self->{s_kwd} = '#';          $self->{kwd} = '#';
4178                    
4179      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4180        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3686  sub _get_next_token ($) { Line 4194  sub _get_next_token ($) {
4194                    
4195          require Whatpm::_NamedEntityList;          require Whatpm::_NamedEntityList;
4196          $self->{state} = ENTITY_NAME_STATE;          $self->{state} = ENTITY_NAME_STATE;
4197          $self->{s_kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
4198          $self->{entity__value} = $self->{s_kwd};          $self->{entity__value} = $self->{kwd};
4199          $self->{entity__match} = 0;          $self->{entity__match} = 0;
4200                    
4201      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 3717  sub _get_next_token ($) { Line 4225  sub _get_next_token ($) {
4225        if ($self->{prev_state} == DATA_STATE) {        if ($self->{prev_state} == DATA_STATE) {
4226                    
4227          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
4228            $self->{s_kwd} = '';
4229          ## Reconsume.          ## Reconsume.
4230          return  ({type => CHARACTER_TOKEN, data => '&',          return  ({type => CHARACTER_TOKEN, data => '&',
4231                    line => $self->{line_prev},                    line => $self->{line_prev},
# Line 3727  sub _get_next_token ($) { Line 4236  sub _get_next_token ($) {
4236                    
4237          $self->{ca}->{value} .= '&';          $self->{ca}->{value} .= '&';
4238          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
4239            $self->{s_kwd} = '';
4240          ## Reconsume.          ## Reconsume.
4241          redo A;          redo A;
4242        }        }
# Line 3735  sub _get_next_token ($) { Line 4245  sub _get_next_token ($) {
4245            $self->{nc} == 0x0058) { # X            $self->{nc} == 0x0058) { # X
4246                    
4247          $self->{state} = HEXREF_X_STATE;          $self->{state} = HEXREF_X_STATE;
4248          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
4249                    
4250      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4251        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3752  sub _get_next_token ($) { Line 4262  sub _get_next_token ($) {
4262                 $self->{nc} <= 0x0039) { # 0..9                 $self->{nc} <= 0x0039) { # 0..9
4263                    
4264          $self->{state} = NCR_NUM_STATE;          $self->{state} = NCR_NUM_STATE;
4265          $self->{s_kwd} = $self->{nc} - 0x0030;          $self->{kwd} = $self->{nc} - 0x0030;
4266                    
4267      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4268        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3777  sub _get_next_token ($) { Line 4287  sub _get_next_token ($) {
4287          if ($self->{prev_state} == DATA_STATE) {          if ($self->{prev_state} == DATA_STATE) {
4288                        
4289            $self->{state} = $self->{prev_state};            $self->{state} = $self->{prev_state};
4290              $self->{s_kwd} = '';
4291            ## Reconsume.            ## Reconsume.
4292            return  ({type => CHARACTER_TOKEN,            return  ({type => CHARACTER_TOKEN,
4293                      data => '&#',                      data => '&#',
# Line 3788  sub _get_next_token ($) { Line 4299  sub _get_next_token ($) {
4299                        
4300            $self->{ca}->{value} .= '&#';            $self->{ca}->{value} .= '&#';
4301            $self->{state} = $self->{prev_state};            $self->{state} = $self->{prev_state};
4302              $self->{s_kwd} = '';
4303            ## Reconsume.            ## Reconsume.
4304            redo A;            redo A;
4305          }          }
# Line 3796  sub _get_next_token ($) { Line 4308  sub _get_next_token ($) {
4308        if (0x0030 <= $self->{nc} and        if (0x0030 <= $self->{nc} and
4309            $self->{nc} <= 0x0039) { # 0..9            $self->{nc} <= 0x0039) { # 0..9
4310                    
4311          $self->{s_kwd} *= 10;          $self->{kwd} *= 10;
4312          $self->{s_kwd} += $self->{nc} - 0x0030;          $self->{kwd} += $self->{nc} - 0x0030;
4313                    
4314          ## Stay in the state.          ## Stay in the state.
4315                    
# Line 3833  sub _get_next_token ($) { Line 4345  sub _get_next_token ($) {
4345          #          #
4346        }        }
4347    
4348        my $code = $self->{s_kwd};        my $code = $self->{kwd};
4349        my $l = $self->{line_prev};        my $l = $self->{line_prev};
4350        my $c = $self->{column_prev};        my $c = $self->{column_prev};
4351        if ($charref_map->{$code}) {        if ($charref_map->{$code}) {
# Line 3853  sub _get_next_token ($) { Line 4365  sub _get_next_token ($) {
4365        if ($self->{prev_state} == DATA_STATE) {        if ($self->{prev_state} == DATA_STATE) {
4366                    
4367          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
4368            $self->{s_kwd} = '';
4369          ## Reconsume.          ## Reconsume.
4370          return  ({type => CHARACTER_TOKEN, data => chr $code,          return  ({type => CHARACTER_TOKEN, data => chr $code,
4371                      has_reference => 1,
4372                    line => $l, column => $c,                    line => $l, column => $c,
4373                   });                   });
4374          redo A;          redo A;
# Line 3863  sub _get_next_token ($) { Line 4377  sub _get_next_token ($) {
4377          $self->{ca}->{value} .= chr $code;          $self->{ca}->{value} .= chr $code;
4378          $self->{ca}->{has_reference} = 1;          $self->{ca}->{has_reference} = 1;
4379          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
4380            $self->{s_kwd} = '';
4381          ## Reconsume.          ## Reconsume.
4382          redo A;          redo A;
4383        }        }
# Line 3873  sub _get_next_token ($) { Line 4388  sub _get_next_token ($) {
4388          # 0..9, A..F, a..f          # 0..9, A..F, a..f
4389                    
4390          $self->{state} = HEXREF_HEX_STATE;          $self->{state} = HEXREF_HEX_STATE;
4391          $self->{s_kwd} = 0;          $self->{kwd} = 0;
4392          ## Reconsume.          ## Reconsume.
4393          redo A;          redo A;
4394        } else {        } else {
# Line 3888  sub _get_next_token ($) { Line 4403  sub _get_next_token ($) {
4403          if ($self->{prev_state} == DATA_STATE) {          if ($self->{prev_state} == DATA_STATE) {
4404                        
4405            $self->{state} = $self->{prev_state};            $self->{state} = $self->{prev_state};
4406              $self->{s_kwd} = '';
4407            ## Reconsume.            ## Reconsume.
4408            return  ({type => CHARACTER_TOKEN,            return  ({type => CHARACTER_TOKEN,
4409                      data => '&' . $self->{s_kwd},                      data => '&' . $self->{kwd},
4410                      line => $self->{line_prev},                      line => $self->{line_prev},
4411                      column => $self->{column_prev} - length $self->{s_kwd},                      column => $self->{column_prev} - length $self->{kwd},
4412                     });                     });
4413            redo A;            redo A;
4414          } else {          } else {
4415                        
4416            $self->{ca}->{value} .= '&' . $self->{s_kwd};            $self->{ca}->{value} .= '&' . $self->{kwd};
4417            $self->{state} = $self->{prev_state};            $self->{state} = $self->{prev_state};
4418              $self->{s_kwd} = '';
4419            ## Reconsume.            ## Reconsume.
4420            redo A;            redo A;
4421          }          }
# Line 3907  sub _get_next_token ($) { Line 4424  sub _get_next_token ($) {
4424        if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {        if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
4425          # 0..9          # 0..9
4426                    
4427          $self->{s_kwd} *= 0x10;          $self->{kwd} *= 0x10;
4428          $self->{s_kwd} += $self->{nc} - 0x0030;          $self->{kwd} += $self->{nc} - 0x0030;
4429          ## Stay in the state.          ## Stay in the state.
4430                    
4431      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 3925  sub _get_next_token ($) { Line 4442  sub _get_next_token ($) {
4442        } elsif (0x0061 <= $self->{nc} and        } elsif (0x0061 <= $self->{nc} and
4443                 $self->{nc} <= 0x0066) { # a..f                 $self->{nc} <= 0x0066) { # a..f
4444                    
4445          $self->{s_kwd} *= 0x10;          $self->{kwd} *= 0x10;
4446          $self->{s_kwd} += $self->{nc} - 0x0060 + 9;          $self->{kwd} += $self->{nc} - 0x0060 + 9;
4447          ## Stay in the state.          ## Stay in the state.
4448                    
4449      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 3943  sub _get_next_token ($) { Line 4460  sub _get_next_token ($) {
4460        } elsif (0x0041 <= $self->{nc} and        } elsif (0x0041 <= $self->{nc} and
4461                 $self->{nc} <= 0x0046) { # A..F                 $self->{nc} <= 0x0046) { # A..F
4462                    
4463          $self->{s_kwd} *= 0x10;          $self->{kwd} *= 0x10;
4464          $self->{s_kwd} += $self->{nc} - 0x0040 + 9;          $self->{kwd} += $self->{nc} - 0x0040 + 9;
4465          ## Stay in the state.          ## Stay in the state.
4466                    
4467      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 3981  sub _get_next_token ($) { Line 4498  sub _get_next_token ($) {
4498          #          #
4499        }        }
4500    
4501        my $code = $self->{s_kwd};        my $code = $self->{kwd};
4502        my $l = $self->{line_prev};        my $l = $self->{line_prev};
4503        my $c = $self->{column_prev};        my $c = $self->{column_prev};
4504        if ($charref_map->{$code}) {        if ($charref_map->{$code}) {
# Line 4001  sub _get_next_token ($) { Line 4518  sub _get_next_token ($) {
4518        if ($self->{prev_state} == DATA_STATE) {        if ($self->{prev_state} == DATA_STATE) {
4519                    
4520          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
4521            $self->{s_kwd} = '';
4522          ## Reconsume.          ## Reconsume.
4523          return  ({type => CHARACTER_TOKEN, data => chr $code,          return  ({type => CHARACTER_TOKEN, data => chr $code,
4524                      has_reference => 1,
4525                    line => $l, column => $c,                    line => $l, column => $c,
4526                   });                   });
4527          redo A;          redo A;
# Line 4011  sub _get_next_token ($) { Line 4530  sub _get_next_token ($) {
4530          $self->{ca}->{value} .= chr $code;          $self->{ca}->{value} .= chr $code;
4531          $self->{ca}->{has_reference} = 1;          $self->{ca}->{has_reference} = 1;
4532          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
4533            $self->{s_kwd} = '';
4534          ## Reconsume.          ## Reconsume.
4535          redo A;          redo A;
4536        }        }
4537      } elsif ($self->{state} == ENTITY_NAME_STATE) {      } elsif ($self->{state} == ENTITY_NAME_STATE) {
4538        if (length $self->{s_kwd} < 30 and        if (length $self->{kwd} < 30 and
4539            ## NOTE: Some number greater than the maximum length of entity name            ## NOTE: Some number greater than the maximum length of entity name
4540            ((0x0041 <= $self->{nc} and # a            ((0x0041 <= $self->{nc} and # a
4541              $self->{nc} <= 0x005A) or # x              $self->{nc} <= 0x005A) or # x
# Line 4025  sub _get_next_token ($) { Line 4545  sub _get_next_token ($) {
4545              $self->{nc} <= 0x0039) or # 9              $self->{nc} <= 0x0039) or # 9
4546             $self->{nc} == 0x003B)) { # ;             $self->{nc} == 0x003B)) { # ;
4547          our $EntityChar;          our $EntityChar;
4548          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
4549          if (defined $EntityChar->{$self->{s_kwd}}) {          if (defined $EntityChar->{$self->{kwd}}) {
4550            if ($self->{nc} == 0x003B) { # ;            if ($self->{nc} == 0x003B) { # ;
4551                            
4552              $self->{entity__value} = $EntityChar->{$self->{s_kwd}};              $self->{entity__value} = $EntityChar->{$self->{kwd}};
4553              $self->{entity__match} = 1;              $self->{entity__match} = 1;
4554                            
4555      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 4045  sub _get_next_token ($) { Line 4565  sub _get_next_token ($) {
4565              #              #
4566            } else {            } else {
4567                            
4568              $self->{entity__value} = $EntityChar->{$self->{s_kwd}};              $self->{entity__value} = $EntityChar->{$self->{kwd}};
4569              $self->{entity__match} = -1;              $self->{entity__match} = -1;
4570              ## Stay in the state.              ## Stay in the state.
4571                            
# Line 4093  sub _get_next_token ($) { Line 4613  sub _get_next_token ($) {
4613          if ($self->{prev_state} != DATA_STATE and # in attribute          if ($self->{prev_state} != DATA_STATE and # in attribute
4614              $self->{entity__match} < -1) {              $self->{entity__match} < -1) {
4615                        
4616            $data = '&' . $self->{s_kwd};            $data = '&' . $self->{kwd};
4617            #            #
4618          } else {          } else {
4619                        
# Line 4105  sub _get_next_token ($) { Line 4625  sub _get_next_token ($) {
4625                    
4626          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
4627                          line => $self->{line_prev},                          line => $self->{line_prev},
4628                          column => $self->{column_prev} - length $self->{s_kwd});                          column => $self->{column_prev} - length $self->{kwd});
4629          $data = '&' . $self->{s_kwd};          $data = '&' . $self->{kwd};
4630          #          #
4631        }        }
4632        
# Line 4123  sub _get_next_token ($) { Line 4643  sub _get_next_token ($) {
4643        if ($self->{prev_state} == DATA_STATE) {        if ($self->{prev_state} == DATA_STATE) {
4644                    
4645          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
4646            $self->{s_kwd} = '';
4647          ## Reconsume.          ## Reconsume.
4648          return  ({type => CHARACTER_TOKEN,          return  ({type => CHARACTER_TOKEN,
4649                    data => $data,                    data => $data,
4650                      has_reference => $has_ref,
4651                    line => $self->{line_prev},                    line => $self->{line_prev},
4652                    column => $self->{column_prev} + 1 - length $self->{s_kwd},                    column => $self->{column_prev} + 1 - length $self->{kwd},
4653                   });                   });
4654          redo A;          redo A;
4655        } else {        } else {
# Line 4135  sub _get_next_token ($) { Line 4657  sub _get_next_token ($) {
4657          $self->{ca}->{value} .= $data;          $self->{ca}->{value} .= $data;
4658          $self->{ca}->{has_reference} = 1 if $has_ref;          $self->{ca}->{has_reference} = 1 if $has_ref;
4659          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
4660            $self->{s_kwd} = '';
4661            ## Reconsume.
4662            redo A;
4663          }
4664    
4665        ## XML-only states
4666    
4667        } elsif ($self->{state} == PI_STATE) {
4668          if ($is_space->{$self->{nc}} or
4669              $self->{nc} == 0x003F or # ? ## XML5: Same as "Anything else"
4670              $self->{nc} == -1) {
4671            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare pio', ## TODO: type
4672                            line => $self->{line_prev},
4673                            column => $self->{column_prev}
4674                                - 1 * ($self->{nc} != -1));
4675            $self->{state} = BOGUS_COMMENT_STATE;
4676            ## Reconsume.
4677            $self->{ct} = {type => COMMENT_TOKEN,
4678                           data => '?',
4679                           line => $self->{line_prev},
4680                           column => $self->{column_prev}
4681                               - 1 * ($self->{nc} != -1),
4682                          };
4683            redo A;
4684          } else {
4685            $self->{ct} = {type => PI_TOKEN,
4686                           target => chr $self->{nc},
4687                           data => '',
4688                           line => $self->{line_prev},
4689                           column => $self->{column_prev} - 1,
4690                          };
4691            $self->{state} = PI_TARGET_STATE;
4692            
4693        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4694          $self->{line_prev} = $self->{line};
4695          $self->{column_prev} = $self->{column};
4696          $self->{column}++;
4697          $self->{nc}
4698              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4699        } else {
4700          $self->{set_nc}->($self);
4701        }
4702      
4703            redo A;
4704          }
4705        } elsif ($self->{state} == PI_TARGET_STATE) {
4706          if ($is_space->{$self->{nc}}) {
4707            $self->{state} = PI_TARGET_AFTER_STATE;
4708            
4709        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4710          $self->{line_prev} = $self->{line};
4711          $self->{column_prev} = $self->{column};
4712          $self->{column}++;
4713          $self->{nc}
4714              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4715        } else {
4716          $self->{set_nc}->($self);
4717        }
4718      
4719            redo A;
4720          } elsif ($self->{nc} == -1) {
4721            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
4722            $self->{state} = DATA_STATE;
4723            $self->{s_kwd} = '';
4724            ## Reconsume.
4725            return  ($self->{ct}); # pi
4726            redo A;
4727          } elsif ($self->{nc} == 0x003F) { # ?
4728            $self->{state} = PI_AFTER_STATE;
4729            
4730        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4731          $self->{line_prev} = $self->{line};
4732          $self->{column_prev} = $self->{column};
4733          $self->{column}++;
4734          $self->{nc}
4735              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4736        } else {
4737          $self->{set_nc}->($self);
4738        }
4739      
4740            redo A;
4741          } else {
4742            ## XML5: typo ("tag name" -> "target")
4743            $self->{ct}->{target} .= chr $self->{nc}; # pi
4744            
4745        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4746          $self->{line_prev} = $self->{line};
4747          $self->{column_prev} = $self->{column};
4748          $self->{column}++;
4749          $self->{nc}
4750              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4751        } else {
4752          $self->{set_nc}->($self);
4753        }
4754      
4755            redo A;
4756          }
4757        } elsif ($self->{state} == PI_TARGET_AFTER_STATE) {
4758          if ($is_space->{$self->{nc}}) {
4759            ## Stay in the state.
4760            
4761        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4762          $self->{line_prev} = $self->{line};
4763          $self->{column_prev} = $self->{column};
4764          $self->{column}++;
4765          $self->{nc}
4766              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4767        } else {
4768          $self->{set_nc}->($self);
4769        }
4770      
4771            redo A;
4772          } else {
4773            $self->{state} = PI_DATA_STATE;
4774            ## Reprocess.
4775            redo A;
4776          }
4777        } elsif ($self->{state} == PI_DATA_STATE) {
4778          if ($self->{nc} == 0x003F) { # ?
4779            $self->{state} = PI_DATA_AFTER_STATE;
4780            
4781        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4782          $self->{line_prev} = $self->{line};
4783          $self->{column_prev} = $self->{column};
4784          $self->{column}++;
4785          $self->{nc}
4786              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4787        } else {
4788          $self->{set_nc}->($self);
4789        }
4790      
4791            redo A;
4792          } elsif ($self->{nc} == -1) {
4793            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
4794            $self->{state} = DATA_STATE;
4795            $self->{s_kwd} = '';
4796            ## Reprocess.
4797            return  ($self->{ct}); # pi
4798            redo A;
4799          } else {
4800            $self->{ct}->{data} .= chr $self->{nc}; # pi
4801            $self->{read_until}->($self->{ct}->{data}, q[?],
4802                                  length $self->{ct}->{data});
4803            ## Stay in the state.
4804            
4805        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4806          $self->{line_prev} = $self->{line};
4807          $self->{column_prev} = $self->{column};
4808          $self->{column}++;
4809          $self->{nc}
4810              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4811        } else {
4812          $self->{set_nc}->($self);
4813        }
4814      
4815            ## Reprocess.
4816            redo A;
4817          }
4818        } elsif ($self->{state} == PI_AFTER_STATE) {
4819          if ($self->{nc} == 0x003E) { # >
4820            $self->{state} = DATA_STATE;
4821            $self->{s_kwd} = '';
4822            
4823        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4824          $self->{line_prev} = $self->{line};
4825          $self->{column_prev} = $self->{column};
4826          $self->{column}++;
4827          $self->{nc}
4828              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4829        } else {
4830          $self->{set_nc}->($self);
4831        }
4832      
4833            return  ($self->{ct}); # pi
4834            redo A;
4835          } elsif ($self->{nc} == 0x003F) { # ?
4836            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
4837                            line => $self->{line_prev},
4838                            column => $self->{column_prev}); ## XML5: no error
4839            $self->{ct}->{data} .= '?';
4840            $self->{state} = PI_DATA_AFTER_STATE;
4841            
4842        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4843          $self->{line_prev} = $self->{line};
4844          $self->{column_prev} = $self->{column};
4845          $self->{column}++;
4846          $self->{nc}
4847              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4848        } else {
4849          $self->{set_nc}->($self);
4850        }
4851      
4852            redo A;
4853          } else {
4854            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
4855                            line => $self->{line_prev},
4856                            column => $self->{column_prev}
4857                                + 1 * ($self->{nc} == -1)); ## XML5: no error
4858            $self->{ct}->{data} .= '?'; ## XML5: not appended
4859            $self->{state} = PI_DATA_STATE;
4860            ## Reprocess.
4861            redo A;
4862          }
4863        } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
4864          ## XML5: Same as "pi after state" in XML5
4865          if ($self->{nc} == 0x003E) { # >
4866            $self->{state} = DATA_STATE;
4867            $self->{s_kwd} = '';
4868            
4869        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4870          $self->{line_prev} = $self->{line};
4871          $self->{column_prev} = $self->{column};
4872          $self->{column}++;
4873          $self->{nc}
4874              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4875        } else {
4876          $self->{set_nc}->($self);
4877        }
4878      
4879            return  ($self->{ct}); # pi
4880            redo A;
4881          } elsif ($self->{nc} == 0x003F) { # ?
4882            $self->{ct}->{data} .= '?';
4883            ## Stay in the state.
4884            
4885        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4886          $self->{line_prev} = $self->{line};
4887          $self->{column_prev} = $self->{column};
4888          $self->{column}++;
4889          $self->{nc}
4890              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4891        } else {
4892          $self->{set_nc}->($self);
4893        }
4894      
4895            redo A;
4896          } else {
4897            $self->{ct}->{data} .= '?'; ## XML5: not appended
4898            $self->{state} = PI_DATA_STATE;
4899            ## Reprocess.
4900            redo A;
4901          }
4902    
4903        } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_STATE) {
4904          if ($self->{nc} == 0x003C) { # <
4905            ## TODO:
4906            
4907        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4908          $self->{line_prev} = $self->{line};
4909          $self->{column_prev} = $self->{column};
4910          $self->{column}++;
4911          $self->{nc}
4912              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4913        } else {
4914          $self->{set_nc}->($self);
4915        }
4916      
4917            redo A;
4918          } elsif ($self->{nc} == 0x0025) { # %
4919            ## XML5: Not defined yet.
4920    
4921            ## TODO:
4922            
4923        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4924          $self->{line_prev} = $self->{line};
4925          $self->{column_prev} = $self->{column};
4926          $self->{column}++;
4927          $self->{nc}
4928              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4929        } else {
4930          $self->{set_nc}->($self);
4931        }
4932      
4933            redo A;
4934          } elsif ($self->{nc} == 0x005D) { # ]
4935            $self->{state} = DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
4936            
4937        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4938          $self->{line_prev} = $self->{line};
4939          $self->{column_prev} = $self->{column};
4940          $self->{column}++;
4941          $self->{nc}
4942              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4943        } else {
4944          $self->{set_nc}->($self);
4945        }
4946      
4947            redo A;
4948          } elsif ($is_space->{$self->{nc}}) {
4949            ## Stay in the state.
4950            
4951        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4952          $self->{line_prev} = $self->{line};
4953          $self->{column_prev} = $self->{column};
4954          $self->{column}++;
4955          $self->{nc}
4956              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4957        } else {
4958          $self->{set_nc}->($self);
4959        }
4960      
4961            redo A;
4962          } elsif ($self->{nc} == -1) {
4963            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed internal subset'); ## TODO: type
4964            $self->{state} = DATA_STATE;
4965            $self->{s_kwd} = '';
4966            ## Reconsume.
4967            return  ($self->{ct}); # DOCTYPE
4968            redo A;
4969          } else {
4970            unless ($self->{internal_subset_tainted}) {
4971              ## XML5: No parse error.
4972              $self->{parse_error}->(level => $self->{level}->{must}, type => 'string in internal subset');
4973              $self->{internal_subset_tainted} = 1;
4974            }
4975            ## Stay in the state.
4976            
4977        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4978          $self->{line_prev} = $self->{line};
4979          $self->{column_prev} = $self->{column};
4980          $self->{column}++;
4981          $self->{nc}
4982              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4983        } else {
4984          $self->{set_nc}->($self);
4985        }
4986      
4987            redo A;
4988          }
4989        } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
4990          if ($self->{nc} == 0x003E) { # >
4991            $self->{state} = DATA_STATE;
4992            $self->{s_kwd} = '';
4993            
4994        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4995          $self->{line_prev} = $self->{line};
4996          $self->{column_prev} = $self->{column};
4997          $self->{column}++;
4998          $self->{nc}
4999              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5000        } else {
5001          $self->{set_nc}->($self);
5002        }
5003      
5004            return  ($self->{ct}); # DOCTYPE
5005            redo A;
5006          } elsif ($self->{nc} == -1) {
5007            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
5008            $self->{state} = DATA_STATE;
5009            $self->{s_kwd} = '';
5010          ## Reconsume.          ## Reconsume.
5011            return  ($self->{ct}); # DOCTYPE
5012            redo A;
5013          } else {
5014            ## XML5: No parse error and stay in the state.
5015            $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after internal subset'); ## TODO: type
5016    
5017            $self->{state} = BOGUS_DOCTYPE_STATE;
5018            
5019        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5020          $self->{line_prev} = $self->{line};
5021          $self->{column_prev} = $self->{column};
5022          $self->{column}++;
5023          $self->{nc}
5024              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5025        } else {
5026          $self->{set_nc}->($self);
5027        }
5028      
5029          redo A;          redo A;
5030        }        }
5031            
5032      } else {      } else {
5033        die "$0: $self->{state}: Unknown state";        die "$0: $self->{state}: Unknown state";
5034      }      }

Legend:
Removed from v.1.2  
changed lines
  Added in v.1.12

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24