/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm
Suika

Diff of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.5 by wakaba, Tue Oct 14 14:38:59 2008 UTC revision 1.12 by wakaba, Wed Oct 15 12:49:49 2008 UTC
# Line 31  BEGIN { Line 31  BEGIN {
31    );    );
32  }  }
33    
34    ## NOTE: Differences from the XML5 draft are marked as "XML5:".
35    
36  ## Token types  ## Token types
37    
38  sub DOCTYPE_TOKEN () { 1 }  sub DOCTYPE_TOKEN () { 1 } ## XML5: No DOCTYPE token.
39  sub COMMENT_TOKEN () { 2 }  sub COMMENT_TOKEN () { 2 }
40  sub START_TAG_TOKEN () { 3 }  sub START_TAG_TOKEN () { 3 }
41  sub END_TAG_TOKEN () { 4 }  sub END_TAG_TOKEN () { 4 }
42  sub END_OF_FILE_TOKEN () { 5 }  sub END_OF_FILE_TOKEN () { 5 }
43  sub CHARACTER_TOKEN () { 6 }  sub CHARACTER_TOKEN () { 6 }
44  sub PI_TOKEN () { 7 } # XML5  sub PI_TOKEN () { 7 } ## NOTE: XML only.
45  sub ABORT_TOKEN () { 8 } # Not a token actually  sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing.
46    
47    ## XML5: XML5 has "empty tag token".  In this implementation, it is
48    ## represented as a start tag token with $self->{self_closing} flag
49    ## set to true.
50    
51    ## XML5: XML5 has "short end tag token".  In this implementation, it
52    ## is represented as an end tag token with $token->{tag_name} flag set
53    ## to an empty string.
54    
55  package Whatpm::HTML;  package Whatpm::HTML;
56    
# Line 114  sub HEXREF_HEX_STATE () { 48 } Line 124  sub HEXREF_HEX_STATE () { 48 }
124  sub ENTITY_NAME_STATE () { 49 }  sub ENTITY_NAME_STATE () { 49 }
125  sub PCDATA_STATE () { 50 } # "data state" in the spec  sub PCDATA_STATE () { 50 } # "data state" in the spec
126    
127    ## XML-only states
128    sub PI_STATE () { 51 }
129    sub PI_TARGET_STATE () { 52 }
130    sub PI_TARGET_AFTER_STATE () { 53 }
131    sub PI_DATA_STATE () { 54 }
132    sub PI_AFTER_STATE () { 55 }
133    sub PI_DATA_AFTER_STATE () { 56 }
134    sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 }
135    sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 }
136    
137  ## Tree constructor state constants (see Whatpm::HTML for the full  ## Tree constructor state constants (see Whatpm::HTML for the full
138  ## list and descriptions)  ## list and descriptions)
139    
# Line 178  sub _initialize_tokenizer ($) { Line 198  sub _initialize_tokenizer ($) {
198    #$self->{is_xml} (if XML)    #$self->{is_xml} (if XML)
199    
200    $self->{state} = DATA_STATE; # MUST    $self->{state} = DATA_STATE; # MUST
201    $self->{s_kwd} = ''; # state keyword    $self->{s_kwd} = ''; # Data state keyword
202      #$self->{kwd} = ''; # State-dependent keyword; initialized when used
203    #$self->{entity__value}; # initialized when used    #$self->{entity__value}; # initialized when used
204    #$self->{entity__match}; # initialized when used    #$self->{entity__match}; # initialized when used
205    $self->{content_model} = PCDATA_CONTENT_MODEL; # be    $self->{content_model} = PCDATA_CONTENT_MODEL; # be
# Line 208  sub _initialize_tokenizer ($) { Line 229  sub _initialize_tokenizer ($) {
229    
230  ## A token has:  ## A token has:
231  ##   ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,  ##   ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
232  ##       CHARACTER_TOKEN, or END_OF_FILE_TOKEN  ##       CHARACTER_TOKEN, END_OF_FILE_TOKEN, PI_TOKEN, or ABORT_TOKEN
233  ##   ->{name} (DOCTYPE_TOKEN)  ##   ->{name} (DOCTYPE_TOKEN)
234  ##   ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)  ##   ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
235    ##   ->{target} (PI_TOKEN)
236  ##   ->{pubid} (DOCTYPE_TOKEN)  ##   ->{pubid} (DOCTYPE_TOKEN)
237  ##   ->{sysid} (DOCTYPE_TOKEN)  ##   ->{sysid} (DOCTYPE_TOKEN)
238  ##   ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag  ##   ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
# Line 218  sub _initialize_tokenizer ($) { Line 240  sub _initialize_tokenizer ($) {
240  ##        ->{name}  ##        ->{name}
241  ##        ->{value}  ##        ->{value}
242  ##        ->{has_reference} == 1 or 0  ##        ->{has_reference} == 1 or 0
243  ##   ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)  ##        ->{index}: Index of the attribute in a tag.
244    ##   ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)
245    ##   ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
246    ##   ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.
247    ##   ->{has_internal_subset} = 1 or 0 (DOCTYPE_TOKEN)
248    
249  ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.  ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
250  ##     |->{self_closing}| is used to save the value of |$self->{self_closing}|  ##     |->{self_closing}| is used to save the value of |$self->{self_closing}|
251  ##     while the token is pushed back to the stack.  ##     while the token is pushed back to the stack.
# Line 238  my $is_space = { Line 265  my $is_space = {
265    0x0009 => 1, # CHARACTER TABULATION (HT)    0x0009 => 1, # CHARACTER TABULATION (HT)
266    0x000A => 1, # LINE FEED (LF)    0x000A => 1, # LINE FEED (LF)
267    #0x000B => 0, # LINE TABULATION (VT)    #0x000B => 0, # LINE TABULATION (VT)
268    0x000C => 1, # FORM FEED (FF)    0x000C => 1, # FORM FEED (FF) ## XML5: Not a space character.
269    #0x000D => 1, # CARRIAGE RETURN (CR)    #0x000D => 1, # CARRIAGE RETURN (CR)
270    0x0020 => 1, # SPACE (SP)    0x0020 => 1, # SPACE (SP)
271  };  };
# Line 498  sub _get_next_token ($) { Line 525  sub _get_next_token ($) {
525        return  ($token);        return  ($token);
526        redo A;        redo A;
527      } elsif ($self->{state} == TAG_OPEN_STATE) {      } elsif ($self->{state} == TAG_OPEN_STATE) {
528          ## XML5: "tag state".
529    
530        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
531          if ($self->{nc} == 0x002F) { # /          if ($self->{nc} == 0x002F) { # /
532                        
# Line 516  sub _get_next_token ($) { Line 545  sub _get_next_token ($) {
545            redo A;            redo A;
546          } elsif ($self->{nc} == 0x0021) { # !          } elsif ($self->{nc} == 0x0021) { # !
547                        
548            $self->{s_kwd} = '<' unless $self->{escape};            $self->{s_kwd} = $self->{escaped} ? '' : '<';
549            #            #
550          } else {          } else {
551                        
552              $self->{s_kwd} = '';
553            #            #
554          }          }
555    
556          ## reconsume          ## reconsume
557          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
         $self->{s_kwd} = '';  
558          return  ({type => CHARACTER_TOKEN, data => '<',          return  ({type => CHARACTER_TOKEN, data => '<',
559                    line => $self->{line_prev},                    line => $self->{line_prev},
560                    column => $self->{column_prev},                    column => $self->{column_prev},
# Line 629  sub _get_next_token ($) { Line 658  sub _get_next_token ($) {
658    
659            redo A;            redo A;
660          } elsif ($self->{nc} == 0x003F) { # ?          } elsif ($self->{nc} == 0x003F) { # ?
661                        if ($self->{is_xml}) {
662            $self->{parse_error}->(level => $self->{level}->{must}, type => 'pio',              
663                            line => $self->{line_prev},              $self->{state} = PI_STATE;
664                            column => $self->{column_prev});              
665            $self->{state} = BOGUS_COMMENT_STATE;      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
666            $self->{ct} = {type => COMMENT_TOKEN, data => '',        $self->{line_prev} = $self->{line};
667                                      line => $self->{line_prev},        $self->{column_prev} = $self->{column};
668                                      column => $self->{column_prev},        $self->{column}++;
669                                     };        $self->{nc}
670            ## $self->{nc} is intentionally left as is            = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
671            redo A;      } else {
672          } else {        $self->{set_nc}->($self);
673        }
674      
675                redo A;
676              } else {
677                
678                $self->{parse_error}->(level => $self->{level}->{must}, type => 'pio',
679                                line => $self->{line_prev},
680                                column => $self->{column_prev});
681                $self->{state} = BOGUS_COMMENT_STATE;
682                $self->{ct} = {type => COMMENT_TOKEN, data => '',
683                               line => $self->{line_prev},
684                               column => $self->{column_prev},
685                              };
686                ## $self->{nc} is intentionally left as is
687                redo A;
688              }
689            } elsif (not $self->{is_xml} or $is_space->{$self->{nc}}) {
690                        
691            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago',            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago',
692                            line => $self->{line_prev},                            line => $self->{line_prev},
# Line 655  sub _get_next_token ($) { Line 701  sub _get_next_token ($) {
701                     });                     });
702    
703            redo A;            redo A;
704            } else {
705              ## XML5: "<:" is a parse error.
706              
707              $self->{ct} = {type => START_TAG_TOKEN,
708                                        tag_name => chr ($self->{nc}),
709                                        line => $self->{line_prev},
710                                        column => $self->{column_prev}};
711              $self->{state} = TAG_NAME_STATE;
712              
713        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
714          $self->{line_prev} = $self->{line};
715          $self->{column_prev} = $self->{column};
716          $self->{column}++;
717          $self->{nc}
718              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
719        } else {
720          $self->{set_nc}->($self);
721        }
722      
723              redo A;
724          }          }
725        } else {        } else {
726          die "$0: $self->{content_model} in tag open";          die "$0: $self->{content_model} in tag open";
# Line 663  sub _get_next_token ($) { Line 729  sub _get_next_token ($) {
729        ## NOTE: The "close tag open state" in the spec is implemented as        ## NOTE: The "close tag open state" in the spec is implemented as
730        ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.        ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
731    
732          ## XML5: "end tag state".
733    
734        my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"        my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
735        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
736          if (defined $self->{last_stag_name}) {          if (defined $self->{last_stag_name}) {
737            $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;            $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
738            $self->{s_kwd} = '';            $self->{kwd} = '';
739            ## Reconsume.            ## Reconsume.
740            redo A;            redo A;
741          } else {          } else {
# Line 724  sub _get_next_token ($) { Line 792  sub _get_next_token ($) {
792        
793          redo A;          redo A;
794        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
           
795          $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty end tag',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty end tag',
796                          line => $self->{line_prev}, ## "<" in "</>"                          line => $self->{line_prev}, ## "<" in "</>"
797                          column => $self->{column_prev} - 1);                          column => $self->{column_prev} - 1);
798          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
799          $self->{s_kwd} = '';          $self->{s_kwd} = '';
800                    if ($self->{is_xml}) {
801              
802              ## XML5: No parse error.
803              
804              ## NOTE: This parser raises a parse error, since it supports
805              ## XML1, not XML5.
806    
807              ## NOTE: A short end tag token.
808              my $ct = {type => END_TAG_TOKEN,
809                        tag_name => '',
810                        line => $self->{line_prev},
811                        column => $self->{column_prev} - 1,
812                       };
813              
814      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
815        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
816        $self->{column_prev} = $self->{column};        $self->{column_prev} = $self->{column};
# Line 741  sub _get_next_token ($) { Line 821  sub _get_next_token ($) {
821        $self->{set_nc}->($self);        $self->{set_nc}->($self);
822      }      }
823        
824              return  ($ct);
825            } else {
826              
827              
828        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
829          $self->{line_prev} = $self->{line};
830          $self->{column_prev} = $self->{column};
831          $self->{column}++;
832          $self->{nc}
833              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
834        } else {
835          $self->{set_nc}->($self);
836        }
837      
838            }
839          redo A;          redo A;
840        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
841                    
# Line 754  sub _get_next_token ($) { Line 849  sub _get_next_token ($) {
849                   });                   });
850    
851          redo A;          redo A;
852        } else {        } elsif (not $self->{is_xml} or
853                   $is_space->{$self->{nc}}) {
854                    
855          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus end tag');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus end tag',
856                            line => $self->{line_prev}, # "<" of "</"
857                            column => $self->{column_prev} - 1);
858          $self->{state} = BOGUS_COMMENT_STATE;          $self->{state} = BOGUS_COMMENT_STATE;
859          $self->{ct} = {type => COMMENT_TOKEN, data => '',          $self->{ct} = {type => COMMENT_TOKEN, data => '',
860                                    line => $self->{line_prev}, # "<" of "</"                                    line => $self->{line_prev}, # "<" of "</"
# Line 769  sub _get_next_token ($) { Line 867  sub _get_next_token ($) {
867          ## generated from the bogus end tag, as defined in the          ## generated from the bogus end tag, as defined in the
868          ## "bogus comment state" entry.          ## "bogus comment state" entry.
869          redo A;          redo A;
870          } else {
871            ## XML5: "</:" is a parse error.
872            
873            $self->{ct} = {type => END_TAG_TOKEN,
874                           tag_name => chr ($self->{nc}),
875                           line => $l, column => $c};
876            $self->{state} = TAG_NAME_STATE; ## XML5: "end tag name state".
877            
878        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
879          $self->{line_prev} = $self->{line};
880          $self->{column_prev} = $self->{column};
881          $self->{column}++;
882          $self->{nc}
883              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
884        } else {
885          $self->{set_nc}->($self);
886        }
887      
888            redo A;
889        }        }
890      } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {      } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
891        my $ch = substr $self->{last_stag_name}, length $self->{s_kwd}, 1;        my $ch = substr $self->{last_stag_name}, length $self->{kwd}, 1;
892        if (length $ch) {        if (length $ch) {
893          my $CH = $ch;          my $CH = $ch;
894          $ch =~ tr/a-z/A-Z/;          $ch =~ tr/a-z/A-Z/;
# Line 779  sub _get_next_token ($) { Line 896  sub _get_next_token ($) {
896          if ($nch eq $ch or $nch eq $CH) {          if ($nch eq $ch or $nch eq $CH) {
897                        
898            ## Stay in the state.            ## Stay in the state.
899            $self->{s_kwd} .= $nch;            $self->{kwd} .= $nch;
900                        
901      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
902        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 798  sub _get_next_token ($) { Line 915  sub _get_next_token ($) {
915            $self->{s_kwd} = '';            $self->{s_kwd} = '';
916            ## Reconsume.            ## Reconsume.
917            return  ({type => CHARACTER_TOKEN,            return  ({type => CHARACTER_TOKEN,
918                      data => '</' . $self->{s_kwd},                      data => '</' . $self->{kwd},
919                      line => $self->{line_prev},                      line => $self->{line_prev},
920                      column => $self->{column_prev} - 1 - length $self->{s_kwd},                      column => $self->{column_prev} - 1 - length $self->{kwd},
921                     });                     });
922            redo A;            redo A;
923          }          }
# Line 816  sub _get_next_token ($) { Line 933  sub _get_next_token ($) {
933            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
934            $self->{s_kwd} = '';            $self->{s_kwd} = '';
935            return  ({type => CHARACTER_TOKEN,            return  ({type => CHARACTER_TOKEN,
936                      data => '</' . $self->{s_kwd},                      data => '</' . $self->{kwd},
937                      line => $self->{line_prev},                      line => $self->{line_prev},
938                      column => $self->{column_prev} - 1 - length $self->{s_kwd},                      column => $self->{column_prev} - 1 - length $self->{kwd},
939                     });                     });
940            redo A;            redo A;
941          } else {          } else {
# Line 827  sub _get_next_token ($) { Line 944  sub _get_next_token ($) {
944                = {type => END_TAG_TOKEN,                = {type => END_TAG_TOKEN,
945                   tag_name => $self->{last_stag_name},                   tag_name => $self->{last_stag_name},
946                   line => $self->{line_prev},                   line => $self->{line_prev},
947                   column => $self->{column_prev} - 1 - length $self->{s_kwd}};                   column => $self->{column_prev} - 1 - length $self->{kwd}};
948            $self->{state} = TAG_NAME_STATE;            $self->{state} = TAG_NAME_STATE;
949            ## Reconsume.            ## Reconsume.
950            redo A;            redo A;
# Line 959  sub _get_next_token ($) { Line 1076  sub _get_next_token ($) {
1076          redo A;          redo A;
1077        }        }
1078      } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {      } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
1079          ## XML5: "Tag attribute name before state".
1080    
1081        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
1082                    
1083          ## Stay in the state          ## Stay in the state
# Line 1071  sub _get_next_token ($) { Line 1190  sub _get_next_token ($) {
1190               0x003D => 1, # =               0x003D => 1, # =
1191              }->{$self->{nc}}) {              }->{$self->{nc}}) {
1192                        
1193              ## XML5: Not a parse error.
1194            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1195          } else {          } else {
1196                        
1197              ## XML5: ":" raises a parse error and is ignored.
1198          }          }
1199          $self->{ca}          $self->{ca}
1200              = {name => chr ($self->{nc}),              = {name => chr ($self->{nc}),
# Line 1094  sub _get_next_token ($) { Line 1215  sub _get_next_token ($) {
1215          redo A;          redo A;
1216        }        }
1217      } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {      } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
1218          ## XML5: "Tag attribute name state".
1219    
1220        my $before_leave = sub {        my $before_leave = sub {
1221          if (exists $self->{ct}->{attributes} # start tag or end tag          if (exists $self->{ct}->{attributes} # start tag or end tag
1222              ->{$self->{ca}->{name}}) { # MUST              ->{$self->{ca}->{name}}) { # MUST
# Line 1104  sub _get_next_token ($) { Line 1227  sub _get_next_token ($) {
1227                        
1228            $self->{ct}->{attributes}->{$self->{ca}->{name}}            $self->{ct}->{attributes}->{$self->{ca}->{name}}
1229              = $self->{ca};              = $self->{ca};
1230              $self->{ca}->{index} = ++$self->{ct}->{last_index};
1231          }          }
1232        }; # $before_leave        }; # $before_leave
1233    
# Line 1140  sub _get_next_token ($) { Line 1264  sub _get_next_token ($) {
1264        
1265          redo A;          redo A;
1266        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
1267            if ($self->{is_xml}) {
1268              
1269              ## XML5: Not a parse error.
1270              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1271            } else {
1272              
1273            }
1274    
1275          $before_leave->();          $before_leave->();
1276          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
1277                        
# Line 1189  sub _get_next_token ($) { Line 1321  sub _get_next_token ($) {
1321        
1322          redo A;          redo A;
1323        } elsif ($self->{nc} == 0x002F) { # /        } elsif ($self->{nc} == 0x002F) { # /
1324            if ($self->{is_xml}) {
1325              
1326              ## XML5: Not a parse error.
1327              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1328            } else {
1329              
1330            }
1331                    
1332          $before_leave->();          $before_leave->();
1333          $self->{state} = SELF_CLOSING_START_TAG_STATE;          $self->{state} = SELF_CLOSING_START_TAG_STATE;
# Line 1233  sub _get_next_token ($) { Line 1372  sub _get_next_token ($) {
1372          if ($self->{nc} == 0x0022 or # "          if ($self->{nc} == 0x0022 or # "
1373              $self->{nc} == 0x0027) { # '              $self->{nc} == 0x0027) { # '
1374                        
1375              ## XML5: Not a parse error.
1376            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1377          } else {          } else {
1378                        
# Line 1253  sub _get_next_token ($) { Line 1393  sub _get_next_token ($) {
1393          redo A;          redo A;
1394        }        }
1395      } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {      } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1396          ## XML5: "Tag attribute name after state".
1397          
1398        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
1399                    
1400          ## Stay in the state          ## Stay in the state
# Line 1284  sub _get_next_token ($) { Line 1426  sub _get_next_token ($) {
1426        
1427          redo A;          redo A;
1428        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
1429            if ($self->{is_xml}) {
1430              
1431              ## XML5: Not a parse error.
1432              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1433            } else {
1434              
1435            }
1436    
1437          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
1438                        
1439            $self->{last_stag_name} = $self->{ct}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
# Line 1337  sub _get_next_token ($) { Line 1487  sub _get_next_token ($) {
1487        
1488          redo A;          redo A;
1489        } elsif ($self->{nc} == 0x002F) { # /        } elsif ($self->{nc} == 0x002F) { # /
1490            if ($self->{is_xml}) {
1491              
1492              ## XML5: Not a parse error.
1493              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1494            } else {
1495              
1496            }
1497                    
1498          $self->{state} = SELF_CLOSING_START_TAG_STATE;          $self->{state} = SELF_CLOSING_START_TAG_STATE;
1499                    
# Line 1376  sub _get_next_token ($) { Line 1533  sub _get_next_token ($) {
1533    
1534          redo A;          redo A;
1535        } else {        } else {
1536            if ($self->{is_xml}) {
1537              
1538              ## XML5: Not a parse error.
1539              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1540            } else {
1541              
1542            }
1543    
1544          if ($self->{nc} == 0x0022 or # "          if ($self->{nc} == 0x0022 or # "
1545              $self->{nc} == 0x0027) { # '              $self->{nc} == 0x0027) { # '
1546                        
1547              ## XML5: Not a parse error.
1548            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1549          } else {          } else {
1550                        
# Line 1402  sub _get_next_token ($) { Line 1568  sub _get_next_token ($) {
1568          redo A;                  redo A;        
1569        }        }
1570      } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {      } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1571          ## XML5: "Tag attribute value before state".
1572    
1573        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
1574                    
1575          ## Stay in the state          ## Stay in the state
# Line 1513  sub _get_next_token ($) { Line 1681  sub _get_next_token ($) {
1681        } else {        } else {
1682          if ($self->{nc} == 0x003D) { # =          if ($self->{nc} == 0x003D) { # =
1683                        
1684              ## XML5: Not a parse error.
1685            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
1686            } elsif ($self->{is_xml}) {
1687              
1688              ## XML5: No parse error.
1689              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO
1690          } else {          } else {
1691                        
1692          }          }
# Line 1533  sub _get_next_token ($) { Line 1706  sub _get_next_token ($) {
1706          redo A;          redo A;
1707        }        }
1708      } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {      } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1709          ## XML5: "Tag attribute value double quoted state".
1710          
1711        if ($self->{nc} == 0x0022) { # "        if ($self->{nc} == 0x0022) { # "
1712                    
1713            ## XML5: "Tag attribute name before state".
1714          $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;          $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1715                    
1716      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 1550  sub _get_next_token ($) { Line 1726  sub _get_next_token ($) {
1726          redo A;          redo A;
1727        } elsif ($self->{nc} == 0x0026) { # &        } elsif ($self->{nc} == 0x0026) { # &
1728                    
1729            ## XML5: Not defined yet.
1730    
1731          ## NOTE: In the spec, the tokenizer is switched to the          ## NOTE: In the spec, the tokenizer is switched to the
1732          ## "entity in attribute value state".  In this implementation, the          ## "entity in attribute value state".  In this implementation, the
1733          ## tokenizer is switched to the |ENTITY_STATE|, which is an          ## tokenizer is switched to the |ENTITY_STATE|, which is an
# Line 1594  sub _get_next_token ($) { Line 1772  sub _get_next_token ($) {
1772    
1773          redo A;          redo A;
1774        } else {        } else {
1775                    if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1776              
1777              ## XML5: Not a parse error.
1778              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
1779            } else {
1780              
1781            }
1782          $self->{ca}->{value} .= chr ($self->{nc});          $self->{ca}->{value} .= chr ($self->{nc});
1783          $self->{read_until}->($self->{ca}->{value},          $self->{read_until}->($self->{ca}->{value},
1784                                q["&],                                q["&<],
1785                                length $self->{ca}->{value});                                length $self->{ca}->{value});
1786    
1787          ## Stay in the state          ## Stay in the state
# Line 1615  sub _get_next_token ($) { Line 1799  sub _get_next_token ($) {
1799          redo A;          redo A;
1800        }        }
1801      } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {      } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1802          ## XML5: "Tag attribute value single quoted state".
1803    
1804        if ($self->{nc} == 0x0027) { # '        if ($self->{nc} == 0x0027) { # '
1805                    
1806            ## XML5: "Before attribute name state" (sic).
1807          $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;          $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1808                    
1809      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 1632  sub _get_next_token ($) { Line 1819  sub _get_next_token ($) {
1819          redo A;          redo A;
1820        } elsif ($self->{nc} == 0x0026) { # &        } elsif ($self->{nc} == 0x0026) { # &
1821                    
1822            ## XML5: Not defined yet.
1823    
1824          ## NOTE: In the spec, the tokenizer is switched to the          ## NOTE: In the spec, the tokenizer is switched to the
1825          ## "entity in attribute value state".  In this implementation, the          ## "entity in attribute value state".  In this implementation, the
1826          ## tokenizer is switched to the |ENTITY_STATE|, which is an          ## tokenizer is switched to the |ENTITY_STATE|, which is an
# Line 1676  sub _get_next_token ($) { Line 1865  sub _get_next_token ($) {
1865    
1866          redo A;          redo A;
1867        } else {        } else {
1868                    if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1869              
1870              ## XML5: Not a parse error.
1871              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
1872            } else {
1873              
1874            }
1875          $self->{ca}->{value} .= chr ($self->{nc});          $self->{ca}->{value} .= chr ($self->{nc});
1876          $self->{read_until}->($self->{ca}->{value},          $self->{read_until}->($self->{ca}->{value},
1877                                q['&],                                q['&<],
1878                                length $self->{ca}->{value});                                length $self->{ca}->{value});
1879    
1880          ## Stay in the state          ## Stay in the state
# Line 1697  sub _get_next_token ($) { Line 1892  sub _get_next_token ($) {
1892          redo A;          redo A;
1893        }        }
1894      } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {      } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1895          ## XML5: "Tag attribute value unquoted state".
1896    
1897        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
1898                    
1899            ## XML5: "Tag attribute name before state".
1900          $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;          $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1901                    
1902      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 1714  sub _get_next_token ($) { Line 1912  sub _get_next_token ($) {
1912          redo A;          redo A;
1913        } elsif ($self->{nc} == 0x0026) { # &        } elsif ($self->{nc} == 0x0026) { # &
1914                    
1915    
1916            ## XML5: Not defined yet.
1917    
1918          ## NOTE: In the spec, the tokenizer is switched to the          ## NOTE: In the spec, the tokenizer is switched to the
1919          ## "entity in attribute value state".  In this implementation, the          ## "entity in attribute value state".  In this implementation, the
1920          ## tokenizer is switched to the |ENTITY_STATE|, which is an          ## tokenizer is switched to the |ENTITY_STATE|, which is an
# Line 1797  sub _get_next_token ($) { Line 1998  sub _get_next_token ($) {
1998               0x003D => 1, # =               0x003D => 1, # =
1999              }->{$self->{nc}}) {              }->{$self->{nc}}) {
2000                        
2001              ## XML5: Not a parse error.
2002            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
2003          } else {          } else {
2004                        
# Line 1913  sub _get_next_token ($) { Line 2115  sub _get_next_token ($) {
2115          redo A;          redo A;
2116        }        }
2117      } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {      } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
2118          ## XML5: "Empty tag state".
2119    
2120        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
2121          if ($self->{ct}->{type} == END_TAG_TOKEN) {          if ($self->{ct}->{type} == END_TAG_TOKEN) {
2122                        
# Line 1964  sub _get_next_token ($) { Line 2168  sub _get_next_token ($) {
2168          } else {          } else {
2169            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
2170          }          }
2171            ## XML5: "Tag attribute name before state".
2172          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2173          $self->{s_kwd} = '';          $self->{s_kwd} = '';
2174          ## Reconsume.          ## Reconsume.
# Line 2053  sub _get_next_token ($) { Line 2258  sub _get_next_token ($) {
2258          ## ASCII case-insensitive.          ## ASCII case-insensitive.
2259                    
2260          $self->{state} = MD_DOCTYPE_STATE;          $self->{state} = MD_DOCTYPE_STATE;
2261          $self->{s_kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
2262                    
2263      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2264        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2072  sub _get_next_token ($) { Line 2277  sub _get_next_token ($) {
2277                 $self->{nc} == 0x005B) { # [                 $self->{nc} == 0x005B) { # [
2278                                                    
2279          $self->{state} = MD_CDATA_STATE;          $self->{state} = MD_CDATA_STATE;
2280          $self->{s_kwd} = '[';          $self->{kwd} = '[';
2281                    
2282      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2283        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2106  sub _get_next_token ($) { Line 2311  sub _get_next_token ($) {
2311                                    line => $self->{line_prev},                                    line => $self->{line_prev},
2312                                    column => $self->{column_prev} - 2,                                    column => $self->{column_prev} - 2,
2313                                   };                                   };
2314          $self->{state} = COMMENT_START_STATE;          $self->{state} = COMMENT_START_STATE; ## XML5: "comment state".
2315                    
2316      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2317        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2142  sub _get_next_token ($) { Line 2347  sub _get_next_token ($) {
2347              0x0054, # T              0x0054, # T
2348              0x0059, # Y              0x0059, # Y
2349              0x0050, # P              0x0050, # P
2350            ]->[length $self->{s_kwd}] or            ]->[length $self->{kwd}] or
2351            $self->{nc} == [            $self->{nc} == [
2352              undef,              undef,
2353              0x006F, # o              0x006F, # o
# Line 2150  sub _get_next_token ($) { Line 2355  sub _get_next_token ($) {
2355              0x0074, # t              0x0074, # t
2356              0x0079, # y              0x0079, # y
2357              0x0070, # p              0x0070, # p
2358            ]->[length $self->{s_kwd}]) {            ]->[length $self->{kwd}]) {
2359                    
2360          ## Stay in the state.          ## Stay in the state.
2361          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
2362                    
2363      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2364        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2166  sub _get_next_token ($) { Line 2371  sub _get_next_token ($) {
2371      }      }
2372        
2373          redo A;          redo A;
2374        } elsif ((length $self->{s_kwd}) == 6 and        } elsif ((length $self->{kwd}) == 6 and
2375                 ($self->{nc} == 0x0045 or # E                 ($self->{nc} == 0x0045 or # E
2376                  $self->{nc} == 0x0065)) { # e                  $self->{nc} == 0x0065)) { # e
2377                    if ($self->{is_xml} and
2378                ($self->{kwd} ne 'DOCTYP' or $self->{nc} == 0x0065)) {
2379              
2380              ## XML5: case-sensitive.
2381              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO
2382                              text => 'DOCTYPE',
2383                              line => $self->{line_prev},
2384                              column => $self->{column_prev} - 5);
2385            } else {
2386              
2387            }
2388          $self->{state} = DOCTYPE_STATE;          $self->{state} = DOCTYPE_STATE;
2389          $self->{ct} = {type => DOCTYPE_TOKEN,          $self->{ct} = {type => DOCTYPE_TOKEN,
2390                                    quirks => 1,                                    quirks => 1,
# Line 2192  sub _get_next_token ($) { Line 2407  sub _get_next_token ($) {
2407                                    
2408          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2409                          line => $self->{line_prev},                          line => $self->{line_prev},
2410                          column => $self->{column_prev} - 1 - length $self->{s_kwd});                          column => $self->{column_prev} - 1 - length $self->{kwd});
2411          $self->{state} = BOGUS_COMMENT_STATE;          $self->{state} = BOGUS_COMMENT_STATE;
2412          ## Reconsume.          ## Reconsume.
2413          $self->{ct} = {type => COMMENT_TOKEN,          $self->{ct} = {type => COMMENT_TOKEN,
2414                                    data => $self->{s_kwd},                                    data => $self->{kwd},
2415                                    line => $self->{line_prev},                                    line => $self->{line_prev},
2416                                    column => $self->{column_prev} - 1 - length $self->{s_kwd},                                    column => $self->{column_prev} - 1 - length $self->{kwd},
2417                                   };                                   };
2418          redo A;          redo A;
2419        }        }
# Line 2209  sub _get_next_token ($) { Line 2424  sub _get_next_token ($) {
2424              '[CD' => 0x0041, # A              '[CD' => 0x0041, # A
2425              '[CDA' => 0x0054, # T              '[CDA' => 0x0054, # T
2426              '[CDAT' => 0x0041, # A              '[CDAT' => 0x0041, # A
2427            }->{$self->{s_kwd}}) {            }->{$self->{kwd}}) {
2428                    
2429          ## Stay in the state.          ## Stay in the state.
2430          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
2431                    
2432      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2433        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2225  sub _get_next_token ($) { Line 2440  sub _get_next_token ($) {
2440      }      }
2441        
2442          redo A;          redo A;
2443        } elsif ($self->{s_kwd} eq '[CDATA' and        } elsif ($self->{kwd} eq '[CDATA' and
2444                 $self->{nc} == 0x005B) { # [                 $self->{nc} == 0x005B) { # [
2445                    if ($self->{is_xml} and
2446                not $self->{tainted} and
2447                @{$self->{open_elements} or []} == 0) {
2448              
2449              $self->{parse_error}->(level => $self->{level}->{must}, type => 'cdata outside of root element',
2450                              line => $self->{line_prev},
2451                              column => $self->{column_prev} - 7);
2452              $self->{tainted} = 1;
2453            } else {
2454              
2455            }
2456    
2457          $self->{ct} = {type => CHARACTER_TOKEN,          $self->{ct} = {type => CHARACTER_TOKEN,
2458                                    data => '',                                    data => '',
2459                                    line => $self->{line_prev},                                    line => $self->{line_prev},
# Line 2249  sub _get_next_token ($) { Line 2475  sub _get_next_token ($) {
2475                    
2476          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2477                          line => $self->{line_prev},                          line => $self->{line_prev},
2478                          column => $self->{column_prev} - 1 - length $self->{s_kwd});                          column => $self->{column_prev} - 1 - length $self->{kwd});
2479          $self->{state} = BOGUS_COMMENT_STATE;          $self->{state} = BOGUS_COMMENT_STATE;
2480          ## Reconsume.          ## Reconsume.
2481          $self->{ct} = {type => COMMENT_TOKEN,          $self->{ct} = {type => COMMENT_TOKEN,
2482                                    data => $self->{s_kwd},                                    data => $self->{kwd},
2483                                    line => $self->{line_prev},                                    line => $self->{line_prev},
2484                                    column => $self->{column_prev} - 1 - length $self->{s_kwd},                                    column => $self->{column_prev} - 1 - length $self->{kwd},
2485                                   };                                   };
2486          redo A;          redo A;
2487        }        }
# Line 2435  sub _get_next_token ($) { Line 2661  sub _get_next_token ($) {
2661          redo A;          redo A;
2662        }        }
2663      } elsif ($self->{state} == COMMENT_END_DASH_STATE) {      } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2664          ## XML5: "comment dash state".
2665    
2666        if ($self->{nc} == 0x002D) { # -        if ($self->{nc} == 0x002D) { # -
2667                    
2668          $self->{state} = COMMENT_END_STATE;          $self->{state} = COMMENT_END_STATE;
# Line 2453  sub _get_next_token ($) { Line 2681  sub _get_next_token ($) {
2681        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
2682                    
2683          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
         $self->{s_kwd} = '';  
2684          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2685          $self->{s_kwd} = '';          $self->{s_kwd} = '';
2686          ## reconsume          ## reconsume
# Line 2500  sub _get_next_token ($) { Line 2727  sub _get_next_token ($) {
2727          redo A;          redo A;
2728        } elsif ($self->{nc} == 0x002D) { # -        } elsif ($self->{nc} == 0x002D) { # -
2729                    
2730            ## XML5: Not a parse error.
2731          $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2732                          line => $self->{line_prev},                          line => $self->{line_prev},
2733                          column => $self->{column_prev});                          column => $self->{column_prev});
# Line 2529  sub _get_next_token ($) { Line 2757  sub _get_next_token ($) {
2757          redo A;          redo A;
2758        } else {        } else {
2759                    
2760            ## XML5: Not a parse error.
2761          $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2762                          line => $self->{line_prev},                          line => $self->{line_prev},
2763                          column => $self->{column_prev});                          column => $self->{column_prev});
# Line 2565  sub _get_next_token ($) { Line 2794  sub _get_next_token ($) {
2794          redo A;          redo A;
2795        } else {        } else {
2796                    
2797            ## XML5: Unless EOF, swith to the bogus comment state.
2798          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before DOCTYPE name');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before DOCTYPE name');
2799          $self->{state} = BEFORE_DOCTYPE_NAME_STATE;          $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2800          ## reconsume          ## reconsume
2801          redo A;          redo A;
2802        }        }
2803      } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {      } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
2804          ## XML5: "DOCTYPE root name before state".
2805    
2806        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
2807                    
2808          ## Stay in the state          ## Stay in the state
# Line 2588  sub _get_next_token ($) { Line 2820  sub _get_next_token ($) {
2820          redo A;          redo A;
2821        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
2822                    
2823            ## XML5: No parse error.
2824          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
2825          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2826          $self->{s_kwd} = '';          $self->{s_kwd} = '';
# Line 2616  sub _get_next_token ($) { Line 2849  sub _get_next_token ($) {
2849          return  ($self->{ct}); # DOCTYPE (quirks)          return  ($self->{ct}); # DOCTYPE (quirks)
2850    
2851          redo A;          redo A;
2852          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2853            
2854            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
2855            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2856            
2857        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2858          $self->{line_prev} = $self->{line};
2859          $self->{column_prev} = $self->{column};
2860          $self->{column}++;
2861          $self->{nc}
2862              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2863        } else {
2864          $self->{set_nc}->($self);
2865        }
2866      
2867            redo A;
2868        } else {        } else {
2869                    
2870          $self->{ct}->{name} = chr $self->{nc};          $self->{ct}->{name} = chr $self->{nc};
# Line 2635  sub _get_next_token ($) { Line 2884  sub _get_next_token ($) {
2884          redo A;          redo A;
2885        }        }
2886      } elsif ($self->{state} == DOCTYPE_NAME_STATE) {      } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
2887  ## ISSUE: Redundant "First," in the spec.        ## XML5: "DOCTYPE root name state".
2888    
2889          ## ISSUE: Redundant "First," in the spec.
2890    
2891        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
2892                    
2893          $self->{state} = AFTER_DOCTYPE_NAME_STATE;          $self->{state} = AFTER_DOCTYPE_NAME_STATE;
# Line 2681  sub _get_next_token ($) { Line 2933  sub _get_next_token ($) {
2933          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
2934    
2935          redo A;          redo A;
2936          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2937            
2938            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2939            
2940        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2941          $self->{line_prev} = $self->{line};
2942          $self->{column_prev} = $self->{column};
2943          $self->{column}++;
2944          $self->{nc}
2945              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2946        } else {
2947          $self->{set_nc}->($self);
2948        }
2949      
2950            redo A;
2951        } else {        } else {
2952                    
2953          $self->{ct}->{name}          $self->{ct}->{name}
# Line 2700  sub _get_next_token ($) { Line 2967  sub _get_next_token ($) {
2967          redo A;          redo A;
2968        }        }
2969      } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {      } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
2970          ## XML5: Corresponding to XML5's "DOCTYPE root name after
2971          ## state", but implemented differently.
2972    
2973        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
2974                    
2975          ## Stay in the state          ## Stay in the state
# Line 2747  sub _get_next_token ($) { Line 3017  sub _get_next_token ($) {
3017          redo A;          redo A;
3018        } elsif ($self->{nc} == 0x0050 or # P        } elsif ($self->{nc} == 0x0050 or # P
3019                 $self->{nc} == 0x0070) { # p                 $self->{nc} == 0x0070) { # p
3020            
3021          $self->{state} = PUBLIC_STATE;          $self->{state} = PUBLIC_STATE;
3022          $self->{s_kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
3023                    
3024      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3025        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2763  sub _get_next_token ($) { Line 3034  sub _get_next_token ($) {
3034          redo A;          redo A;
3035        } elsif ($self->{nc} == 0x0053 or # S        } elsif ($self->{nc} == 0x0053 or # S
3036                 $self->{nc} == 0x0073) { # s                 $self->{nc} == 0x0073) { # s
3037            
3038          $self->{state} = SYSTEM_STATE;          $self->{state} = SYSTEM_STATE;
3039          $self->{s_kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
3040            
3041        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3042          $self->{line_prev} = $self->{line};
3043          $self->{column_prev} = $self->{column};
3044          $self->{column}++;
3045          $self->{nc}
3046              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3047        } else {
3048          $self->{set_nc}->($self);
3049        }
3050      
3051            redo A;
3052          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3053            
3054            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3055            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3056                    
3057      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3058        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2804  sub _get_next_token ($) { Line 3092  sub _get_next_token ($) {
3092              0x0042, # B              0x0042, # B
3093              0x004C, # L              0x004C, # L
3094              0x0049, # I              0x0049, # I
3095            ]->[length $self->{s_kwd}] or            ]->[length $self->{kwd}] or
3096            $self->{nc} == [            $self->{nc} == [
3097              undef,              undef,
3098              0x0075, # u              0x0075, # u
3099              0x0062, # b              0x0062, # b
3100              0x006C, # l              0x006C, # l
3101              0x0069, # i              0x0069, # i
3102            ]->[length $self->{s_kwd}]) {            ]->[length $self->{kwd}]) {
3103                    
3104          ## Stay in the state.          ## Stay in the state.
3105          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
3106                    
3107      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3108        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2827  sub _get_next_token ($) { Line 3115  sub _get_next_token ($) {
3115      }      }
3116        
3117          redo A;          redo A;
3118        } elsif ((length $self->{s_kwd}) == 5 and        } elsif ((length $self->{kwd}) == 5 and
3119                 ($self->{nc} == 0x0043 or # C                 ($self->{nc} == 0x0043 or # C
3120                  $self->{nc} == 0x0063)) { # c                  $self->{nc} == 0x0063)) { # c
3121                    if ($self->{is_xml} and
3122                ($self->{kwd} ne 'PUBLI' or $self->{nc} == 0x0063)) { # c
3123              
3124              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
3125                              text => 'PUBLIC',
3126                              line => $self->{line_prev},
3127                              column => $self->{column_prev} - 4);
3128            } else {
3129              
3130            }
3131          $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;          $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3132                    
3133      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 2848  sub _get_next_token ($) { Line 3145  sub _get_next_token ($) {
3145                    
3146          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name',
3147                          line => $self->{line_prev},                          line => $self->{line_prev},
3148                          column => $self->{column_prev} + 1 - length $self->{s_kwd});                          column => $self->{column_prev} + 1 - length $self->{kwd});
3149          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
3150    
3151          $self->{state} = BOGUS_DOCTYPE_STATE;          $self->{state} = BOGUS_DOCTYPE_STATE;
# Line 2863  sub _get_next_token ($) { Line 3160  sub _get_next_token ($) {
3160              0x0053, # S              0x0053, # S
3161              0x0054, # T              0x0054, # T
3162              0x0045, # E              0x0045, # E
3163            ]->[length $self->{s_kwd}] or            ]->[length $self->{kwd}] or
3164            $self->{nc} == [            $self->{nc} == [
3165              undef,              undef,
3166              0x0079, # y              0x0079, # y
3167              0x0073, # s              0x0073, # s
3168              0x0074, # t              0x0074, # t
3169              0x0065, # e              0x0065, # e
3170            ]->[length $self->{s_kwd}]) {            ]->[length $self->{kwd}]) {
3171                    
3172          ## Stay in the state.          ## Stay in the state.
3173          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
3174                    
3175      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3176        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2886  sub _get_next_token ($) { Line 3183  sub _get_next_token ($) {
3183      }      }
3184        
3185          redo A;          redo A;
3186        } elsif ((length $self->{s_kwd}) == 5 and        } elsif ((length $self->{kwd}) == 5 and
3187                 ($self->{nc} == 0x004D or # M                 ($self->{nc} == 0x004D or # M
3188                  $self->{nc} == 0x006D)) { # m                  $self->{nc} == 0x006D)) { # m
3189                    if ($self->{is_xml} and
3190                ($self->{kwd} ne 'SYSTE' or $self->{nc} == 0x006D)) { # m
3191              
3192              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
3193                              text => 'SYSTEM',
3194                              line => $self->{line_prev},
3195                              column => $self->{column_prev} - 4);
3196            } else {
3197              
3198            }
3199          $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;          $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
3200                    
3201      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 2907  sub _get_next_token ($) { Line 3213  sub _get_next_token ($) {
3213                    
3214          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name',
3215                          line => $self->{line_prev},                          line => $self->{line_prev},
3216                          column => $self->{column_prev} + 1 - length $self->{s_kwd});                          column => $self->{column_prev} + 1 - length $self->{kwd});
3217          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
3218    
3219          $self->{state} = BOGUS_DOCTYPE_STATE;          $self->{state} = BOGUS_DOCTYPE_STATE;
# Line 2996  sub _get_next_token ($) { Line 3302  sub _get_next_token ($) {
3302          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
3303    
3304          redo A;          redo A;
3305          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3306            
3307            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3308            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3309            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3310            
3311        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3312          $self->{line_prev} = $self->{line};
3313          $self->{column_prev} = $self->{column};
3314          $self->{column}++;
3315          $self->{nc}
3316              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3317        } else {
3318          $self->{set_nc}->($self);
3319        }
3320      
3321            redo A;
3322        } else {        } else {
3323                    
3324          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC');
# Line 3206  sub _get_next_token ($) { Line 3529  sub _get_next_token ($) {
3529        
3530          redo A;          redo A;
3531        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
3532                    if ($self->{is_xml}) {
3533              
3534              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3535            } else {
3536              
3537            }
3538          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3539          $self->{s_kwd} = '';          $self->{s_kwd} = '';
3540                    
# Line 3236  sub _get_next_token ($) { Line 3564  sub _get_next_token ($) {
3564          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
3565    
3566          redo A;          redo A;
3567          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3568            
3569            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3570            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3571            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3572            
3573        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3574          $self->{line_prev} = $self->{line};
3575          $self->{column_prev} = $self->{column};
3576          $self->{column}++;
3577          $self->{nc}
3578              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3579        } else {
3580          $self->{set_nc}->($self);
3581        }
3582      
3583            redo A;
3584        } else {        } else {
3585                    
3586          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC literal');
# Line 3336  sub _get_next_token ($) { Line 3681  sub _get_next_token ($) {
3681          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
3682    
3683          redo A;          redo A;
3684          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3685            
3686            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3687    
3688            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3689            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3690            
3691        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3692          $self->{line_prev} = $self->{line};
3693          $self->{column_prev} = $self->{column};
3694          $self->{column}++;
3695          $self->{nc}
3696              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3697        } else {
3698          $self->{set_nc}->($self);
3699        }
3700      
3701            redo A;
3702        } else {        } else {
3703                    
3704          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM');
# Line 3371  sub _get_next_token ($) { Line 3734  sub _get_next_token ($) {
3734      }      }
3735        
3736          redo A;          redo A;
3737        } elsif ($self->{nc} == 0x003E) { # >        } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
3738                    
3739          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3740    
# Line 3442  sub _get_next_token ($) { Line 3805  sub _get_next_token ($) {
3805      }      }
3806        
3807          redo A;          redo A;
3808        } elsif ($self->{nc} == 0x003E) { # >        } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
3809                    
3810          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3811    
# Line 3543  sub _get_next_token ($) { Line 3906  sub _get_next_token ($) {
3906          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
3907    
3908          redo A;          redo A;
3909          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3910            
3911            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3912            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3913            
3914        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3915          $self->{line_prev} = $self->{line};
3916          $self->{column_prev} = $self->{column};
3917          $self->{column}++;
3918          $self->{nc}
3919              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3920        } else {
3921          $self->{set_nc}->($self);
3922        }
3923      
3924            redo A;
3925        } else {        } else {
3926                    
3927          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
# Line 3582  sub _get_next_token ($) { Line 3961  sub _get_next_token ($) {
3961          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
3962    
3963          redo A;          redo A;
3964          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3965            if ($self->{ct}->{has_internal_subset}) { # DOCTYPE
3966              
3967              ## Stay in the state.
3968              
3969        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3970          $self->{line_prev} = $self->{line};
3971          $self->{column_prev} = $self->{column};
3972          $self->{column}++;
3973          $self->{nc}
3974              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3975        } else {
3976          $self->{set_nc}->($self);
3977        }
3978      
3979              redo A;
3980            } else {
3981              
3982              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3983              $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3984              
3985        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3986          $self->{line_prev} = $self->{line};
3987          $self->{column_prev} = $self->{column};
3988          $self->{column}++;
3989          $self->{nc}
3990              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3991        } else {
3992          $self->{set_nc}->($self);
3993        }
3994      
3995              redo A;
3996            }
3997        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
3998                    
3999          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
# Line 3594  sub _get_next_token ($) { Line 4006  sub _get_next_token ($) {
4006        } else {        } else {
4007                    
4008          my $s = '';          my $s = '';
4009          $self->{read_until}->($s, q[>], 0);          $self->{read_until}->($s, q{>[}, 0);
4010    
4011          ## Stay in the state          ## Stay in the state
4012                    
# Line 3614  sub _get_next_token ($) { Line 4026  sub _get_next_token ($) {
4026        ## NOTE: "CDATA section state" in the state is jointly implemented        ## NOTE: "CDATA section state" in the state is jointly implemented
4027        ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,        ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
4028        ## and |CDATA_SECTION_MSE2_STATE|.        ## and |CDATA_SECTION_MSE2_STATE|.
4029    
4030          ## XML5: "CDATA state".
4031                
4032        if ($self->{nc} == 0x005D) { # ]        if ($self->{nc} == 0x005D) { # ]
4033                    
# Line 3631  sub _get_next_token ($) { Line 4045  sub _get_next_token ($) {
4045        
4046          redo A;          redo A;
4047        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
4048            if ($self->{is_xml}) {
4049              
4050              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no mse'); ## TODO: type
4051            } else {
4052              
4053            }
4054    
4055          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
4056          $self->{s_kwd} = '';          $self->{s_kwd} = '';
4057                    ## Reconsume.
     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {  
       $self->{line_prev} = $self->{line};  
       $self->{column_prev} = $self->{column};  
       $self->{column}++;  
       $self->{nc}  
           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);  
     } else {  
       $self->{set_nc}->($self);  
     }  
     
4058          if (length $self->{ct}->{data}) { # character          if (length $self->{ct}->{data}) { # character
4059                        
4060            return  ($self->{ct}); # character            return  ($self->{ct}); # character
# Line 3676  sub _get_next_token ($) { Line 4087  sub _get_next_token ($) {
4087    
4088        ## ISSUE: "text tokens" in spec.        ## ISSUE: "text tokens" in spec.
4089      } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {      } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
4090          ## XML5: "CDATA bracket state".
4091    
4092        if ($self->{nc} == 0x005D) { # ]        if ($self->{nc} == 0x005D) { # ]
4093                    
4094          $self->{state} = CDATA_SECTION_MSE2_STATE;          $self->{state} = CDATA_SECTION_MSE2_STATE;
# Line 3693  sub _get_next_token ($) { Line 4106  sub _get_next_token ($) {
4106          redo A;          redo A;
4107        } else {        } else {
4108                    
4109            ## XML5: If EOF, "]" is not appended and changed to the data state.
4110          $self->{ct}->{data} .= ']';          $self->{ct}->{data} .= ']';
4111          $self->{state} = CDATA_SECTION_STATE;          $self->{state} = CDATA_SECTION_STATE; ## XML5: Stay in the state.
4112          ## Reconsume.          ## Reconsume.
4113          redo A;          redo A;
4114        }        }
4115      } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {      } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
4116          ## XML5: "CDATA end state".
4117    
4118        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
4119          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
4120          $self->{s_kwd} = '';          $self->{s_kwd} = '';
# Line 3741  sub _get_next_token ($) { Line 4157  sub _get_next_token ($) {
4157                    
4158          $self->{ct}->{data} .= ']]'; # character          $self->{ct}->{data} .= ']]'; # character
4159          $self->{state} = CDATA_SECTION_STATE;          $self->{state} = CDATA_SECTION_STATE;
4160          ## Reconsume.          ## Reconsume. ## XML5: Emit.
4161          redo A;          redo A;
4162        }        }
4163      } elsif ($self->{state} == ENTITY_STATE) {      } elsif ($self->{state} == ENTITY_STATE) {
# Line 3758  sub _get_next_token ($) { Line 4174  sub _get_next_token ($) {
4174        } elsif ($self->{nc} == 0x0023) { # #        } elsif ($self->{nc} == 0x0023) { # #
4175                    
4176          $self->{state} = ENTITY_HASH_STATE;          $self->{state} = ENTITY_HASH_STATE;
4177          $self->{s_kwd} = '#';          $self->{kwd} = '#';
4178                    
4179      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4180        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3778  sub _get_next_token ($) { Line 4194  sub _get_next_token ($) {
4194                    
4195          require Whatpm::_NamedEntityList;          require Whatpm::_NamedEntityList;
4196          $self->{state} = ENTITY_NAME_STATE;          $self->{state} = ENTITY_NAME_STATE;
4197          $self->{s_kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
4198          $self->{entity__value} = $self->{s_kwd};          $self->{entity__value} = $self->{kwd};
4199          $self->{entity__match} = 0;          $self->{entity__match} = 0;
4200                    
4201      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 3829  sub _get_next_token ($) { Line 4245  sub _get_next_token ($) {
4245            $self->{nc} == 0x0058) { # X            $self->{nc} == 0x0058) { # X
4246                    
4247          $self->{state} = HEXREF_X_STATE;          $self->{state} = HEXREF_X_STATE;
4248          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
4249                    
4250      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4251        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3846  sub _get_next_token ($) { Line 4262  sub _get_next_token ($) {
4262                 $self->{nc} <= 0x0039) { # 0..9                 $self->{nc} <= 0x0039) { # 0..9
4263                    
4264          $self->{state} = NCR_NUM_STATE;          $self->{state} = NCR_NUM_STATE;
4265          $self->{s_kwd} = $self->{nc} - 0x0030;          $self->{kwd} = $self->{nc} - 0x0030;
4266                    
4267      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4268        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3892  sub _get_next_token ($) { Line 4308  sub _get_next_token ($) {
4308        if (0x0030 <= $self->{nc} and        if (0x0030 <= $self->{nc} and
4309            $self->{nc} <= 0x0039) { # 0..9            $self->{nc} <= 0x0039) { # 0..9
4310                    
4311          $self->{s_kwd} *= 10;          $self->{kwd} *= 10;
4312          $self->{s_kwd} += $self->{nc} - 0x0030;          $self->{kwd} += $self->{nc} - 0x0030;
4313                    
4314          ## Stay in the state.          ## Stay in the state.
4315                    
# Line 3929  sub _get_next_token ($) { Line 4345  sub _get_next_token ($) {
4345          #          #
4346        }        }
4347    
4348        my $code = $self->{s_kwd};        my $code = $self->{kwd};
4349        my $l = $self->{line_prev};        my $l = $self->{line_prev};
4350        my $c = $self->{column_prev};        my $c = $self->{column_prev};
4351        if ($charref_map->{$code}) {        if ($charref_map->{$code}) {
# Line 3952  sub _get_next_token ($) { Line 4368  sub _get_next_token ($) {
4368          $self->{s_kwd} = '';          $self->{s_kwd} = '';
4369          ## Reconsume.          ## Reconsume.
4370          return  ({type => CHARACTER_TOKEN, data => chr $code,          return  ({type => CHARACTER_TOKEN, data => chr $code,
4371                      has_reference => 1,
4372                    line => $l, column => $c,                    line => $l, column => $c,
4373                   });                   });
4374          redo A;          redo A;
# Line 3971  sub _get_next_token ($) { Line 4388  sub _get_next_token ($) {
4388          # 0..9, A..F, a..f          # 0..9, A..F, a..f
4389                    
4390          $self->{state} = HEXREF_HEX_STATE;          $self->{state} = HEXREF_HEX_STATE;
4391          $self->{s_kwd} = 0;          $self->{kwd} = 0;
4392          ## Reconsume.          ## Reconsume.
4393          redo A;          redo A;
4394        } else {        } else {
# Line 3989  sub _get_next_token ($) { Line 4406  sub _get_next_token ($) {
4406            $self->{s_kwd} = '';            $self->{s_kwd} = '';
4407            ## Reconsume.            ## Reconsume.
4408            return  ({type => CHARACTER_TOKEN,            return  ({type => CHARACTER_TOKEN,
4409                      data => '&' . $self->{s_kwd},                      data => '&' . $self->{kwd},
4410                      line => $self->{line_prev},                      line => $self->{line_prev},
4411                      column => $self->{column_prev} - length $self->{s_kwd},                      column => $self->{column_prev} - length $self->{kwd},
4412                     });                     });
4413            redo A;            redo A;
4414          } else {          } else {
4415                        
4416            $self->{ca}->{value} .= '&' . $self->{s_kwd};            $self->{ca}->{value} .= '&' . $self->{kwd};
4417            $self->{state} = $self->{prev_state};            $self->{state} = $self->{prev_state};
4418            $self->{s_kwd} = '';            $self->{s_kwd} = '';
4419            ## Reconsume.            ## Reconsume.
# Line 4007  sub _get_next_token ($) { Line 4424  sub _get_next_token ($) {
4424        if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {        if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
4425          # 0..9          # 0..9
4426                    
4427          $self->{s_kwd} *= 0x10;          $self->{kwd} *= 0x10;
4428          $self->{s_kwd} += $self->{nc} - 0x0030;          $self->{kwd} += $self->{nc} - 0x0030;
4429          ## Stay in the state.          ## Stay in the state.
4430                    
4431      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 4025  sub _get_next_token ($) { Line 4442  sub _get_next_token ($) {
4442        } elsif (0x0061 <= $self->{nc} and        } elsif (0x0061 <= $self->{nc} and
4443                 $self->{nc} <= 0x0066) { # a..f                 $self->{nc} <= 0x0066) { # a..f
4444                    
4445          $self->{s_kwd} *= 0x10;          $self->{kwd} *= 0x10;
4446          $self->{s_kwd} += $self->{nc} - 0x0060 + 9;          $self->{kwd} += $self->{nc} - 0x0060 + 9;
4447          ## Stay in the state.          ## Stay in the state.
4448                    
4449      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 4043  sub _get_next_token ($) { Line 4460  sub _get_next_token ($) {
4460        } elsif (0x0041 <= $self->{nc} and        } elsif (0x0041 <= $self->{nc} and
4461                 $self->{nc} <= 0x0046) { # A..F                 $self->{nc} <= 0x0046) { # A..F
4462                    
4463          $self->{s_kwd} *= 0x10;          $self->{kwd} *= 0x10;
4464          $self->{s_kwd} += $self->{nc} - 0x0040 + 9;          $self->{kwd} += $self->{nc} - 0x0040 + 9;
4465          ## Stay in the state.          ## Stay in the state.
4466                    
4467      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 4081  sub _get_next_token ($) { Line 4498  sub _get_next_token ($) {
4498          #          #
4499        }        }
4500    
4501        my $code = $self->{s_kwd};        my $code = $self->{kwd};
4502        my $l = $self->{line_prev};        my $l = $self->{line_prev};
4503        my $c = $self->{column_prev};        my $c = $self->{column_prev};
4504        if ($charref_map->{$code}) {        if ($charref_map->{$code}) {
# Line 4104  sub _get_next_token ($) { Line 4521  sub _get_next_token ($) {
4521          $self->{s_kwd} = '';          $self->{s_kwd} = '';
4522          ## Reconsume.          ## Reconsume.
4523          return  ({type => CHARACTER_TOKEN, data => chr $code,          return  ({type => CHARACTER_TOKEN, data => chr $code,
4524                      has_reference => 1,
4525                    line => $l, column => $c,                    line => $l, column => $c,
4526                   });                   });
4527          redo A;          redo A;
# Line 4117  sub _get_next_token ($) { Line 4535  sub _get_next_token ($) {
4535          redo A;          redo A;
4536        }        }
4537      } elsif ($self->{state} == ENTITY_NAME_STATE) {      } elsif ($self->{state} == ENTITY_NAME_STATE) {
4538        if (length $self->{s_kwd} < 30 and        if (length $self->{kwd} < 30 and
4539            ## NOTE: Some number greater than the maximum length of entity name            ## NOTE: Some number greater than the maximum length of entity name
4540            ((0x0041 <= $self->{nc} and # a            ((0x0041 <= $self->{nc} and # a
4541              $self->{nc} <= 0x005A) or # x              $self->{nc} <= 0x005A) or # x
# Line 4127  sub _get_next_token ($) { Line 4545  sub _get_next_token ($) {
4545              $self->{nc} <= 0x0039) or # 9              $self->{nc} <= 0x0039) or # 9
4546             $self->{nc} == 0x003B)) { # ;             $self->{nc} == 0x003B)) { # ;
4547          our $EntityChar;          our $EntityChar;
4548          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
4549          if (defined $EntityChar->{$self->{s_kwd}}) {          if (defined $EntityChar->{$self->{kwd}}) {
4550            if ($self->{nc} == 0x003B) { # ;            if ($self->{nc} == 0x003B) { # ;
4551                            
4552              $self->{entity__value} = $EntityChar->{$self->{s_kwd}};              $self->{entity__value} = $EntityChar->{$self->{kwd}};
4553              $self->{entity__match} = 1;              $self->{entity__match} = 1;
4554                            
4555      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 4147  sub _get_next_token ($) { Line 4565  sub _get_next_token ($) {
4565              #              #
4566            } else {            } else {
4567                            
4568              $self->{entity__value} = $EntityChar->{$self->{s_kwd}};              $self->{entity__value} = $EntityChar->{$self->{kwd}};
4569              $self->{entity__match} = -1;              $self->{entity__match} = -1;
4570              ## Stay in the state.              ## Stay in the state.
4571                            
# Line 4195  sub _get_next_token ($) { Line 4613  sub _get_next_token ($) {
4613          if ($self->{prev_state} != DATA_STATE and # in attribute          if ($self->{prev_state} != DATA_STATE and # in attribute
4614              $self->{entity__match} < -1) {              $self->{entity__match} < -1) {
4615                        
4616            $data = '&' . $self->{s_kwd};            $data = '&' . $self->{kwd};
4617            #            #
4618          } else {          } else {
4619                        
# Line 4207  sub _get_next_token ($) { Line 4625  sub _get_next_token ($) {
4625                    
4626          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
4627                          line => $self->{line_prev},                          line => $self->{line_prev},
4628                          column => $self->{column_prev} - length $self->{s_kwd});                          column => $self->{column_prev} - length $self->{kwd});
4629          $data = '&' . $self->{s_kwd};          $data = '&' . $self->{kwd};
4630          #          #
4631        }        }
4632        
# Line 4229  sub _get_next_token ($) { Line 4647  sub _get_next_token ($) {
4647          ## Reconsume.          ## Reconsume.
4648          return  ({type => CHARACTER_TOKEN,          return  ({type => CHARACTER_TOKEN,
4649                    data => $data,                    data => $data,
4650                      has_reference => $has_ref,
4651                    line => $self->{line_prev},                    line => $self->{line_prev},
4652                    column => $self->{column_prev} + 1 - length $self->{s_kwd},                    column => $self->{column_prev} + 1 - length $self->{kwd},
4653                   });                   });
4654          redo A;          redo A;
4655        } else {        } else {
# Line 4242  sub _get_next_token ($) { Line 4661  sub _get_next_token ($) {
4661          ## Reconsume.          ## Reconsume.
4662          redo A;          redo A;
4663        }        }
4664    
4665        ## XML-only states
4666    
4667        } elsif ($self->{state} == PI_STATE) {
4668          if ($is_space->{$self->{nc}} or
4669              $self->{nc} == 0x003F or # ? ## XML5: Same as "Anything else"
4670              $self->{nc} == -1) {
4671            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare pio', ## TODO: type
4672                            line => $self->{line_prev},
4673                            column => $self->{column_prev}
4674                                - 1 * ($self->{nc} != -1));
4675            $self->{state} = BOGUS_COMMENT_STATE;
4676            ## Reconsume.
4677            $self->{ct} = {type => COMMENT_TOKEN,
4678                           data => '?',
4679                           line => $self->{line_prev},
4680                           column => $self->{column_prev}
4681                               - 1 * ($self->{nc} != -1),
4682                          };
4683            redo A;
4684          } else {
4685            $self->{ct} = {type => PI_TOKEN,
4686                           target => chr $self->{nc},
4687                           data => '',
4688                           line => $self->{line_prev},
4689                           column => $self->{column_prev} - 1,
4690                          };
4691            $self->{state} = PI_TARGET_STATE;
4692            
4693        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4694          $self->{line_prev} = $self->{line};
4695          $self->{column_prev} = $self->{column};
4696          $self->{column}++;
4697          $self->{nc}
4698              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4699        } else {
4700          $self->{set_nc}->($self);
4701        }
4702      
4703            redo A;
4704          }
4705        } elsif ($self->{state} == PI_TARGET_STATE) {
4706          if ($is_space->{$self->{nc}}) {
4707            $self->{state} = PI_TARGET_AFTER_STATE;
4708            
4709        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4710          $self->{line_prev} = $self->{line};
4711          $self->{column_prev} = $self->{column};
4712          $self->{column}++;
4713          $self->{nc}
4714              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4715        } else {
4716          $self->{set_nc}->($self);
4717        }
4718      
4719            redo A;
4720          } elsif ($self->{nc} == -1) {
4721            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
4722            $self->{state} = DATA_STATE;
4723            $self->{s_kwd} = '';
4724            ## Reconsume.
4725            return  ($self->{ct}); # pi
4726            redo A;
4727          } elsif ($self->{nc} == 0x003F) { # ?
4728            $self->{state} = PI_AFTER_STATE;
4729            
4730        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4731          $self->{line_prev} = $self->{line};
4732          $self->{column_prev} = $self->{column};
4733          $self->{column}++;
4734          $self->{nc}
4735              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4736        } else {
4737          $self->{set_nc}->($self);
4738        }
4739      
4740            redo A;
4741          } else {
4742            ## XML5: typo ("tag name" -> "target")
4743            $self->{ct}->{target} .= chr $self->{nc}; # pi
4744            
4745        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4746          $self->{line_prev} = $self->{line};
4747          $self->{column_prev} = $self->{column};
4748          $self->{column}++;
4749          $self->{nc}
4750              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4751        } else {
4752          $self->{set_nc}->($self);
4753        }
4754      
4755            redo A;
4756          }
4757        } elsif ($self->{state} == PI_TARGET_AFTER_STATE) {
4758          if ($is_space->{$self->{nc}}) {
4759            ## Stay in the state.
4760            
4761        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4762          $self->{line_prev} = $self->{line};
4763          $self->{column_prev} = $self->{column};
4764          $self->{column}++;
4765          $self->{nc}
4766              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4767        } else {
4768          $self->{set_nc}->($self);
4769        }
4770      
4771            redo A;
4772          } else {
4773            $self->{state} = PI_DATA_STATE;
4774            ## Reprocess.
4775            redo A;
4776          }
4777        } elsif ($self->{state} == PI_DATA_STATE) {
4778          if ($self->{nc} == 0x003F) { # ?
4779            $self->{state} = PI_DATA_AFTER_STATE;
4780            
4781        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4782          $self->{line_prev} = $self->{line};
4783          $self->{column_prev} = $self->{column};
4784          $self->{column}++;
4785          $self->{nc}
4786              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4787        } else {
4788          $self->{set_nc}->($self);
4789        }
4790      
4791            redo A;
4792          } elsif ($self->{nc} == -1) {
4793            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
4794            $self->{state} = DATA_STATE;
4795            $self->{s_kwd} = '';
4796            ## Reprocess.
4797            return  ($self->{ct}); # pi
4798            redo A;
4799          } else {
4800            $self->{ct}->{data} .= chr $self->{nc}; # pi
4801            $self->{read_until}->($self->{ct}->{data}, q[?],
4802                                  length $self->{ct}->{data});
4803            ## Stay in the state.
4804            
4805        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4806          $self->{line_prev} = $self->{line};
4807          $self->{column_prev} = $self->{column};
4808          $self->{column}++;
4809          $self->{nc}
4810              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4811        } else {
4812          $self->{set_nc}->($self);
4813        }
4814      
4815            ## Reprocess.
4816            redo A;
4817          }
4818        } elsif ($self->{state} == PI_AFTER_STATE) {
4819          if ($self->{nc} == 0x003E) { # >
4820            $self->{state} = DATA_STATE;
4821            $self->{s_kwd} = '';
4822            
4823        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4824          $self->{line_prev} = $self->{line};
4825          $self->{column_prev} = $self->{column};
4826          $self->{column}++;
4827          $self->{nc}
4828              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4829        } else {
4830          $self->{set_nc}->($self);
4831        }
4832      
4833            return  ($self->{ct}); # pi
4834            redo A;
4835          } elsif ($self->{nc} == 0x003F) { # ?
4836            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
4837                            line => $self->{line_prev},
4838                            column => $self->{column_prev}); ## XML5: no error
4839            $self->{ct}->{data} .= '?';
4840            $self->{state} = PI_DATA_AFTER_STATE;
4841            
4842        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4843          $self->{line_prev} = $self->{line};
4844          $self->{column_prev} = $self->{column};
4845          $self->{column}++;
4846          $self->{nc}
4847              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4848        } else {
4849          $self->{set_nc}->($self);
4850        }
4851      
4852            redo A;
4853          } else {
4854            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
4855                            line => $self->{line_prev},
4856                            column => $self->{column_prev}
4857                                + 1 * ($self->{nc} == -1)); ## XML5: no error
4858            $self->{ct}->{data} .= '?'; ## XML5: not appended
4859            $self->{state} = PI_DATA_STATE;
4860            ## Reprocess.
4861            redo A;
4862          }
4863        } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
4864          ## XML5: Same as "pi after state" in XML5
4865          if ($self->{nc} == 0x003E) { # >
4866            $self->{state} = DATA_STATE;
4867            $self->{s_kwd} = '';
4868            
4869        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4870          $self->{line_prev} = $self->{line};
4871          $self->{column_prev} = $self->{column};
4872          $self->{column}++;
4873          $self->{nc}
4874              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4875        } else {
4876          $self->{set_nc}->($self);
4877        }
4878      
4879            return  ($self->{ct}); # pi
4880            redo A;
4881          } elsif ($self->{nc} == 0x003F) { # ?
4882            $self->{ct}->{data} .= '?';
4883            ## Stay in the state.
4884            
4885        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4886          $self->{line_prev} = $self->{line};
4887          $self->{column_prev} = $self->{column};
4888          $self->{column}++;
4889          $self->{nc}
4890              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4891        } else {
4892          $self->{set_nc}->($self);
4893        }
4894      
4895            redo A;
4896          } else {
4897            $self->{ct}->{data} .= '?'; ## XML5: not appended
4898            $self->{state} = PI_DATA_STATE;
4899            ## Reprocess.
4900            redo A;
4901          }
4902    
4903        } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_STATE) {
4904          if ($self->{nc} == 0x003C) { # <
4905            ## TODO:
4906            
4907        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4908          $self->{line_prev} = $self->{line};
4909          $self->{column_prev} = $self->{column};
4910          $self->{column}++;
4911          $self->{nc}
4912              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4913        } else {
4914          $self->{set_nc}->($self);
4915        }
4916      
4917            redo A;
4918          } elsif ($self->{nc} == 0x0025) { # %
4919            ## XML5: Not defined yet.
4920    
4921            ## TODO:
4922            
4923        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4924          $self->{line_prev} = $self->{line};
4925          $self->{column_prev} = $self->{column};
4926          $self->{column}++;
4927          $self->{nc}
4928              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4929        } else {
4930          $self->{set_nc}->($self);
4931        }
4932      
4933            redo A;
4934          } elsif ($self->{nc} == 0x005D) { # ]
4935            $self->{state} = DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
4936            
4937        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4938          $self->{line_prev} = $self->{line};
4939          $self->{column_prev} = $self->{column};
4940          $self->{column}++;
4941          $self->{nc}
4942              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4943        } else {
4944          $self->{set_nc}->($self);
4945        }
4946      
4947            redo A;
4948          } elsif ($is_space->{$self->{nc}}) {
4949            ## Stay in the state.
4950            
4951        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4952          $self->{line_prev} = $self->{line};
4953          $self->{column_prev} = $self->{column};
4954          $self->{column}++;
4955          $self->{nc}
4956              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4957        } else {
4958          $self->{set_nc}->($self);
4959        }
4960      
4961            redo A;
4962          } elsif ($self->{nc} == -1) {
4963            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed internal subset'); ## TODO: type
4964            $self->{state} = DATA_STATE;
4965            $self->{s_kwd} = '';
4966            ## Reconsume.
4967            return  ($self->{ct}); # DOCTYPE
4968            redo A;
4969          } else {
4970            unless ($self->{internal_subset_tainted}) {
4971              ## XML5: No parse error.
4972              $self->{parse_error}->(level => $self->{level}->{must}, type => 'string in internal subset');
4973              $self->{internal_subset_tainted} = 1;
4974            }
4975            ## Stay in the state.
4976            
4977        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4978          $self->{line_prev} = $self->{line};
4979          $self->{column_prev} = $self->{column};
4980          $self->{column}++;
4981          $self->{nc}
4982              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4983        } else {
4984          $self->{set_nc}->($self);
4985        }
4986      
4987            redo A;
4988          }
4989        } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
4990          if ($self->{nc} == 0x003E) { # >
4991            $self->{state} = DATA_STATE;
4992            $self->{s_kwd} = '';
4993            
4994        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4995          $self->{line_prev} = $self->{line};
4996          $self->{column_prev} = $self->{column};
4997          $self->{column}++;
4998          $self->{nc}
4999              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5000        } else {
5001          $self->{set_nc}->($self);
5002        }
5003      
5004            return  ($self->{ct}); # DOCTYPE
5005            redo A;
5006          } elsif ($self->{nc} == -1) {
5007            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
5008            $self->{state} = DATA_STATE;
5009            $self->{s_kwd} = '';
5010            ## Reconsume.
5011            return  ($self->{ct}); # DOCTYPE
5012            redo A;
5013          } else {
5014            ## XML5: No parse error and stay in the state.
5015            $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after internal subset'); ## TODO: type
5016    
5017            $self->{state} = BOGUS_DOCTYPE_STATE;
5018            
5019        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5020          $self->{line_prev} = $self->{line};
5021          $self->{column_prev} = $self->{column};
5022          $self->{column}++;
5023          $self->{nc}
5024              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5025        } else {
5026          $self->{set_nc}->($self);
5027        }
5028      
5029            redo A;
5030          }
5031            
5032      } else {      } else {
5033        die "$0: $self->{state}: Unknown state";        die "$0: $self->{state}: Unknown state";
5034      }      }

Legend:
Removed from v.1.5  
changed lines
  Added in v.1.12

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24