/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm
Suika

Diff of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.4 by wakaba, Tue Oct 14 11:46:57 2008 UTC revision 1.17 by wakaba, Sun Oct 19 04:39:25 2008 UTC
# Line 15  BEGIN { Line 15  BEGIN {
15      CHARACTER_TOKEN      CHARACTER_TOKEN
16      PI_TOKEN      PI_TOKEN
17      ABORT_TOKEN      ABORT_TOKEN
18        END_OF_DOCTYPE_TOKEN
19        ATTLIST_TOKEN
20        ELEMENT_TOKEN
21        GENERAL_ENTITY_TOKEN
22        PARAMETER_ENTITY_TOKEN
23        NOTATION_TOKEN
24    );    );
25        
26    our %EXPORT_TAGS = (    our %EXPORT_TAGS = (
# Line 27  BEGIN { Line 33  BEGIN {
33        CHARACTER_TOKEN        CHARACTER_TOKEN
34        PI_TOKEN        PI_TOKEN
35        ABORT_TOKEN        ABORT_TOKEN
36          END_OF_DOCTYPE_TOKEN
37          ATTLIST_TOKEN
38          ELEMENT_TOKEN
39          GENERAL_ENTITY_TOKEN
40          PARAMETER_ENTITY_TOKEN
41          NOTATION_TOKEN
42      )],      )],
43    );    );
44  }  }
45    
46    ## NOTE: Differences from the XML5 draft are marked as "XML5:".
47    
48  ## Token types  ## Token types
49    
50  sub DOCTYPE_TOKEN () { 1 }  sub DOCTYPE_TOKEN () { 1 } ## XML5: No DOCTYPE token.
51  sub COMMENT_TOKEN () { 2 }  sub COMMENT_TOKEN () { 2 }
52  sub START_TAG_TOKEN () { 3 }  sub START_TAG_TOKEN () { 3 }
53  sub END_TAG_TOKEN () { 4 }  sub END_TAG_TOKEN () { 4 }
54  sub END_OF_FILE_TOKEN () { 5 }  sub END_OF_FILE_TOKEN () { 5 }
55  sub CHARACTER_TOKEN () { 6 }  sub CHARACTER_TOKEN () { 6 }
56  sub PI_TOKEN () { 7 } # XML5  sub PI_TOKEN () { 7 } ## NOTE: XML only.
57  sub ABORT_TOKEN () { 8 } # Not a token actually  sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing.
58    sub END_OF_DOCTYPE_TOKEN () { 9 } ## NOTE: XML only.
59    sub ATTLIST_TOKEN () { 10 } ## NOTE: XML only.
60    sub ELEMENT_TOKEN () { 11 } ## NOTE: XML only.
61    sub GENERAL_ENTITY_TOKEN () { 12 } ## NOTE: XML only.
62    sub PARAMETER_ENTITY_TOKEN () { 13 } ## NOTE: XML only.
63    sub NOTATION_TOKEN () { 14 } ## NOTE: XML only.
64    
65    ## XML5: XML5 has "empty tag token".  In this implementation, it is
66    ## represented as a start tag token with $self->{self_closing} flag
67    ## set to true.
68    
69    ## XML5: XML5 has "short end tag token".  In this implementation, it
70    ## is represented as an end tag token with $token->{tag_name} flag set
71    ## to an empty string.
72    
73  package Whatpm::HTML;  package Whatpm::HTML;
74    
# Line 114  sub HEXREF_HEX_STATE () { 48 } Line 142  sub HEXREF_HEX_STATE () { 48 }
142  sub ENTITY_NAME_STATE () { 49 }  sub ENTITY_NAME_STATE () { 49 }
143  sub PCDATA_STATE () { 50 } # "data state" in the spec  sub PCDATA_STATE () { 50 } # "data state" in the spec
144    
145    ## XML-only states
146    sub PI_STATE () { 51 }
147    sub PI_TARGET_STATE () { 52 }
148    sub PI_TARGET_AFTER_STATE () { 53 }
149    sub PI_DATA_STATE () { 54 }
150    sub PI_AFTER_STATE () { 55 }
151    sub PI_DATA_AFTER_STATE () { 56 }
152    sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 }
153    sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 }
154    sub BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 59 }
155    sub DOCTYPE_TAG_STATE () { 60 }
156    sub DOCTYPE_MARKUP_DECLARATION_OPEN_STATE () { 61 }
157    sub MD_ATTLIST_STATE () { 62 }
158    sub MD_E_STATE () { 63 }
159    sub MD_ELEMENT_STATE () { 64 }
160    sub MD_ENTITY_STATE () { 65 }
161    sub MD_NOTATION_STATE () { 66 }
162    sub DOCTYPE_MD_STATE () { 67 }
163    sub BEFORE_MD_NAME_STATE () { 68 }
164    sub MD_NAME_STATE () { 69 }
165    sub DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE () { 70 }
166    sub DOCTYPE_ATTLIST_NAME_AFTER_STATE () { 71 }
167    sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE () { 72 }
168    sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE () { 73 }
169    sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE () { 74 }
170    sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE () { 75 }
171    sub BEFORE_ALLOWED_TOKEN_STATE () { 76 }
172    sub ALLOWED_TOKEN_STATE () { 77 }
173    sub AFTER_ALLOWED_TOKEN_STATE () { 78 }
174    sub AFTER_ALLOWED_TOKENS_STATE () { 79 }
175    sub BEFORE_ATTR_DEFAULT_STATE () { 80 }
176    sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE () { 81 }
177    sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE () { 82 }
178    sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE () { 83 }
179    sub AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE () { 84 }
180    sub BOGUS_MD_STATE () { 85 }
181    
182  ## Tree constructor state constants (see Whatpm::HTML for the full  ## Tree constructor state constants (see Whatpm::HTML for the full
183  ## list and descriptions)  ## list and descriptions)
184    
# Line 178  sub _initialize_tokenizer ($) { Line 243  sub _initialize_tokenizer ($) {
243    #$self->{is_xml} (if XML)    #$self->{is_xml} (if XML)
244    
245    $self->{state} = DATA_STATE; # MUST    $self->{state} = DATA_STATE; # MUST
246    #$self->{s_kwd}; # state keyword - initialized when used    $self->{s_kwd} = ''; # Data state keyword
247      #$self->{kwd} = ''; # State-dependent keyword; initialized when used
248    #$self->{entity__value}; # initialized when used    #$self->{entity__value}; # initialized when used
249    #$self->{entity__match}; # initialized when used    #$self->{entity__match}; # initialized when used
250    $self->{content_model} = PCDATA_CONTENT_MODEL; # be    $self->{content_model} = PCDATA_CONTENT_MODEL; # be
# Line 208  sub _initialize_tokenizer ($) { Line 274  sub _initialize_tokenizer ($) {
274    
275  ## A token has:  ## A token has:
276  ##   ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,  ##   ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
277  ##       CHARACTER_TOKEN, or END_OF_FILE_TOKEN  ##       CHARACTER_TOKEN, END_OF_FILE_TOKEN, PI_TOKEN, or ABORT_TOKEN
278  ##   ->{name} (DOCTYPE_TOKEN)  ##   ->{name} (DOCTYPE_TOKEN)
279  ##   ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)  ##   ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
280    ##   ->{target} (PI_TOKEN)
281  ##   ->{pubid} (DOCTYPE_TOKEN)  ##   ->{pubid} (DOCTYPE_TOKEN)
282  ##   ->{sysid} (DOCTYPE_TOKEN)  ##   ->{sysid} (DOCTYPE_TOKEN)
283  ##   ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag  ##   ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
# Line 218  sub _initialize_tokenizer ($) { Line 285  sub _initialize_tokenizer ($) {
285  ##        ->{name}  ##        ->{name}
286  ##        ->{value}  ##        ->{value}
287  ##        ->{has_reference} == 1 or 0  ##        ->{has_reference} == 1 or 0
288  ##   ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)  ##        ->{index}: Index of the attribute in a tag.
289    ##   ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)
290    ##   ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
291    ##   ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.
292    ##   ->{has_internal_subset} = 1 or 0 (DOCTYPE_TOKEN)
293    
294  ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.  ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
295  ##     |->{self_closing}| is used to save the value of |$self->{self_closing}|  ##     |->{self_closing}| is used to save the value of |$self->{self_closing}|
296  ##     while the token is pushed back to the stack.  ##     while the token is pushed back to the stack.
# Line 238  my $is_space = { Line 310  my $is_space = {
310    0x0009 => 1, # CHARACTER TABULATION (HT)    0x0009 => 1, # CHARACTER TABULATION (HT)
311    0x000A => 1, # LINE FEED (LF)    0x000A => 1, # LINE FEED (LF)
312    #0x000B => 0, # LINE TABULATION (VT)    #0x000B => 0, # LINE TABULATION (VT)
313    0x000C => 1, # FORM FEED (FF)    0x000C => 1, # FORM FEED (FF) ## XML5: Not a space character.
314    #0x000D => 1, # CARRIAGE RETURN (CR)    #0x000D => 1, # CARRIAGE RETURN (CR)
315    0x0020 => 1, # SPACE (SP)    0x0020 => 1, # SPACE (SP)
316  };  };
# Line 362  sub _get_next_token ($) { Line 434  sub _get_next_token ($) {
434          }          }
435        } elsif ($self->{nc} == 0x002D) { # -        } elsif ($self->{nc} == 0x002D) { # -
436          if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA          if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
437            $self->{s_kwd} .= '-';            if ($self->{s_kwd} eq '<!-') {
             
           if ($self->{s_kwd} eq '<!--') {  
438                            
439              $self->{escape} = 1; # unless $self->{escape};              $self->{escape} = 1; # unless $self->{escape};
440              $self->{s_kwd} = '--';              $self->{s_kwd} = '--';
441              #              #
442            } elsif ($self->{s_kwd} eq '---') {            } elsif ($self->{s_kwd} eq '-') {
443                            
444              $self->{s_kwd} = '--';              $self->{s_kwd} = '--';
445              #              #
446              } elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') {
447                
448                $self->{s_kwd} .= '-';
449                #
450            } else {            } else {
451                            
452                $self->{s_kwd} = '-';
453              #              #
454            }            }
455          }          }
# Line 420  sub _get_next_token ($) { Line 495  sub _get_next_token ($) {
495            if ($self->{s_kwd} eq '--') {            if ($self->{s_kwd} eq '--') {
496                            
497              delete $self->{escape};              delete $self->{escape};
498                #
499            } else {            } else {
500                            
501                #
502            }            }
503            } elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') {
504              
505              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched mse', ## TODO: type
506                              line => $self->{line_prev},
507                              column => $self->{column_prev} - 1);
508              #
509          } else {          } else {
510                        
511              #
512          }          }
513                    
514          $self->{s_kwd} = '';          $self->{s_kwd} = '';
515          #          #
516          } elsif ($self->{nc} == 0x005D) { # ]
517            if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') {
518              
519              $self->{s_kwd} .= ']';
520            } elsif ($self->{s_kwd} eq ']]') {
521              
522              #
523            } else {
524              
525              $self->{s_kwd} = '';
526            }
527            #
528        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
529                    
530          $self->{s_kwd} = '';          $self->{s_kwd} = '';
# Line 446  sub _get_next_token ($) { Line 542  sub _get_next_token ($) {
542                     data => chr $self->{nc},                     data => chr $self->{nc},
543                     line => $self->{line}, column => $self->{column},                     line => $self->{line}, column => $self->{column},
544                    };                    };
545        if ($self->{read_until}->($token->{data}, q[-!<>&],        if ($self->{read_until}->($token->{data}, q{-!<>&\]},
546                                  length $token->{data})) {                                  length $token->{data})) {
547          $self->{s_kwd} = '';          $self->{s_kwd} = '';
548        }        }
549    
550        ## Stay in the data state.        ## Stay in the data state.
551        if ($self->{content_model} == PCDATA_CONTENT_MODEL) {        if (not $self->{is_xml} and
552              $self->{content_model} == PCDATA_CONTENT_MODEL) {
553                    
554          $self->{state} = PCDATA_STATE;          $self->{state} = PCDATA_STATE;
555        } else {        } else {
# Line 473  sub _get_next_token ($) { Line 570  sub _get_next_token ($) {
570        return  ($token);        return  ($token);
571        redo A;        redo A;
572      } elsif ($self->{state} == TAG_OPEN_STATE) {      } elsif ($self->{state} == TAG_OPEN_STATE) {
573          ## XML5: "tag state".
574    
575        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
576          if ($self->{nc} == 0x002F) { # /          if ($self->{nc} == 0x002F) { # /
577                        
# Line 491  sub _get_next_token ($) { Line 590  sub _get_next_token ($) {
590            redo A;            redo A;
591          } elsif ($self->{nc} == 0x0021) { # !          } elsif ($self->{nc} == 0x0021) { # !
592                        
593            $self->{s_kwd} = '<' unless $self->{escape};            $self->{s_kwd} = $self->{escaped} ? '' : '<';
594            #            #
595          } else {          } else {
596                        
597              $self->{s_kwd} = '';
598            #            #
599          }          }
600    
# Line 583  sub _get_next_token ($) { Line 683  sub _get_next_token ($) {
683                            line => $self->{line_prev},                            line => $self->{line_prev},
684                            column => $self->{column_prev});                            column => $self->{column_prev});
685            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
686              $self->{s_kwd} = '';
687                        
688      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
689        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 602  sub _get_next_token ($) { Line 703  sub _get_next_token ($) {
703    
704            redo A;            redo A;
705          } elsif ($self->{nc} == 0x003F) { # ?          } elsif ($self->{nc} == 0x003F) { # ?
706                        if ($self->{is_xml}) {
707            $self->{parse_error}->(level => $self->{level}->{must}, type => 'pio',              
708                            line => $self->{line_prev},              $self->{state} = PI_STATE;
709                            column => $self->{column_prev});              
710            $self->{state} = BOGUS_COMMENT_STATE;      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
711            $self->{ct} = {type => COMMENT_TOKEN, data => '',        $self->{line_prev} = $self->{line};
712                                      line => $self->{line_prev},        $self->{column_prev} = $self->{column};
713                                      column => $self->{column_prev},        $self->{column}++;
714                                     };        $self->{nc}
715            ## $self->{nc} is intentionally left as is            = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
716            redo A;      } else {
717          } else {        $self->{set_nc}->($self);
718        }
719      
720                redo A;
721              } else {
722                
723                $self->{parse_error}->(level => $self->{level}->{must}, type => 'pio',
724                                line => $self->{line_prev},
725                                column => $self->{column_prev});
726                $self->{state} = BOGUS_COMMENT_STATE;
727                $self->{ct} = {type => COMMENT_TOKEN, data => '',
728                               line => $self->{line_prev},
729                               column => $self->{column_prev},
730                              };
731                ## $self->{nc} is intentionally left as is
732                redo A;
733              }
734            } elsif (not $self->{is_xml} or $is_space->{$self->{nc}}) {
735                        
736            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago',            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago',
737                            line => $self->{line_prev},                            line => $self->{line_prev},
738                            column => $self->{column_prev});                            column => $self->{column_prev});
739            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
740              $self->{s_kwd} = '';
741            ## reconsume            ## reconsume
742    
743            return  ({type => CHARACTER_TOKEN, data => '<',            return  ({type => CHARACTER_TOKEN, data => '<',
# Line 627  sub _get_next_token ($) { Line 746  sub _get_next_token ($) {
746                     });                     });
747    
748            redo A;            redo A;
749            } else {
750              ## XML5: "<:" is a parse error.
751              
752              $self->{ct} = {type => START_TAG_TOKEN,
753                                        tag_name => chr ($self->{nc}),
754                                        line => $self->{line_prev},
755                                        column => $self->{column_prev}};
756              $self->{state} = TAG_NAME_STATE;
757              
758        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
759          $self->{line_prev} = $self->{line};
760          $self->{column_prev} = $self->{column};
761          $self->{column}++;
762          $self->{nc}
763              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
764        } else {
765          $self->{set_nc}->($self);
766        }
767      
768              redo A;
769          }          }
770        } else {        } else {
771          die "$0: $self->{content_model} in tag open";          die "$0: $self->{content_model} in tag open";
# Line 635  sub _get_next_token ($) { Line 774  sub _get_next_token ($) {
774        ## NOTE: The "close tag open state" in the spec is implemented as        ## NOTE: The "close tag open state" in the spec is implemented as
775        ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.        ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
776    
777          ## XML5: "end tag state".
778    
779        my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"        my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
780        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
781          if (defined $self->{last_stag_name}) {          if (defined $self->{last_stag_name}) {
782            $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;            $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
783            $self->{s_kwd} = '';            $self->{kwd} = '';
784            ## Reconsume.            ## Reconsume.
785            redo A;            redo A;
786          } else {          } else {
# Line 647  sub _get_next_token ($) { Line 788  sub _get_next_token ($) {
788            ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.            ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
789                        
790            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
791              $self->{s_kwd} = '';
792            ## Reconsume.            ## Reconsume.
793            return  ({type => CHARACTER_TOKEN, data => '</',            return  ({type => CHARACTER_TOKEN, data => '</',
794                      line => $l, column => $c,                      line => $l, column => $c,
# Line 695  sub _get_next_token ($) { Line 837  sub _get_next_token ($) {
837        
838          redo A;          redo A;
839        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
           
840          $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty end tag',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty end tag',
841                          line => $self->{line_prev}, ## "<" in "</>"                          line => $self->{line_prev}, ## "<" in "</>"
842                          column => $self->{column_prev} - 1);                          column => $self->{column_prev} - 1);
843          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
844                    $self->{s_kwd} = '';
845            if ($self->{is_xml}) {
846              
847              ## XML5: No parse error.
848              
849              ## NOTE: This parser raises a parse error, since it supports
850              ## XML1, not XML5.
851    
852              ## NOTE: A short end tag token.
853              my $ct = {type => END_TAG_TOKEN,
854                        tag_name => '',
855                        line => $self->{line_prev},
856                        column => $self->{column_prev} - 1,
857                       };
858              
859      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
860        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
861        $self->{column_prev} = $self->{column};        $self->{column_prev} = $self->{column};
# Line 711  sub _get_next_token ($) { Line 866  sub _get_next_token ($) {
866        $self->{set_nc}->($self);        $self->{set_nc}->($self);
867      }      }
868        
869              return  ($ct);
870            } else {
871              
872              
873        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
874          $self->{line_prev} = $self->{line};
875          $self->{column_prev} = $self->{column};
876          $self->{column}++;
877          $self->{nc}
878              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
879        } else {
880          $self->{set_nc}->($self);
881        }
882      
883            }
884          redo A;          redo A;
885        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
886                    
887          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare etago');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare etago');
888            $self->{s_kwd} = '';
889          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
890          # reconsume          # reconsume
891    
# Line 723  sub _get_next_token ($) { Line 894  sub _get_next_token ($) {
894                   });                   });
895    
896          redo A;          redo A;
897        } else {        } elsif (not $self->{is_xml} or
898                   $is_space->{$self->{nc}}) {
899                    
900          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus end tag');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus end tag',
901                            line => $self->{line_prev}, # "<" of "</"
902                            column => $self->{column_prev} - 1);
903          $self->{state} = BOGUS_COMMENT_STATE;          $self->{state} = BOGUS_COMMENT_STATE;
904          $self->{ct} = {type => COMMENT_TOKEN, data => '',          $self->{ct} = {type => COMMENT_TOKEN, data => '',
905                                    line => $self->{line_prev}, # "<" of "</"                                    line => $self->{line_prev}, # "<" of "</"
# Line 738  sub _get_next_token ($) { Line 912  sub _get_next_token ($) {
912          ## generated from the bogus end tag, as defined in the          ## generated from the bogus end tag, as defined in the
913          ## "bogus comment state" entry.          ## "bogus comment state" entry.
914          redo A;          redo A;
915          } else {
916            ## XML5: "</:" is a parse error.
917            
918            $self->{ct} = {type => END_TAG_TOKEN,
919                           tag_name => chr ($self->{nc}),
920                           line => $l, column => $c};
921            $self->{state} = TAG_NAME_STATE; ## XML5: "end tag name state".
922            
923        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
924          $self->{line_prev} = $self->{line};
925          $self->{column_prev} = $self->{column};
926          $self->{column}++;
927          $self->{nc}
928              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
929        } else {
930          $self->{set_nc}->($self);
931        }
932      
933            redo A;
934        }        }
935      } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {      } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
936        my $ch = substr $self->{last_stag_name}, length $self->{s_kwd}, 1;        my $ch = substr $self->{last_stag_name}, length $self->{kwd}, 1;
937        if (length $ch) {        if (length $ch) {
938          my $CH = $ch;          my $CH = $ch;
939          $ch =~ tr/a-z/A-Z/;          $ch =~ tr/a-z/A-Z/;
# Line 748  sub _get_next_token ($) { Line 941  sub _get_next_token ($) {
941          if ($nch eq $ch or $nch eq $CH) {          if ($nch eq $ch or $nch eq $CH) {
942                        
943            ## Stay in the state.            ## Stay in the state.
944            $self->{s_kwd} .= $nch;            $self->{kwd} .= $nch;
945                        
946      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
947        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 764  sub _get_next_token ($) { Line 957  sub _get_next_token ($) {
957          } else {          } else {
958                        
959            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
960              $self->{s_kwd} = '';
961            ## Reconsume.            ## Reconsume.
962            return  ({type => CHARACTER_TOKEN,            return  ({type => CHARACTER_TOKEN,
963                      data => '</' . $self->{s_kwd},                      data => '</' . $self->{kwd},
964                      line => $self->{line_prev},                      line => $self->{line_prev},
965                      column => $self->{column_prev} - 1 - length $self->{s_kwd},                      column => $self->{column_prev} - 1 - length $self->{kwd},
966                     });                     });
967            redo A;            redo A;
968          }          }
# Line 782  sub _get_next_token ($) { Line 976  sub _get_next_token ($) {
976                        
977            ## Reconsume.            ## Reconsume.
978            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
979              $self->{s_kwd} = '';
980            return  ({type => CHARACTER_TOKEN,            return  ({type => CHARACTER_TOKEN,
981                      data => '</' . $self->{s_kwd},                      data => '</' . $self->{kwd},
982                      line => $self->{line_prev},                      line => $self->{line_prev},
983                      column => $self->{column_prev} - 1 - length $self->{s_kwd},                      column => $self->{column_prev} - 1 - length $self->{kwd},
984                     });                     });
985            redo A;            redo A;
986          } else {          } else {
# Line 794  sub _get_next_token ($) { Line 989  sub _get_next_token ($) {
989                = {type => END_TAG_TOKEN,                = {type => END_TAG_TOKEN,
990                   tag_name => $self->{last_stag_name},                   tag_name => $self->{last_stag_name},
991                   line => $self->{line_prev},                   line => $self->{line_prev},
992                   column => $self->{column_prev} - 1 - length $self->{s_kwd}};                   column => $self->{column_prev} - 1 - length $self->{kwd}};
993            $self->{state} = TAG_NAME_STATE;            $self->{state} = TAG_NAME_STATE;
994            ## Reconsume.            ## Reconsume.
995            redo A;            redo A;
# Line 833  sub _get_next_token ($) { Line 1028  sub _get_next_token ($) {
1028            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1029          }          }
1030          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1031            $self->{s_kwd} = '';
1032                    
1033      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1034        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 885  sub _get_next_token ($) { Line 1081  sub _get_next_token ($) {
1081            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1082          }          }
1083          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1084            $self->{s_kwd} = '';
1085          # reconsume          # reconsume
1086    
1087          return  ($self->{ct}); # start tag or end tag          return  ($self->{ct}); # start tag or end tag
# Line 924  sub _get_next_token ($) { Line 1121  sub _get_next_token ($) {
1121          redo A;          redo A;
1122        }        }
1123      } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {      } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
1124          ## XML5: "Tag attribute name before state".
1125    
1126        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
1127                    
1128          ## Stay in the state          ## Stay in the state
# Line 955  sub _get_next_token ($) { Line 1154  sub _get_next_token ($) {
1154            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1155          }          }
1156          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1157            $self->{s_kwd} = '';
1158                    
1159      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1160        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 1022  sub _get_next_token ($) { Line 1222  sub _get_next_token ($) {
1222            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1223          }          }
1224          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1225            $self->{s_kwd} = '';
1226          # reconsume          # reconsume
1227    
1228          return  ($self->{ct}); # start tag or end tag          return  ($self->{ct}); # start tag or end tag
# Line 1034  sub _get_next_token ($) { Line 1235  sub _get_next_token ($) {
1235               0x003D => 1, # =               0x003D => 1, # =
1236              }->{$self->{nc}}) {              }->{$self->{nc}}) {
1237                        
1238              ## XML5: Not a parse error.
1239            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1240          } else {          } else {
1241                        
1242              ## XML5: ":" raises a parse error and is ignored.
1243          }          }
1244          $self->{ca}          $self->{ca}
1245              = {name => chr ($self->{nc}),              = {name => chr ($self->{nc}),
# Line 1057  sub _get_next_token ($) { Line 1260  sub _get_next_token ($) {
1260          redo A;          redo A;
1261        }        }
1262      } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {      } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
1263          ## XML5: "Tag attribute name state".
1264    
1265        my $before_leave = sub {        my $before_leave = sub {
1266          if (exists $self->{ct}->{attributes} # start tag or end tag          if (exists $self->{ct}->{attributes} # start tag or end tag
1267              ->{$self->{ca}->{name}}) { # MUST              ->{$self->{ca}->{name}}) { # MUST
# Line 1067  sub _get_next_token ($) { Line 1272  sub _get_next_token ($) {
1272                        
1273            $self->{ct}->{attributes}->{$self->{ca}->{name}}            $self->{ct}->{attributes}->{$self->{ca}->{name}}
1274              = $self->{ca};              = $self->{ca};
1275              $self->{ca}->{index} = ++$self->{ct}->{last_index};
1276          }          }
1277        }; # $before_leave        }; # $before_leave
1278    
# Line 1103  sub _get_next_token ($) { Line 1309  sub _get_next_token ($) {
1309        
1310          redo A;          redo A;
1311        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
1312            if ($self->{is_xml}) {
1313              
1314              ## XML5: Not a parse error.
1315              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1316            } else {
1317              
1318            }
1319    
1320          $before_leave->();          $before_leave->();
1321          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
1322                        
# Line 1117  sub _get_next_token ($) { Line 1331  sub _get_next_token ($) {
1331            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1332          }          }
1333          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1334            $self->{s_kwd} = '';
1335                    
1336      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1337        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 1151  sub _get_next_token ($) { Line 1366  sub _get_next_token ($) {
1366        
1367          redo A;          redo A;
1368        } elsif ($self->{nc} == 0x002F) { # /        } elsif ($self->{nc} == 0x002F) { # /
1369            if ($self->{is_xml}) {
1370              
1371              ## XML5: Not a parse error.
1372              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1373            } else {
1374              
1375            }
1376                    
1377          $before_leave->();          $before_leave->();
1378          $self->{state} = SELF_CLOSING_START_TAG_STATE;          $self->{state} = SELF_CLOSING_START_TAG_STATE;
# Line 1185  sub _get_next_token ($) { Line 1407  sub _get_next_token ($) {
1407            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1408          }          }
1409          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1410            $self->{s_kwd} = '';
1411          # reconsume          # reconsume
1412    
1413          return  ($self->{ct}); # start tag or end tag          return  ($self->{ct}); # start tag or end tag
# Line 1194  sub _get_next_token ($) { Line 1417  sub _get_next_token ($) {
1417          if ($self->{nc} == 0x0022 or # "          if ($self->{nc} == 0x0022 or # "
1418              $self->{nc} == 0x0027) { # '              $self->{nc} == 0x0027) { # '
1419                        
1420              ## XML5: Not a parse error.
1421            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1422          } else {          } else {
1423                        
# Line 1214  sub _get_next_token ($) { Line 1438  sub _get_next_token ($) {
1438          redo A;          redo A;
1439        }        }
1440      } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {      } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1441          ## XML5: "Tag attribute name after state".
1442          
1443        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
1444                    
1445          ## Stay in the state          ## Stay in the state
# Line 1245  sub _get_next_token ($) { Line 1471  sub _get_next_token ($) {
1471        
1472          redo A;          redo A;
1473        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
1474            if ($self->{is_xml}) {
1475              
1476              ## XML5: Not a parse error.
1477              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1478            } else {
1479              
1480            }
1481    
1482          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
1483                        
1484            $self->{last_stag_name} = $self->{ct}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
# Line 1261  sub _get_next_token ($) { Line 1495  sub _get_next_token ($) {
1495            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1496          }          }
1497          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1498            $self->{s_kwd} = '';
1499                    
1500      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1501        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 1297  sub _get_next_token ($) { Line 1532  sub _get_next_token ($) {
1532        
1533          redo A;          redo A;
1534        } elsif ($self->{nc} == 0x002F) { # /        } elsif ($self->{nc} == 0x002F) { # /
1535            if ($self->{is_xml}) {
1536              
1537              ## XML5: Not a parse error.
1538              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1539            } else {
1540              
1541            }
1542                    
1543          $self->{state} = SELF_CLOSING_START_TAG_STATE;          $self->{state} = SELF_CLOSING_START_TAG_STATE;
1544                    
# Line 1328  sub _get_next_token ($) { Line 1570  sub _get_next_token ($) {
1570          } else {          } else {
1571            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1572          }          }
1573            $self->{s_kwd} = '';
1574          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1575          # reconsume          # reconsume
1576    
# Line 1335  sub _get_next_token ($) { Line 1578  sub _get_next_token ($) {
1578    
1579          redo A;          redo A;
1580        } else {        } else {
1581            if ($self->{is_xml}) {
1582              
1583              ## XML5: Not a parse error.
1584              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1585            } else {
1586              
1587            }
1588    
1589          if ($self->{nc} == 0x0022 or # "          if ($self->{nc} == 0x0022 or # "
1590              $self->{nc} == 0x0027) { # '              $self->{nc} == 0x0027) { # '
1591                        
1592              ## XML5: Not a parse error.
1593            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1594          } else {          } else {
1595                        
# Line 1361  sub _get_next_token ($) { Line 1613  sub _get_next_token ($) {
1613          redo A;                  redo A;        
1614        }        }
1615      } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {      } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1616          ## XML5: "Tag attribute value before state".
1617    
1618        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
1619                    
1620          ## Stay in the state          ## Stay in the state
# Line 1429  sub _get_next_token ($) { Line 1683  sub _get_next_token ($) {
1683            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1684          }          }
1685          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1686            $self->{s_kwd} = '';
1687                    
1688      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1689        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 1462  sub _get_next_token ($) { Line 1717  sub _get_next_token ($) {
1717            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1718          }          }
1719          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1720            $self->{s_kwd} = '';
1721          ## reconsume          ## reconsume
1722    
1723          return  ($self->{ct}); # start tag or end tag          return  ($self->{ct}); # start tag or end tag
# Line 1470  sub _get_next_token ($) { Line 1726  sub _get_next_token ($) {
1726        } else {        } else {
1727          if ($self->{nc} == 0x003D) { # =          if ($self->{nc} == 0x003D) { # =
1728                        
1729              ## XML5: Not a parse error.
1730            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
1731            } elsif ($self->{is_xml}) {
1732              
1733              ## XML5: No parse error.
1734              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO
1735          } else {          } else {
1736                        
1737          }          }
# Line 1490  sub _get_next_token ($) { Line 1751  sub _get_next_token ($) {
1751          redo A;          redo A;
1752        }        }
1753      } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {      } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1754          ## XML5: "Tag attribute value double quoted state" and "DOCTYPE
1755          ## ATTLIST attribute value double quoted state".
1756          
1757        if ($self->{nc} == 0x0022) { # "        if ($self->{nc} == 0x0022) { # "
1758                    if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1759          $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;            
1760              ## XML5: "DOCTYPE ATTLIST name after state".
1761              push @{$self->{ct}->{attrdefs}}, $self->{ca};
1762              $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1763            } else {
1764              
1765              ## XML5: "Tag attribute name before state".
1766              $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1767            }
1768                    
1769      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1770        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 1507  sub _get_next_token ($) { Line 1779  sub _get_next_token ($) {
1779          redo A;          redo A;
1780        } elsif ($self->{nc} == 0x0026) { # &        } elsif ($self->{nc} == 0x0026) { # &
1781                    
1782            ## XML5: Not defined yet.
1783    
1784          ## NOTE: In the spec, the tokenizer is switched to the          ## NOTE: In the spec, the tokenizer is switched to the
1785          ## "entity in attribute value state".  In this implementation, the          ## "entity in attribute value state".  In this implementation, the
1786          ## tokenizer is switched to the |ENTITY_STATE|, which is an          ## tokenizer is switched to the |ENTITY_STATE|, which is an
# Line 1531  sub _get_next_token ($) { Line 1805  sub _get_next_token ($) {
1805          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
1806                        
1807            $self->{last_stag_name} = $self->{ct}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
1808    
1809              $self->{state} = DATA_STATE;
1810              $self->{s_kwd} = '';
1811              ## reconsume
1812              return  ($self->{ct}); # start tag
1813              redo A;
1814          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1815            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1816            if ($self->{ct}->{attributes}) {            if ($self->{ct}->{attributes}) {
# Line 1540  sub _get_next_token ($) { Line 1820  sub _get_next_token ($) {
1820              ## NOTE: This state should never be reached.              ## NOTE: This state should never be reached.
1821                            
1822            }            }
1823    
1824              $self->{state} = DATA_STATE;
1825              $self->{s_kwd} = '';
1826              ## reconsume
1827              return  ($self->{ct}); # end tag
1828              redo A;
1829            } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1830              ## XML5: No parse error above; not defined yet.
1831              push @{$self->{ct}->{attrdefs}}, $self->{ca};
1832              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1833              ## Reconsume.
1834              return  ($self->{ct}); # ATTLIST
1835              redo A;
1836          } else {          } else {
1837            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1838          }          }
         $self->{state} = DATA_STATE;  
         ## reconsume  
   
         return  ($self->{ct}); # start tag or end tag  
   
         redo A;  
1839        } else {        } else {
1840                    ## XML5 [ATTLIST]: Not defined yet.
1841            if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1842              
1843              ## XML5: Not a parse error.
1844              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
1845            } else {
1846              
1847            }
1848          $self->{ca}->{value} .= chr ($self->{nc});          $self->{ca}->{value} .= chr ($self->{nc});
1849          $self->{read_until}->($self->{ca}->{value},          $self->{read_until}->($self->{ca}->{value},
1850                                q["&],                                q["&<],
1851                                length $self->{ca}->{value});                                length $self->{ca}->{value});
1852    
1853          ## Stay in the state          ## Stay in the state
# Line 1571  sub _get_next_token ($) { Line 1865  sub _get_next_token ($) {
1865          redo A;          redo A;
1866        }        }
1867      } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {      } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1868          ## XML5: "Tag attribute value single quoted state" and "DOCTYPE
1869          ## ATTLIST attribute value single quoted state".
1870    
1871        if ($self->{nc} == 0x0027) { # '        if ($self->{nc} == 0x0027) { # '
1872                    if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1873          $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;            
1874              ## XML5: "DOCTYPE ATTLIST name after state".
1875              push @{$self->{ct}->{attrdefs}}, $self->{ca};
1876              $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1877            } else {
1878              
1879              ## XML5: "Before attribute name state" (sic).
1880              $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1881            }
1882                    
1883      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1884        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 1588  sub _get_next_token ($) { Line 1893  sub _get_next_token ($) {
1893          redo A;          redo A;
1894        } elsif ($self->{nc} == 0x0026) { # &        } elsif ($self->{nc} == 0x0026) { # &
1895                    
1896            ## XML5: Not defined yet.
1897    
1898          ## NOTE: In the spec, the tokenizer is switched to the          ## NOTE: In the spec, the tokenizer is switched to the
1899          ## "entity in attribute value state".  In this implementation, the          ## "entity in attribute value state".  In this implementation, the
1900          ## tokenizer is switched to the |ENTITY_STATE|, which is an          ## tokenizer is switched to the |ENTITY_STATE|, which is an
# Line 1612  sub _get_next_token ($) { Line 1919  sub _get_next_token ($) {
1919          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
1920                        
1921            $self->{last_stag_name} = $self->{ct}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
1922    
1923              $self->{state} = DATA_STATE;
1924              $self->{s_kwd} = '';
1925              ## reconsume
1926              return  ($self->{ct}); # start tag
1927              redo A;
1928          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1929            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1930            if ($self->{ct}->{attributes}) {            if ($self->{ct}->{attributes}) {
# Line 1621  sub _get_next_token ($) { Line 1934  sub _get_next_token ($) {
1934              ## NOTE: This state should never be reached.              ## NOTE: This state should never be reached.
1935                            
1936            }            }
1937    
1938              $self->{state} = DATA_STATE;
1939              $self->{s_kwd} = '';
1940              ## reconsume
1941              return  ($self->{ct}); # end tag
1942              redo A;
1943            } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1944              ## XML5: No parse error above; not defined yet.
1945              push @{$self->{ct}->{attrdefs}}, $self->{ca};
1946              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1947              ## Reconsume.
1948              return  ($self->{ct}); # ATTLIST
1949              redo A;
1950          } else {          } else {
1951            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1952          }          }
         $self->{state} = DATA_STATE;  
         ## reconsume  
   
         return  ($self->{ct}); # start tag or end tag  
   
         redo A;  
1953        } else {        } else {
1954                    ## XML5 [ATTLIST]: Not defined yet.
1955            if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1956              
1957              ## XML5: Not a parse error.
1958              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
1959            } else {
1960              
1961            }
1962          $self->{ca}->{value} .= chr ($self->{nc});          $self->{ca}->{value} .= chr ($self->{nc});
1963          $self->{read_until}->($self->{ca}->{value},          $self->{read_until}->($self->{ca}->{value},
1964                                q['&],                                q['&<],
1965                                length $self->{ca}->{value});                                length $self->{ca}->{value});
1966    
1967          ## Stay in the state          ## Stay in the state
# Line 1652  sub _get_next_token ($) { Line 1979  sub _get_next_token ($) {
1979          redo A;          redo A;
1980        }        }
1981      } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {      } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1982          ## XML5: "Tag attribute value unquoted state".
1983    
1984        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
1985                    if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1986          $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;            
1987              push @{$self->{ct}->{attrdefs}}, $self->{ca};
1988              $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
1989            } else {
1990              
1991              ## XML5: "Tag attribute name before state".
1992              $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1993            }
1994                    
1995      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1996        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 1669  sub _get_next_token ($) { Line 2005  sub _get_next_token ($) {
2005          redo A;          redo A;
2006        } elsif ($self->{nc} == 0x0026) { # &        } elsif ($self->{nc} == 0x0026) { # &
2007                    
2008    
2009            ## XML5: Not defined yet.
2010    
2011          ## NOTE: In the spec, the tokenizer is switched to the          ## NOTE: In the spec, the tokenizer is switched to the
2012          ## "entity in attribute value state".  In this implementation, the          ## "entity in attribute value state".  In this implementation, the
2013          ## tokenizer is switched to the |ENTITY_STATE|, which is an          ## tokenizer is switched to the |ENTITY_STATE|, which is an
# Line 1692  sub _get_next_token ($) { Line 2031  sub _get_next_token ($) {
2031          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
2032                        
2033            $self->{last_stag_name} = $self->{ct}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
2034    
2035              $self->{state} = DATA_STATE;
2036              $self->{s_kwd} = '';
2037              
2038        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2039          $self->{line_prev} = $self->{line};
2040          $self->{column_prev} = $self->{column};
2041          $self->{column}++;
2042          $self->{nc}
2043              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2044        } else {
2045          $self->{set_nc}->($self);
2046        }
2047      
2048              return  ($self->{ct}); # start tag
2049              redo A;
2050          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2051            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2052            if ($self->{ct}->{attributes}) {            if ($self->{ct}->{attributes}) {
# Line 1701  sub _get_next_token ($) { Line 2056  sub _get_next_token ($) {
2056              ## NOTE: This state should never be reached.              ## NOTE: This state should never be reached.
2057                            
2058            }            }
2059          } else {  
2060            die "$0: $self->{ct}->{type}: Unknown token type";            $self->{state} = DATA_STATE;
2061          }            $self->{s_kwd} = '';
2062          $self->{state} = DATA_STATE;            
           
2063      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2064        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
2065        $self->{column_prev} = $self->{column};        $self->{column_prev} = $self->{column};
# Line 1716  sub _get_next_token ($) { Line 2070  sub _get_next_token ($) {
2070        $self->{set_nc}->($self);        $self->{set_nc}->($self);
2071      }      }
2072        
2073              return  ($self->{ct}); # end tag
2074          return  ($self->{ct}); # start tag or end tag            redo A;
2075            } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2076          redo A;            push @{$self->{ct}->{attrdefs}}, $self->{ca};
2077              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2078              
2079        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2080          $self->{line_prev} = $self->{line};
2081          $self->{column_prev} = $self->{column};
2082          $self->{column}++;
2083          $self->{nc}
2084              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2085        } else {
2086          $self->{set_nc}->($self);
2087        }
2088      
2089              return  ($self->{ct}); # ATTLIST
2090              redo A;
2091            } else {
2092              die "$0: $self->{ct}->{type}: Unknown token type";
2093            }
2094        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
         $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');  
2095          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
2096                        
2097              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2098            $self->{last_stag_name} = $self->{ct}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
2099    
2100              $self->{state} = DATA_STATE;
2101              $self->{s_kwd} = '';
2102              ## reconsume
2103              return  ($self->{ct}); # start tag
2104              redo A;
2105          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2106              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2107            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2108            if ($self->{ct}->{attributes}) {            if ($self->{ct}->{attributes}) {
2109                            
# Line 1734  sub _get_next_token ($) { Line 2112  sub _get_next_token ($) {
2112              ## NOTE: This state should never be reached.              ## NOTE: This state should never be reached.
2113                            
2114            }            }
2115    
2116              $self->{state} = DATA_STATE;
2117              $self->{s_kwd} = '';
2118              ## reconsume
2119              return  ($self->{ct}); # end tag
2120              redo A;
2121            } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2122              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
2123              push @{$self->{ct}->{attrdefs}}, $self->{ca};
2124              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2125              ## Reconsume.
2126              return  ($self->{ct}); # ATTLIST
2127              redo A;
2128          } else {          } else {
2129            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
2130          }          }
         $self->{state} = DATA_STATE;  
         ## reconsume  
   
         return  ($self->{ct}); # start tag or end tag  
   
         redo A;  
2131        } else {        } else {
2132          if ({          if ({
2133               0x0022 => 1, # "               0x0022 => 1, # "
# Line 1750  sub _get_next_token ($) { Line 2135  sub _get_next_token ($) {
2135               0x003D => 1, # =               0x003D => 1, # =
2136              }->{$self->{nc}}) {              }->{$self->{nc}}) {
2137                        
2138              ## XML5: Not a parse error.
2139            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
2140          } else {          } else {
2141                        
# Line 1806  sub _get_next_token ($) { Line 2192  sub _get_next_token ($) {
2192            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
2193          }          }
2194          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2195            $self->{s_kwd} = '';
2196                    
2197      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2198        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 1853  sub _get_next_token ($) { Line 2240  sub _get_next_token ($) {
2240            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
2241          }          }
2242          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2243            $self->{s_kwd} = '';
2244          ## Reconsume.          ## Reconsume.
2245          return  ($self->{ct}); # start tag or end tag          return  ($self->{ct}); # start tag or end tag
2246          redo A;          redo A;
# Line 1864  sub _get_next_token ($) { Line 2252  sub _get_next_token ($) {
2252          redo A;          redo A;
2253        }        }
2254      } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {      } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
2255          ## XML5: "Empty tag state".
2256    
2257        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
2258          if ($self->{ct}->{type} == END_TAG_TOKEN) {          if ($self->{ct}->{type} == END_TAG_TOKEN) {
2259                        
# Line 1883  sub _get_next_token ($) { Line 2273  sub _get_next_token ($) {
2273          }          }
2274    
2275          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2276            $self->{s_kwd} = '';
2277                    
2278      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2279        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 1914  sub _get_next_token ($) { Line 2305  sub _get_next_token ($) {
2305          } else {          } else {
2306            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
2307          }          }
2308            ## XML5: "Tag attribute name before state".
2309          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2310            $self->{s_kwd} = '';
2311          ## Reconsume.          ## Reconsume.
2312          return  ($self->{ct}); # start tag or end tag          return  ($self->{ct}); # start tag or end tag
2313          redo A;          redo A;
# Line 1927  sub _get_next_token ($) { Line 2320  sub _get_next_token ($) {
2320          redo A;          redo A;
2321        }        }
2322      } elsif ($self->{state} == BOGUS_COMMENT_STATE) {      } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
2323        ## (only happen if PCDATA state)        ## XML5: "Bogus comment state" and "DOCTYPE bogus comment state".
2324    
2325        ## NOTE: Unlike spec's "bogus comment state", this implementation        ## NOTE: Unlike spec's "bogus comment state", this implementation
2326        ## consumes characters one-by-one basis.        ## consumes characters one-by-one basis.
2327                
2328        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
2329                    if ($self->{in_subset}) {
2330          $self->{state} = DATA_STATE;            
2331              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2332            } else {
2333              
2334              $self->{state} = DATA_STATE;
2335              $self->{s_kwd} = '';
2336            }
2337                    
2338      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2339        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 1950  sub _get_next_token ($) { Line 2349  sub _get_next_token ($) {
2349          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
2350          redo A;          redo A;
2351        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
2352                    if ($self->{in_subset}) {
2353          $self->{state} = DATA_STATE;            
2354              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2355            } else {
2356              
2357              $self->{state} = DATA_STATE;
2358              $self->{s_kwd} = '';
2359            }
2360          ## reconsume          ## reconsume
2361    
2362          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
# Line 1978  sub _get_next_token ($) { Line 2383  sub _get_next_token ($) {
2383          redo A;          redo A;
2384        }        }
2385      } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {      } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
2386        ## (only happen if PCDATA state)        ## XML5: "Markup declaration state".
2387                
2388        if ($self->{nc} == 0x002D) { # -        if ($self->{nc} == 0x002D) { # -
2389                    
# Line 2000  sub _get_next_token ($) { Line 2405  sub _get_next_token ($) {
2405          ## ASCII case-insensitive.          ## ASCII case-insensitive.
2406                    
2407          $self->{state} = MD_DOCTYPE_STATE;          $self->{state} = MD_DOCTYPE_STATE;
2408          $self->{s_kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
2409                    
2410      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2411        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2019  sub _get_next_token ($) { Line 2424  sub _get_next_token ($) {
2424                 $self->{nc} == 0x005B) { # [                 $self->{nc} == 0x005B) { # [
2425                                                    
2426          $self->{state} = MD_CDATA_STATE;          $self->{state} = MD_CDATA_STATE;
2427          $self->{s_kwd} = '[';          $self->{kwd} = '[';
2428                    
2429      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2430        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2053  sub _get_next_token ($) { Line 2458  sub _get_next_token ($) {
2458                                    line => $self->{line_prev},                                    line => $self->{line_prev},
2459                                    column => $self->{column_prev} - 2,                                    column => $self->{column_prev} - 2,
2460                                   };                                   };
2461          $self->{state} = COMMENT_START_STATE;          $self->{state} = COMMENT_START_STATE; ## XML5: "comment state".
2462                    
2463      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2464        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2089  sub _get_next_token ($) { Line 2494  sub _get_next_token ($) {
2494              0x0054, # T              0x0054, # T
2495              0x0059, # Y              0x0059, # Y
2496              0x0050, # P              0x0050, # P
2497            ]->[length $self->{s_kwd}] or            ]->[length $self->{kwd}] or
2498            $self->{nc} == [            $self->{nc} == [
2499              undef,              undef,
2500              0x006F, # o              0x006F, # o
# Line 2097  sub _get_next_token ($) { Line 2502  sub _get_next_token ($) {
2502              0x0074, # t              0x0074, # t
2503              0x0079, # y              0x0079, # y
2504              0x0070, # p              0x0070, # p
2505            ]->[length $self->{s_kwd}]) {            ]->[length $self->{kwd}]) {
2506                    
2507          ## Stay in the state.          ## Stay in the state.
2508          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
2509                    
2510      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2511        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2113  sub _get_next_token ($) { Line 2518  sub _get_next_token ($) {
2518      }      }
2519        
2520          redo A;          redo A;
2521        } elsif ((length $self->{s_kwd}) == 6 and        } elsif ((length $self->{kwd}) == 6 and
2522                 ($self->{nc} == 0x0045 or # E                 ($self->{nc} == 0x0045 or # E
2523                  $self->{nc} == 0x0065)) { # e                  $self->{nc} == 0x0065)) { # e
2524                    if ($self->{is_xml} and
2525                ($self->{kwd} ne 'DOCTYP' or $self->{nc} == 0x0065)) {
2526              
2527              ## XML5: case-sensitive.
2528              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO
2529                              text => 'DOCTYPE',
2530                              line => $self->{line_prev},
2531                              column => $self->{column_prev} - 5);
2532            } else {
2533              
2534            }
2535          $self->{state} = DOCTYPE_STATE;          $self->{state} = DOCTYPE_STATE;
2536          $self->{ct} = {type => DOCTYPE_TOKEN,          $self->{ct} = {type => DOCTYPE_TOKEN,
2537                                    quirks => 1,                                    quirks => 1,
# Line 2139  sub _get_next_token ($) { Line 2554  sub _get_next_token ($) {
2554                                    
2555          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2556                          line => $self->{line_prev},                          line => $self->{line_prev},
2557                          column => $self->{column_prev} - 1 - length $self->{s_kwd});                          column => $self->{column_prev} - 1 - length $self->{kwd});
2558          $self->{state} = BOGUS_COMMENT_STATE;          $self->{state} = BOGUS_COMMENT_STATE;
2559          ## Reconsume.          ## Reconsume.
2560          $self->{ct} = {type => COMMENT_TOKEN,          $self->{ct} = {type => COMMENT_TOKEN,
2561                                    data => $self->{s_kwd},                                    data => $self->{kwd},
2562                                    line => $self->{line_prev},                                    line => $self->{line_prev},
2563                                    column => $self->{column_prev} - 1 - length $self->{s_kwd},                                    column => $self->{column_prev} - 1 - length $self->{kwd},
2564                                   };                                   };
2565          redo A;          redo A;
2566        }        }
# Line 2156  sub _get_next_token ($) { Line 2571  sub _get_next_token ($) {
2571              '[CD' => 0x0041, # A              '[CD' => 0x0041, # A
2572              '[CDA' => 0x0054, # T              '[CDA' => 0x0054, # T
2573              '[CDAT' => 0x0041, # A              '[CDAT' => 0x0041, # A
2574            }->{$self->{s_kwd}}) {            }->{$self->{kwd}}) {
2575                    
2576          ## Stay in the state.          ## Stay in the state.
2577          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
2578                    
2579      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2580        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2172  sub _get_next_token ($) { Line 2587  sub _get_next_token ($) {
2587      }      }
2588        
2589          redo A;          redo A;
2590        } elsif ($self->{s_kwd} eq '[CDATA' and        } elsif ($self->{kwd} eq '[CDATA' and
2591                 $self->{nc} == 0x005B) { # [                 $self->{nc} == 0x005B) { # [
2592                    if ($self->{is_xml} and
2593                not $self->{tainted} and
2594                @{$self->{open_elements} or []} == 0) {
2595              
2596              $self->{parse_error}->(level => $self->{level}->{must}, type => 'cdata outside of root element',
2597                              line => $self->{line_prev},
2598                              column => $self->{column_prev} - 7);
2599              $self->{tainted} = 1;
2600            } else {
2601              
2602            }
2603    
2604          $self->{ct} = {type => CHARACTER_TOKEN,          $self->{ct} = {type => CHARACTER_TOKEN,
2605                                    data => '',                                    data => '',
2606                                    line => $self->{line_prev},                                    line => $self->{line_prev},
# Line 2196  sub _get_next_token ($) { Line 2622  sub _get_next_token ($) {
2622                    
2623          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2624                          line => $self->{line_prev},                          line => $self->{line_prev},
2625                          column => $self->{column_prev} - 1 - length $self->{s_kwd});                          column => $self->{column_prev} - 1 - length $self->{kwd});
2626          $self->{state} = BOGUS_COMMENT_STATE;          $self->{state} = BOGUS_COMMENT_STATE;
2627          ## Reconsume.          ## Reconsume.
2628          $self->{ct} = {type => COMMENT_TOKEN,          $self->{ct} = {type => COMMENT_TOKEN,
2629                                    data => $self->{s_kwd},                                    data => $self->{kwd},
2630                                    line => $self->{line_prev},                                    line => $self->{line_prev},
2631                                    column => $self->{column_prev} - 1 - length $self->{s_kwd},                                    column => $self->{column_prev} - 1 - length $self->{kwd},
2632                                   };                                   };
2633          redo A;          redo A;
2634        }        }
# Line 2223  sub _get_next_token ($) { Line 2649  sub _get_next_token ($) {
2649        
2650          redo A;          redo A;
2651        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
           
2652          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2653          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
2654              
2655              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2656            } else {
2657              
2658              $self->{state} = DATA_STATE;
2659              $self->{s_kwd} = '';
2660            }
2661                    
2662      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2663        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2242  sub _get_next_token ($) { Line 2674  sub _get_next_token ($) {
2674    
2675          redo A;          redo A;
2676        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
2677          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2678          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
2679              
2680              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2681            } else {
2682              
2683              $self->{state} = DATA_STATE;
2684              $self->{s_kwd} = '';
2685            }
2686          ## reconsume          ## reconsume
2687    
2688          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
# Line 2285  sub _get_next_token ($) { Line 2723  sub _get_next_token ($) {
2723        
2724          redo A;          redo A;
2725        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
           
2726          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2727          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
2728              
2729              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2730            } else {
2731              
2732              $self->{state} = DATA_STATE;
2733              $self->{s_kwd} = '';
2734            }
2735                    
2736      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2737        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2304  sub _get_next_token ($) { Line 2748  sub _get_next_token ($) {
2748    
2749          redo A;          redo A;
2750        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
2751          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2752          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
2753              
2754              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2755            } else {
2756              
2757              $self->{state} = DATA_STATE;
2758              $self->{s_kwd} = '';
2759            }
2760          ## reconsume          ## reconsume
2761    
2762          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
# Line 2331  sub _get_next_token ($) { Line 2781  sub _get_next_token ($) {
2781          redo A;          redo A;
2782        }        }
2783      } elsif ($self->{state} == COMMENT_STATE) {      } elsif ($self->{state} == COMMENT_STATE) {
2784          ## XML5: "Comment state" and "DOCTYPE comment state".
2785    
2786        if ($self->{nc} == 0x002D) { # -        if ($self->{nc} == 0x002D) { # -
2787                    
2788          $self->{state} = COMMENT_END_DASH_STATE;          $self->{state} = COMMENT_END_DASH_STATE;
# Line 2347  sub _get_next_token ($) { Line 2799  sub _get_next_token ($) {
2799        
2800          redo A;          redo A;
2801        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
2802          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2803          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
2804              
2805              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2806            } else {
2807              
2808              $self->{state} = DATA_STATE;
2809              $self->{s_kwd} = '';
2810            }
2811          ## reconsume          ## reconsume
2812    
2813          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
# Line 2377  sub _get_next_token ($) { Line 2835  sub _get_next_token ($) {
2835          redo A;          redo A;
2836        }        }
2837      } elsif ($self->{state} == COMMENT_END_DASH_STATE) {      } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2838          ## XML5: "Comment dash state" and "DOCTYPE comment dash state".
2839    
2840        if ($self->{nc} == 0x002D) { # -        if ($self->{nc} == 0x002D) { # -
2841                    
2842          $self->{state} = COMMENT_END_STATE;          $self->{state} = COMMENT_END_STATE;
# Line 2393  sub _get_next_token ($) { Line 2853  sub _get_next_token ($) {
2853        
2854          redo A;          redo A;
2855        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
2856          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2857          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
2858              
2859              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2860            } else {
2861              
2862              $self->{state} = DATA_STATE;
2863              $self->{s_kwd} = '';
2864            }
2865          ## reconsume          ## reconsume
2866    
2867          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
# Line 2419  sub _get_next_token ($) { Line 2885  sub _get_next_token ($) {
2885          redo A;          redo A;
2886        }        }
2887      } elsif ($self->{state} == COMMENT_END_STATE) {      } elsif ($self->{state} == COMMENT_END_STATE) {
2888          ## XML5: "Comment end state" and "DOCTYPE comment end state".
2889    
2890        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
2891                    if ($self->{in_subset}) {
2892          $self->{state} = DATA_STATE;            
2893              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2894            } else {
2895              
2896              $self->{state} = DATA_STATE;
2897              $self->{s_kwd} = '';
2898            }
2899                    
2900      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2901        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2439  sub _get_next_token ($) { Line 2913  sub _get_next_token ($) {
2913          redo A;          redo A;
2914        } elsif ($self->{nc} == 0x002D) { # -        } elsif ($self->{nc} == 0x002D) { # -
2915                    
2916            ## XML5: Not a parse error.
2917          $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2918                          line => $self->{line_prev},                          line => $self->{line_prev},
2919                          column => $self->{column_prev});                          column => $self->{column_prev});
# Line 2457  sub _get_next_token ($) { Line 2932  sub _get_next_token ($) {
2932        
2933          redo A;          redo A;
2934        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
2935          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2936          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
2937              
2938              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2939            } else {
2940              
2941              $self->{state} = DATA_STATE;
2942              $self->{s_kwd} = '';
2943            }
2944          ## reconsume          ## reconsume
2945    
2946          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
# Line 2467  sub _get_next_token ($) { Line 2948  sub _get_next_token ($) {
2948          redo A;          redo A;
2949        } else {        } else {
2950                    
2951            ## XML5: Not a parse error.
2952          $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2953                          line => $self->{line_prev},                          line => $self->{line_prev},
2954                          column => $self->{column_prev});                          column => $self->{column_prev});
# Line 2503  sub _get_next_token ($) { Line 2985  sub _get_next_token ($) {
2985          redo A;          redo A;
2986        } else {        } else {
2987                    
2988            ## XML5: Unless EOF, swith to the bogus comment state.
2989          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before DOCTYPE name');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before DOCTYPE name');
2990          $self->{state} = BEFORE_DOCTYPE_NAME_STATE;          $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2991          ## reconsume          ## reconsume
2992          redo A;          redo A;
2993        }        }
2994      } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {      } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
2995          ## XML5: "DOCTYPE root name before state".
2996    
2997        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
2998                    
2999          ## Stay in the state          ## Stay in the state
# Line 2526  sub _get_next_token ($) { Line 3011  sub _get_next_token ($) {
3011          redo A;          redo A;
3012        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
3013                    
3014            ## XML5: No parse error.
3015          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3016          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3017            $self->{s_kwd} = '';
3018                    
3019      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3020        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2547  sub _get_next_token ($) { Line 3034  sub _get_next_token ($) {
3034                    
3035          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3036          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3037            $self->{s_kwd} = '';
3038          ## reconsume          ## reconsume
3039    
3040          return  ($self->{ct}); # DOCTYPE (quirks)          return  ($self->{ct}); # DOCTYPE (quirks)
3041    
3042          redo A;          redo A;
3043          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3044            
3045            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3046            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3047            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3048            $self->{in_subset} = 1;
3049            
3050        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3051          $self->{line_prev} = $self->{line};
3052          $self->{column_prev} = $self->{column};
3053          $self->{column}++;
3054          $self->{nc}
3055              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3056        } else {
3057          $self->{set_nc}->($self);
3058        }
3059      
3060            return  ($self->{ct}); # DOCTYPE
3061            redo A;
3062        } else {        } else {
3063                    
3064          $self->{ct}->{name} = chr $self->{nc};          $self->{ct}->{name} = chr $self->{nc};
# Line 2571  sub _get_next_token ($) { Line 3078  sub _get_next_token ($) {
3078          redo A;          redo A;
3079        }        }
3080      } elsif ($self->{state} == DOCTYPE_NAME_STATE) {      } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
3081  ## ISSUE: Redundant "First," in the spec.        ## XML5: "DOCTYPE root name state".
3082    
3083          ## ISSUE: Redundant "First," in the spec.
3084    
3085        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
3086                    
3087          $self->{state} = AFTER_DOCTYPE_NAME_STATE;          $self->{state} = AFTER_DOCTYPE_NAME_STATE;
# Line 2590  sub _get_next_token ($) { Line 3100  sub _get_next_token ($) {
3100        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
3101                    
3102          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3103            $self->{s_kwd} = '';
3104                    
3105      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3106        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2609  sub _get_next_token ($) { Line 3120  sub _get_next_token ($) {
3120                    
3121          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3122          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3123            $self->{s_kwd} = '';
3124          ## reconsume          ## reconsume
3125    
3126          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
3127          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
3128    
3129          redo A;          redo A;
3130          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3131            
3132            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3133            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3134            $self->{in_subset} = 1;
3135            
3136        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3137          $self->{line_prev} = $self->{line};
3138          $self->{column_prev} = $self->{column};
3139          $self->{column}++;
3140          $self->{nc}
3141              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3142        } else {
3143          $self->{set_nc}->($self);
3144        }
3145      
3146            return  ($self->{ct}); # DOCTYPE
3147            redo A;
3148        } else {        } else {
3149                    
3150          $self->{ct}->{name}          $self->{ct}->{name}
# Line 2634  sub _get_next_token ($) { Line 3164  sub _get_next_token ($) {
3164          redo A;          redo A;
3165        }        }
3166      } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {      } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
3167          ## XML5: Corresponding to XML5's "DOCTYPE root name after
3168          ## state", but implemented differently.
3169    
3170        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
3171                    
3172          ## Stay in the state          ## Stay in the state
# Line 2650  sub _get_next_token ($) { Line 3183  sub _get_next_token ($) {
3183        
3184          redo A;          redo A;
3185        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
3186            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3187              
3188              $self->{state} = DATA_STATE;
3189              $self->{s_kwd} = '';
3190            } else {
3191              
3192              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
3193              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3194            }
3195                    
         $self->{state} = DATA_STATE;  
3196                    
3197      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3198        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2663  sub _get_next_token ($) { Line 3204  sub _get_next_token ($) {
3204        $self->{set_nc}->($self);        $self->{set_nc}->($self);
3205      }      }
3206        
3207            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         return  ($self->{ct}); # DOCTYPE  
   
3208          redo A;          redo A;
3209        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
3210            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3211              
3212              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3213              $self->{state} = DATA_STATE;
3214              $self->{s_kwd} = '';
3215              $self->{ct}->{quirks} = 1;
3216            } else {
3217              
3218              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3219              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3220            }
3221                    
3222          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');          ## Reconsume.
3223          $self->{state} = DATA_STATE;          return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         ## reconsume  
   
         $self->{ct}->{quirks} = 1;  
         return  ($self->{ct}); # DOCTYPE  
   
3224          redo A;          redo A;
3225        } elsif ($self->{nc} == 0x0050 or # P        } elsif ($self->{nc} == 0x0050 or # P
3226                 $self->{nc} == 0x0070) { # p                 $self->{nc} == 0x0070) { # p
3227            
3228          $self->{state} = PUBLIC_STATE;          $self->{state} = PUBLIC_STATE;
3229          $self->{s_kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
3230                    
3231      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3232        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2695  sub _get_next_token ($) { Line 3241  sub _get_next_token ($) {
3241          redo A;          redo A;
3242        } elsif ($self->{nc} == 0x0053 or # S        } elsif ($self->{nc} == 0x0053 or # S
3243                 $self->{nc} == 0x0073) { # s                 $self->{nc} == 0x0073) { # s
3244            
3245          $self->{state} = SYSTEM_STATE;          $self->{state} = SYSTEM_STATE;
3246          $self->{s_kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
3247                    
3248      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3249        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2709  sub _get_next_token ($) { Line 3256  sub _get_next_token ($) {
3256      }      }
3257        
3258          redo A;          redo A;
3259        } else {  ## TODO: " and ' for ENTITY
3260          } elsif ($self->{is_xml} and
3261                   $self->{ct}->{type} == DOCTYPE_TOKEN and
3262                   $self->{nc} == 0x005B) { # [
3263                    
3264          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name');          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3265          $self->{ct}->{quirks} = 1;          $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3266            $self->{in_subset} = 1;
3267            
3268        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3269          $self->{line_prev} = $self->{line};
3270          $self->{column_prev} = $self->{column};
3271          $self->{column}++;
3272          $self->{nc}
3273              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3274        } else {
3275          $self->{set_nc}->($self);
3276        }
3277      
3278            return  ($self->{ct}); # DOCTYPE
3279            redo A;
3280          } else {
3281            $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name'); ## TODO: type
3282    
3283            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3284              
3285              $self->{ct}->{quirks} = 1;
3286              $self->{state} = BOGUS_DOCTYPE_STATE;
3287            } else {
3288              
3289              $self->{state} = BOGUS_MD_STATE;
3290            }
3291    
         $self->{state} = BOGUS_DOCTYPE_STATE;  
3292                    
3293      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3294        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2736  sub _get_next_token ($) { Line 3310  sub _get_next_token ($) {
3310              0x0042, # B              0x0042, # B
3311              0x004C, # L              0x004C, # L
3312              0x0049, # I              0x0049, # I
3313            ]->[length $self->{s_kwd}] or            ]->[length $self->{kwd}] or
3314            $self->{nc} == [            $self->{nc} == [
3315              undef,              undef,
3316              0x0075, # u              0x0075, # u
3317              0x0062, # b              0x0062, # b
3318              0x006C, # l              0x006C, # l
3319              0x0069, # i              0x0069, # i
3320            ]->[length $self->{s_kwd}]) {            ]->[length $self->{kwd}]) {
3321                    
3322          ## Stay in the state.          ## Stay in the state.
3323          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
3324                    
3325      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3326        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2759  sub _get_next_token ($) { Line 3333  sub _get_next_token ($) {
3333      }      }
3334        
3335          redo A;          redo A;
3336        } elsif ((length $self->{s_kwd}) == 5 and        } elsif ((length $self->{kwd}) == 5 and
3337                 ($self->{nc} == 0x0043 or # C                 ($self->{nc} == 0x0043 or # C
3338                  $self->{nc} == 0x0063)) { # c                  $self->{nc} == 0x0063)) { # c
3339                    if ($self->{is_xml} and
3340                ($self->{kwd} ne 'PUBLI' or $self->{nc} == 0x0063)) { # c
3341              
3342              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
3343                              text => 'PUBLIC',
3344                              line => $self->{line_prev},
3345                              column => $self->{column_prev} - 4);
3346            } else {
3347              
3348            }
3349          $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;          $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3350                    
3351      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 2777  sub _get_next_token ($) { Line 3360  sub _get_next_token ($) {
3360        
3361          redo A;          redo A;
3362        } else {        } else {
3363                    $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name', ## TODO: type
         $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name',  
3364                          line => $self->{line_prev},                          line => $self->{line_prev},
3365                          column => $self->{column_prev} + 1 - length $self->{s_kwd});                          column => $self->{column_prev} + 1 - length $self->{kwd});
3366          $self->{ct}->{quirks} = 1;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3367              
3368          $self->{state} = BOGUS_DOCTYPE_STATE;            $self->{ct}->{quirks} = 1;
3369              $self->{state} = BOGUS_DOCTYPE_STATE;
3370            } else {
3371              
3372              $self->{state} = BOGUS_MD_STATE;
3373            }
3374          ## Reconsume.          ## Reconsume.
3375          redo A;          redo A;
3376        }        }
# Line 2795  sub _get_next_token ($) { Line 3382  sub _get_next_token ($) {
3382              0x0053, # S              0x0053, # S
3383              0x0054, # T              0x0054, # T
3384              0x0045, # E              0x0045, # E
3385            ]->[length $self->{s_kwd}] or            ]->[length $self->{kwd}] or
3386            $self->{nc} == [            $self->{nc} == [
3387              undef,              undef,
3388              0x0079, # y              0x0079, # y
3389              0x0073, # s              0x0073, # s
3390              0x0074, # t              0x0074, # t
3391              0x0065, # e              0x0065, # e
3392            ]->[length $self->{s_kwd}]) {            ]->[length $self->{kwd}]) {
3393                    
3394          ## Stay in the state.          ## Stay in the state.
3395          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
3396                    
3397      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3398        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2818  sub _get_next_token ($) { Line 3405  sub _get_next_token ($) {
3405      }      }
3406        
3407          redo A;          redo A;
3408        } elsif ((length $self->{s_kwd}) == 5 and        } elsif ((length $self->{kwd}) == 5 and
3409                 ($self->{nc} == 0x004D or # M                 ($self->{nc} == 0x004D or # M
3410                  $self->{nc} == 0x006D)) { # m                  $self->{nc} == 0x006D)) { # m
3411                    if ($self->{is_xml} and
3412                ($self->{kwd} ne 'SYSTE' or $self->{nc} == 0x006D)) { # m
3413              
3414              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
3415                              text => 'SYSTEM',
3416                              line => $self->{line_prev},
3417                              column => $self->{column_prev} - 4);
3418            } else {
3419              
3420            }
3421          $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;          $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
3422                    
3423      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 2836  sub _get_next_token ($) { Line 3432  sub _get_next_token ($) {
3432        
3433          redo A;          redo A;
3434        } else {        } else {
3435                    $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name', ## TODO: type
         $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name',  
3436                          line => $self->{line_prev},                          line => $self->{line_prev},
3437                          column => $self->{column_prev} + 1 - length $self->{s_kwd});                          column => $self->{column_prev} + 1 - length $self->{kwd});
3438          $self->{ct}->{quirks} = 1;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3439              
3440          $self->{state} = BOGUS_DOCTYPE_STATE;            $self->{ct}->{quirks} = 1;
3441              $self->{state} = BOGUS_DOCTYPE_STATE;
3442            } else {
3443              
3444              $self->{state} = BOGUS_MD_STATE;
3445            }
3446          ## Reconsume.          ## Reconsume.
3447          redo A;          redo A;
3448        }        }
# Line 2895  sub _get_next_token ($) { Line 3495  sub _get_next_token ($) {
3495        
3496          redo A;          redo A;
3497        } elsif ($self->{nc} eq 0x003E) { # >        } elsif ($self->{nc} eq 0x003E) { # >
           
3498          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3499            
3500          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3501              
3502              $self->{state} = DATA_STATE;
3503              $self->{s_kwd} = '';
3504              $self->{ct}->{quirks} = 1;
3505            } else {
3506              
3507              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3508            }
3509            
3510                    
3511      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3512        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2910  sub _get_next_token ($) { Line 3518  sub _get_next_token ($) {
3518        $self->{set_nc}->($self);        $self->{set_nc}->($self);
3519      }      }
3520        
3521            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         return  ($self->{ct}); # DOCTYPE  
   
3522          redo A;          redo A;
3523        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
3524            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3525              
3526              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3527              $self->{state} = DATA_STATE;
3528              $self->{s_kwd} = '';
3529              $self->{ct}->{quirks} = 1;
3530            } else {
3531              
3532              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3533              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3534            }
3535                    
         $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');  
   
         $self->{state} = DATA_STATE;  
3536          ## reconsume          ## reconsume
   
         $self->{ct}->{quirks} = 1;  
3537          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
   
3538          redo A;          redo A;
3539        } else {        } elsif ($self->{is_xml} and
3540                   $self->{ct}->{type} == DOCTYPE_TOKEN and
3541                   $self->{nc} == 0x005B) { # [
3542                    
3543            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3544            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3545            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3546            $self->{in_subset} = 1;
3547            
3548        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3549          $self->{line_prev} = $self->{line};
3550          $self->{column_prev} = $self->{column};
3551          $self->{column}++;
3552          $self->{nc}
3553              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3554        } else {
3555          $self->{set_nc}->($self);
3556        }
3557      
3558            return  ($self->{ct}); # DOCTYPE
3559            redo A;
3560          } else {
3561          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC');
         $self->{ct}->{quirks} = 1;  
3562    
3563          $self->{state} = BOGUS_DOCTYPE_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3564              
3565              $self->{ct}->{quirks} = 1;
3566              $self->{state} = BOGUS_DOCTYPE_STATE;
3567            } else {
3568              
3569              $self->{state} = BOGUS_MD_STATE;
3570            }
3571    
3572                    
3573      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3574        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2962  sub _get_next_token ($) { Line 3599  sub _get_next_token ($) {
3599        
3600          redo A;          redo A;
3601        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
           
3602          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3603    
3604          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3605              
3606              $self->{state} = DATA_STATE;
3607              $self->{s_kwd} = '';
3608              $self->{ct}->{quirks} = 1;
3609            } else {
3610              
3611              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3612            }
3613    
3614                    
3615      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3616        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2977  sub _get_next_token ($) { Line 3622  sub _get_next_token ($) {
3622        $self->{set_nc}->($self);        $self->{set_nc}->($self);
3623      }      }
3624        
3625            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         return  ($self->{ct}); # DOCTYPE  
   
3626          redo A;          redo A;
3627        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
3628          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3629    
3630          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3631          ## reconsume            
3632              $self->{state} = DATA_STATE;
3633          $self->{ct}->{quirks} = 1;            $self->{s_kwd} = '';
3634              $self->{ct}->{quirks} = 1;
3635            } else {
3636              
3637              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3638            }
3639            
3640            ## Reconsume.
3641          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
   
3642          redo A;          redo A;
3643        } else {        } else {
3644                    
3645          $self->{ct}->{pubid} # DOCTYPE          $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
             .= chr $self->{nc};  
3646          $self->{read_until}->($self->{ct}->{pubid}, q[">],          $self->{read_until}->($self->{ct}->{pubid}, q[">],
3647                                length $self->{ct}->{pubid});                                length $self->{ct}->{pubid});
3648    
# Line 3031  sub _get_next_token ($) { Line 3677  sub _get_next_token ($) {
3677        
3678          redo A;          redo A;
3679        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
           
3680          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3681    
3682          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3683              
3684              $self->{state} = DATA_STATE;
3685              $self->{s_kwd} = '';
3686              $self->{ct}->{quirks} = 1;
3687            } else {
3688              
3689              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3690            }
3691    
3692                    
3693      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3694        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3046  sub _get_next_token ($) { Line 3700  sub _get_next_token ($) {
3700        $self->{set_nc}->($self);        $self->{set_nc}->($self);
3701      }      }
3702        
3703            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         return  ($self->{ct}); # DOCTYPE  
   
3704          redo A;          redo A;
3705        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
3706          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3707    
3708          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3709              
3710              $self->{state} = DATA_STATE;
3711              $self->{s_kwd} = '';
3712              $self->{ct}->{quirks} = 1;
3713            } else {
3714              
3715              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3716            }
3717          
3718          ## reconsume          ## reconsume
3719            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         return  ($self->{ct}); # DOCTYPE  
   
3720          redo A;          redo A;
3721        } else {        } else {
3722                    
3723          $self->{ct}->{pubid} # DOCTYPE          $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
             .= chr $self->{nc};  
3724          $self->{read_until}->($self->{ct}->{pubid}, q['>],          $self->{read_until}->($self->{ct}->{pubid}, q['>],
3725                                length $self->{ct}->{pubid});                                length $self->{ct}->{pubid});
3726    
# Line 3101  sub _get_next_token ($) { Line 3756  sub _get_next_token ($) {
3756          redo A;          redo A;
3757        } elsif ($self->{nc} == 0x0022) { # "        } elsif ($self->{nc} == 0x0022) { # "
3758                    
3759          $self->{ct}->{sysid} = ''; # DOCTYPE          $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
3760          $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;          $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3761                    
3762      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 3117  sub _get_next_token ($) { Line 3772  sub _get_next_token ($) {
3772          redo A;          redo A;
3773        } elsif ($self->{nc} == 0x0027) { # '        } elsif ($self->{nc} == 0x0027) { # '
3774                    
3775          $self->{ct}->{sysid} = ''; # DOCTYPE          $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
3776          $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;          $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
3777                    
3778      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 3132  sub _get_next_token ($) { Line 3787  sub _get_next_token ($) {
3787        
3788          redo A;          redo A;
3789        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
3790            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3791              if ($self->{is_xml}) {
3792                
3793                $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3794              } else {
3795                
3796              }
3797              $self->{state} = DATA_STATE;
3798              $self->{s_kwd} = '';
3799            } else {
3800              if ($self->{ct}->{type} == NOTATION_TOKEN) {
3801                
3802              } else {
3803                
3804                $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');            
3805              }
3806              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3807            }
3808                    
         $self->{state} = DATA_STATE;  
3809                    
3810      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3811        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3145  sub _get_next_token ($) { Line 3817  sub _get_next_token ($) {
3817        $self->{set_nc}->($self);        $self->{set_nc}->($self);
3818      }      }
3819        
3820            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         return  ($self->{ct}); # DOCTYPE  
   
3821          redo A;          redo A;
3822        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
3823            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3824              
3825              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3826              
3827              $self->{state} = DATA_STATE;
3828              $self->{s_kwd} = '';
3829              $self->{ct}->{quirks} = 1;
3830            } else {
3831              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3832              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3833            }
3834                    
         $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');  
   
         $self->{state} = DATA_STATE;  
3835          ## reconsume          ## reconsume
3836            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3837          $self->{ct}->{quirks} = 1;          redo A;
3838          } elsif ($self->{is_xml} and
3839                   $self->{ct}->{type} == DOCTYPE_TOKEN and
3840                   $self->{nc} == 0x005B) { # [
3841            
3842            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3843            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3844            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3845            $self->{in_subset} = 1;
3846            
3847        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3848          $self->{line_prev} = $self->{line};
3849          $self->{column_prev} = $self->{column};
3850          $self->{column}++;
3851          $self->{nc}
3852              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3853        } else {
3854          $self->{set_nc}->($self);
3855        }
3856      
3857          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
   
3858          redo A;          redo A;
3859        } else {        } else {
           
3860          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC literal');
         $self->{ct}->{quirks} = 1;  
3861    
3862          $self->{state} = BOGUS_DOCTYPE_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3863              
3864              $self->{ct}->{quirks} = 1;
3865              $self->{state} = BOGUS_DOCTYPE_STATE;
3866            } else {
3867              
3868              $self->{state} = BOGUS_MD_STATE;
3869            }
3870    
3871                    
3872      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3873        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3228  sub _get_next_token ($) { Line 3930  sub _get_next_token ($) {
3930        
3931          redo A;          redo A;
3932        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
           
3933          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
         $self->{state} = DATA_STATE;  
3934                    
3935      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3936        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3243  sub _get_next_token ($) { Line 3943  sub _get_next_token ($) {
3943      }      }
3944        
3945    
3946          $self->{ct}->{quirks} = 1;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3947          return  ($self->{ct}); # DOCTYPE            
3948              $self->{state} = DATA_STATE;
3949              $self->{s_kwd} = '';
3950              $self->{ct}->{quirks} = 1;
3951            } else {
3952              
3953              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3954            }
3955    
3956            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3957          redo A;          redo A;
3958        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
3959            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3960              
3961              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3962              $self->{state} = DATA_STATE;
3963              $self->{s_kwd} = '';
3964              $self->{ct}->{quirks} = 1;
3965            } else {
3966              
3967              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3968              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3969            }
3970                    
         $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');  
   
         $self->{state} = DATA_STATE;  
3971          ## reconsume          ## reconsume
3972            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3973            redo A;
3974          } elsif ($self->{is_xml} and
3975                   $self->{ct}->{type} == DOCTYPE_TOKEN and
3976                   $self->{nc} == 0x005B) { # [
3977            
3978            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3979    
3980          $self->{ct}->{quirks} = 1;          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3981            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3982            $self->{in_subset} = 1;
3983            
3984        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3985          $self->{line_prev} = $self->{line};
3986          $self->{column_prev} = $self->{column};
3987          $self->{column}++;
3988          $self->{nc}
3989              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3990        } else {
3991          $self->{set_nc}->($self);
3992        }
3993      
3994          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
   
3995          redo A;          redo A;
3996        } else {        } else {
           
3997          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM');
         $self->{ct}->{quirks} = 1;  
3998    
3999          $self->{state} = BOGUS_DOCTYPE_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4000                        
4001              $self->{ct}->{quirks} = 1;
4002              $self->{state} = BOGUS_DOCTYPE_STATE;
4003            } else {
4004              
4005              $self->{state} = BOGUS_MD_STATE;
4006            }
4007    
4008                    
4009      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4010        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3293  sub _get_next_token ($) { Line 4034  sub _get_next_token ($) {
4034      }      }
4035        
4036          redo A;          redo A;
4037        } elsif ($self->{nc} == 0x003E) { # >        } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
           
4038          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4039    
4040          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4041              
4042              $self->{state} = DATA_STATE;
4043              $self->{s_kwd} = '';
4044              $self->{ct}->{quirks} = 1;
4045            } else {
4046              
4047              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4048            }
4049            
4050                    
4051      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4052        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3309  sub _get_next_token ($) { Line 4058  sub _get_next_token ($) {
4058        $self->{set_nc}->($self);        $self->{set_nc}->($self);
4059      }      }
4060        
4061            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         return  ($self->{ct}); # DOCTYPE  
   
4062          redo A;          redo A;
4063        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
4064          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4065    
4066          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4067              
4068              $self->{state} = DATA_STATE;
4069              $self->{s_kwd} = '';
4070              $self->{ct}->{quirks} = 1;
4071            } else {
4072              
4073              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4074            }
4075            
4076          ## reconsume          ## reconsume
4077            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         return  ($self->{ct}); # DOCTYPE  
   
4078          redo A;          redo A;
4079        } else {        } else {
4080                    
4081          $self->{ct}->{sysid} # DOCTYPE          $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
             .= chr $self->{nc};  
4082          $self->{read_until}->($self->{ct}->{sysid}, q[">],          $self->{read_until}->($self->{ct}->{sysid}, q[">],
4083                                length $self->{ct}->{sysid});                                length $self->{ct}->{sysid});
4084    
# Line 3362  sub _get_next_token ($) { Line 4112  sub _get_next_token ($) {
4112      }      }
4113        
4114          redo A;          redo A;
4115        } elsif ($self->{nc} == 0x003E) { # >        } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
4116                    
4117          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4118    
4119          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
4120            $self->{s_kwd} = '';
4121                    
4122      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4123        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3384  sub _get_next_token ($) { Line 4135  sub _get_next_token ($) {
4135    
4136          redo A;          redo A;
4137        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
4138          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4139    
4140          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4141          ## reconsume            
4142              $self->{state} = DATA_STATE;
4143          $self->{ct}->{quirks} = 1;            $self->{s_kwd} = '';
4144          return  ($self->{ct}); # DOCTYPE            $self->{ct}->{quirks} = 1;
4145            } else {
4146              
4147              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4148            }
4149    
4150            ## reconsume
4151            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4152          redo A;          redo A;
4153        } else {        } else {
4154                    
4155          $self->{ct}->{sysid} # DOCTYPE          $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
             .= chr $self->{nc};  
4156          $self->{read_until}->($self->{ct}->{sysid}, q['>],          $self->{read_until}->($self->{ct}->{sysid}, q['>],
4157                                length $self->{ct}->{sysid});                                length $self->{ct}->{sysid});
4158    
# Line 3432  sub _get_next_token ($) { Line 4187  sub _get_next_token ($) {
4187        
4188          redo A;          redo A;
4189        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
4190                    if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4191          $self->{state} = DATA_STATE;            
4192              $self->{state} = DATA_STATE;
4193              $self->{s_kwd} = '';
4194            } else {
4195              
4196              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4197            }
4198    
4199                    
4200      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4201        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3445  sub _get_next_token ($) { Line 4207  sub _get_next_token ($) {
4207        $self->{set_nc}->($self);        $self->{set_nc}->($self);
4208      }      }
4209        
4210            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         return  ($self->{ct}); # DOCTYPE  
   
4211          redo A;          redo A;
4212    ## TODO: "NDATA"
4213        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
4214                    if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4215          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');            
4216          $self->{state} = DATA_STATE;            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
4217          ## reconsume            $self->{state} = DATA_STATE;
4218              $self->{s_kwd} = '';
4219              $self->{ct}->{quirks} = 1;
4220            } else {
4221              
4222              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4223              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4224            }
4225    
4226          $self->{ct}->{quirks} = 1;          ## reconsume
4227            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4228            redo A;
4229          } elsif ($self->{is_xml} and
4230                   $self->{ct}->{type} == DOCTYPE_TOKEN and
4231                   $self->{nc} == 0x005B) { # [
4232            
4233            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4234            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4235            $self->{in_subset} = 1;
4236            
4237        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4238          $self->{line_prev} = $self->{line};
4239          $self->{column_prev} = $self->{column};
4240          $self->{column}++;
4241          $self->{nc}
4242              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4243        } else {
4244          $self->{set_nc}->($self);
4245        }
4246      
4247          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
   
4248          redo A;          redo A;
4249        } else {        } else {
           
4250          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
         #$self->{ct}->{quirks} = 1;  
4251    
4252          $self->{state} = BOGUS_DOCTYPE_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4253              
4254              #$self->{ct}->{quirks} = 1;
4255              $self->{state} = BOGUS_DOCTYPE_STATE;
4256            } else {
4257              
4258              $self->{state} = BOGUS_MD_STATE;
4259            }
4260    
4261                    
4262      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4263        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3482  sub _get_next_token ($) { Line 4275  sub _get_next_token ($) {
4275        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
4276                    
4277          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
4278            $self->{s_kwd} = '';
4279                    
4280      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4281        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3497  sub _get_next_token ($) { Line 4291  sub _get_next_token ($) {
4291          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
4292    
4293          redo A;          redo A;
4294          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
4295            
4296            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4297            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4298            $self->{in_subset} = 1;
4299            
4300        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4301          $self->{line_prev} = $self->{line};
4302          $self->{column_prev} = $self->{column};
4303          $self->{column}++;
4304          $self->{nc}
4305              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4306        } else {
4307          $self->{set_nc}->($self);
4308        }
4309      
4310            return  ($self->{ct}); # DOCTYPE
4311            redo A;
4312        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
4313                    
4314          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
4315            $self->{s_kwd} = '';
4316          ## reconsume          ## reconsume
4317    
4318          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
# Line 3508  sub _get_next_token ($) { Line 4321  sub _get_next_token ($) {
4321        } else {        } else {
4322                    
4323          my $s = '';          my $s = '';
4324          $self->{read_until}->($s, q[>], 0);          $self->{read_until}->($s, q{>[}, 0);
4325    
4326          ## Stay in the state          ## Stay in the state
4327                    
# Line 3528  sub _get_next_token ($) { Line 4341  sub _get_next_token ($) {
4341        ## NOTE: "CDATA section state" in the state is jointly implemented        ## NOTE: "CDATA section state" in the state is jointly implemented
4342        ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,        ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
4343        ## and |CDATA_SECTION_MSE2_STATE|.        ## and |CDATA_SECTION_MSE2_STATE|.
4344    
4345          ## XML5: "CDATA state".
4346                
4347        if ($self->{nc} == 0x005D) { # ]        if ($self->{nc} == 0x005D) { # ]
4348                    
# Line 3545  sub _get_next_token ($) { Line 4360  sub _get_next_token ($) {
4360        
4361          redo A;          redo A;
4362        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
4363            if ($self->{is_xml}) {
4364              
4365              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no mse'); ## TODO: type
4366            } else {
4367              
4368            }
4369    
4370          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
4371                    $self->{s_kwd} = '';
4372      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {          ## Reconsume.
       $self->{line_prev} = $self->{line};  
       $self->{column_prev} = $self->{column};  
       $self->{column}++;  
       $self->{nc}  
           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);  
     } else {  
       $self->{set_nc}->($self);  
     }  
     
4373          if (length $self->{ct}->{data}) { # character          if (length $self->{ct}->{data}) { # character
4374                        
4375            return  ($self->{ct}); # character            return  ($self->{ct}); # character
# Line 3589  sub _get_next_token ($) { Line 4402  sub _get_next_token ($) {
4402    
4403        ## ISSUE: "text tokens" in spec.        ## ISSUE: "text tokens" in spec.
4404      } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {      } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
4405          ## XML5: "CDATA bracket state".
4406    
4407        if ($self->{nc} == 0x005D) { # ]        if ($self->{nc} == 0x005D) { # ]
4408                    
4409          $self->{state} = CDATA_SECTION_MSE2_STATE;          $self->{state} = CDATA_SECTION_MSE2_STATE;
# Line 3606  sub _get_next_token ($) { Line 4421  sub _get_next_token ($) {
4421          redo A;          redo A;
4422        } else {        } else {
4423                    
4424            ## XML5: If EOF, "]" is not appended and changed to the data state.
4425          $self->{ct}->{data} .= ']';          $self->{ct}->{data} .= ']';
4426          $self->{state} = CDATA_SECTION_STATE;          $self->{state} = CDATA_SECTION_STATE; ## XML5: Stay in the state.
4427          ## Reconsume.          ## Reconsume.
4428          redo A;          redo A;
4429        }        }
4430      } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {      } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
4431          ## XML5: "CDATA end state".
4432    
4433        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
4434          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
4435            $self->{s_kwd} = '';
4436                    
4437      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4438        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3653  sub _get_next_token ($) { Line 4472  sub _get_next_token ($) {
4472                    
4473          $self->{ct}->{data} .= ']]'; # character          $self->{ct}->{data} .= ']]'; # character
4474          $self->{state} = CDATA_SECTION_STATE;          $self->{state} = CDATA_SECTION_STATE;
4475          ## Reconsume.          ## Reconsume. ## XML5: Emit.
4476          redo A;          redo A;
4477        }        }
4478      } elsif ($self->{state} == ENTITY_STATE) {      } elsif ($self->{state} == ENTITY_STATE) {
# Line 3670  sub _get_next_token ($) { Line 4489  sub _get_next_token ($) {
4489        } elsif ($self->{nc} == 0x0023) { # #        } elsif ($self->{nc} == 0x0023) { # #
4490                    
4491          $self->{state} = ENTITY_HASH_STATE;          $self->{state} = ENTITY_HASH_STATE;
4492          $self->{s_kwd} = '#';          $self->{kwd} = '#';
4493                    
4494      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4495        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3690  sub _get_next_token ($) { Line 4509  sub _get_next_token ($) {
4509                    
4510          require Whatpm::_NamedEntityList;          require Whatpm::_NamedEntityList;
4511          $self->{state} = ENTITY_NAME_STATE;          $self->{state} = ENTITY_NAME_STATE;
4512          $self->{s_kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
4513          $self->{entity__value} = $self->{s_kwd};          $self->{entity__value} = $self->{kwd};
4514          $self->{entity__match} = 0;          $self->{entity__match} = 0;
4515                    
4516      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 3721  sub _get_next_token ($) { Line 4540  sub _get_next_token ($) {
4540        if ($self->{prev_state} == DATA_STATE) {        if ($self->{prev_state} == DATA_STATE) {
4541                    
4542          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
4543            $self->{s_kwd} = '';
4544          ## Reconsume.          ## Reconsume.
4545          return  ({type => CHARACTER_TOKEN, data => '&',          return  ({type => CHARACTER_TOKEN, data => '&',
4546                    line => $self->{line_prev},                    line => $self->{line_prev},
# Line 3731  sub _get_next_token ($) { Line 4551  sub _get_next_token ($) {
4551                    
4552          $self->{ca}->{value} .= '&';          $self->{ca}->{value} .= '&';
4553          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
4554            $self->{s_kwd} = '';
4555          ## Reconsume.          ## Reconsume.
4556          redo A;          redo A;
4557        }        }
# Line 3739  sub _get_next_token ($) { Line 4560  sub _get_next_token ($) {
4560            $self->{nc} == 0x0058) { # X            $self->{nc} == 0x0058) { # X
4561                    
4562          $self->{state} = HEXREF_X_STATE;          $self->{state} = HEXREF_X_STATE;
4563          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
4564                    
4565      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4566        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3756  sub _get_next_token ($) { Line 4577  sub _get_next_token ($) {
4577                 $self->{nc} <= 0x0039) { # 0..9                 $self->{nc} <= 0x0039) { # 0..9
4578                    
4579          $self->{state} = NCR_NUM_STATE;          $self->{state} = NCR_NUM_STATE;
4580          $self->{s_kwd} = $self->{nc} - 0x0030;          $self->{kwd} = $self->{nc} - 0x0030;
4581                    
4582      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4583        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3781  sub _get_next_token ($) { Line 4602  sub _get_next_token ($) {
4602          if ($self->{prev_state} == DATA_STATE) {          if ($self->{prev_state} == DATA_STATE) {
4603                        
4604            $self->{state} = $self->{prev_state};            $self->{state} = $self->{prev_state};
4605              $self->{s_kwd} = '';
4606            ## Reconsume.            ## Reconsume.
4607            return  ({type => CHARACTER_TOKEN,            return  ({type => CHARACTER_TOKEN,
4608                      data => '&#',                      data => '&#',
# Line 3792  sub _get_next_token ($) { Line 4614  sub _get_next_token ($) {
4614                        
4615            $self->{ca}->{value} .= '&#';            $self->{ca}->{value} .= '&#';
4616            $self->{state} = $self->{prev_state};            $self->{state} = $self->{prev_state};
4617              $self->{s_kwd} = '';
4618            ## Reconsume.            ## Reconsume.
4619            redo A;            redo A;
4620          }          }
# Line 3800  sub _get_next_token ($) { Line 4623  sub _get_next_token ($) {
4623        if (0x0030 <= $self->{nc} and        if (0x0030 <= $self->{nc} and
4624            $self->{nc} <= 0x0039) { # 0..9            $self->{nc} <= 0x0039) { # 0..9
4625                    
4626          $self->{s_kwd} *= 10;          $self->{kwd} *= 10;
4627          $self->{s_kwd} += $self->{nc} - 0x0030;          $self->{kwd} += $self->{nc} - 0x0030;
4628                    
4629          ## Stay in the state.          ## Stay in the state.
4630                    
# Line 3837  sub _get_next_token ($) { Line 4660  sub _get_next_token ($) {
4660          #          #
4661        }        }
4662    
4663        my $code = $self->{s_kwd};        my $code = $self->{kwd};
4664        my $l = $self->{line_prev};        my $l = $self->{line_prev};
4665        my $c = $self->{column_prev};        my $c = $self->{column_prev};
4666        if ($charref_map->{$code}) {        if ($charref_map->{$code}) {
# Line 3857  sub _get_next_token ($) { Line 4680  sub _get_next_token ($) {
4680        if ($self->{prev_state} == DATA_STATE) {        if ($self->{prev_state} == DATA_STATE) {
4681                    
4682          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
4683            $self->{s_kwd} = '';
4684          ## Reconsume.          ## Reconsume.
4685          return  ({type => CHARACTER_TOKEN, data => chr $code,          return  ({type => CHARACTER_TOKEN, data => chr $code,
4686                      has_reference => 1,
4687                    line => $l, column => $c,                    line => $l, column => $c,
4688                   });                   });
4689          redo A;          redo A;
# Line 3867  sub _get_next_token ($) { Line 4692  sub _get_next_token ($) {
4692          $self->{ca}->{value} .= chr $code;          $self->{ca}->{value} .= chr $code;
4693          $self->{ca}->{has_reference} = 1;          $self->{ca}->{has_reference} = 1;
4694          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
4695            $self->{s_kwd} = '';
4696          ## Reconsume.          ## Reconsume.
4697          redo A;          redo A;
4698        }        }
# Line 3877  sub _get_next_token ($) { Line 4703  sub _get_next_token ($) {
4703          # 0..9, A..F, a..f          # 0..9, A..F, a..f
4704                    
4705          $self->{state} = HEXREF_HEX_STATE;          $self->{state} = HEXREF_HEX_STATE;
4706          $self->{s_kwd} = 0;          $self->{kwd} = 0;
4707          ## Reconsume.          ## Reconsume.
4708          redo A;          redo A;
4709        } else {        } else {
# Line 3892  sub _get_next_token ($) { Line 4718  sub _get_next_token ($) {
4718          if ($self->{prev_state} == DATA_STATE) {          if ($self->{prev_state} == DATA_STATE) {
4719                        
4720            $self->{state} = $self->{prev_state};            $self->{state} = $self->{prev_state};
4721              $self->{s_kwd} = '';
4722            ## Reconsume.            ## Reconsume.
4723            return  ({type => CHARACTER_TOKEN,            return  ({type => CHARACTER_TOKEN,
4724                      data => '&' . $self->{s_kwd},                      data => '&' . $self->{kwd},
4725                      line => $self->{line_prev},                      line => $self->{line_prev},
4726                      column => $self->{column_prev} - length $self->{s_kwd},                      column => $self->{column_prev} - length $self->{kwd},
4727                     });                     });
4728            redo A;            redo A;
4729          } else {          } else {
4730                        
4731            $self->{ca}->{value} .= '&' . $self->{s_kwd};            $self->{ca}->{value} .= '&' . $self->{kwd};
4732            $self->{state} = $self->{prev_state};            $self->{state} = $self->{prev_state};
4733              $self->{s_kwd} = '';
4734            ## Reconsume.            ## Reconsume.
4735            redo A;            redo A;
4736          }          }
# Line 3911  sub _get_next_token ($) { Line 4739  sub _get_next_token ($) {
4739        if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {        if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
4740          # 0..9          # 0..9
4741                    
4742          $self->{s_kwd} *= 0x10;          $self->{kwd} *= 0x10;
4743          $self->{s_kwd} += $self->{nc} - 0x0030;          $self->{kwd} += $self->{nc} - 0x0030;
4744          ## Stay in the state.          ## Stay in the state.
4745                    
4746      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 3929  sub _get_next_token ($) { Line 4757  sub _get_next_token ($) {
4757        } elsif (0x0061 <= $self->{nc} and        } elsif (0x0061 <= $self->{nc} and
4758                 $self->{nc} <= 0x0066) { # a..f                 $self->{nc} <= 0x0066) { # a..f
4759                    
4760          $self->{s_kwd} *= 0x10;          $self->{kwd} *= 0x10;
4761          $self->{s_kwd} += $self->{nc} - 0x0060 + 9;          $self->{kwd} += $self->{nc} - 0x0060 + 9;
4762          ## Stay in the state.          ## Stay in the state.
4763                    
4764      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 3947  sub _get_next_token ($) { Line 4775  sub _get_next_token ($) {
4775        } elsif (0x0041 <= $self->{nc} and        } elsif (0x0041 <= $self->{nc} and
4776                 $self->{nc} <= 0x0046) { # A..F                 $self->{nc} <= 0x0046) { # A..F
4777                    
4778          $self->{s_kwd} *= 0x10;          $self->{kwd} *= 0x10;
4779          $self->{s_kwd} += $self->{nc} - 0x0040 + 9;          $self->{kwd} += $self->{nc} - 0x0040 + 9;
4780          ## Stay in the state.          ## Stay in the state.
4781                    
4782      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 3985  sub _get_next_token ($) { Line 4813  sub _get_next_token ($) {
4813          #          #
4814        }        }
4815    
4816        my $code = $self->{s_kwd};        my $code = $self->{kwd};
4817        my $l = $self->{line_prev};        my $l = $self->{line_prev};
4818        my $c = $self->{column_prev};        my $c = $self->{column_prev};
4819        if ($charref_map->{$code}) {        if ($charref_map->{$code}) {
# Line 4005  sub _get_next_token ($) { Line 4833  sub _get_next_token ($) {
4833        if ($self->{prev_state} == DATA_STATE) {        if ($self->{prev_state} == DATA_STATE) {
4834                    
4835          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
4836            $self->{s_kwd} = '';
4837          ## Reconsume.          ## Reconsume.
4838          return  ({type => CHARACTER_TOKEN, data => chr $code,          return  ({type => CHARACTER_TOKEN, data => chr $code,
4839                      has_reference => 1,
4840                    line => $l, column => $c,                    line => $l, column => $c,
4841                   });                   });
4842          redo A;          redo A;
# Line 4015  sub _get_next_token ($) { Line 4845  sub _get_next_token ($) {
4845          $self->{ca}->{value} .= chr $code;          $self->{ca}->{value} .= chr $code;
4846          $self->{ca}->{has_reference} = 1;          $self->{ca}->{has_reference} = 1;
4847          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
4848            $self->{s_kwd} = '';
4849          ## Reconsume.          ## Reconsume.
4850          redo A;          redo A;
4851        }        }
4852      } elsif ($self->{state} == ENTITY_NAME_STATE) {      } elsif ($self->{state} == ENTITY_NAME_STATE) {
4853        if (length $self->{s_kwd} < 30 and        if (length $self->{kwd} < 30 and
4854            ## NOTE: Some number greater than the maximum length of entity name            ## NOTE: Some number greater than the maximum length of entity name
4855            ((0x0041 <= $self->{nc} and # a            ((0x0041 <= $self->{nc} and # a
4856              $self->{nc} <= 0x005A) or # x              $self->{nc} <= 0x005A) or # x
# Line 4029  sub _get_next_token ($) { Line 4860  sub _get_next_token ($) {
4860              $self->{nc} <= 0x0039) or # 9              $self->{nc} <= 0x0039) or # 9
4861             $self->{nc} == 0x003B)) { # ;             $self->{nc} == 0x003B)) { # ;
4862          our $EntityChar;          our $EntityChar;
4863          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
4864          if (defined $EntityChar->{$self->{s_kwd}}) {          if (defined $EntityChar->{$self->{kwd}}) {
4865            if ($self->{nc} == 0x003B) { # ;            if ($self->{nc} == 0x003B) { # ;
4866                            
4867              $self->{entity__value} = $EntityChar->{$self->{s_kwd}};              $self->{entity__value} = $EntityChar->{$self->{kwd}};
4868              $self->{entity__match} = 1;              $self->{entity__match} = 1;
4869                            
4870      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 4049  sub _get_next_token ($) { Line 4880  sub _get_next_token ($) {
4880              #              #
4881            } else {            } else {
4882                            
4883              $self->{entity__value} = $EntityChar->{$self->{s_kwd}};              $self->{entity__value} = $EntityChar->{$self->{kwd}};
4884              $self->{entity__match} = -1;              $self->{entity__match} = -1;
4885              ## Stay in the state.              ## Stay in the state.
4886                            
# Line 4097  sub _get_next_token ($) { Line 4928  sub _get_next_token ($) {
4928          if ($self->{prev_state} != DATA_STATE and # in attribute          if ($self->{prev_state} != DATA_STATE and # in attribute
4929              $self->{entity__match} < -1) {              $self->{entity__match} < -1) {
4930                        
4931            $data = '&' . $self->{s_kwd};            $data = '&' . $self->{kwd};
4932            #            #
4933          } else {          } else {
4934                        
# Line 4109  sub _get_next_token ($) { Line 4940  sub _get_next_token ($) {
4940                    
4941          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
4942                          line => $self->{line_prev},                          line => $self->{line_prev},
4943                          column => $self->{column_prev} - length $self->{s_kwd});                          column => $self->{column_prev} - length $self->{kwd});
4944          $data = '&' . $self->{s_kwd};          $data = '&' . $self->{kwd};
4945          #          #
4946        }        }
4947        
# Line 4127  sub _get_next_token ($) { Line 4958  sub _get_next_token ($) {
4958        if ($self->{prev_state} == DATA_STATE) {        if ($self->{prev_state} == DATA_STATE) {
4959                    
4960          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
4961            $self->{s_kwd} = '';
4962          ## Reconsume.          ## Reconsume.
4963          return  ({type => CHARACTER_TOKEN,          return  ({type => CHARACTER_TOKEN,
4964                    data => $data,                    data => $data,
4965                      has_reference => $has_ref,
4966                    line => $self->{line_prev},                    line => $self->{line_prev},
4967                    column => $self->{column_prev} + 1 - length $self->{s_kwd},                    column => $self->{column_prev} + 1 - length $self->{kwd},
4968                   });                   });
4969          redo A;          redo A;
4970        } else {        } else {
# Line 4139  sub _get_next_token ($) { Line 4972  sub _get_next_token ($) {
4972          $self->{ca}->{value} .= $data;          $self->{ca}->{value} .= $data;
4973          $self->{ca}->{has_reference} = 1 if $has_ref;          $self->{ca}->{has_reference} = 1 if $has_ref;
4974          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
4975            $self->{s_kwd} = '';
4976            ## Reconsume.
4977            redo A;
4978          }
4979    
4980        ## XML-only states
4981    
4982        } elsif ($self->{state} == PI_STATE) {
4983          ## XML5: "Pi state" and "DOCTYPE pi state".
4984    
4985          if ($is_space->{$self->{nc}} or
4986              $self->{nc} == 0x003F or # ?
4987              $self->{nc} == -1) {
4988            ## XML5: U+003F: "pi state": Same as "Anything else"; "DOCTYPE
4989            ## pi state": Switch to the "DOCTYPE pi after state".  EOF:
4990            ## "DOCTYPE pi state": Parse error, switch to the "data
4991            ## state".
4992            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare pio', ## TODO: type
4993                            line => $self->{line_prev},
4994                            column => $self->{column_prev}
4995                                - 1 * ($self->{nc} != -1));
4996            $self->{state} = BOGUS_COMMENT_STATE;
4997            ## Reconsume.
4998            $self->{ct} = {type => COMMENT_TOKEN,
4999                           data => '?',
5000                           line => $self->{line_prev},
5001                           column => $self->{column_prev}
5002                               - 1 * ($self->{nc} != -1),
5003                          };
5004            redo A;
5005          } else {
5006            ## XML5: "DOCTYPE pi state": Stay in the state.
5007            $self->{ct} = {type => PI_TOKEN,
5008                           target => chr $self->{nc},
5009                           data => '',
5010                           line => $self->{line_prev},
5011                           column => $self->{column_prev} - 1,
5012                          };
5013            $self->{state} = PI_TARGET_STATE;
5014            
5015        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5016          $self->{line_prev} = $self->{line};
5017          $self->{column_prev} = $self->{column};
5018          $self->{column}++;
5019          $self->{nc}
5020              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5021        } else {
5022          $self->{set_nc}->($self);
5023        }
5024      
5025            redo A;
5026          }
5027        } elsif ($self->{state} == PI_TARGET_STATE) {
5028          if ($is_space->{$self->{nc}}) {
5029            $self->{state} = PI_TARGET_AFTER_STATE;
5030            
5031        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5032          $self->{line_prev} = $self->{line};
5033          $self->{column_prev} = $self->{column};
5034          $self->{column}++;
5035          $self->{nc}
5036              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5037        } else {
5038          $self->{set_nc}->($self);
5039        }
5040      
5041            redo A;
5042          } elsif ($self->{nc} == -1) {
5043            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
5044            if ($self->{in_subset}) {
5045              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5046            } else {
5047              $self->{state} = DATA_STATE;
5048              $self->{s_kwd} = '';
5049            }
5050            ## Reconsume.
5051            return  ($self->{ct}); # pi
5052            redo A;
5053          } elsif ($self->{nc} == 0x003F) { # ?
5054            $self->{state} = PI_AFTER_STATE;
5055            
5056        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5057          $self->{line_prev} = $self->{line};
5058          $self->{column_prev} = $self->{column};
5059          $self->{column}++;
5060          $self->{nc}
5061              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5062        } else {
5063          $self->{set_nc}->($self);
5064        }
5065      
5066            redo A;
5067          } else {
5068            ## XML5: typo ("tag name" -> "target")
5069            $self->{ct}->{target} .= chr $self->{nc}; # pi
5070            
5071        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5072          $self->{line_prev} = $self->{line};
5073          $self->{column_prev} = $self->{column};
5074          $self->{column}++;
5075          $self->{nc}
5076              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5077        } else {
5078          $self->{set_nc}->($self);
5079        }
5080      
5081            redo A;
5082          }
5083        } elsif ($self->{state} == PI_TARGET_AFTER_STATE) {
5084          if ($is_space->{$self->{nc}}) {
5085            ## Stay in the state.
5086            
5087        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5088          $self->{line_prev} = $self->{line};
5089          $self->{column_prev} = $self->{column};
5090          $self->{column}++;
5091          $self->{nc}
5092              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5093        } else {
5094          $self->{set_nc}->($self);
5095        }
5096      
5097            redo A;
5098          } else {
5099            $self->{state} = PI_DATA_STATE;
5100            ## Reprocess.
5101            redo A;
5102          }
5103        } elsif ($self->{state} == PI_DATA_STATE) {
5104          if ($self->{nc} == 0x003F) { # ?
5105            $self->{state} = PI_DATA_AFTER_STATE;
5106            
5107        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5108          $self->{line_prev} = $self->{line};
5109          $self->{column_prev} = $self->{column};
5110          $self->{column}++;
5111          $self->{nc}
5112              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5113        } else {
5114          $self->{set_nc}->($self);
5115        }
5116      
5117            redo A;
5118          } elsif ($self->{nc} == -1) {
5119            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
5120            if ($self->{in_subset}) {
5121              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state"
5122            } else {
5123              $self->{state} = DATA_STATE;
5124              $self->{s_kwd} = '';
5125            }
5126            ## Reprocess.
5127            return  ($self->{ct}); # pi
5128            redo A;
5129          } else {
5130            $self->{ct}->{data} .= chr $self->{nc}; # pi
5131            $self->{read_until}->($self->{ct}->{data}, q[?],
5132                                  length $self->{ct}->{data});
5133            ## Stay in the state.
5134            
5135        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5136          $self->{line_prev} = $self->{line};
5137          $self->{column_prev} = $self->{column};
5138          $self->{column}++;
5139          $self->{nc}
5140              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5141        } else {
5142          $self->{set_nc}->($self);
5143        }
5144      
5145            ## Reprocess.
5146            redo A;
5147          }
5148        } elsif ($self->{state} == PI_AFTER_STATE) {
5149          ## XML5: Part of "Pi after state".
5150    
5151          if ($self->{nc} == 0x003E) { # >
5152            if ($self->{in_subset}) {
5153              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5154            } else {
5155              $self->{state} = DATA_STATE;
5156              $self->{s_kwd} = '';
5157            }
5158            
5159        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5160          $self->{line_prev} = $self->{line};
5161          $self->{column_prev} = $self->{column};
5162          $self->{column}++;
5163          $self->{nc}
5164              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5165        } else {
5166          $self->{set_nc}->($self);
5167        }
5168      
5169            return  ($self->{ct}); # pi
5170            redo A;
5171          } elsif ($self->{nc} == 0x003F) { # ?
5172            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
5173                            line => $self->{line_prev},
5174                            column => $self->{column_prev}); ## XML5: no error
5175            $self->{ct}->{data} .= '?';
5176            $self->{state} = PI_DATA_AFTER_STATE;
5177            
5178        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5179          $self->{line_prev} = $self->{line};
5180          $self->{column_prev} = $self->{column};
5181          $self->{column}++;
5182          $self->{nc}
5183              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5184        } else {
5185          $self->{set_nc}->($self);
5186        }
5187      
5188            redo A;
5189          } else {
5190            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
5191                            line => $self->{line_prev},
5192                            column => $self->{column_prev}
5193                                + 1 * ($self->{nc} == -1)); ## XML5: no error
5194            $self->{ct}->{data} .= '?'; ## XML5: not appended
5195            $self->{state} = PI_DATA_STATE;
5196            ## Reprocess.
5197            redo A;
5198          }
5199        } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
5200          ## XML5: Same as "pi after state" and "DOCTYPE pi after state".
5201    
5202          if ($self->{nc} == 0x003E) { # >
5203            if ($self->{in_subset}) {
5204              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5205            } else {
5206              $self->{state} = DATA_STATE;
5207              $self->{s_kwd} = '';
5208            }
5209            
5210        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5211          $self->{line_prev} = $self->{line};
5212          $self->{column_prev} = $self->{column};
5213          $self->{column}++;
5214          $self->{nc}
5215              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5216        } else {
5217          $self->{set_nc}->($self);
5218        }
5219      
5220            return  ($self->{ct}); # pi
5221            redo A;
5222          } elsif ($self->{nc} == 0x003F) { # ?
5223            $self->{ct}->{data} .= '?';
5224            ## Stay in the state.
5225            
5226        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5227          $self->{line_prev} = $self->{line};
5228          $self->{column_prev} = $self->{column};
5229          $self->{column}++;
5230          $self->{nc}
5231              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5232        } else {
5233          $self->{set_nc}->($self);
5234        }
5235      
5236            redo A;
5237          } else {
5238            $self->{ct}->{data} .= '?'; ## XML5: not appended
5239            $self->{state} = PI_DATA_STATE;
5240            ## Reprocess.
5241            redo A;
5242          }
5243    
5244        } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_STATE) {
5245          if ($self->{nc} == 0x003C) { # <
5246            $self->{state} = DOCTYPE_TAG_STATE;
5247            
5248        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5249          $self->{line_prev} = $self->{line};
5250          $self->{column_prev} = $self->{column};
5251          $self->{column}++;
5252          $self->{nc}
5253              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5254        } else {
5255          $self->{set_nc}->($self);
5256        }
5257      
5258            redo A;
5259          } elsif ($self->{nc} == 0x0025) { # %
5260            ## XML5: Not defined yet.
5261    
5262            ## TODO:
5263            
5264        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5265          $self->{line_prev} = $self->{line};
5266          $self->{column_prev} = $self->{column};
5267          $self->{column}++;
5268          $self->{nc}
5269              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5270        } else {
5271          $self->{set_nc}->($self);
5272        }
5273      
5274            redo A;
5275          } elsif ($self->{nc} == 0x005D) { # ]
5276            delete $self->{in_subset};
5277            $self->{state} = DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
5278            
5279        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5280          $self->{line_prev} = $self->{line};
5281          $self->{column_prev} = $self->{column};
5282          $self->{column}++;
5283          $self->{nc}
5284              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5285        } else {
5286          $self->{set_nc}->($self);
5287        }
5288      
5289            redo A;
5290          } elsif ($is_space->{$self->{nc}}) {
5291            ## Stay in the state.
5292            
5293        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5294          $self->{line_prev} = $self->{line};
5295          $self->{column_prev} = $self->{column};
5296          $self->{column}++;
5297          $self->{nc}
5298              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5299        } else {
5300          $self->{set_nc}->($self);
5301        }
5302      
5303            redo A;
5304          } elsif ($self->{nc} == -1) {
5305            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed internal subset'); ## TODO: type
5306            delete $self->{in_subset};
5307            $self->{state} = DATA_STATE;
5308            $self->{s_kwd} = '';
5309            ## Reconsume.
5310            return  ({type => END_OF_DOCTYPE_TOKEN});
5311            redo A;
5312          } else {
5313            unless ($self->{internal_subset_tainted}) {
5314              ## XML5: No parse error.
5315              $self->{parse_error}->(level => $self->{level}->{must}, type => 'string in internal subset');
5316              $self->{internal_subset_tainted} = 1;
5317            }
5318            ## Stay in the state.
5319            
5320        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5321          $self->{line_prev} = $self->{line};
5322          $self->{column_prev} = $self->{column};
5323          $self->{column}++;
5324          $self->{nc}
5325              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5326        } else {
5327          $self->{set_nc}->($self);
5328        }
5329      
5330            redo A;
5331          }
5332        } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
5333          if ($self->{nc} == 0x003E) { # >
5334            $self->{state} = DATA_STATE;
5335            $self->{s_kwd} = '';
5336            
5337        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5338          $self->{line_prev} = $self->{line};
5339          $self->{column_prev} = $self->{column};
5340          $self->{column}++;
5341          $self->{nc}
5342              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5343        } else {
5344          $self->{set_nc}->($self);
5345        }
5346      
5347            return  ({type => END_OF_DOCTYPE_TOKEN});
5348            redo A;
5349          } elsif ($self->{nc} == -1) {
5350            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
5351            $self->{state} = DATA_STATE;
5352            $self->{s_kwd} = '';
5353            ## Reconsume.
5354            return  ({type => END_OF_DOCTYPE_TOKEN});
5355            redo A;
5356          } else {
5357            ## XML5: No parse error and stay in the state.
5358            $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after internal subset'); ## TODO: type
5359    
5360            $self->{state} = BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
5361            
5362        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5363          $self->{line_prev} = $self->{line};
5364          $self->{column_prev} = $self->{column};
5365          $self->{column}++;
5366          $self->{nc}
5367              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5368        } else {
5369          $self->{set_nc}->($self);
5370        }
5371      
5372            redo A;
5373          }
5374        } elsif ($self->{state} == BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
5375          if ($self->{nc} == 0x003E) { # >
5376            $self->{state} = DATA_STATE;
5377            $self->{s_kwd} = '';
5378            
5379        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5380          $self->{line_prev} = $self->{line};
5381          $self->{column_prev} = $self->{column};
5382          $self->{column}++;
5383          $self->{nc}
5384              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5385        } else {
5386          $self->{set_nc}->($self);
5387        }
5388      
5389            return  ({type => END_OF_DOCTYPE_TOKEN});
5390            redo A;
5391          } elsif ($self->{nc} == -1) {
5392            $self->{state} = DATA_STATE;
5393            $self->{s_kwd} = '';
5394            ## Reconsume.
5395            return  ({type => END_OF_DOCTYPE_TOKEN});
5396            redo A;
5397          } else {
5398            ## Stay in the state.
5399            
5400        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5401          $self->{line_prev} = $self->{line};
5402          $self->{column_prev} = $self->{column};
5403          $self->{column}++;
5404          $self->{nc}
5405              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5406        } else {
5407          $self->{set_nc}->($self);
5408        }
5409      
5410            redo A;
5411          }
5412        } elsif ($self->{state} == DOCTYPE_TAG_STATE) {
5413          if ($self->{nc} == 0x0021) { # !
5414            $self->{state} = DOCTYPE_MARKUP_DECLARATION_OPEN_STATE;
5415            
5416        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5417          $self->{line_prev} = $self->{line};
5418          $self->{column_prev} = $self->{column};
5419          $self->{column}++;
5420          $self->{nc}
5421              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5422        } else {
5423          $self->{set_nc}->($self);
5424        }
5425      
5426            redo A;
5427          } elsif ($self->{nc} == 0x003F) { # ?
5428            $self->{state} = PI_STATE;
5429            
5430        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5431          $self->{line_prev} = $self->{line};
5432          $self->{column_prev} = $self->{column};
5433          $self->{column}++;
5434          $self->{nc}
5435              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5436        } else {
5437          $self->{set_nc}->($self);
5438        }
5439      
5440            redo A;
5441          } elsif ($self->{nc} == -1) {
5442            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago');
5443            $self->{state} = DATA_STATE;
5444            $self->{s_kwd} = '';
5445            ## Reconsume.
5446            redo A;
5447          } else {
5448            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago', ## XML5: Not a parse error.
5449                            line => $self->{line_prev},
5450                            column => $self->{column_prev});
5451            $self->{state} = BOGUS_COMMENT_STATE;
5452            $self->{ct} = {type => COMMENT_TOKEN,
5453                           data => '',
5454                          }; ## NOTE: Will be discarded.
5455            
5456        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5457          $self->{line_prev} = $self->{line};
5458          $self->{column_prev} = $self->{column};
5459          $self->{column}++;
5460          $self->{nc}
5461              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5462        } else {
5463          $self->{set_nc}->($self);
5464        }
5465      
5466            redo A;
5467          }
5468        } elsif ($self->{state} == DOCTYPE_MARKUP_DECLARATION_OPEN_STATE) {
5469          ## XML5: "DOCTYPE markup declaration state".
5470          
5471          if ($self->{nc} == 0x002D) { # -
5472            $self->{state} = MD_HYPHEN_STATE;
5473            
5474        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5475          $self->{line_prev} = $self->{line};
5476          $self->{column_prev} = $self->{column};
5477          $self->{column}++;
5478          $self->{nc}
5479              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5480        } else {
5481          $self->{set_nc}->($self);
5482        }
5483      
5484            redo A;
5485          } elsif ($self->{nc} == 0x0045 or # E
5486                   $self->{nc} == 0x0065) { # e
5487            $self->{state} = MD_E_STATE;
5488            $self->{kwd} = chr $self->{nc};
5489            
5490        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5491          $self->{line_prev} = $self->{line};
5492          $self->{column_prev} = $self->{column};
5493          $self->{column}++;
5494          $self->{nc}
5495              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5496        } else {
5497          $self->{set_nc}->($self);
5498        }
5499      
5500            redo A;
5501          } elsif ($self->{nc} == 0x0041 or # A
5502                   $self->{nc} == 0x0061) { # a
5503            $self->{state} = MD_ATTLIST_STATE;
5504            $self->{kwd} = chr $self->{nc};
5505            
5506        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5507          $self->{line_prev} = $self->{line};
5508          $self->{column_prev} = $self->{column};
5509          $self->{column}++;
5510          $self->{nc}
5511              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5512        } else {
5513          $self->{set_nc}->($self);
5514        }
5515      
5516            redo A;
5517          } elsif ($self->{nc} == 0x004E or # N
5518                   $self->{nc} == 0x006E) { # n
5519            $self->{state} = MD_NOTATION_STATE;
5520            $self->{kwd} = chr $self->{nc};
5521            
5522        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5523          $self->{line_prev} = $self->{line};
5524          $self->{column_prev} = $self->{column};
5525          $self->{column}++;
5526          $self->{nc}
5527              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5528        } else {
5529          $self->{set_nc}->($self);
5530        }
5531      
5532            redo A;
5533          } else {
5534            #
5535          }
5536          
5537          ## XML5: No parse error.
5538          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5539                          line => $self->{line_prev},
5540                          column => $self->{column_prev} - 1);
5541          ## Reconsume.
5542          $self->{state} = BOGUS_COMMENT_STATE;
5543          $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded.
5544          redo A;
5545        } elsif ($self->{state} == MD_E_STATE) {
5546          if ($self->{nc} == 0x004E or # N
5547              $self->{nc} == 0x006E) { # n
5548            $self->{state} = MD_ENTITY_STATE;
5549            $self->{kwd} .= chr $self->{nc};
5550            
5551        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5552          $self->{line_prev} = $self->{line};
5553          $self->{column_prev} = $self->{column};
5554          $self->{column}++;
5555          $self->{nc}
5556              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5557        } else {
5558          $self->{set_nc}->($self);
5559        }
5560      
5561            redo A;
5562          } elsif ($self->{nc} == 0x004C or # L
5563                   $self->{nc} == 0x006C) { # l
5564            ## XML5: <!ELEMENT> not supported.
5565            $self->{state} = MD_ELEMENT_STATE;
5566            $self->{kwd} .= chr $self->{nc};
5567            
5568        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5569          $self->{line_prev} = $self->{line};
5570          $self->{column_prev} = $self->{column};
5571          $self->{column}++;
5572          $self->{nc}
5573              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5574        } else {
5575          $self->{set_nc}->($self);
5576        }
5577      
5578            redo A;
5579          } else {
5580            ## XML5: No parse error.
5581            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5582                            line => $self->{line_prev},
5583                            column => $self->{column_prev} - 2
5584                                + 1 * ($self->{nc} == -1));
5585            ## Reconsume.
5586            $self->{state} = BOGUS_COMMENT_STATE;
5587            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5588            redo A;
5589          }
5590        } elsif ($self->{state} == MD_ENTITY_STATE) {
5591          if ($self->{nc} == [
5592                undef,
5593                undef,
5594                0x0054, # T
5595                0x0049, # I
5596                0x0054, # T
5597              ]->[length $self->{kwd}] or
5598              $self->{nc} == [
5599                undef,
5600                undef,
5601                0x0074, # t
5602                0x0069, # i
5603                0x0074, # t
5604              ]->[length $self->{kwd}]) {
5605            ## Stay in the state.
5606            $self->{kwd} .= chr $self->{nc};
5607            
5608        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5609          $self->{line_prev} = $self->{line};
5610          $self->{column_prev} = $self->{column};
5611          $self->{column}++;
5612          $self->{nc}
5613              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5614        } else {
5615          $self->{set_nc}->($self);
5616        }
5617      
5618            redo A;
5619          } elsif ((length $self->{kwd}) == 5 and
5620                   ($self->{nc} == 0x0059 or # Y
5621                    $self->{nc} == 0x0079)) { # y
5622            if ($self->{kwd} ne 'ENTIT' or $self->{nc} == 0x0079) {
5623              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
5624                              text => 'ENTITY',
5625                              line => $self->{line_prev},
5626                              column => $self->{column_prev} - 4);
5627            }
5628            $self->{ct} = {type => GENERAL_ENTITY_TOKEN, name => '',
5629                           line => $self->{line_prev},
5630                           column => $self->{column_prev} - 6};
5631            $self->{state} = DOCTYPE_MD_STATE;
5632            
5633        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5634          $self->{line_prev} = $self->{line};
5635          $self->{column_prev} = $self->{column};
5636          $self->{column}++;
5637          $self->{nc}
5638              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5639        } else {
5640          $self->{set_nc}->($self);
5641        }
5642      
5643            redo A;
5644          } else {
5645            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5646                            line => $self->{line_prev},
5647                            column => $self->{column_prev} - 1
5648                                - (length $self->{kwd})
5649                                + 1 * ($self->{nc} == -1));
5650            $self->{state} = BOGUS_COMMENT_STATE;
5651            ## Reconsume.
5652            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5653            redo A;
5654          }
5655        } elsif ($self->{state} == MD_ELEMENT_STATE) {
5656          if ($self->{nc} == [
5657               undef,
5658               undef,
5659               0x0045, # E
5660               0x004D, # M
5661               0x0045, # E
5662               0x004E, # N
5663              ]->[length $self->{kwd}] or
5664              $self->{nc} == [
5665               undef,
5666               undef,
5667               0x0065, # e
5668               0x006D, # m
5669               0x0065, # e
5670               0x006E, # n
5671              ]->[length $self->{kwd}]) {
5672            ## Stay in the state.
5673            $self->{kwd} .= chr $self->{nc};
5674            
5675        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5676          $self->{line_prev} = $self->{line};
5677          $self->{column_prev} = $self->{column};
5678          $self->{column}++;
5679          $self->{nc}
5680              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5681        } else {
5682          $self->{set_nc}->($self);
5683        }
5684      
5685            redo A;
5686          } elsif ((length $self->{kwd}) == 6 and
5687                   ($self->{nc} == 0x0054 or # T
5688                    $self->{nc} == 0x0074)) { # t
5689            if ($self->{kwd} ne 'ELEMEN' or $self->{nc} == 0x0074) {
5690              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
5691                              text => 'ELEMENT',
5692                              line => $self->{line_prev},
5693                              column => $self->{column_prev} - 5);
5694            }
5695            $self->{ct} = {type => ELEMENT_TOKEN, name => '',
5696                           line => $self->{line_prev},
5697                           column => $self->{column_prev} - 6};
5698            $self->{state} = DOCTYPE_MD_STATE;
5699            
5700        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5701          $self->{line_prev} = $self->{line};
5702          $self->{column_prev} = $self->{column};
5703          $self->{column}++;
5704          $self->{nc}
5705              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5706        } else {
5707          $self->{set_nc}->($self);
5708        }
5709      
5710            redo A;
5711          } else {
5712            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5713                            line => $self->{line_prev},
5714                            column => $self->{column_prev} - 1
5715                                - (length $self->{kwd})
5716                                + 1 * ($self->{nc} == -1));
5717            $self->{state} = BOGUS_COMMENT_STATE;
5718            ## Reconsume.
5719            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5720            redo A;
5721          }
5722        } elsif ($self->{state} == MD_ATTLIST_STATE) {
5723          if ($self->{nc} == [
5724               undef,
5725               0x0054, # T
5726               0x0054, # T
5727               0x004C, # L
5728               0x0049, # I
5729               0x0053, # S
5730              ]->[length $self->{kwd}] or
5731              $self->{nc} == [
5732               undef,
5733               0x0074, # t
5734               0x0074, # t
5735               0x006C, # l
5736               0x0069, # i
5737               0x0073, # s
5738              ]->[length $self->{kwd}]) {
5739            ## Stay in the state.
5740            $self->{kwd} .= chr $self->{nc};
5741            
5742        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5743          $self->{line_prev} = $self->{line};
5744          $self->{column_prev} = $self->{column};
5745          $self->{column}++;
5746          $self->{nc}
5747              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5748        } else {
5749          $self->{set_nc}->($self);
5750        }
5751      
5752            redo A;
5753          } elsif ((length $self->{kwd}) == 6 and
5754                   ($self->{nc} == 0x0054 or # T
5755                    $self->{nc} == 0x0074)) { # t
5756            if ($self->{kwd} ne 'ATTLIS' or $self->{nc} == 0x0074) {
5757              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
5758                              text => 'ATTLIST',
5759                              line => $self->{line_prev},
5760                              column => $self->{column_prev} - 5);
5761            }
5762            $self->{ct} = {type => ATTLIST_TOKEN, name => '',
5763                           attrdefs => [],
5764                           line => $self->{line_prev},
5765                           column => $self->{column_prev} - 6};
5766            $self->{state} = DOCTYPE_MD_STATE;
5767            
5768        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5769          $self->{line_prev} = $self->{line};
5770          $self->{column_prev} = $self->{column};
5771          $self->{column}++;
5772          $self->{nc}
5773              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5774        } else {
5775          $self->{set_nc}->($self);
5776        }
5777      
5778            redo A;
5779          } else {
5780            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5781                            line => $self->{line_prev},
5782                            column => $self->{column_prev} - 1
5783                                 - (length $self->{kwd})
5784                                 + 1 * ($self->{nc} == -1));
5785            $self->{state} = BOGUS_COMMENT_STATE;
5786            ## Reconsume.
5787            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5788            redo A;
5789          }
5790        } elsif ($self->{state} == MD_NOTATION_STATE) {
5791          if ($self->{nc} == [
5792               undef,
5793               0x004F, # O
5794               0x0054, # T
5795               0x0041, # A
5796               0x0054, # T
5797               0x0049, # I
5798               0x004F, # O
5799              ]->[length $self->{kwd}] or
5800              $self->{nc} == [
5801               undef,
5802               0x006F, # o
5803               0x0074, # t
5804               0x0061, # a
5805               0x0074, # t
5806               0x0069, # i
5807               0x006F, # o
5808              ]->[length $self->{kwd}]) {
5809            ## Stay in the state.
5810            $self->{kwd} .= chr $self->{nc};
5811            
5812        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5813          $self->{line_prev} = $self->{line};
5814          $self->{column_prev} = $self->{column};
5815          $self->{column}++;
5816          $self->{nc}
5817              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5818        } else {
5819          $self->{set_nc}->($self);
5820        }
5821      
5822            redo A;
5823          } elsif ((length $self->{kwd}) == 7 and
5824                   ($self->{nc} == 0x004E or # N
5825                    $self->{nc} == 0x006E)) { # n
5826            if ($self->{kwd} ne 'NOTATIO' or $self->{nc} == 0x006E) {
5827              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
5828                              text => 'NOTATION',
5829                              line => $self->{line_prev},
5830                              column => $self->{column_prev} - 6);
5831            }
5832            $self->{ct} = {type => NOTATION_TOKEN, name => '',
5833                           line => $self->{line_prev},
5834                           column => $self->{column_prev} - 6};
5835            $self->{state} = DOCTYPE_MD_STATE;
5836            
5837        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5838          $self->{line_prev} = $self->{line};
5839          $self->{column_prev} = $self->{column};
5840          $self->{column}++;
5841          $self->{nc}
5842              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5843        } else {
5844          $self->{set_nc}->($self);
5845        }
5846      
5847            redo A;
5848          } else {
5849            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5850                            line => $self->{line_prev},
5851                            column => $self->{column_prev} - 1
5852                                - (length $self->{kwd})
5853                                + 1 * ($self->{nc} == -1));
5854            $self->{state} = BOGUS_COMMENT_STATE;
5855            ## Reconsume.
5856            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5857            redo A;
5858          }
5859        } elsif ($self->{state} == DOCTYPE_MD_STATE) {
5860          ## XML5: "DOCTYPE ENTITY state", "DOCTYPE ATTLIST state", and
5861          ## "DOCTYPE NOTATION state".
5862    
5863          if ($is_space->{$self->{nc}}) {
5864            ## XML5: [NOTATION] Switch to the "DOCTYPE NOTATION identifier state".
5865            $self->{state} = BEFORE_MD_NAME_STATE;
5866            
5867        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5868          $self->{line_prev} = $self->{line};
5869          $self->{column_prev} = $self->{column};
5870          $self->{column}++;
5871          $self->{nc}
5872              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5873        } else {
5874          $self->{set_nc}->($self);
5875        }
5876      
5877            redo A;
5878          } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
5879                   $self->{nc} == 0x0025) { # %
5880            ## XML5: Switch to the "DOCTYPE bogus comment state".
5881            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before md name'); ## TODO: type
5882            $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
5883            
5884        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5885          $self->{line_prev} = $self->{line};
5886          $self->{column_prev} = $self->{column};
5887          $self->{column}++;
5888          $self->{nc}
5889              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5890        } else {
5891          $self->{set_nc}->($self);
5892        }
5893      
5894            redo A;
5895          } elsif ($self->{nc} == -1) {
5896            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
5897            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
5898            ## Reconsume.
5899            redo A;
5900          } elsif ($self->{nc} == 0x003E) { # >
5901            ## XML5: Switch to the "DOCTYPE bogus comment state".
5902            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
5903            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5904            
5905        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5906          $self->{line_prev} = $self->{line};
5907          $self->{column_prev} = $self->{column};
5908          $self->{column}++;
5909          $self->{nc}
5910              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5911        } else {
5912          $self->{set_nc}->($self);
5913        }
5914      
5915            redo A;
5916          } else {
5917            ## XML5: Switch to the "DOCTYPE bogus comment state".
5918            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before md name'); ## TODO: type
5919            $self->{state} = BEFORE_MD_NAME_STATE;
5920            redo A;
5921          }
5922        } elsif ($self->{state} == BEFORE_MD_NAME_STATE) {
5923          ## XML5: "DOCTYPE ENTITY parameter state", "DOCTYPE ENTITY type
5924          ## before state", "DOCTYPE ATTLIST name before state".
5925    
5926          if ($is_space->{$self->{nc}}) {
5927            ## Stay in the state.
5928            
5929        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5930          $self->{line_prev} = $self->{line};
5931          $self->{column_prev} = $self->{column};
5932          $self->{column}++;
5933          $self->{nc}
5934              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5935        } else {
5936          $self->{set_nc}->($self);
5937        }
5938      
5939            redo A;
5940          } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
5941                   $self->{nc} == 0x0025) { # %
5942            $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
5943            
5944        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5945          $self->{line_prev} = $self->{line};
5946          $self->{column_prev} = $self->{column};
5947          $self->{column}++;
5948          $self->{nc}
5949              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5950        } else {
5951          $self->{set_nc}->($self);
5952        }
5953      
5954            redo A;
5955          } elsif ($self->{nc} == 0x003E) { # >
5956            ## XML5: Same as "Anything else".
5957            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
5958            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5959            
5960        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5961          $self->{line_prev} = $self->{line};
5962          $self->{column_prev} = $self->{column};
5963          $self->{column}++;
5964          $self->{nc}
5965              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5966        } else {
5967          $self->{set_nc}->($self);
5968        }
5969      
5970            redo A;
5971          } elsif ($self->{nc} == -1) {
5972            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
5973            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
5974            ## Reconsume.
5975            redo A;
5976          } else {
5977            ## XML5: [ATTLIST] Not defined yet.
5978            $self->{ct}->{name} .= chr $self->{nc};
5979            $self->{state} = MD_NAME_STATE;
5980            
5981        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5982          $self->{line_prev} = $self->{line};
5983          $self->{column_prev} = $self->{column};
5984          $self->{column}++;
5985          $self->{nc}
5986              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5987        } else {
5988          $self->{set_nc}->($self);
5989        }
5990      
5991            redo A;
5992          }
5993        } elsif ($self->{state} == DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE) {
5994          if ($is_space->{$self->{nc}}) {
5995            ## XML5: Switch to the "DOCTYPE ENTITY parameter state".
5996            $self->{ct}->{type} = PARAMETER_ENTITY_TOKEN;
5997            $self->{state} = BEFORE_MD_NAME_STATE;
5998            
5999        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6000          $self->{line_prev} = $self->{line};
6001          $self->{column_prev} = $self->{column};
6002          $self->{column}++;
6003          $self->{nc}
6004              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6005        } else {
6006          $self->{set_nc}->($self);
6007        }
6008      
6009            redo A;
6010          } elsif ($self->{nc} == 0x003E) { # >
6011            ## XML5: Same as "Anything else".
6012            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6013            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6014            
6015        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6016          $self->{line_prev} = $self->{line};
6017          $self->{column_prev} = $self->{column};
6018          $self->{column}++;
6019          $self->{nc}
6020              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6021        } else {
6022          $self->{set_nc}->($self);
6023        }
6024      
6025            redo A;
6026          } elsif ($self->{nc} == -1) {
6027            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md');
6028            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6029            ## Reconsume.
6030            redo A;
6031          } else {
6032            ## XML5: No parse error.
6033            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space after ENTITY percent'); ## TODO: type
6034            $self->{state} = BOGUS_COMMENT_STATE;
6035            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6036            ## Reconsume.
6037            redo A;
6038          }
6039        } elsif ($self->{state} == MD_NAME_STATE) {
6040          ## XML5: "DOCTYPE ENTITY name state" and "DOCTYPE ATTLIST name state".
6041          
6042          if ($is_space->{$self->{nc}}) {
6043            if ($self->{ct}->{type} == ATTLIST_TOKEN) {
6044              $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
6045            } elsif ($self->{ct}->{type} == ELEMENT_TOKEN) {
6046              ## TODO: ...
6047              $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
6048            } else { # ENTITY/NOTATION
6049              $self->{state} = AFTER_DOCTYPE_NAME_STATE;
6050            }
6051            
6052        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6053          $self->{line_prev} = $self->{line};
6054          $self->{column_prev} = $self->{column};
6055          $self->{column}++;
6056          $self->{nc}
6057              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6058        } else {
6059          $self->{set_nc}->($self);
6060        }
6061      
6062            redo A;
6063          } elsif ($self->{nc} == 0x003E) { # >
6064            if ($self->{ct}->{type} == ATTLIST_TOKEN) {
6065              #
6066            } else {
6067              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
6068            }
6069            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6070            
6071        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6072          $self->{line_prev} = $self->{line};
6073          $self->{column_prev} = $self->{column};
6074          $self->{column}++;
6075          $self->{nc}
6076              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6077        } else {
6078          $self->{set_nc}->($self);
6079        }
6080      
6081            return  ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
6082            redo A;
6083          } elsif ($self->{nc} == -1) {
6084            ## XML5: [ATTLIST] No parse error.
6085            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md');
6086            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6087            ## Reconsume.
6088            return  ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
6089            redo A;
6090          } else {
6091            ## XML5: [ATTLIST] Not defined yet.
6092            $self->{ct}->{name} .= chr $self->{nc};
6093            ## Stay in the state.
6094            
6095        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6096          $self->{line_prev} = $self->{line};
6097          $self->{column_prev} = $self->{column};
6098          $self->{column}++;
6099          $self->{nc}
6100              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6101        } else {
6102          $self->{set_nc}->($self);
6103        }
6104      
6105            redo A;
6106          }
6107        } elsif ($self->{state} == DOCTYPE_ATTLIST_NAME_AFTER_STATE) {
6108          if ($is_space->{$self->{nc}}) {
6109            ## Stay in the state.
6110            
6111        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6112          $self->{line_prev} = $self->{line};
6113          $self->{column_prev} = $self->{column};
6114          $self->{column}++;
6115          $self->{nc}
6116              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6117        } else {
6118          $self->{set_nc}->($self);
6119        }
6120      
6121            redo A;
6122          } elsif ($self->{nc} == 0x003E) { # >
6123            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6124            
6125        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6126          $self->{line_prev} = $self->{line};
6127          $self->{column_prev} = $self->{column};
6128          $self->{column}++;
6129          $self->{nc}
6130              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6131        } else {
6132          $self->{set_nc}->($self);
6133        }
6134      
6135            return  ($self->{ct}); # ATTLIST
6136            redo A;
6137          } elsif ($self->{nc} == -1) {
6138            ## XML5: No parse error.
6139            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6140            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6141            return  ($self->{ct});
6142            redo A;
6143          } else {
6144            ## XML5: Not defined yet.
6145            $self->{ca} = {name => chr ($self->{nc}), # attrdef
6146                           tokens => [],
6147                           line => $self->{line}, column => $self->{column}};
6148            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE;
6149            
6150        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6151          $self->{line_prev} = $self->{line};
6152          $self->{column_prev} = $self->{column};
6153          $self->{column}++;
6154          $self->{nc}
6155              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6156        } else {
6157          $self->{set_nc}->($self);
6158        }
6159      
6160            redo A;
6161          }
6162        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE) {
6163          if ($is_space->{$self->{nc}}) {
6164            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE;
6165            
6166        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6167          $self->{line_prev} = $self->{line};
6168          $self->{column_prev} = $self->{column};
6169          $self->{column}++;
6170          $self->{nc}
6171              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6172        } else {
6173          $self->{set_nc}->($self);
6174        }
6175      
6176            redo A;
6177          } elsif ($self->{nc} == 0x003E) { # >
6178            ## XML5: Same as "anything else".
6179            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr type'); ## TODO: type
6180            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6181            
6182        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6183          $self->{line_prev} = $self->{line};
6184          $self->{column_prev} = $self->{column};
6185          $self->{column}++;
6186          $self->{nc}
6187              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6188        } else {
6189          $self->{set_nc}->($self);
6190        }
6191      
6192            return  ($self->{ct}); # ATTLIST
6193            redo A;
6194          } elsif ($self->{nc} == 0x0028) { # (
6195            ## XML5: Same as "anything else".
6196            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before paren'); ## TODO: type
6197            $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6198            
6199        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6200          $self->{line_prev} = $self->{line};
6201          $self->{column_prev} = $self->{column};
6202          $self->{column}++;
6203          $self->{nc}
6204              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6205        } else {
6206          $self->{set_nc}->($self);
6207        }
6208      
6209            redo A;
6210          } elsif ($self->{nc} == -1) {
6211            ## XML5: No parse error.
6212            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6213            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6214            
6215        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6216          $self->{line_prev} = $self->{line};
6217          $self->{column_prev} = $self->{column};
6218          $self->{column}++;
6219          $self->{nc}
6220              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6221        } else {
6222          $self->{set_nc}->($self);
6223        }
6224      
6225            return  ($self->{ct}); # ATTLIST
6226            redo A;
6227          } else {
6228            ## XML5: Not defined yet.
6229            $self->{ca}->{name} .= chr $self->{nc};
6230            ## Stay in the state.
6231            
6232        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6233          $self->{line_prev} = $self->{line};
6234          $self->{column_prev} = $self->{column};
6235          $self->{column}++;
6236          $self->{nc}
6237              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6238        } else {
6239          $self->{set_nc}->($self);
6240        }
6241      
6242            redo A;
6243          }
6244        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE) {
6245          if ($is_space->{$self->{nc}}) {
6246            ## Stay in the state.
6247            
6248        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6249          $self->{line_prev} = $self->{line};
6250          $self->{column_prev} = $self->{column};
6251          $self->{column}++;
6252          $self->{nc}
6253              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6254        } else {
6255          $self->{set_nc}->($self);
6256        }
6257      
6258            redo A;
6259          } elsif ($self->{nc} == 0x003E) { # >
6260            ## XML5: Same as "anything else".
6261            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr type'); ## TODO: type
6262            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6263            
6264        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6265          $self->{line_prev} = $self->{line};
6266          $self->{column_prev} = $self->{column};
6267          $self->{column}++;
6268          $self->{nc}
6269              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6270        } else {
6271          $self->{set_nc}->($self);
6272        }
6273      
6274            return  ($self->{ct}); # ATTLIST
6275            redo A;
6276          } elsif ($self->{nc} == 0x0028) { # (
6277            ## XML5: Same as "anything else".
6278            $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6279            
6280        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6281          $self->{line_prev} = $self->{line};
6282          $self->{column_prev} = $self->{column};
6283          $self->{column}++;
6284          $self->{nc}
6285              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6286        } else {
6287          $self->{set_nc}->($self);
6288        }
6289      
6290            redo A;
6291          } elsif ($self->{nc} == -1) {
6292            ## XML5: No parse error.
6293            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6294            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6295            
6296        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6297          $self->{line_prev} = $self->{line};
6298          $self->{column_prev} = $self->{column};
6299          $self->{column}++;
6300          $self->{nc}
6301              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6302        } else {
6303          $self->{set_nc}->($self);
6304        }
6305      
6306            return  ($self->{ct});
6307            redo A;
6308          } else {
6309            ## XML5: Not defined yet.
6310            $self->{ca}->{type} = chr $self->{nc};
6311            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE;
6312            
6313        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6314          $self->{line_prev} = $self->{line};
6315          $self->{column_prev} = $self->{column};
6316          $self->{column}++;
6317          $self->{nc}
6318              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6319        } else {
6320          $self->{set_nc}->($self);
6321        }
6322      
6323            redo A;
6324          }
6325        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE) {
6326          if ($is_space->{$self->{nc}}) {
6327            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE;
6328            
6329        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6330          $self->{line_prev} = $self->{line};
6331          $self->{column_prev} = $self->{column};
6332          $self->{column}++;
6333          $self->{nc}
6334              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6335        } else {
6336          $self->{set_nc}->($self);
6337        }
6338      
6339            redo A;
6340          } elsif ($self->{nc} == 0x0023) { # #
6341            ## XML5: Same as "anything else".
6342            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6343            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
6344            
6345        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6346          $self->{line_prev} = $self->{line};
6347          $self->{column_prev} = $self->{column};
6348          $self->{column}++;
6349          $self->{nc}
6350              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6351        } else {
6352          $self->{set_nc}->($self);
6353        }
6354      
6355            redo A;
6356          } elsif ($self->{nc} == 0x0022) { # "
6357            ## XML5: Same as "anything else".
6358            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6359            $self->{ca}->{value} = '';
6360            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
6361            
6362        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6363          $self->{line_prev} = $self->{line};
6364          $self->{column_prev} = $self->{column};
6365          $self->{column}++;
6366          $self->{nc}
6367              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6368        } else {
6369          $self->{set_nc}->($self);
6370        }
6371      
6372            redo A;
6373          } elsif ($self->{nc} == 0x0027) { # '
6374            ## XML5: Same as "anything else".
6375            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6376            $self->{ca}->{value} = '';
6377            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
6378            
6379        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6380          $self->{line_prev} = $self->{line};
6381          $self->{column_prev} = $self->{column};
6382          $self->{column}++;
6383          $self->{nc}
6384              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6385        } else {
6386          $self->{set_nc}->($self);
6387        }
6388      
6389            redo A;
6390          } elsif ($self->{nc} == 0x003E) { # >
6391            ## XML5: Same as "anything else".
6392            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
6393            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6394            
6395        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6396          $self->{line_prev} = $self->{line};
6397          $self->{column_prev} = $self->{column};
6398          $self->{column}++;
6399          $self->{nc}
6400              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6401        } else {
6402          $self->{set_nc}->($self);
6403        }
6404      
6405            return  ($self->{ct}); # ATTLIST
6406            redo A;
6407          } elsif ($self->{nc} == 0x0028) { # (
6408            ## XML5: Same as "anything else".
6409            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before paren'); ## TODO: type
6410            $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6411            
6412        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6413          $self->{line_prev} = $self->{line};
6414          $self->{column_prev} = $self->{column};
6415          $self->{column}++;
6416          $self->{nc}
6417              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6418        } else {
6419          $self->{set_nc}->($self);
6420        }
6421      
6422            redo A;
6423          } elsif ($self->{nc} == -1) {
6424            ## XML5: No parse error.
6425            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6426            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6427            
6428        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6429          $self->{line_prev} = $self->{line};
6430          $self->{column_prev} = $self->{column};
6431          $self->{column}++;
6432          $self->{nc}
6433              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6434        } else {
6435          $self->{set_nc}->($self);
6436        }
6437      
6438            return  ($self->{ct});
6439            redo A;
6440          } else {
6441            ## XML5: Not defined yet.
6442            $self->{ca}->{type} .= chr $self->{nc};
6443            ## Stay in the state.
6444            
6445        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6446          $self->{line_prev} = $self->{line};
6447          $self->{column_prev} = $self->{column};
6448          $self->{column}++;
6449          $self->{nc}
6450              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6451        } else {
6452          $self->{set_nc}->($self);
6453        }
6454      
6455            redo A;
6456          }
6457        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE) {
6458          if ($is_space->{$self->{nc}}) {
6459            ## Stay in the state.
6460            
6461        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6462          $self->{line_prev} = $self->{line};
6463          $self->{column_prev} = $self->{column};
6464          $self->{column}++;
6465          $self->{nc}
6466              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6467        } else {
6468          $self->{set_nc}->($self);
6469        }
6470      
6471            redo A;
6472          } elsif ($self->{nc} == 0x0028) { # (
6473            ## XML5: Same as "anything else".
6474            $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6475            
6476        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6477          $self->{line_prev} = $self->{line};
6478          $self->{column_prev} = $self->{column};
6479          $self->{column}++;
6480          $self->{nc}
6481              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6482        } else {
6483          $self->{set_nc}->($self);
6484        }
6485      
6486            redo A;
6487          } elsif ($self->{nc} == 0x0023) { # #
6488            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
6489            
6490        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6491          $self->{line_prev} = $self->{line};
6492          $self->{column_prev} = $self->{column};
6493          $self->{column}++;
6494          $self->{nc}
6495              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6496        } else {
6497          $self->{set_nc}->($self);
6498        }
6499      
6500            redo A;
6501          } elsif ($self->{nc} == 0x0022) { # "
6502            ## XML5: Same as "anything else".
6503            $self->{ca}->{value} = '';
6504            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
6505            
6506        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6507          $self->{line_prev} = $self->{line};
6508          $self->{column_prev} = $self->{column};
6509          $self->{column}++;
6510          $self->{nc}
6511              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6512        } else {
6513          $self->{set_nc}->($self);
6514        }
6515      
6516            redo A;
6517          } elsif ($self->{nc} == 0x0027) { # '
6518            ## XML5: Same as "anything else".
6519            $self->{ca}->{value} = '';
6520            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
6521            
6522        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6523          $self->{line_prev} = $self->{line};
6524          $self->{column_prev} = $self->{column};
6525          $self->{column}++;
6526          $self->{nc}
6527              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6528        } else {
6529          $self->{set_nc}->($self);
6530        }
6531      
6532            redo A;
6533          } elsif ($self->{nc} == 0x003E) { # >
6534            ## XML5: Same as "anything else".
6535            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
6536            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6537            
6538        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6539          $self->{line_prev} = $self->{line};
6540          $self->{column_prev} = $self->{column};
6541          $self->{column}++;
6542          $self->{nc}
6543              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6544        } else {
6545          $self->{set_nc}->($self);
6546        }
6547      
6548            return  ($self->{ct}); # ATTLIST
6549            redo A;
6550          } elsif ($self->{nc} == -1) {
6551            ## XML5: No parse error.
6552            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6553            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6554            
6555        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6556          $self->{line_prev} = $self->{line};
6557          $self->{column_prev} = $self->{column};
6558          $self->{column}++;
6559          $self->{nc}
6560              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6561        } else {
6562          $self->{set_nc}->($self);
6563        }
6564      
6565            return  ($self->{ct});
6566            redo A;
6567          } else {
6568            ## XML5: Switch to the "DOCTYPE bogus comment state".
6569            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
6570            $self->{ca}->{value} = '';
6571            $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
6572          ## Reconsume.          ## Reconsume.
6573          redo A;          redo A;
6574        }        }
6575        } elsif ($self->{state} == BEFORE_ALLOWED_TOKEN_STATE) {
6576          if ($is_space->{$self->{nc}}) {
6577            ## Stay in the state.
6578            
6579        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6580          $self->{line_prev} = $self->{line};
6581          $self->{column_prev} = $self->{column};
6582          $self->{column}++;
6583          $self->{nc}
6584              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6585        } else {
6586          $self->{set_nc}->($self);
6587        }
6588      
6589            redo A;
6590          } elsif ($self->{nc} == 0x007C) { # |
6591            $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty allowed token'); ## TODO: type
6592            ## Stay in the state.
6593            
6594        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6595          $self->{line_prev} = $self->{line};
6596          $self->{column_prev} = $self->{column};
6597          $self->{column}++;
6598          $self->{nc}
6599              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6600        } else {
6601          $self->{set_nc}->($self);
6602        }
6603      
6604            redo A;
6605          } elsif ($self->{nc} == 0x0029) { # )
6606            $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty allowed token'); ## TODO: type
6607            $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
6608            
6609        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6610          $self->{line_prev} = $self->{line};
6611          $self->{column_prev} = $self->{column};
6612          $self->{column}++;
6613          $self->{nc}
6614              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6615        } else {
6616          $self->{set_nc}->($self);
6617        }
6618      
6619            redo A;
6620          } elsif ($self->{nc} == 0x003E) { # >
6621            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
6622            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6623            
6624        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6625          $self->{line_prev} = $self->{line};
6626          $self->{column_prev} = $self->{column};
6627          $self->{column}++;
6628          $self->{nc}
6629              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6630        } else {
6631          $self->{set_nc}->($self);
6632        }
6633      
6634            return  ($self->{ct}); # ATTLIST
6635            redo A;
6636          } elsif ($self->{nc} == -1) {
6637            ## XML5: No parse error.
6638            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6639            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6640            
6641        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6642          $self->{line_prev} = $self->{line};
6643          $self->{column_prev} = $self->{column};
6644          $self->{column}++;
6645          $self->{nc}
6646              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6647        } else {
6648          $self->{set_nc}->($self);
6649        }
6650      
6651            return  ($self->{ct});
6652            redo A;
6653          } else {
6654            push @{$self->{ca}->{tokens}}, chr $self->{nc};
6655            $self->{state} = ALLOWED_TOKEN_STATE;
6656            
6657        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6658          $self->{line_prev} = $self->{line};
6659          $self->{column_prev} = $self->{column};
6660          $self->{column}++;
6661          $self->{nc}
6662              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6663        } else {
6664          $self->{set_nc}->($self);
6665        }
6666      
6667            redo A;
6668          }
6669        } elsif ($self->{state} == ALLOWED_TOKEN_STATE) {
6670          if ($is_space->{$self->{nc}}) {
6671            $self->{state} = AFTER_ALLOWED_TOKEN_STATE;
6672            
6673        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6674          $self->{line_prev} = $self->{line};
6675          $self->{column_prev} = $self->{column};
6676          $self->{column}++;
6677          $self->{nc}
6678              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6679        } else {
6680          $self->{set_nc}->($self);
6681        }
6682      
6683            redo A;
6684          } elsif ($self->{nc} == 0x007C) { # |
6685            $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6686            
6687        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6688          $self->{line_prev} = $self->{line};
6689          $self->{column_prev} = $self->{column};
6690          $self->{column}++;
6691          $self->{nc}
6692              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6693        } else {
6694          $self->{set_nc}->($self);
6695        }
6696      
6697            redo A;
6698          } elsif ($self->{nc} == 0x0029) { # )
6699            $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
6700            
6701        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6702          $self->{line_prev} = $self->{line};
6703          $self->{column_prev} = $self->{column};
6704          $self->{column}++;
6705          $self->{nc}
6706              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6707        } else {
6708          $self->{set_nc}->($self);
6709        }
6710      
6711            redo A;
6712          } elsif ($self->{nc} == 0x003E) { # >
6713            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
6714            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6715            
6716        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6717          $self->{line_prev} = $self->{line};
6718          $self->{column_prev} = $self->{column};
6719          $self->{column}++;
6720          $self->{nc}
6721              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6722        } else {
6723          $self->{set_nc}->($self);
6724        }
6725      
6726            return  ($self->{ct}); # ATTLIST
6727            redo A;
6728          } elsif ($self->{nc} == -1) {
6729            ## XML5: No parse error.
6730            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6731            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6732            
6733        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6734          $self->{line_prev} = $self->{line};
6735          $self->{column_prev} = $self->{column};
6736          $self->{column}++;
6737          $self->{nc}
6738              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6739        } else {
6740          $self->{set_nc}->($self);
6741        }
6742      
6743            return  ($self->{ct});
6744            redo A;
6745          } else {
6746            $self->{ca}->{tokens}->[-1] .= chr $self->{nc};
6747            ## Stay in the state.
6748            
6749        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6750          $self->{line_prev} = $self->{line};
6751          $self->{column_prev} = $self->{column};
6752          $self->{column}++;
6753          $self->{nc}
6754              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6755        } else {
6756          $self->{set_nc}->($self);
6757        }
6758      
6759            redo A;
6760          }
6761        } elsif ($self->{state} == AFTER_ALLOWED_TOKEN_STATE) {
6762          if ($is_space->{$self->{nc}}) {
6763            ## Stay in the state.
6764            
6765        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6766          $self->{line_prev} = $self->{line};
6767          $self->{column_prev} = $self->{column};
6768          $self->{column}++;
6769          $self->{nc}
6770              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6771        } else {
6772          $self->{set_nc}->($self);
6773        }
6774      
6775            redo A;
6776          } elsif ($self->{nc} == 0x007C) { # |
6777            $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6778            
6779        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6780          $self->{line_prev} = $self->{line};
6781          $self->{column_prev} = $self->{column};
6782          $self->{column}++;
6783          $self->{nc}
6784              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6785        } else {
6786          $self->{set_nc}->($self);
6787        }
6788      
6789            redo A;
6790          } elsif ($self->{nc} == 0x0029) { # )
6791            $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
6792            
6793        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6794          $self->{line_prev} = $self->{line};
6795          $self->{column_prev} = $self->{column};
6796          $self->{column}++;
6797          $self->{nc}
6798              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6799        } else {
6800          $self->{set_nc}->($self);
6801        }
6802      
6803            redo A;
6804          } elsif ($self->{nc} == 0x003E) { # >
6805            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
6806            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6807            
6808        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6809          $self->{line_prev} = $self->{line};
6810          $self->{column_prev} = $self->{column};
6811          $self->{column}++;
6812          $self->{nc}
6813              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6814        } else {
6815          $self->{set_nc}->($self);
6816        }
6817      
6818            return  ($self->{ct}); # ATTLIST
6819            redo A;
6820          } elsif ($self->{nc} == -1) {
6821            ## XML5: No parse error.
6822            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6823            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6824            
6825        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6826          $self->{line_prev} = $self->{line};
6827          $self->{column_prev} = $self->{column};
6828          $self->{column}++;
6829          $self->{nc}
6830              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6831        } else {
6832          $self->{set_nc}->($self);
6833        }
6834      
6835            return  ($self->{ct});
6836            redo A;
6837          } else {
6838            $self->{parse_error}->(level => $self->{level}->{must}, type => 'space in allowed token', ## TODO: type
6839                            line => $self->{line_prev},
6840                            column => $self->{column_prev});
6841            $self->{ca}->{tokens}->[-1] .= ' ' . chr $self->{nc};
6842            $self->{state} = ALLOWED_TOKEN_STATE;
6843            
6844        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6845          $self->{line_prev} = $self->{line};
6846          $self->{column_prev} = $self->{column};
6847          $self->{column}++;
6848          $self->{nc}
6849              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6850        } else {
6851          $self->{set_nc}->($self);
6852        }
6853      
6854            redo A;
6855          }
6856        } elsif ($self->{state} == AFTER_ALLOWED_TOKENS_STATE) {
6857          if ($is_space->{$self->{nc}}) {
6858            $self->{state} = BEFORE_ATTR_DEFAULT_STATE;
6859            
6860        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6861          $self->{line_prev} = $self->{line};
6862          $self->{column_prev} = $self->{column};
6863          $self->{column}++;
6864          $self->{nc}
6865              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6866        } else {
6867          $self->{set_nc}->($self);
6868        }
6869      
6870            redo A;
6871          } elsif ($self->{nc} == 0x0023) { # #
6872            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6873            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
6874            
6875        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6876          $self->{line_prev} = $self->{line};
6877          $self->{column_prev} = $self->{column};
6878          $self->{column}++;
6879          $self->{nc}
6880              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6881        } else {
6882          $self->{set_nc}->($self);
6883        }
6884      
6885            redo A;
6886          } elsif ($self->{nc} == 0x0022) { # "
6887            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6888            $self->{ca}->{value} = '';
6889            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
6890            
6891        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6892          $self->{line_prev} = $self->{line};
6893          $self->{column_prev} = $self->{column};
6894          $self->{column}++;
6895          $self->{nc}
6896              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6897        } else {
6898          $self->{set_nc}->($self);
6899        }
6900      
6901            redo A;
6902          } elsif ($self->{nc} == 0x0027) { # '
6903            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6904            $self->{ca}->{value} = '';
6905            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
6906            
6907        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6908          $self->{line_prev} = $self->{line};
6909          $self->{column_prev} = $self->{column};
6910          $self->{column}++;
6911          $self->{nc}
6912              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6913        } else {
6914          $self->{set_nc}->($self);
6915        }
6916      
6917            redo A;
6918          } elsif ($self->{nc} == 0x003E) { # >
6919            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
6920            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6921            
6922        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6923          $self->{line_prev} = $self->{line};
6924          $self->{column_prev} = $self->{column};
6925          $self->{column}++;
6926          $self->{nc}
6927              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6928        } else {
6929          $self->{set_nc}->($self);
6930        }
6931      
6932            return  ($self->{ct}); # ATTLIST
6933            redo A;
6934          } elsif ($self->{nc} == -1) {
6935            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6936            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6937            
6938        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6939          $self->{line_prev} = $self->{line};
6940          $self->{column_prev} = $self->{column};
6941          $self->{column}++;
6942          $self->{nc}
6943              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6944        } else {
6945          $self->{set_nc}->($self);
6946        }
6947      
6948            return  ($self->{ct});
6949            redo A;
6950          } else {
6951            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
6952            $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
6953            ## Reconsume.
6954            redo A;
6955          }
6956        } elsif ($self->{state} == BEFORE_ATTR_DEFAULT_STATE) {
6957          if ($is_space->{$self->{nc}}) {
6958            ## Stay in the state.
6959            
6960        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6961          $self->{line_prev} = $self->{line};
6962          $self->{column_prev} = $self->{column};
6963          $self->{column}++;
6964          $self->{nc}
6965              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6966        } else {
6967          $self->{set_nc}->($self);
6968        }
6969      
6970            redo A;
6971          } elsif ($self->{nc} == 0x0023) { # #
6972            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
6973            
6974        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6975          $self->{line_prev} = $self->{line};
6976          $self->{column_prev} = $self->{column};
6977          $self->{column}++;
6978          $self->{nc}
6979              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6980        } else {
6981          $self->{set_nc}->($self);
6982        }
6983      
6984            redo A;
6985          } elsif ($self->{nc} == 0x0022) { # "
6986            $self->{ca}->{value} = '';
6987            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
6988            
6989        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6990          $self->{line_prev} = $self->{line};
6991          $self->{column_prev} = $self->{column};
6992          $self->{column}++;
6993          $self->{nc}
6994              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6995        } else {
6996          $self->{set_nc}->($self);
6997        }
6998      
6999            redo A;
7000          } elsif ($self->{nc} == 0x0027) { # '
7001            $self->{ca}->{value} = '';
7002            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7003            
7004        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7005          $self->{line_prev} = $self->{line};
7006          $self->{column_prev} = $self->{column};
7007          $self->{column}++;
7008          $self->{nc}
7009              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7010        } else {
7011          $self->{set_nc}->($self);
7012        }
7013      
7014            redo A;
7015          } elsif ($self->{nc} == 0x003E) { # >
7016            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7017            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7018            
7019        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7020          $self->{line_prev} = $self->{line};
7021          $self->{column_prev} = $self->{column};
7022          $self->{column}++;
7023          $self->{nc}
7024              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7025        } else {
7026          $self->{set_nc}->($self);
7027        }
7028      
7029            return  ($self->{ct}); # ATTLIST
7030            redo A;
7031          } elsif ($self->{nc} == -1) {
7032            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7033            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7034            
7035        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7036          $self->{line_prev} = $self->{line};
7037          $self->{column_prev} = $self->{column};
7038          $self->{column}++;
7039          $self->{nc}
7040              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7041        } else {
7042          $self->{set_nc}->($self);
7043        }
7044      
7045            return  ($self->{ct});
7046            redo A;
7047          } else {
7048            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
7049            $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7050            ## Reconsume.
7051            redo A;
7052          }
7053        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE) {
7054          if ($is_space->{$self->{nc}}) {
7055            ## XML5: No parse error.
7056            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no default type'); ## TODO: type
7057            $self->{state} = BOGUS_MD_STATE;
7058            ## Reconsume.
7059            redo A;
7060          } elsif ($self->{nc} == 0x0022) { # "
7061            ## XML5: Same as "anything else".
7062            $self->{ca}->{value} = '';
7063            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7064            
7065        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7066          $self->{line_prev} = $self->{line};
7067          $self->{column_prev} = $self->{column};
7068          $self->{column}++;
7069          $self->{nc}
7070              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7071        } else {
7072          $self->{set_nc}->($self);
7073        }
7074      
7075            redo A;
7076          } elsif ($self->{nc} == 0x0027) { # '
7077            ## XML5: Same as "anything else".
7078            $self->{ca}->{value} = '';
7079            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7080            
7081        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7082          $self->{line_prev} = $self->{line};
7083          $self->{column_prev} = $self->{column};
7084          $self->{column}++;
7085          $self->{nc}
7086              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7087        } else {
7088          $self->{set_nc}->($self);
7089        }
7090      
7091            redo A;
7092          } elsif ($self->{nc} == 0x003E) { # >
7093            ## XML5: Same as "anything else".
7094            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7095            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7096            
7097        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7098          $self->{line_prev} = $self->{line};
7099          $self->{column_prev} = $self->{column};
7100          $self->{column}++;
7101          $self->{nc}
7102              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7103        } else {
7104          $self->{set_nc}->($self);
7105        }
7106      
7107            return  ($self->{ct}); # ATTLIST
7108            redo A;
7109          } elsif ($self->{nc} == -1) {
7110            ## XML5: No parse error.
7111            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7112            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7113            
7114        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7115          $self->{line_prev} = $self->{line};
7116          $self->{column_prev} = $self->{column};
7117          $self->{column}++;
7118          $self->{nc}
7119              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7120        } else {
7121          $self->{set_nc}->($self);
7122        }
7123      
7124            return  ($self->{ct});
7125            redo A;
7126          } else {
7127            $self->{ca}->{default} = chr $self->{nc};
7128            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE;
7129            
7130        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7131          $self->{line_prev} = $self->{line};
7132          $self->{column_prev} = $self->{column};
7133          $self->{column}++;
7134          $self->{nc}
7135              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7136        } else {
7137          $self->{set_nc}->($self);
7138        }
7139      
7140            redo A;
7141          }
7142        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE) {
7143          if ($is_space->{$self->{nc}}) {
7144            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE;
7145            
7146        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7147          $self->{line_prev} = $self->{line};
7148          $self->{column_prev} = $self->{column};
7149          $self->{column}++;
7150          $self->{nc}
7151              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7152        } else {
7153          $self->{set_nc}->($self);
7154        }
7155      
7156            redo A;
7157          } elsif ($self->{nc} == 0x0022) { # "
7158            ## XML5: Same as "anything else".
7159            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7160            $self->{ca}->{value} = '';
7161            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7162            
7163        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7164          $self->{line_prev} = $self->{line};
7165          $self->{column_prev} = $self->{column};
7166          $self->{column}++;
7167          $self->{nc}
7168              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7169        } else {
7170          $self->{set_nc}->($self);
7171        }
7172      
7173            redo A;
7174          } elsif ($self->{nc} == 0x0027) { # '
7175            ## XML5: Same as "anything else".
7176            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7177            $self->{ca}->{value} = '';
7178            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7179            
7180        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7181          $self->{line_prev} = $self->{line};
7182          $self->{column_prev} = $self->{column};
7183          $self->{column}++;
7184          $self->{nc}
7185              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7186        } else {
7187          $self->{set_nc}->($self);
7188        }
7189      
7190            redo A;
7191          } elsif ($self->{nc} == 0x003E) { # >
7192            ## XML5: Same as "anything else".
7193            push @{$self->{ct}->{attrdefs}}, $self->{ca};
7194            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7195            
7196        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7197          $self->{line_prev} = $self->{line};
7198          $self->{column_prev} = $self->{column};
7199          $self->{column}++;
7200          $self->{nc}
7201              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7202        } else {
7203          $self->{set_nc}->($self);
7204        }
7205      
7206            return  ($self->{ct}); # ATTLIST
7207            redo A;
7208          } elsif ($self->{nc} == -1) {
7209            ## XML5: No parse error.
7210            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7211            push @{$self->{ct}->{attrdefs}}, $self->{ca};
7212            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7213            
7214        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7215          $self->{line_prev} = $self->{line};
7216          $self->{column_prev} = $self->{column};
7217          $self->{column}++;
7218          $self->{nc}
7219              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7220        } else {
7221          $self->{set_nc}->($self);
7222        }
7223      
7224            return  ($self->{ct});
7225            redo A;
7226          } else {
7227            $self->{ca}->{default} .= chr $self->{nc};
7228            ## Stay in the state.
7229            
7230        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7231          $self->{line_prev} = $self->{line};
7232          $self->{column_prev} = $self->{column};
7233          $self->{column}++;
7234          $self->{nc}
7235              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7236        } else {
7237          $self->{set_nc}->($self);
7238        }
7239      
7240            redo A;
7241          }
7242        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE) {
7243          if ($is_space->{$self->{nc}}) {
7244            ## Stay in the state.
7245            
7246        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7247          $self->{line_prev} = $self->{line};
7248          $self->{column_prev} = $self->{column};
7249          $self->{column}++;
7250          $self->{nc}
7251              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7252        } else {
7253          $self->{set_nc}->($self);
7254        }
7255      
7256            redo A;
7257          } elsif ($self->{nc} == 0x0022) { # "
7258            $self->{ca}->{value} = '';
7259            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7260            
7261        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7262          $self->{line_prev} = $self->{line};
7263          $self->{column_prev} = $self->{column};
7264          $self->{column}++;
7265          $self->{nc}
7266              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7267        } else {
7268          $self->{set_nc}->($self);
7269        }
7270      
7271            redo A;
7272          } elsif ($self->{nc} == 0x0027) { # '
7273            $self->{ca}->{value} = '';
7274            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7275            
7276        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7277          $self->{line_prev} = $self->{line};
7278          $self->{column_prev} = $self->{column};
7279          $self->{column}++;
7280          $self->{nc}
7281              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7282        } else {
7283          $self->{set_nc}->($self);
7284        }
7285      
7286            redo A;
7287          } elsif ($self->{nc} == 0x003E) { # >
7288            push @{$self->{ct}->{attrdefs}}, $self->{ca};
7289            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7290            
7291        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7292          $self->{line_prev} = $self->{line};
7293          $self->{column_prev} = $self->{column};
7294          $self->{column}++;
7295          $self->{nc}
7296              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7297        } else {
7298          $self->{set_nc}->($self);
7299        }
7300      
7301            return  ($self->{ct}); # ATTLIST
7302            redo A;
7303          } elsif ($self->{nc} == -1) {
7304            ## XML5: No parse error.
7305            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7306            push @{$self->{ct}->{attrdefs}}, $self->{ca};
7307            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7308            
7309        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7310          $self->{line_prev} = $self->{line};
7311          $self->{column_prev} = $self->{column};
7312          $self->{column}++;
7313          $self->{nc}
7314              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7315        } else {
7316          $self->{set_nc}->($self);
7317        }
7318      
7319            return  ($self->{ct});
7320            redo A;
7321          } else {
7322            ## XML5: Not defined yet.
7323            if ($self->{ca}->{default} eq 'FIXED') {
7324              $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7325            } else {
7326              push @{$self->{ct}->{attrdefs}}, $self->{ca};
7327              $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7328            }
7329            ## Reconsume.
7330            redo A;
7331          }
7332        } elsif ($self->{state} == AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE) {
7333          if ($is_space->{$self->{nc}} or
7334              $self->{nc} == -1 or
7335              $self->{nc} == 0x003E) { # >
7336            $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7337            ## Reconsume.
7338            redo A;
7339          } else {
7340            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before attr name'); ## TODO: type
7341            $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7342            ## Reconsume.
7343            redo A;
7344          }
7345    
7346        } elsif ($self->{state} == BOGUS_MD_STATE) {
7347          if ($self->{nc} == 0x003E) { # >
7348            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7349            
7350        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7351          $self->{line_prev} = $self->{line};
7352          $self->{column_prev} = $self->{column};
7353          $self->{column}++;
7354          $self->{nc}
7355              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7356        } else {
7357          $self->{set_nc}->($self);
7358        }
7359      
7360            return  ($self->{ct}); # ATTLIST/ENTITY/NOTATION
7361            redo A;
7362          } elsif ($self->{nc} == -1) {
7363            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7364            ## Reconsume.
7365            return  ($self->{ct}); # ATTLIST/ENTITY/NOTATION
7366            redo A;
7367          } else {
7368            ## Stay in the state.
7369            
7370        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7371          $self->{line_prev} = $self->{line};
7372          $self->{column_prev} = $self->{column};
7373          $self->{column}++;
7374          $self->{nc}
7375              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7376        } else {
7377          $self->{set_nc}->($self);
7378        }
7379      
7380            redo A;
7381          }
7382      } else {      } else {
7383        die "$0: $self->{state}: Unknown state";        die "$0: $self->{state}: Unknown state";
7384      }      }
# Line 4152  sub _get_next_token ($) { Line 7389  sub _get_next_token ($) {
7389    
7390  1;  1;
7391  ## $Date$  ## $Date$
7392                                    

Legend:
Removed from v.1.4  
changed lines
  Added in v.1.17

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24