/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm
Suika

Diff of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.4 by wakaba, Tue Oct 14 11:46:57 2008 UTC revision 1.19 by wakaba, Sun Oct 19 07:19:00 2008 UTC
# Line 15  BEGIN { Line 15  BEGIN {
15      CHARACTER_TOKEN      CHARACTER_TOKEN
16      PI_TOKEN      PI_TOKEN
17      ABORT_TOKEN      ABORT_TOKEN
18        END_OF_DOCTYPE_TOKEN
19        ATTLIST_TOKEN
20        ELEMENT_TOKEN
21        GENERAL_ENTITY_TOKEN
22        PARAMETER_ENTITY_TOKEN
23        NOTATION_TOKEN
24    );    );
25        
26    our %EXPORT_TAGS = (    our %EXPORT_TAGS = (
# Line 27  BEGIN { Line 33  BEGIN {
33        CHARACTER_TOKEN        CHARACTER_TOKEN
34        PI_TOKEN        PI_TOKEN
35        ABORT_TOKEN        ABORT_TOKEN
36          END_OF_DOCTYPE_TOKEN
37          ATTLIST_TOKEN
38          ELEMENT_TOKEN
39          GENERAL_ENTITY_TOKEN
40          PARAMETER_ENTITY_TOKEN
41          NOTATION_TOKEN
42      )],      )],
43    );    );
44  }  }
45    
46    ## NOTE: Differences from the XML5 draft are marked as "XML5:".
47    
48  ## Token types  ## Token types
49    
50  sub DOCTYPE_TOKEN () { 1 }  sub DOCTYPE_TOKEN () { 1 } ## XML5: No DOCTYPE token.
51  sub COMMENT_TOKEN () { 2 }  sub COMMENT_TOKEN () { 2 }
52  sub START_TAG_TOKEN () { 3 }  sub START_TAG_TOKEN () { 3 }
53  sub END_TAG_TOKEN () { 4 }  sub END_TAG_TOKEN () { 4 }
54  sub END_OF_FILE_TOKEN () { 5 }  sub END_OF_FILE_TOKEN () { 5 }
55  sub CHARACTER_TOKEN () { 6 }  sub CHARACTER_TOKEN () { 6 }
56  sub PI_TOKEN () { 7 } # XML5  sub PI_TOKEN () { 7 } ## NOTE: XML only.
57  sub ABORT_TOKEN () { 8 } # Not a token actually  sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing.
58    sub END_OF_DOCTYPE_TOKEN () { 9 } ## NOTE: XML only.
59    sub ATTLIST_TOKEN () { 10 } ## NOTE: XML only.
60    sub ELEMENT_TOKEN () { 11 } ## NOTE: XML only.
61    sub GENERAL_ENTITY_TOKEN () { 12 } ## NOTE: XML only.
62    sub PARAMETER_ENTITY_TOKEN () { 13 } ## NOTE: XML only.
63    sub NOTATION_TOKEN () { 14 } ## NOTE: XML only.
64    
65    ## XML5: XML5 has "empty tag token".  In this implementation, it is
66    ## represented as a start tag token with $self->{self_closing} flag
67    ## set to true.
68    
69    ## XML5: XML5 has "short end tag token".  In this implementation, it
70    ## is represented as an end tag token with $token->{tag_name} flag set
71    ## to an empty string.
72    
73  package Whatpm::HTML;  package Whatpm::HTML;
74    
# Line 114  sub HEXREF_HEX_STATE () { 48 } Line 142  sub HEXREF_HEX_STATE () { 48 }
142  sub ENTITY_NAME_STATE () { 49 }  sub ENTITY_NAME_STATE () { 49 }
143  sub PCDATA_STATE () { 50 } # "data state" in the spec  sub PCDATA_STATE () { 50 } # "data state" in the spec
144    
145    ## XML-only states
146    sub PI_STATE () { 51 }
147    sub PI_TARGET_STATE () { 52 }
148    sub PI_TARGET_AFTER_STATE () { 53 }
149    sub PI_DATA_STATE () { 54 }
150    sub PI_AFTER_STATE () { 55 }
151    sub PI_DATA_AFTER_STATE () { 56 }
152    sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 }
153    sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 }
154    sub BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 59 }
155    sub DOCTYPE_TAG_STATE () { 60 }
156    sub DOCTYPE_MARKUP_DECLARATION_OPEN_STATE () { 61 }
157    sub MD_ATTLIST_STATE () { 62 }
158    sub MD_E_STATE () { 63 }
159    sub MD_ELEMENT_STATE () { 64 }
160    sub MD_ENTITY_STATE () { 65 }
161    sub MD_NOTATION_STATE () { 66 }
162    sub DOCTYPE_MD_STATE () { 67 }
163    sub BEFORE_MD_NAME_STATE () { 68 }
164    sub MD_NAME_STATE () { 69 }
165    sub DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE () { 70 }
166    sub DOCTYPE_ATTLIST_NAME_AFTER_STATE () { 71 }
167    sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE () { 72 }
168    sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE () { 73 }
169    sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE () { 74 }
170    sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE () { 75 }
171    sub BEFORE_ALLOWED_TOKEN_STATE () { 76 }
172    sub ALLOWED_TOKEN_STATE () { 77 }
173    sub AFTER_ALLOWED_TOKEN_STATE () { 78 }
174    sub AFTER_ALLOWED_TOKENS_STATE () { 79 }
175    sub BEFORE_ATTR_DEFAULT_STATE () { 80 }
176    sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE () { 81 }
177    sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE () { 82 }
178    sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE () { 83 }
179    sub AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE () { 84 }
180    sub BEFORE_NDATA_STATE () { 85 }
181    sub NDATA_STATE () { 86 }
182    sub AFTER_NDATA_STATE () { 87 }
183    sub BEFORE_NOTATION_NAME_STATE () { 88 }
184    sub NOTATION_NAME_STATE () { 89 }
185    sub AFTER_NOTATION_NAME_STATE () { 90 }
186    sub DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE () { 91 }
187    sub DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE () { 92 }
188    sub ENTITY_VALUE_ENTITY_STATE () { 93 }
189    sub BOGUS_MD_STATE () { 94 }
190    
191  ## Tree constructor state constants (see Whatpm::HTML for the full  ## Tree constructor state constants (see Whatpm::HTML for the full
192  ## list and descriptions)  ## list and descriptions)
193    
# Line 178  sub _initialize_tokenizer ($) { Line 252  sub _initialize_tokenizer ($) {
252    #$self->{is_xml} (if XML)    #$self->{is_xml} (if XML)
253    
254    $self->{state} = DATA_STATE; # MUST    $self->{state} = DATA_STATE; # MUST
255    #$self->{s_kwd}; # state keyword - initialized when used    $self->{s_kwd} = ''; # Data state keyword
256      #$self->{kwd} = ''; # State-dependent keyword; initialized when used
257    #$self->{entity__value}; # initialized when used    #$self->{entity__value}; # initialized when used
258    #$self->{entity__match}; # initialized when used    #$self->{entity__match}; # initialized when used
259    $self->{content_model} = PCDATA_CONTENT_MODEL; # be    $self->{content_model} = PCDATA_CONTENT_MODEL; # be
# Line 208  sub _initialize_tokenizer ($) { Line 283  sub _initialize_tokenizer ($) {
283    
284  ## A token has:  ## A token has:
285  ##   ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,  ##   ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
286  ##       CHARACTER_TOKEN, or END_OF_FILE_TOKEN  ##       CHARACTER_TOKEN, END_OF_FILE_TOKEN, PI_TOKEN, or ABORT_TOKEN
287  ##   ->{name} (DOCTYPE_TOKEN)  ##   ->{name} (DOCTYPE_TOKEN)
288  ##   ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)  ##   ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
289    ##   ->{target} (PI_TOKEN)
290  ##   ->{pubid} (DOCTYPE_TOKEN)  ##   ->{pubid} (DOCTYPE_TOKEN)
291  ##   ->{sysid} (DOCTYPE_TOKEN)  ##   ->{sysid} (DOCTYPE_TOKEN)
292  ##   ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag  ##   ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
# Line 218  sub _initialize_tokenizer ($) { Line 294  sub _initialize_tokenizer ($) {
294  ##        ->{name}  ##        ->{name}
295  ##        ->{value}  ##        ->{value}
296  ##        ->{has_reference} == 1 or 0  ##        ->{has_reference} == 1 or 0
297  ##   ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)  ##        ->{index}: Index of the attribute in a tag.
298    ##   ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)
299    ##   ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
300    ##   ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.
301    ##   ->{has_internal_subset} = 1 or 0 (DOCTYPE_TOKEN)
302    
303  ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.  ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
304  ##     |->{self_closing}| is used to save the value of |$self->{self_closing}|  ##     |->{self_closing}| is used to save the value of |$self->{self_closing}|
305  ##     while the token is pushed back to the stack.  ##     while the token is pushed back to the stack.
# Line 238  my $is_space = { Line 319  my $is_space = {
319    0x0009 => 1, # CHARACTER TABULATION (HT)    0x0009 => 1, # CHARACTER TABULATION (HT)
320    0x000A => 1, # LINE FEED (LF)    0x000A => 1, # LINE FEED (LF)
321    #0x000B => 0, # LINE TABULATION (VT)    #0x000B => 0, # LINE TABULATION (VT)
322    0x000C => 1, # FORM FEED (FF)    0x000C => 1, # FORM FEED (FF) ## XML5: Not a space character.
323    #0x000D => 1, # CARRIAGE RETURN (CR)    #0x000D => 1, # CARRIAGE RETURN (CR)
324    0x0020 => 1, # SPACE (SP)    0x0020 => 1, # SPACE (SP)
325  };  };
# Line 362  sub _get_next_token ($) { Line 443  sub _get_next_token ($) {
443          }          }
444        } elsif ($self->{nc} == 0x002D) { # -        } elsif ($self->{nc} == 0x002D) { # -
445          if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA          if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
446            $self->{s_kwd} .= '-';            if ($self->{s_kwd} eq '<!-') {
             
           if ($self->{s_kwd} eq '<!--') {  
447                            
448              $self->{escape} = 1; # unless $self->{escape};              $self->{escape} = 1; # unless $self->{escape};
449              $self->{s_kwd} = '--';              $self->{s_kwd} = '--';
450              #              #
451            } elsif ($self->{s_kwd} eq '---') {            } elsif ($self->{s_kwd} eq '-') {
452                            
453              $self->{s_kwd} = '--';              $self->{s_kwd} = '--';
454              #              #
455              } elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') {
456                
457                $self->{s_kwd} .= '-';
458                #
459            } else {            } else {
460                            
461                $self->{s_kwd} = '-';
462              #              #
463            }            }
464          }          }
# Line 420  sub _get_next_token ($) { Line 504  sub _get_next_token ($) {
504            if ($self->{s_kwd} eq '--') {            if ($self->{s_kwd} eq '--') {
505                            
506              delete $self->{escape};              delete $self->{escape};
507                #
508            } else {            } else {
509                            
510                #
511            }            }
512            } elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') {
513              
514              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched mse', ## TODO: type
515                              line => $self->{line_prev},
516                              column => $self->{column_prev} - 1);
517              #
518          } else {          } else {
519                        
520              #
521          }          }
522                    
523          $self->{s_kwd} = '';          $self->{s_kwd} = '';
524          #          #
525          } elsif ($self->{nc} == 0x005D) { # ]
526            if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') {
527              
528              $self->{s_kwd} .= ']';
529            } elsif ($self->{s_kwd} eq ']]') {
530              
531              #
532            } else {
533              
534              $self->{s_kwd} = '';
535            }
536            #
537        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
538                    
539          $self->{s_kwd} = '';          $self->{s_kwd} = '';
# Line 446  sub _get_next_token ($) { Line 551  sub _get_next_token ($) {
551                     data => chr $self->{nc},                     data => chr $self->{nc},
552                     line => $self->{line}, column => $self->{column},                     line => $self->{line}, column => $self->{column},
553                    };                    };
554        if ($self->{read_until}->($token->{data}, q[-!<>&],        if ($self->{read_until}->($token->{data}, q{-!<>&\]},
555                                  length $token->{data})) {                                  length $token->{data})) {
556          $self->{s_kwd} = '';          $self->{s_kwd} = '';
557        }        }
558    
559        ## Stay in the data state.        ## Stay in the data state.
560        if ($self->{content_model} == PCDATA_CONTENT_MODEL) {        if (not $self->{is_xml} and
561              $self->{content_model} == PCDATA_CONTENT_MODEL) {
562                    
563          $self->{state} = PCDATA_STATE;          $self->{state} = PCDATA_STATE;
564        } else {        } else {
# Line 473  sub _get_next_token ($) { Line 579  sub _get_next_token ($) {
579        return  ($token);        return  ($token);
580        redo A;        redo A;
581      } elsif ($self->{state} == TAG_OPEN_STATE) {      } elsif ($self->{state} == TAG_OPEN_STATE) {
582          ## XML5: "tag state".
583    
584        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
585          if ($self->{nc} == 0x002F) { # /          if ($self->{nc} == 0x002F) { # /
586                        
# Line 491  sub _get_next_token ($) { Line 599  sub _get_next_token ($) {
599            redo A;            redo A;
600          } elsif ($self->{nc} == 0x0021) { # !          } elsif ($self->{nc} == 0x0021) { # !
601                        
602            $self->{s_kwd} = '<' unless $self->{escape};            $self->{s_kwd} = $self->{escaped} ? '' : '<';
603            #            #
604          } else {          } else {
605                        
606              $self->{s_kwd} = '';
607            #            #
608          }          }
609    
# Line 583  sub _get_next_token ($) { Line 692  sub _get_next_token ($) {
692                            line => $self->{line_prev},                            line => $self->{line_prev},
693                            column => $self->{column_prev});                            column => $self->{column_prev});
694            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
695              $self->{s_kwd} = '';
696                        
697      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
698        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 602  sub _get_next_token ($) { Line 712  sub _get_next_token ($) {
712    
713            redo A;            redo A;
714          } elsif ($self->{nc} == 0x003F) { # ?          } elsif ($self->{nc} == 0x003F) { # ?
715                        if ($self->{is_xml}) {
716            $self->{parse_error}->(level => $self->{level}->{must}, type => 'pio',              
717                            line => $self->{line_prev},              $self->{state} = PI_STATE;
718                            column => $self->{column_prev});              
719            $self->{state} = BOGUS_COMMENT_STATE;      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
720            $self->{ct} = {type => COMMENT_TOKEN, data => '',        $self->{line_prev} = $self->{line};
721                                      line => $self->{line_prev},        $self->{column_prev} = $self->{column};
722                                      column => $self->{column_prev},        $self->{column}++;
723                                     };        $self->{nc}
724            ## $self->{nc} is intentionally left as is            = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
725            redo A;      } else {
726          } else {        $self->{set_nc}->($self);
727        }
728      
729                redo A;
730              } else {
731                
732                $self->{parse_error}->(level => $self->{level}->{must}, type => 'pio',
733                                line => $self->{line_prev},
734                                column => $self->{column_prev});
735                $self->{state} = BOGUS_COMMENT_STATE;
736                $self->{ct} = {type => COMMENT_TOKEN, data => '',
737                               line => $self->{line_prev},
738                               column => $self->{column_prev},
739                              };
740                ## $self->{nc} is intentionally left as is
741                redo A;
742              }
743            } elsif (not $self->{is_xml} or $is_space->{$self->{nc}}) {
744                        
745            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago',            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago',
746                            line => $self->{line_prev},                            line => $self->{line_prev},
747                            column => $self->{column_prev});                            column => $self->{column_prev});
748            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
749              $self->{s_kwd} = '';
750            ## reconsume            ## reconsume
751    
752            return  ({type => CHARACTER_TOKEN, data => '<',            return  ({type => CHARACTER_TOKEN, data => '<',
# Line 627  sub _get_next_token ($) { Line 755  sub _get_next_token ($) {
755                     });                     });
756    
757            redo A;            redo A;
758            } else {
759              ## XML5: "<:" is a parse error.
760              
761              $self->{ct} = {type => START_TAG_TOKEN,
762                                        tag_name => chr ($self->{nc}),
763                                        line => $self->{line_prev},
764                                        column => $self->{column_prev}};
765              $self->{state} = TAG_NAME_STATE;
766              
767        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
768          $self->{line_prev} = $self->{line};
769          $self->{column_prev} = $self->{column};
770          $self->{column}++;
771          $self->{nc}
772              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
773        } else {
774          $self->{set_nc}->($self);
775        }
776      
777              redo A;
778          }          }
779        } else {        } else {
780          die "$0: $self->{content_model} in tag open";          die "$0: $self->{content_model} in tag open";
# Line 635  sub _get_next_token ($) { Line 783  sub _get_next_token ($) {
783        ## NOTE: The "close tag open state" in the spec is implemented as        ## NOTE: The "close tag open state" in the spec is implemented as
784        ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.        ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
785    
786          ## XML5: "end tag state".
787    
788        my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"        my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
789        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
790          if (defined $self->{last_stag_name}) {          if (defined $self->{last_stag_name}) {
791            $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;            $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
792            $self->{s_kwd} = '';            $self->{kwd} = '';
793            ## Reconsume.            ## Reconsume.
794            redo A;            redo A;
795          } else {          } else {
# Line 647  sub _get_next_token ($) { Line 797  sub _get_next_token ($) {
797            ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.            ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
798                        
799            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
800              $self->{s_kwd} = '';
801            ## Reconsume.            ## Reconsume.
802            return  ({type => CHARACTER_TOKEN, data => '</',            return  ({type => CHARACTER_TOKEN, data => '</',
803                      line => $l, column => $c,                      line => $l, column => $c,
# Line 695  sub _get_next_token ($) { Line 846  sub _get_next_token ($) {
846        
847          redo A;          redo A;
848        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
           
849          $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty end tag',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty end tag',
850                          line => $self->{line_prev}, ## "<" in "</>"                          line => $self->{line_prev}, ## "<" in "</>"
851                          column => $self->{column_prev} - 1);                          column => $self->{column_prev} - 1);
852          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
853                    $self->{s_kwd} = '';
854            if ($self->{is_xml}) {
855              
856              ## XML5: No parse error.
857              
858              ## NOTE: This parser raises a parse error, since it supports
859              ## XML1, not XML5.
860    
861              ## NOTE: A short end tag token.
862              my $ct = {type => END_TAG_TOKEN,
863                        tag_name => '',
864                        line => $self->{line_prev},
865                        column => $self->{column_prev} - 1,
866                       };
867              
868        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
869          $self->{line_prev} = $self->{line};
870          $self->{column_prev} = $self->{column};
871          $self->{column}++;
872          $self->{nc}
873              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
874        } else {
875          $self->{set_nc}->($self);
876        }
877      
878              return  ($ct);
879            } else {
880              
881              
882      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
883        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
884        $self->{column_prev} = $self->{column};        $self->{column_prev} = $self->{column};
# Line 711  sub _get_next_token ($) { Line 889  sub _get_next_token ($) {
889        $self->{set_nc}->($self);        $self->{set_nc}->($self);
890      }      }
891        
892            }
893          redo A;          redo A;
894        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
895                    
896          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare etago');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare etago');
897            $self->{s_kwd} = '';
898          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
899          # reconsume          # reconsume
900    
# Line 723  sub _get_next_token ($) { Line 903  sub _get_next_token ($) {
903                   });                   });
904    
905          redo A;          redo A;
906        } else {        } elsif (not $self->{is_xml} or
907                   $is_space->{$self->{nc}}) {
908                    
909          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus end tag');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus end tag',
910                            line => $self->{line_prev}, # "<" of "</"
911                            column => $self->{column_prev} - 1);
912          $self->{state} = BOGUS_COMMENT_STATE;          $self->{state} = BOGUS_COMMENT_STATE;
913          $self->{ct} = {type => COMMENT_TOKEN, data => '',          $self->{ct} = {type => COMMENT_TOKEN, data => '',
914                                    line => $self->{line_prev}, # "<" of "</"                                    line => $self->{line_prev}, # "<" of "</"
# Line 738  sub _get_next_token ($) { Line 921  sub _get_next_token ($) {
921          ## generated from the bogus end tag, as defined in the          ## generated from the bogus end tag, as defined in the
922          ## "bogus comment state" entry.          ## "bogus comment state" entry.
923          redo A;          redo A;
924          } else {
925            ## XML5: "</:" is a parse error.
926            
927            $self->{ct} = {type => END_TAG_TOKEN,
928                           tag_name => chr ($self->{nc}),
929                           line => $l, column => $c};
930            $self->{state} = TAG_NAME_STATE; ## XML5: "end tag name state".
931            
932        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
933          $self->{line_prev} = $self->{line};
934          $self->{column_prev} = $self->{column};
935          $self->{column}++;
936          $self->{nc}
937              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
938        } else {
939          $self->{set_nc}->($self);
940        }
941      
942            redo A;
943        }        }
944      } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {      } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
945        my $ch = substr $self->{last_stag_name}, length $self->{s_kwd}, 1;        my $ch = substr $self->{last_stag_name}, length $self->{kwd}, 1;
946        if (length $ch) {        if (length $ch) {
947          my $CH = $ch;          my $CH = $ch;
948          $ch =~ tr/a-z/A-Z/;          $ch =~ tr/a-z/A-Z/;
# Line 748  sub _get_next_token ($) { Line 950  sub _get_next_token ($) {
950          if ($nch eq $ch or $nch eq $CH) {          if ($nch eq $ch or $nch eq $CH) {
951                        
952            ## Stay in the state.            ## Stay in the state.
953            $self->{s_kwd} .= $nch;            $self->{kwd} .= $nch;
954                        
955      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
956        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 764  sub _get_next_token ($) { Line 966  sub _get_next_token ($) {
966          } else {          } else {
967                        
968            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
969              $self->{s_kwd} = '';
970            ## Reconsume.            ## Reconsume.
971            return  ({type => CHARACTER_TOKEN,            return  ({type => CHARACTER_TOKEN,
972                      data => '</' . $self->{s_kwd},                      data => '</' . $self->{kwd},
973                      line => $self->{line_prev},                      line => $self->{line_prev},
974                      column => $self->{column_prev} - 1 - length $self->{s_kwd},                      column => $self->{column_prev} - 1 - length $self->{kwd},
975                     });                     });
976            redo A;            redo A;
977          }          }
# Line 782  sub _get_next_token ($) { Line 985  sub _get_next_token ($) {
985                        
986            ## Reconsume.            ## Reconsume.
987            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
988              $self->{s_kwd} = '';
989            return  ({type => CHARACTER_TOKEN,            return  ({type => CHARACTER_TOKEN,
990                      data => '</' . $self->{s_kwd},                      data => '</' . $self->{kwd},
991                      line => $self->{line_prev},                      line => $self->{line_prev},
992                      column => $self->{column_prev} - 1 - length $self->{s_kwd},                      column => $self->{column_prev} - 1 - length $self->{kwd},
993                     });                     });
994            redo A;            redo A;
995          } else {          } else {
# Line 794  sub _get_next_token ($) { Line 998  sub _get_next_token ($) {
998                = {type => END_TAG_TOKEN,                = {type => END_TAG_TOKEN,
999                   tag_name => $self->{last_stag_name},                   tag_name => $self->{last_stag_name},
1000                   line => $self->{line_prev},                   line => $self->{line_prev},
1001                   column => $self->{column_prev} - 1 - length $self->{s_kwd}};                   column => $self->{column_prev} - 1 - length $self->{kwd}};
1002            $self->{state} = TAG_NAME_STATE;            $self->{state} = TAG_NAME_STATE;
1003            ## Reconsume.            ## Reconsume.
1004            redo A;            redo A;
# Line 833  sub _get_next_token ($) { Line 1037  sub _get_next_token ($) {
1037            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1038          }          }
1039          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1040            $self->{s_kwd} = '';
1041                    
1042      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1043        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 885  sub _get_next_token ($) { Line 1090  sub _get_next_token ($) {
1090            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1091          }          }
1092          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1093            $self->{s_kwd} = '';
1094          # reconsume          # reconsume
1095    
1096          return  ($self->{ct}); # start tag or end tag          return  ($self->{ct}); # start tag or end tag
# Line 924  sub _get_next_token ($) { Line 1130  sub _get_next_token ($) {
1130          redo A;          redo A;
1131        }        }
1132      } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {      } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
1133          ## XML5: "Tag attribute name before state".
1134    
1135        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
1136                    
1137          ## Stay in the state          ## Stay in the state
# Line 955  sub _get_next_token ($) { Line 1163  sub _get_next_token ($) {
1163            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1164          }          }
1165          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1166            $self->{s_kwd} = '';
1167                    
1168      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1169        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 1022  sub _get_next_token ($) { Line 1231  sub _get_next_token ($) {
1231            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1232          }          }
1233          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1234            $self->{s_kwd} = '';
1235          # reconsume          # reconsume
1236    
1237          return  ($self->{ct}); # start tag or end tag          return  ($self->{ct}); # start tag or end tag
# Line 1034  sub _get_next_token ($) { Line 1244  sub _get_next_token ($) {
1244               0x003D => 1, # =               0x003D => 1, # =
1245              }->{$self->{nc}}) {              }->{$self->{nc}}) {
1246                        
1247              ## XML5: Not a parse error.
1248            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1249          } else {          } else {
1250                        
1251              ## XML5: ":" raises a parse error and is ignored.
1252          }          }
1253          $self->{ca}          $self->{ca}
1254              = {name => chr ($self->{nc}),              = {name => chr ($self->{nc}),
# Line 1057  sub _get_next_token ($) { Line 1269  sub _get_next_token ($) {
1269          redo A;          redo A;
1270        }        }
1271      } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {      } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
1272          ## XML5: "Tag attribute name state".
1273    
1274        my $before_leave = sub {        my $before_leave = sub {
1275          if (exists $self->{ct}->{attributes} # start tag or end tag          if (exists $self->{ct}->{attributes} # start tag or end tag
1276              ->{$self->{ca}->{name}}) { # MUST              ->{$self->{ca}->{name}}) { # MUST
# Line 1067  sub _get_next_token ($) { Line 1281  sub _get_next_token ($) {
1281                        
1282            $self->{ct}->{attributes}->{$self->{ca}->{name}}            $self->{ct}->{attributes}->{$self->{ca}->{name}}
1283              = $self->{ca};              = $self->{ca};
1284              $self->{ca}->{index} = ++$self->{ct}->{last_index};
1285          }          }
1286        }; # $before_leave        }; # $before_leave
1287    
# Line 1103  sub _get_next_token ($) { Line 1318  sub _get_next_token ($) {
1318        
1319          redo A;          redo A;
1320        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
1321            if ($self->{is_xml}) {
1322              
1323              ## XML5: Not a parse error.
1324              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1325            } else {
1326              
1327            }
1328    
1329          $before_leave->();          $before_leave->();
1330          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
1331                        
# Line 1117  sub _get_next_token ($) { Line 1340  sub _get_next_token ($) {
1340            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1341          }          }
1342          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1343            $self->{s_kwd} = '';
1344                    
1345      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1346        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 1151  sub _get_next_token ($) { Line 1375  sub _get_next_token ($) {
1375        
1376          redo A;          redo A;
1377        } elsif ($self->{nc} == 0x002F) { # /        } elsif ($self->{nc} == 0x002F) { # /
1378            if ($self->{is_xml}) {
1379              
1380              ## XML5: Not a parse error.
1381              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1382            } else {
1383              
1384            }
1385                    
1386          $before_leave->();          $before_leave->();
1387          $self->{state} = SELF_CLOSING_START_TAG_STATE;          $self->{state} = SELF_CLOSING_START_TAG_STATE;
# Line 1185  sub _get_next_token ($) { Line 1416  sub _get_next_token ($) {
1416            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1417          }          }
1418          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1419            $self->{s_kwd} = '';
1420          # reconsume          # reconsume
1421    
1422          return  ($self->{ct}); # start tag or end tag          return  ($self->{ct}); # start tag or end tag
# Line 1194  sub _get_next_token ($) { Line 1426  sub _get_next_token ($) {
1426          if ($self->{nc} == 0x0022 or # "          if ($self->{nc} == 0x0022 or # "
1427              $self->{nc} == 0x0027) { # '              $self->{nc} == 0x0027) { # '
1428                        
1429              ## XML5: Not a parse error.
1430            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1431          } else {          } else {
1432                        
# Line 1214  sub _get_next_token ($) { Line 1447  sub _get_next_token ($) {
1447          redo A;          redo A;
1448        }        }
1449      } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {      } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1450          ## XML5: "Tag attribute name after state".
1451          
1452        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
1453                    
1454          ## Stay in the state          ## Stay in the state
# Line 1245  sub _get_next_token ($) { Line 1480  sub _get_next_token ($) {
1480        
1481          redo A;          redo A;
1482        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
1483            if ($self->{is_xml}) {
1484              
1485              ## XML5: Not a parse error.
1486              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1487            } else {
1488              
1489            }
1490    
1491          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
1492                        
1493            $self->{last_stag_name} = $self->{ct}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
# Line 1261  sub _get_next_token ($) { Line 1504  sub _get_next_token ($) {
1504            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1505          }          }
1506          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1507            $self->{s_kwd} = '';
1508                    
1509      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1510        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 1297  sub _get_next_token ($) { Line 1541  sub _get_next_token ($) {
1541        
1542          redo A;          redo A;
1543        } elsif ($self->{nc} == 0x002F) { # /        } elsif ($self->{nc} == 0x002F) { # /
1544            if ($self->{is_xml}) {
1545              
1546              ## XML5: Not a parse error.
1547              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1548            } else {
1549              
1550            }
1551                    
1552          $self->{state} = SELF_CLOSING_START_TAG_STATE;          $self->{state} = SELF_CLOSING_START_TAG_STATE;
1553                    
# Line 1328  sub _get_next_token ($) { Line 1579  sub _get_next_token ($) {
1579          } else {          } else {
1580            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1581          }          }
1582            $self->{s_kwd} = '';
1583          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1584          # reconsume          # reconsume
1585    
# Line 1335  sub _get_next_token ($) { Line 1587  sub _get_next_token ($) {
1587    
1588          redo A;          redo A;
1589        } else {        } else {
1590            if ($self->{is_xml}) {
1591              
1592              ## XML5: Not a parse error.
1593              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1594            } else {
1595              
1596            }
1597    
1598          if ($self->{nc} == 0x0022 or # "          if ($self->{nc} == 0x0022 or # "
1599              $self->{nc} == 0x0027) { # '              $self->{nc} == 0x0027) { # '
1600                        
1601              ## XML5: Not a parse error.
1602            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1603          } else {          } else {
1604                        
# Line 1361  sub _get_next_token ($) { Line 1622  sub _get_next_token ($) {
1622          redo A;                  redo A;        
1623        }        }
1624      } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {      } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1625          ## XML5: "Tag attribute value before state".
1626    
1627        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
1628                    
1629          ## Stay in the state          ## Stay in the state
# Line 1429  sub _get_next_token ($) { Line 1692  sub _get_next_token ($) {
1692            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1693          }          }
1694          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1695            $self->{s_kwd} = '';
1696                    
1697      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1698        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 1462  sub _get_next_token ($) { Line 1726  sub _get_next_token ($) {
1726            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1727          }          }
1728          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1729            $self->{s_kwd} = '';
1730          ## reconsume          ## reconsume
1731    
1732          return  ($self->{ct}); # start tag or end tag          return  ($self->{ct}); # start tag or end tag
# Line 1470  sub _get_next_token ($) { Line 1735  sub _get_next_token ($) {
1735        } else {        } else {
1736          if ($self->{nc} == 0x003D) { # =          if ($self->{nc} == 0x003D) { # =
1737                        
1738              ## XML5: Not a parse error.
1739            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
1740            } elsif ($self->{is_xml}) {
1741              
1742              ## XML5: No parse error.
1743              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO
1744          } else {          } else {
1745                        
1746          }          }
# Line 1490  sub _get_next_token ($) { Line 1760  sub _get_next_token ($) {
1760          redo A;          redo A;
1761        }        }
1762      } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {      } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1763          ## XML5: "Tag attribute value double quoted state" and "DOCTYPE
1764          ## ATTLIST attribute value double quoted state".
1765          
1766        if ($self->{nc} == 0x0022) { # "        if ($self->{nc} == 0x0022) { # "
1767                    if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1768          $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;            
1769              ## XML5: "DOCTYPE ATTLIST name after state".
1770              push @{$self->{ct}->{attrdefs}}, $self->{ca};
1771              $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1772            } else {
1773              
1774              ## XML5: "Tag attribute name before state".
1775              $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1776            }
1777                    
1778      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1779        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 1507  sub _get_next_token ($) { Line 1788  sub _get_next_token ($) {
1788          redo A;          redo A;
1789        } elsif ($self->{nc} == 0x0026) { # &        } elsif ($self->{nc} == 0x0026) { # &
1790                    
1791            ## XML5: Not defined yet.
1792    
1793          ## NOTE: In the spec, the tokenizer is switched to the          ## NOTE: In the spec, the tokenizer is switched to the
1794          ## "entity in attribute value state".  In this implementation, the          ## "entity in attribute value state".  In this implementation, the
1795          ## tokenizer is switched to the |ENTITY_STATE|, which is an          ## tokenizer is switched to the |ENTITY_STATE|, which is an
# Line 1531  sub _get_next_token ($) { Line 1814  sub _get_next_token ($) {
1814          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
1815                        
1816            $self->{last_stag_name} = $self->{ct}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
1817    
1818              $self->{state} = DATA_STATE;
1819              $self->{s_kwd} = '';
1820              ## reconsume
1821              return  ($self->{ct}); # start tag
1822              redo A;
1823          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1824            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1825            if ($self->{ct}->{attributes}) {            if ($self->{ct}->{attributes}) {
# Line 1540  sub _get_next_token ($) { Line 1829  sub _get_next_token ($) {
1829              ## NOTE: This state should never be reached.              ## NOTE: This state should never be reached.
1830                            
1831            }            }
1832    
1833              $self->{state} = DATA_STATE;
1834              $self->{s_kwd} = '';
1835              ## reconsume
1836              return  ($self->{ct}); # end tag
1837              redo A;
1838            } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1839              ## XML5: No parse error above; not defined yet.
1840              push @{$self->{ct}->{attrdefs}}, $self->{ca};
1841              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1842              ## Reconsume.
1843              return  ($self->{ct}); # ATTLIST
1844              redo A;
1845          } else {          } else {
1846            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1847          }          }
         $self->{state} = DATA_STATE;  
         ## reconsume  
   
         return  ($self->{ct}); # start tag or end tag  
   
         redo A;  
1848        } else {        } else {
1849                    ## XML5 [ATTLIST]: Not defined yet.
1850            if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1851              
1852              ## XML5: Not a parse error.
1853              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
1854            } else {
1855              
1856            }
1857          $self->{ca}->{value} .= chr ($self->{nc});          $self->{ca}->{value} .= chr ($self->{nc});
1858          $self->{read_until}->($self->{ca}->{value},          $self->{read_until}->($self->{ca}->{value},
1859                                q["&],                                q["&<],
1860                                length $self->{ca}->{value});                                length $self->{ca}->{value});
1861    
1862          ## Stay in the state          ## Stay in the state
# Line 1571  sub _get_next_token ($) { Line 1874  sub _get_next_token ($) {
1874          redo A;          redo A;
1875        }        }
1876      } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {      } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1877          ## XML5: "Tag attribute value single quoted state" and "DOCTYPE
1878          ## ATTLIST attribute value single quoted state".
1879    
1880        if ($self->{nc} == 0x0027) { # '        if ($self->{nc} == 0x0027) { # '
1881                    if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1882          $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;            
1883              ## XML5: "DOCTYPE ATTLIST name after state".
1884              push @{$self->{ct}->{attrdefs}}, $self->{ca};
1885              $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1886            } else {
1887              
1888              ## XML5: "Before attribute name state" (sic).
1889              $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1890            }
1891                    
1892      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1893        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 1588  sub _get_next_token ($) { Line 1902  sub _get_next_token ($) {
1902          redo A;          redo A;
1903        } elsif ($self->{nc} == 0x0026) { # &        } elsif ($self->{nc} == 0x0026) { # &
1904                    
1905            ## XML5: Not defined yet.
1906    
1907          ## NOTE: In the spec, the tokenizer is switched to the          ## NOTE: In the spec, the tokenizer is switched to the
1908          ## "entity in attribute value state".  In this implementation, the          ## "entity in attribute value state".  In this implementation, the
1909          ## tokenizer is switched to the |ENTITY_STATE|, which is an          ## tokenizer is switched to the |ENTITY_STATE|, which is an
# Line 1612  sub _get_next_token ($) { Line 1928  sub _get_next_token ($) {
1928          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
1929                        
1930            $self->{last_stag_name} = $self->{ct}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
1931    
1932              $self->{state} = DATA_STATE;
1933              $self->{s_kwd} = '';
1934              ## reconsume
1935              return  ($self->{ct}); # start tag
1936              redo A;
1937          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1938            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1939            if ($self->{ct}->{attributes}) {            if ($self->{ct}->{attributes}) {
# Line 1621  sub _get_next_token ($) { Line 1943  sub _get_next_token ($) {
1943              ## NOTE: This state should never be reached.              ## NOTE: This state should never be reached.
1944                            
1945            }            }
1946    
1947              $self->{state} = DATA_STATE;
1948              $self->{s_kwd} = '';
1949              ## reconsume
1950              return  ($self->{ct}); # end tag
1951              redo A;
1952            } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1953              ## XML5: No parse error above; not defined yet.
1954              push @{$self->{ct}->{attrdefs}}, $self->{ca};
1955              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1956              ## Reconsume.
1957              return  ($self->{ct}); # ATTLIST
1958              redo A;
1959          } else {          } else {
1960            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1961          }          }
         $self->{state} = DATA_STATE;  
         ## reconsume  
   
         return  ($self->{ct}); # start tag or end tag  
   
         redo A;  
1962        } else {        } else {
1963                    ## XML5 [ATTLIST]: Not defined yet.
1964            if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1965              
1966              ## XML5: Not a parse error.
1967              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
1968            } else {
1969              
1970            }
1971          $self->{ca}->{value} .= chr ($self->{nc});          $self->{ca}->{value} .= chr ($self->{nc});
1972          $self->{read_until}->($self->{ca}->{value},          $self->{read_until}->($self->{ca}->{value},
1973                                q['&],                                q['&<],
1974                                length $self->{ca}->{value});                                length $self->{ca}->{value});
1975    
1976          ## Stay in the state          ## Stay in the state
# Line 1652  sub _get_next_token ($) { Line 1988  sub _get_next_token ($) {
1988          redo A;          redo A;
1989        }        }
1990      } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {      } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1991          ## XML5: "Tag attribute value unquoted state".
1992    
1993        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
1994                    if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1995          $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;            
1996              push @{$self->{ct}->{attrdefs}}, $self->{ca};
1997              $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
1998            } else {
1999              
2000              ## XML5: "Tag attribute name before state".
2001              $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2002            }
2003                    
2004      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2005        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 1669  sub _get_next_token ($) { Line 2014  sub _get_next_token ($) {
2014          redo A;          redo A;
2015        } elsif ($self->{nc} == 0x0026) { # &        } elsif ($self->{nc} == 0x0026) { # &
2016                    
2017    
2018            ## XML5: Not defined yet.
2019    
2020          ## NOTE: In the spec, the tokenizer is switched to the          ## NOTE: In the spec, the tokenizer is switched to the
2021          ## "entity in attribute value state".  In this implementation, the          ## "entity in attribute value state".  In this implementation, the
2022          ## tokenizer is switched to the |ENTITY_STATE|, which is an          ## tokenizer is switched to the |ENTITY_STATE|, which is an
# Line 1692  sub _get_next_token ($) { Line 2040  sub _get_next_token ($) {
2040          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
2041                        
2042            $self->{last_stag_name} = $self->{ct}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
2043    
2044              $self->{state} = DATA_STATE;
2045              $self->{s_kwd} = '';
2046              
2047        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2048          $self->{line_prev} = $self->{line};
2049          $self->{column_prev} = $self->{column};
2050          $self->{column}++;
2051          $self->{nc}
2052              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2053        } else {
2054          $self->{set_nc}->($self);
2055        }
2056      
2057              return  ($self->{ct}); # start tag
2058              redo A;
2059          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2060            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2061            if ($self->{ct}->{attributes}) {            if ($self->{ct}->{attributes}) {
# Line 1701  sub _get_next_token ($) { Line 2065  sub _get_next_token ($) {
2065              ## NOTE: This state should never be reached.              ## NOTE: This state should never be reached.
2066                            
2067            }            }
2068          } else {  
2069            die "$0: $self->{ct}->{type}: Unknown token type";            $self->{state} = DATA_STATE;
2070          }            $self->{s_kwd} = '';
2071          $self->{state} = DATA_STATE;            
           
2072      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2073        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
2074        $self->{column_prev} = $self->{column};        $self->{column_prev} = $self->{column};
# Line 1716  sub _get_next_token ($) { Line 2079  sub _get_next_token ($) {
2079        $self->{set_nc}->($self);        $self->{set_nc}->($self);
2080      }      }
2081        
2082              return  ($self->{ct}); # end tag
2083          return  ($self->{ct}); # start tag or end tag            redo A;
2084            } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2085          redo A;            push @{$self->{ct}->{attrdefs}}, $self->{ca};
2086              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2087              
2088        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2089          $self->{line_prev} = $self->{line};
2090          $self->{column_prev} = $self->{column};
2091          $self->{column}++;
2092          $self->{nc}
2093              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2094        } else {
2095          $self->{set_nc}->($self);
2096        }
2097      
2098              return  ($self->{ct}); # ATTLIST
2099              redo A;
2100            } else {
2101              die "$0: $self->{ct}->{type}: Unknown token type";
2102            }
2103        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
         $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');  
2104          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
2105                        
2106              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2107            $self->{last_stag_name} = $self->{ct}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
2108    
2109              $self->{state} = DATA_STATE;
2110              $self->{s_kwd} = '';
2111              ## reconsume
2112              return  ($self->{ct}); # start tag
2113              redo A;
2114          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2115              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2116            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2117            if ($self->{ct}->{attributes}) {            if ($self->{ct}->{attributes}) {
2118                            
# Line 1734  sub _get_next_token ($) { Line 2121  sub _get_next_token ($) {
2121              ## NOTE: This state should never be reached.              ## NOTE: This state should never be reached.
2122                            
2123            }            }
2124    
2125              $self->{state} = DATA_STATE;
2126              $self->{s_kwd} = '';
2127              ## reconsume
2128              return  ($self->{ct}); # end tag
2129              redo A;
2130            } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2131              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
2132              push @{$self->{ct}->{attrdefs}}, $self->{ca};
2133              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2134              ## Reconsume.
2135              return  ($self->{ct}); # ATTLIST
2136              redo A;
2137          } else {          } else {
2138            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
2139          }          }
         $self->{state} = DATA_STATE;  
         ## reconsume  
   
         return  ($self->{ct}); # start tag or end tag  
   
         redo A;  
2140        } else {        } else {
2141          if ({          if ({
2142               0x0022 => 1, # "               0x0022 => 1, # "
# Line 1750  sub _get_next_token ($) { Line 2144  sub _get_next_token ($) {
2144               0x003D => 1, # =               0x003D => 1, # =
2145              }->{$self->{nc}}) {              }->{$self->{nc}}) {
2146                        
2147              ## XML5: Not a parse error.
2148            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
2149          } else {          } else {
2150                        
# Line 1806  sub _get_next_token ($) { Line 2201  sub _get_next_token ($) {
2201            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
2202          }          }
2203          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2204            $self->{s_kwd} = '';
2205                    
2206      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2207        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 1853  sub _get_next_token ($) { Line 2249  sub _get_next_token ($) {
2249            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
2250          }          }
2251          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2252            $self->{s_kwd} = '';
2253          ## Reconsume.          ## Reconsume.
2254          return  ($self->{ct}); # start tag or end tag          return  ($self->{ct}); # start tag or end tag
2255          redo A;          redo A;
# Line 1864  sub _get_next_token ($) { Line 2261  sub _get_next_token ($) {
2261          redo A;          redo A;
2262        }        }
2263      } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {      } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
2264          ## XML5: "Empty tag state".
2265    
2266        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
2267          if ($self->{ct}->{type} == END_TAG_TOKEN) {          if ($self->{ct}->{type} == END_TAG_TOKEN) {
2268                        
# Line 1883  sub _get_next_token ($) { Line 2282  sub _get_next_token ($) {
2282          }          }
2283    
2284          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2285            $self->{s_kwd} = '';
2286                    
2287      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2288        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 1914  sub _get_next_token ($) { Line 2314  sub _get_next_token ($) {
2314          } else {          } else {
2315            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
2316          }          }
2317            ## XML5: "Tag attribute name before state".
2318          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2319            $self->{s_kwd} = '';
2320          ## Reconsume.          ## Reconsume.
2321          return  ($self->{ct}); # start tag or end tag          return  ($self->{ct}); # start tag or end tag
2322          redo A;          redo A;
# Line 1927  sub _get_next_token ($) { Line 2329  sub _get_next_token ($) {
2329          redo A;          redo A;
2330        }        }
2331      } elsif ($self->{state} == BOGUS_COMMENT_STATE) {      } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
2332        ## (only happen if PCDATA state)        ## XML5: "Bogus comment state" and "DOCTYPE bogus comment state".
2333    
2334        ## NOTE: Unlike spec's "bogus comment state", this implementation        ## NOTE: Unlike spec's "bogus comment state", this implementation
2335        ## consumes characters one-by-one basis.        ## consumes characters one-by-one basis.
2336                
2337        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
2338                    if ($self->{in_subset}) {
2339          $self->{state} = DATA_STATE;            
2340              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2341            } else {
2342              
2343              $self->{state} = DATA_STATE;
2344              $self->{s_kwd} = '';
2345            }
2346                    
2347      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2348        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 1950  sub _get_next_token ($) { Line 2358  sub _get_next_token ($) {
2358          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
2359          redo A;          redo A;
2360        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
2361                    if ($self->{in_subset}) {
2362          $self->{state} = DATA_STATE;            
2363              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2364            } else {
2365              
2366              $self->{state} = DATA_STATE;
2367              $self->{s_kwd} = '';
2368            }
2369          ## reconsume          ## reconsume
2370    
2371          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
# Line 1978  sub _get_next_token ($) { Line 2392  sub _get_next_token ($) {
2392          redo A;          redo A;
2393        }        }
2394      } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {      } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
2395        ## (only happen if PCDATA state)        ## XML5: "Markup declaration state".
2396                
2397        if ($self->{nc} == 0x002D) { # -        if ($self->{nc} == 0x002D) { # -
2398                    
# Line 2000  sub _get_next_token ($) { Line 2414  sub _get_next_token ($) {
2414          ## ASCII case-insensitive.          ## ASCII case-insensitive.
2415                    
2416          $self->{state} = MD_DOCTYPE_STATE;          $self->{state} = MD_DOCTYPE_STATE;
2417          $self->{s_kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
2418                    
2419      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2420        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2019  sub _get_next_token ($) { Line 2433  sub _get_next_token ($) {
2433                 $self->{nc} == 0x005B) { # [                 $self->{nc} == 0x005B) { # [
2434                                                    
2435          $self->{state} = MD_CDATA_STATE;          $self->{state} = MD_CDATA_STATE;
2436          $self->{s_kwd} = '[';          $self->{kwd} = '[';
2437                    
2438      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2439        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2053  sub _get_next_token ($) { Line 2467  sub _get_next_token ($) {
2467                                    line => $self->{line_prev},                                    line => $self->{line_prev},
2468                                    column => $self->{column_prev} - 2,                                    column => $self->{column_prev} - 2,
2469                                   };                                   };
2470          $self->{state} = COMMENT_START_STATE;          $self->{state} = COMMENT_START_STATE; ## XML5: "comment state".
2471                    
2472      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2473        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2089  sub _get_next_token ($) { Line 2503  sub _get_next_token ($) {
2503              0x0054, # T              0x0054, # T
2504              0x0059, # Y              0x0059, # Y
2505              0x0050, # P              0x0050, # P
2506            ]->[length $self->{s_kwd}] or            ]->[length $self->{kwd}] or
2507            $self->{nc} == [            $self->{nc} == [
2508              undef,              undef,
2509              0x006F, # o              0x006F, # o
# Line 2097  sub _get_next_token ($) { Line 2511  sub _get_next_token ($) {
2511              0x0074, # t              0x0074, # t
2512              0x0079, # y              0x0079, # y
2513              0x0070, # p              0x0070, # p
2514            ]->[length $self->{s_kwd}]) {            ]->[length $self->{kwd}]) {
2515                    
2516          ## Stay in the state.          ## Stay in the state.
2517          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
2518                    
2519      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2520        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2113  sub _get_next_token ($) { Line 2527  sub _get_next_token ($) {
2527      }      }
2528        
2529          redo A;          redo A;
2530        } elsif ((length $self->{s_kwd}) == 6 and        } elsif ((length $self->{kwd}) == 6 and
2531                 ($self->{nc} == 0x0045 or # E                 ($self->{nc} == 0x0045 or # E
2532                  $self->{nc} == 0x0065)) { # e                  $self->{nc} == 0x0065)) { # e
2533                    if ($self->{is_xml} and
2534                ($self->{kwd} ne 'DOCTYP' or $self->{nc} == 0x0065)) {
2535              
2536              ## XML5: case-sensitive.
2537              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO
2538                              text => 'DOCTYPE',
2539                              line => $self->{line_prev},
2540                              column => $self->{column_prev} - 5);
2541            } else {
2542              
2543            }
2544          $self->{state} = DOCTYPE_STATE;          $self->{state} = DOCTYPE_STATE;
2545          $self->{ct} = {type => DOCTYPE_TOKEN,          $self->{ct} = {type => DOCTYPE_TOKEN,
2546                                    quirks => 1,                                    quirks => 1,
# Line 2139  sub _get_next_token ($) { Line 2563  sub _get_next_token ($) {
2563                                    
2564          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2565                          line => $self->{line_prev},                          line => $self->{line_prev},
2566                          column => $self->{column_prev} - 1 - length $self->{s_kwd});                          column => $self->{column_prev} - 1 - length $self->{kwd});
2567          $self->{state} = BOGUS_COMMENT_STATE;          $self->{state} = BOGUS_COMMENT_STATE;
2568          ## Reconsume.          ## Reconsume.
2569          $self->{ct} = {type => COMMENT_TOKEN,          $self->{ct} = {type => COMMENT_TOKEN,
2570                                    data => $self->{s_kwd},                                    data => $self->{kwd},
2571                                    line => $self->{line_prev},                                    line => $self->{line_prev},
2572                                    column => $self->{column_prev} - 1 - length $self->{s_kwd},                                    column => $self->{column_prev} - 1 - length $self->{kwd},
2573                                   };                                   };
2574          redo A;          redo A;
2575        }        }
# Line 2156  sub _get_next_token ($) { Line 2580  sub _get_next_token ($) {
2580              '[CD' => 0x0041, # A              '[CD' => 0x0041, # A
2581              '[CDA' => 0x0054, # T              '[CDA' => 0x0054, # T
2582              '[CDAT' => 0x0041, # A              '[CDAT' => 0x0041, # A
2583            }->{$self->{s_kwd}}) {            }->{$self->{kwd}}) {
2584                    
2585          ## Stay in the state.          ## Stay in the state.
2586          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
2587                    
2588      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2589        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2172  sub _get_next_token ($) { Line 2596  sub _get_next_token ($) {
2596      }      }
2597        
2598          redo A;          redo A;
2599        } elsif ($self->{s_kwd} eq '[CDATA' and        } elsif ($self->{kwd} eq '[CDATA' and
2600                 $self->{nc} == 0x005B) { # [                 $self->{nc} == 0x005B) { # [
2601                    if ($self->{is_xml} and
2602                not $self->{tainted} and
2603                @{$self->{open_elements} or []} == 0) {
2604              
2605              $self->{parse_error}->(level => $self->{level}->{must}, type => 'cdata outside of root element',
2606                              line => $self->{line_prev},
2607                              column => $self->{column_prev} - 7);
2608              $self->{tainted} = 1;
2609            } else {
2610              
2611            }
2612    
2613          $self->{ct} = {type => CHARACTER_TOKEN,          $self->{ct} = {type => CHARACTER_TOKEN,
2614                                    data => '',                                    data => '',
2615                                    line => $self->{line_prev},                                    line => $self->{line_prev},
# Line 2196  sub _get_next_token ($) { Line 2631  sub _get_next_token ($) {
2631                    
2632          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2633                          line => $self->{line_prev},                          line => $self->{line_prev},
2634                          column => $self->{column_prev} - 1 - length $self->{s_kwd});                          column => $self->{column_prev} - 1 - length $self->{kwd});
2635          $self->{state} = BOGUS_COMMENT_STATE;          $self->{state} = BOGUS_COMMENT_STATE;
2636          ## Reconsume.          ## Reconsume.
2637          $self->{ct} = {type => COMMENT_TOKEN,          $self->{ct} = {type => COMMENT_TOKEN,
2638                                    data => $self->{s_kwd},                                    data => $self->{kwd},
2639                                    line => $self->{line_prev},                                    line => $self->{line_prev},
2640                                    column => $self->{column_prev} - 1 - length $self->{s_kwd},                                    column => $self->{column_prev} - 1 - length $self->{kwd},
2641                                   };                                   };
2642          redo A;          redo A;
2643        }        }
# Line 2223  sub _get_next_token ($) { Line 2658  sub _get_next_token ($) {
2658        
2659          redo A;          redo A;
2660        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
           
2661          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2662          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
2663              
2664              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2665            } else {
2666              
2667              $self->{state} = DATA_STATE;
2668              $self->{s_kwd} = '';
2669            }
2670                    
2671      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2672        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2242  sub _get_next_token ($) { Line 2683  sub _get_next_token ($) {
2683    
2684          redo A;          redo A;
2685        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
2686          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2687          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
2688              
2689              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2690            } else {
2691              
2692              $self->{state} = DATA_STATE;
2693              $self->{s_kwd} = '';
2694            }
2695          ## reconsume          ## reconsume
2696    
2697          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
# Line 2285  sub _get_next_token ($) { Line 2732  sub _get_next_token ($) {
2732        
2733          redo A;          redo A;
2734        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
           
2735          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2736          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
2737              
2738              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2739            } else {
2740              
2741              $self->{state} = DATA_STATE;
2742              $self->{s_kwd} = '';
2743            }
2744                    
2745      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2746        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2304  sub _get_next_token ($) { Line 2757  sub _get_next_token ($) {
2757    
2758          redo A;          redo A;
2759        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
2760          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2761          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
2762              
2763              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2764            } else {
2765              
2766              $self->{state} = DATA_STATE;
2767              $self->{s_kwd} = '';
2768            }
2769          ## reconsume          ## reconsume
2770    
2771          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
# Line 2331  sub _get_next_token ($) { Line 2790  sub _get_next_token ($) {
2790          redo A;          redo A;
2791        }        }
2792      } elsif ($self->{state} == COMMENT_STATE) {      } elsif ($self->{state} == COMMENT_STATE) {
2793          ## XML5: "Comment state" and "DOCTYPE comment state".
2794    
2795        if ($self->{nc} == 0x002D) { # -        if ($self->{nc} == 0x002D) { # -
2796                    
2797          $self->{state} = COMMENT_END_DASH_STATE;          $self->{state} = COMMENT_END_DASH_STATE;
# Line 2347  sub _get_next_token ($) { Line 2808  sub _get_next_token ($) {
2808        
2809          redo A;          redo A;
2810        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
2811          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2812          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
2813              
2814              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2815            } else {
2816              
2817              $self->{state} = DATA_STATE;
2818              $self->{s_kwd} = '';
2819            }
2820          ## reconsume          ## reconsume
2821    
2822          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
# Line 2377  sub _get_next_token ($) { Line 2844  sub _get_next_token ($) {
2844          redo A;          redo A;
2845        }        }
2846      } elsif ($self->{state} == COMMENT_END_DASH_STATE) {      } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2847          ## XML5: "Comment dash state" and "DOCTYPE comment dash state".
2848    
2849        if ($self->{nc} == 0x002D) { # -        if ($self->{nc} == 0x002D) { # -
2850                    
2851          $self->{state} = COMMENT_END_STATE;          $self->{state} = COMMENT_END_STATE;
# Line 2393  sub _get_next_token ($) { Line 2862  sub _get_next_token ($) {
2862        
2863          redo A;          redo A;
2864        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
2865          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2866          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
2867              
2868              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2869            } else {
2870              
2871              $self->{state} = DATA_STATE;
2872              $self->{s_kwd} = '';
2873            }
2874          ## reconsume          ## reconsume
2875    
2876          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
# Line 2419  sub _get_next_token ($) { Line 2894  sub _get_next_token ($) {
2894          redo A;          redo A;
2895        }        }
2896      } elsif ($self->{state} == COMMENT_END_STATE) {      } elsif ($self->{state} == COMMENT_END_STATE) {
2897          ## XML5: "Comment end state" and "DOCTYPE comment end state".
2898    
2899        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
2900                    if ($self->{in_subset}) {
2901          $self->{state} = DATA_STATE;            
2902              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2903            } else {
2904              
2905              $self->{state} = DATA_STATE;
2906              $self->{s_kwd} = '';
2907            }
2908                    
2909      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2910        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2439  sub _get_next_token ($) { Line 2922  sub _get_next_token ($) {
2922          redo A;          redo A;
2923        } elsif ($self->{nc} == 0x002D) { # -        } elsif ($self->{nc} == 0x002D) { # -
2924                    
2925            ## XML5: Not a parse error.
2926          $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2927                          line => $self->{line_prev},                          line => $self->{line_prev},
2928                          column => $self->{column_prev});                          column => $self->{column_prev});
# Line 2457  sub _get_next_token ($) { Line 2941  sub _get_next_token ($) {
2941        
2942          redo A;          redo A;
2943        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
2944          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2945          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
2946              
2947              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2948            } else {
2949              
2950              $self->{state} = DATA_STATE;
2951              $self->{s_kwd} = '';
2952            }
2953          ## reconsume          ## reconsume
2954    
2955          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
# Line 2467  sub _get_next_token ($) { Line 2957  sub _get_next_token ($) {
2957          redo A;          redo A;
2958        } else {        } else {
2959                    
2960            ## XML5: Not a parse error.
2961          $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2962                          line => $self->{line_prev},                          line => $self->{line_prev},
2963                          column => $self->{column_prev});                          column => $self->{column_prev});
# Line 2503  sub _get_next_token ($) { Line 2994  sub _get_next_token ($) {
2994          redo A;          redo A;
2995        } else {        } else {
2996                    
2997            ## XML5: Unless EOF, swith to the bogus comment state.
2998          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before DOCTYPE name');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before DOCTYPE name');
2999          $self->{state} = BEFORE_DOCTYPE_NAME_STATE;          $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
3000          ## reconsume          ## reconsume
3001          redo A;          redo A;
3002        }        }
3003      } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {      } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
3004          ## XML5: "DOCTYPE root name before state".
3005    
3006        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
3007                    
3008          ## Stay in the state          ## Stay in the state
# Line 2526  sub _get_next_token ($) { Line 3020  sub _get_next_token ($) {
3020          redo A;          redo A;
3021        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
3022                    
3023            ## XML5: No parse error.
3024          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3025          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3026            $self->{s_kwd} = '';
3027                    
3028      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3029        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2547  sub _get_next_token ($) { Line 3043  sub _get_next_token ($) {
3043                    
3044          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3045          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3046            $self->{s_kwd} = '';
3047          ## reconsume          ## reconsume
3048    
3049          return  ($self->{ct}); # DOCTYPE (quirks)          return  ($self->{ct}); # DOCTYPE (quirks)
3050    
3051          redo A;          redo A;
3052          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3053            
3054            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3055            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3056            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3057            $self->{in_subset} = 1;
3058            
3059        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3060          $self->{line_prev} = $self->{line};
3061          $self->{column_prev} = $self->{column};
3062          $self->{column}++;
3063          $self->{nc}
3064              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3065        } else {
3066          $self->{set_nc}->($self);
3067        }
3068      
3069            return  ($self->{ct}); # DOCTYPE
3070            redo A;
3071        } else {        } else {
3072                    
3073          $self->{ct}->{name} = chr $self->{nc};          $self->{ct}->{name} = chr $self->{nc};
# Line 2571  sub _get_next_token ($) { Line 3087  sub _get_next_token ($) {
3087          redo A;          redo A;
3088        }        }
3089      } elsif ($self->{state} == DOCTYPE_NAME_STATE) {      } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
3090  ## ISSUE: Redundant "First," in the spec.        ## XML5: "DOCTYPE root name state".
3091    
3092          ## ISSUE: Redundant "First," in the spec.
3093    
3094        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
3095                    
3096          $self->{state} = AFTER_DOCTYPE_NAME_STATE;          $self->{state} = AFTER_DOCTYPE_NAME_STATE;
# Line 2590  sub _get_next_token ($) { Line 3109  sub _get_next_token ($) {
3109        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
3110                    
3111          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3112            $self->{s_kwd} = '';
3113                    
3114      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3115        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2609  sub _get_next_token ($) { Line 3129  sub _get_next_token ($) {
3129                    
3130          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3131          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3132            $self->{s_kwd} = '';
3133          ## reconsume          ## reconsume
3134    
3135          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
3136          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
3137    
3138          redo A;          redo A;
3139          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3140            
3141            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3142            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3143            $self->{in_subset} = 1;
3144            
3145        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3146          $self->{line_prev} = $self->{line};
3147          $self->{column_prev} = $self->{column};
3148          $self->{column}++;
3149          $self->{nc}
3150              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3151        } else {
3152          $self->{set_nc}->($self);
3153        }
3154      
3155            return  ($self->{ct}); # DOCTYPE
3156            redo A;
3157        } else {        } else {
3158                    
3159          $self->{ct}->{name}          $self->{ct}->{name}
# Line 2634  sub _get_next_token ($) { Line 3173  sub _get_next_token ($) {
3173          redo A;          redo A;
3174        }        }
3175      } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {      } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
3176          ## XML5: Corresponding to XML5's "DOCTYPE root name after
3177          ## state", but implemented differently.
3178    
3179        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
3180                    
3181          ## Stay in the state          ## Stay in the state
# Line 2650  sub _get_next_token ($) { Line 3192  sub _get_next_token ($) {
3192        
3193          redo A;          redo A;
3194        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
3195            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3196              
3197              $self->{state} = DATA_STATE;
3198              $self->{s_kwd} = '';
3199            } else {
3200              
3201              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
3202              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3203            }
3204                    
         $self->{state} = DATA_STATE;  
3205                    
3206      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3207        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2663  sub _get_next_token ($) { Line 3213  sub _get_next_token ($) {
3213        $self->{set_nc}->($self);        $self->{set_nc}->($self);
3214      }      }
3215        
3216            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         return  ($self->{ct}); # DOCTYPE  
   
3217          redo A;          redo A;
3218        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
3219            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3220              
3221              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3222              $self->{state} = DATA_STATE;
3223              $self->{s_kwd} = '';
3224              $self->{ct}->{quirks} = 1;
3225            } else {
3226              
3227              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3228              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3229            }
3230                    
3231          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');          ## Reconsume.
3232          $self->{state} = DATA_STATE;          return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         ## reconsume  
   
         $self->{ct}->{quirks} = 1;  
         return  ($self->{ct}); # DOCTYPE  
   
3233          redo A;          redo A;
3234        } elsif ($self->{nc} == 0x0050 or # P        } elsif ($self->{nc} == 0x0050 or # P
3235                 $self->{nc} == 0x0070) { # p                 $self->{nc} == 0x0070) { # p
3236            
3237          $self->{state} = PUBLIC_STATE;          $self->{state} = PUBLIC_STATE;
3238          $self->{s_kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
3239                    
3240      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3241        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2695  sub _get_next_token ($) { Line 3250  sub _get_next_token ($) {
3250          redo A;          redo A;
3251        } elsif ($self->{nc} == 0x0053 or # S        } elsif ($self->{nc} == 0x0053 or # S
3252                 $self->{nc} == 0x0073) { # s                 $self->{nc} == 0x0073) { # s
3253            
3254          $self->{state} = SYSTEM_STATE;          $self->{state} = SYSTEM_STATE;
3255          $self->{s_kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
3256                    
3257      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3258        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2709  sub _get_next_token ($) { Line 3265  sub _get_next_token ($) {
3265      }      }
3266        
3267          redo A;          redo A;
3268        } else {        } elsif ($self->{nc} == 0x0022 and # "
3269                   ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
3270                    $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
3271                    
3272          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name');          $self->{state} = DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE;
3273          $self->{ct}->{quirks} = 1;          $self->{ct}->{value} = ''; # ENTITY
3274            
3275        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3276          $self->{line_prev} = $self->{line};
3277          $self->{column_prev} = $self->{column};
3278          $self->{column}++;
3279          $self->{nc}
3280              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3281        } else {
3282          $self->{set_nc}->($self);
3283        }
3284      
3285            redo A;
3286          } elsif ($self->{nc} == 0x0027 and # '
3287                   ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
3288                    $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
3289            
3290            $self->{state} = DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE;
3291            $self->{ct}->{value} = ''; # ENTITY
3292            
3293        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3294          $self->{line_prev} = $self->{line};
3295          $self->{column_prev} = $self->{column};
3296          $self->{column}++;
3297          $self->{nc}
3298              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3299        } else {
3300          $self->{set_nc}->($self);
3301        }
3302      
3303            redo A;
3304          } elsif ($self->{is_xml} and
3305                   $self->{ct}->{type} == DOCTYPE_TOKEN and
3306                   $self->{nc} == 0x005B) { # [
3307            
3308            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3309            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3310            $self->{in_subset} = 1;
3311            
3312        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3313          $self->{line_prev} = $self->{line};
3314          $self->{column_prev} = $self->{column};
3315          $self->{column}++;
3316          $self->{nc}
3317              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3318        } else {
3319          $self->{set_nc}->($self);
3320        }
3321      
3322            return  ($self->{ct}); # DOCTYPE
3323            redo A;
3324          } else {
3325            $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name'); ## TODO: type
3326    
3327            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3328              
3329              $self->{ct}->{quirks} = 1;
3330              $self->{state} = BOGUS_DOCTYPE_STATE;
3331            } else {
3332              
3333              $self->{state} = BOGUS_MD_STATE;
3334            }
3335    
         $self->{state} = BOGUS_DOCTYPE_STATE;  
3336                    
3337      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3338        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2736  sub _get_next_token ($) { Line 3354  sub _get_next_token ($) {
3354              0x0042, # B              0x0042, # B
3355              0x004C, # L              0x004C, # L
3356              0x0049, # I              0x0049, # I
3357            ]->[length $self->{s_kwd}] or            ]->[length $self->{kwd}] or
3358            $self->{nc} == [            $self->{nc} == [
3359              undef,              undef,
3360              0x0075, # u              0x0075, # u
3361              0x0062, # b              0x0062, # b
3362              0x006C, # l              0x006C, # l
3363              0x0069, # i              0x0069, # i
3364            ]->[length $self->{s_kwd}]) {            ]->[length $self->{kwd}]) {
3365                    
3366          ## Stay in the state.          ## Stay in the state.
3367          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
3368                    
3369      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3370        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2759  sub _get_next_token ($) { Line 3377  sub _get_next_token ($) {
3377      }      }
3378        
3379          redo A;          redo A;
3380        } elsif ((length $self->{s_kwd}) == 5 and        } elsif ((length $self->{kwd}) == 5 and
3381                 ($self->{nc} == 0x0043 or # C                 ($self->{nc} == 0x0043 or # C
3382                  $self->{nc} == 0x0063)) { # c                  $self->{nc} == 0x0063)) { # c
3383                    if ($self->{is_xml} and
3384                ($self->{kwd} ne 'PUBLI' or $self->{nc} == 0x0063)) { # c
3385              
3386              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
3387                              text => 'PUBLIC',
3388                              line => $self->{line_prev},
3389                              column => $self->{column_prev} - 4);
3390            } else {
3391              
3392            }
3393          $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;          $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3394                    
3395      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 2777  sub _get_next_token ($) { Line 3404  sub _get_next_token ($) {
3404        
3405          redo A;          redo A;
3406        } else {        } else {
3407                    $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name', ## TODO: type
         $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name',  
3408                          line => $self->{line_prev},                          line => $self->{line_prev},
3409                          column => $self->{column_prev} + 1 - length $self->{s_kwd});                          column => $self->{column_prev} + 1 - length $self->{kwd});
3410          $self->{ct}->{quirks} = 1;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3411              
3412          $self->{state} = BOGUS_DOCTYPE_STATE;            $self->{ct}->{quirks} = 1;
3413              $self->{state} = BOGUS_DOCTYPE_STATE;
3414            } else {
3415              
3416              $self->{state} = BOGUS_MD_STATE;
3417            }
3418          ## Reconsume.          ## Reconsume.
3419          redo A;          redo A;
3420        }        }
# Line 2795  sub _get_next_token ($) { Line 3426  sub _get_next_token ($) {
3426              0x0053, # S              0x0053, # S
3427              0x0054, # T              0x0054, # T
3428              0x0045, # E              0x0045, # E
3429            ]->[length $self->{s_kwd}] or            ]->[length $self->{kwd}] or
3430            $self->{nc} == [            $self->{nc} == [
3431              undef,              undef,
3432              0x0079, # y              0x0079, # y
3433              0x0073, # s              0x0073, # s
3434              0x0074, # t              0x0074, # t
3435              0x0065, # e              0x0065, # e
3436            ]->[length $self->{s_kwd}]) {            ]->[length $self->{kwd}]) {
3437                    
3438          ## Stay in the state.          ## Stay in the state.
3439          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
3440                    
3441      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3442        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2818  sub _get_next_token ($) { Line 3449  sub _get_next_token ($) {
3449      }      }
3450        
3451          redo A;          redo A;
3452        } elsif ((length $self->{s_kwd}) == 5 and        } elsif ((length $self->{kwd}) == 5 and
3453                 ($self->{nc} == 0x004D or # M                 ($self->{nc} == 0x004D or # M
3454                  $self->{nc} == 0x006D)) { # m                  $self->{nc} == 0x006D)) { # m
3455                    if ($self->{is_xml} and
3456                ($self->{kwd} ne 'SYSTE' or $self->{nc} == 0x006D)) { # m
3457              
3458              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
3459                              text => 'SYSTEM',
3460                              line => $self->{line_prev},
3461                              column => $self->{column_prev} - 4);
3462            } else {
3463              
3464            }
3465          $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;          $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
3466                    
3467      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 2836  sub _get_next_token ($) { Line 3476  sub _get_next_token ($) {
3476        
3477          redo A;          redo A;
3478        } else {        } else {
3479                    $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name', ## TODO: type
         $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name',  
3480                          line => $self->{line_prev},                          line => $self->{line_prev},
3481                          column => $self->{column_prev} + 1 - length $self->{s_kwd});                          column => $self->{column_prev} + 1 - length $self->{kwd});
3482          $self->{ct}->{quirks} = 1;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3483              
3484          $self->{state} = BOGUS_DOCTYPE_STATE;            $self->{ct}->{quirks} = 1;
3485              $self->{state} = BOGUS_DOCTYPE_STATE;
3486            } else {
3487              
3488              $self->{state} = BOGUS_MD_STATE;
3489            }
3490          ## Reconsume.          ## Reconsume.
3491          redo A;          redo A;
3492        }        }
# Line 2895  sub _get_next_token ($) { Line 3539  sub _get_next_token ($) {
3539        
3540          redo A;          redo A;
3541        } elsif ($self->{nc} eq 0x003E) { # >        } elsif ($self->{nc} eq 0x003E) { # >
           
3542          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3543            
3544          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3545              
3546              $self->{state} = DATA_STATE;
3547              $self->{s_kwd} = '';
3548              $self->{ct}->{quirks} = 1;
3549            } else {
3550              
3551              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3552            }
3553            
3554                    
3555      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3556        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2910  sub _get_next_token ($) { Line 3562  sub _get_next_token ($) {
3562        $self->{set_nc}->($self);        $self->{set_nc}->($self);
3563      }      }
3564        
3565            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         return  ($self->{ct}); # DOCTYPE  
   
3566          redo A;          redo A;
3567        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
3568            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3569              
3570              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3571              $self->{state} = DATA_STATE;
3572              $self->{s_kwd} = '';
3573              $self->{ct}->{quirks} = 1;
3574            } else {
3575              
3576              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3577              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3578            }
3579                    
         $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');  
   
         $self->{state} = DATA_STATE;  
3580          ## reconsume          ## reconsume
   
         $self->{ct}->{quirks} = 1;  
3581          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
   
3582          redo A;          redo A;
3583        } else {        } elsif ($self->{is_xml} and
3584                   $self->{ct}->{type} == DOCTYPE_TOKEN and
3585                   $self->{nc} == 0x005B) { # [
3586                    
3587            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3588            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3589            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3590            $self->{in_subset} = 1;
3591            
3592        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3593          $self->{line_prev} = $self->{line};
3594          $self->{column_prev} = $self->{column};
3595          $self->{column}++;
3596          $self->{nc}
3597              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3598        } else {
3599          $self->{set_nc}->($self);
3600        }
3601      
3602            return  ($self->{ct}); # DOCTYPE
3603            redo A;
3604          } else {
3605          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC');
         $self->{ct}->{quirks} = 1;  
3606    
3607          $self->{state} = BOGUS_DOCTYPE_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3608              
3609              $self->{ct}->{quirks} = 1;
3610              $self->{state} = BOGUS_DOCTYPE_STATE;
3611            } else {
3612              
3613              $self->{state} = BOGUS_MD_STATE;
3614            }
3615    
3616                    
3617      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3618        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2962  sub _get_next_token ($) { Line 3643  sub _get_next_token ($) {
3643        
3644          redo A;          redo A;
3645        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
           
3646          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3647    
3648          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3649              
3650              $self->{state} = DATA_STATE;
3651              $self->{s_kwd} = '';
3652              $self->{ct}->{quirks} = 1;
3653            } else {
3654              
3655              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3656            }
3657    
3658                    
3659      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3660        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2977  sub _get_next_token ($) { Line 3666  sub _get_next_token ($) {
3666        $self->{set_nc}->($self);        $self->{set_nc}->($self);
3667      }      }
3668        
3669            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         return  ($self->{ct}); # DOCTYPE  
   
3670          redo A;          redo A;
3671        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
3672          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3673    
3674          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3675          ## reconsume            
3676              $self->{state} = DATA_STATE;
3677          $self->{ct}->{quirks} = 1;            $self->{s_kwd} = '';
3678              $self->{ct}->{quirks} = 1;
3679            } else {
3680              
3681              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3682            }
3683            
3684            ## Reconsume.
3685          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
   
3686          redo A;          redo A;
3687        } else {        } else {
3688                    
3689          $self->{ct}->{pubid} # DOCTYPE          $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
             .= chr $self->{nc};  
3690          $self->{read_until}->($self->{ct}->{pubid}, q[">],          $self->{read_until}->($self->{ct}->{pubid}, q[">],
3691                                length $self->{ct}->{pubid});                                length $self->{ct}->{pubid});
3692    
# Line 3031  sub _get_next_token ($) { Line 3721  sub _get_next_token ($) {
3721        
3722          redo A;          redo A;
3723        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
           
3724          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3725    
3726          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3727              
3728              $self->{state} = DATA_STATE;
3729              $self->{s_kwd} = '';
3730              $self->{ct}->{quirks} = 1;
3731            } else {
3732              
3733              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3734            }
3735    
3736                    
3737      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3738        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3046  sub _get_next_token ($) { Line 3744  sub _get_next_token ($) {
3744        $self->{set_nc}->($self);        $self->{set_nc}->($self);
3745      }      }
3746        
3747            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         return  ($self->{ct}); # DOCTYPE  
   
3748          redo A;          redo A;
3749        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
3750          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3751    
3752          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3753              
3754              $self->{state} = DATA_STATE;
3755              $self->{s_kwd} = '';
3756              $self->{ct}->{quirks} = 1;
3757            } else {
3758              
3759              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3760            }
3761          
3762          ## reconsume          ## reconsume
3763            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         return  ($self->{ct}); # DOCTYPE  
   
3764          redo A;          redo A;
3765        } else {        } else {
3766                    
3767          $self->{ct}->{pubid} # DOCTYPE          $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
             .= chr $self->{nc};  
3768          $self->{read_until}->($self->{ct}->{pubid}, q['>],          $self->{read_until}->($self->{ct}->{pubid}, q['>],
3769                                length $self->{ct}->{pubid});                                length $self->{ct}->{pubid});
3770    
# Line 3101  sub _get_next_token ($) { Line 3800  sub _get_next_token ($) {
3800          redo A;          redo A;
3801        } elsif ($self->{nc} == 0x0022) { # "        } elsif ($self->{nc} == 0x0022) { # "
3802                    
3803          $self->{ct}->{sysid} = ''; # DOCTYPE          $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
3804          $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;          $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3805                    
3806      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 3117  sub _get_next_token ($) { Line 3816  sub _get_next_token ($) {
3816          redo A;          redo A;
3817        } elsif ($self->{nc} == 0x0027) { # '        } elsif ($self->{nc} == 0x0027) { # '
3818                    
3819          $self->{ct}->{sysid} = ''; # DOCTYPE          $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
3820          $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;          $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
3821                    
3822      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 3132  sub _get_next_token ($) { Line 3831  sub _get_next_token ($) {
3831        
3832          redo A;          redo A;
3833        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
3834            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3835              if ($self->{is_xml}) {
3836                
3837                $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3838              } else {
3839                
3840              }
3841              $self->{state} = DATA_STATE;
3842              $self->{s_kwd} = '';
3843            } else {
3844              if ($self->{ct}->{type} == NOTATION_TOKEN) {
3845                
3846              } else {
3847                
3848                $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');            
3849              }
3850              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3851            }
3852                    
         $self->{state} = DATA_STATE;  
3853                    
3854      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3855        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3145  sub _get_next_token ($) { Line 3861  sub _get_next_token ($) {
3861        $self->{set_nc}->($self);        $self->{set_nc}->($self);
3862      }      }
3863        
3864            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         return  ($self->{ct}); # DOCTYPE  
   
3865          redo A;          redo A;
3866        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
3867            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3868              
3869              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3870              
3871              $self->{state} = DATA_STATE;
3872              $self->{s_kwd} = '';
3873              $self->{ct}->{quirks} = 1;
3874            } else {
3875              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3876              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3877            }
3878                    
         $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');  
   
         $self->{state} = DATA_STATE;  
3879          ## reconsume          ## reconsume
3880            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3881          $self->{ct}->{quirks} = 1;          redo A;
3882          } elsif ($self->{is_xml} and
3883                   $self->{ct}->{type} == DOCTYPE_TOKEN and
3884                   $self->{nc} == 0x005B) { # [
3885            
3886            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3887            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3888            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3889            $self->{in_subset} = 1;
3890            
3891        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3892          $self->{line_prev} = $self->{line};
3893          $self->{column_prev} = $self->{column};
3894          $self->{column}++;
3895          $self->{nc}
3896              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3897        } else {
3898          $self->{set_nc}->($self);
3899        }
3900      
3901          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
   
3902          redo A;          redo A;
3903        } else {        } else {
           
3904          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC literal');
         $self->{ct}->{quirks} = 1;  
3905    
3906          $self->{state} = BOGUS_DOCTYPE_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3907              
3908              $self->{ct}->{quirks} = 1;
3909              $self->{state} = BOGUS_DOCTYPE_STATE;
3910            } else {
3911              
3912              $self->{state} = BOGUS_MD_STATE;
3913            }
3914    
3915                    
3916      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3917        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3228  sub _get_next_token ($) { Line 3974  sub _get_next_token ($) {
3974        
3975          redo A;          redo A;
3976        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
           
3977          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
         $self->{state} = DATA_STATE;  
3978                    
3979      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3980        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3243  sub _get_next_token ($) { Line 3987  sub _get_next_token ($) {
3987      }      }
3988        
3989    
3990          $self->{ct}->{quirks} = 1;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3991          return  ($self->{ct}); # DOCTYPE            
3992              $self->{state} = DATA_STATE;
3993              $self->{s_kwd} = '';
3994              $self->{ct}->{quirks} = 1;
3995            } else {
3996              
3997              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3998            }
3999    
4000            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4001          redo A;          redo A;
4002        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
4003            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4004              
4005              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
4006              $self->{state} = DATA_STATE;
4007              $self->{s_kwd} = '';
4008              $self->{ct}->{quirks} = 1;
4009            } else {
4010              
4011              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4012              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4013            }
4014                    
         $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');  
   
         $self->{state} = DATA_STATE;  
4015          ## reconsume          ## reconsume
4016            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4017            redo A;
4018          } elsif ($self->{is_xml} and
4019                   $self->{ct}->{type} == DOCTYPE_TOKEN and
4020                   $self->{nc} == 0x005B) { # [
4021            
4022            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
4023    
4024          $self->{ct}->{quirks} = 1;          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4025            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4026            $self->{in_subset} = 1;
4027            
4028        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4029          $self->{line_prev} = $self->{line};
4030          $self->{column_prev} = $self->{column};
4031          $self->{column}++;
4032          $self->{nc}
4033              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4034        } else {
4035          $self->{set_nc}->($self);
4036        }
4037      
4038          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
   
4039          redo A;          redo A;
4040        } else {        } else {
           
4041          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM');
         $self->{ct}->{quirks} = 1;  
4042    
4043          $self->{state} = BOGUS_DOCTYPE_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4044                        
4045              $self->{ct}->{quirks} = 1;
4046              $self->{state} = BOGUS_DOCTYPE_STATE;
4047            } else {
4048              
4049              $self->{state} = BOGUS_MD_STATE;
4050            }
4051    
4052                    
4053      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4054        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3293  sub _get_next_token ($) { Line 4078  sub _get_next_token ($) {
4078      }      }
4079        
4080          redo A;          redo A;
4081        } elsif ($self->{nc} == 0x003E) { # >        } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
           
4082          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4083    
4084          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4085              
4086              $self->{state} = DATA_STATE;
4087              $self->{s_kwd} = '';
4088              $self->{ct}->{quirks} = 1;
4089            } else {
4090              
4091              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4092            }
4093            
4094                    
4095      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4096        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3309  sub _get_next_token ($) { Line 4102  sub _get_next_token ($) {
4102        $self->{set_nc}->($self);        $self->{set_nc}->($self);
4103      }      }
4104        
4105            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         return  ($self->{ct}); # DOCTYPE  
   
4106          redo A;          redo A;
4107        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
4108          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4109    
4110          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4111              
4112              $self->{state} = DATA_STATE;
4113              $self->{s_kwd} = '';
4114              $self->{ct}->{quirks} = 1;
4115            } else {
4116              
4117              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4118            }
4119            
4120          ## reconsume          ## reconsume
4121            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         return  ($self->{ct}); # DOCTYPE  
   
4122          redo A;          redo A;
4123        } else {        } else {
4124                    
4125          $self->{ct}->{sysid} # DOCTYPE          $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
             .= chr $self->{nc};  
4126          $self->{read_until}->($self->{ct}->{sysid}, q[">],          $self->{read_until}->($self->{ct}->{sysid}, q[">],
4127                                length $self->{ct}->{sysid});                                length $self->{ct}->{sysid});
4128    
# Line 3362  sub _get_next_token ($) { Line 4156  sub _get_next_token ($) {
4156      }      }
4157        
4158          redo A;          redo A;
4159        } elsif ($self->{nc} == 0x003E) { # >        } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
4160                    
4161          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4162    
4163          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
4164            $self->{s_kwd} = '';
4165                    
4166      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4167        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3384  sub _get_next_token ($) { Line 4179  sub _get_next_token ($) {
4179    
4180          redo A;          redo A;
4181        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
4182          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4183    
4184          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4185          ## reconsume            
4186              $self->{state} = DATA_STATE;
4187          $self->{ct}->{quirks} = 1;            $self->{s_kwd} = '';
4188          return  ($self->{ct}); # DOCTYPE            $self->{ct}->{quirks} = 1;
4189            } else {
4190              
4191              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4192            }
4193    
4194            ## reconsume
4195            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4196          redo A;          redo A;
4197        } else {        } else {
4198                    
4199          $self->{ct}->{sysid} # DOCTYPE          $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
             .= chr $self->{nc};  
4200          $self->{read_until}->($self->{ct}->{sysid}, q['>],          $self->{read_until}->($self->{ct}->{sysid}, q['>],
4201                                length $self->{ct}->{sysid});                                length $self->{ct}->{sysid});
4202    
# Line 3417  sub _get_next_token ($) { Line 4216  sub _get_next_token ($) {
4216        }        }
4217      } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {      } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
4218        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
4219                    if ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN) {
4220          ## Stay in the state            
4221              $self->{state} = BEFORE_NDATA_STATE;
4222            } else {
4223              
4224              ## Stay in the state
4225            }
4226                    
4227      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4228        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3432  sub _get_next_token ($) { Line 4236  sub _get_next_token ($) {
4236        
4237          redo A;          redo A;
4238        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
4239            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4240              
4241              $self->{state} = DATA_STATE;
4242              $self->{s_kwd} = '';
4243            } else {
4244              
4245              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4246            }
4247    
4248                    
4249          $self->{state} = DATA_STATE;      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4250          $self->{line_prev} = $self->{line};
4251          $self->{column_prev} = $self->{column};
4252          $self->{column}++;
4253          $self->{nc}
4254              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4255        } else {
4256          $self->{set_nc}->($self);
4257        }
4258      
4259            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4260            redo A;
4261          } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
4262                   ($self->{nc} == 0x004E or # N
4263                    $self->{nc} == 0x006E)) { # n
4264            
4265            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before NDATA'); ## TODO: type
4266            $self->{state} = NDATA_STATE;
4267            $self->{kwd} = chr $self->{nc};
4268                    
4269      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4270        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3445  sub _get_next_token ($) { Line 4276  sub _get_next_token ($) {
4276        $self->{set_nc}->($self);        $self->{set_nc}->($self);
4277      }      }
4278        
4279            redo A;
4280          } elsif ($self->{nc} == -1) {
4281            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4282              
4283              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
4284              $self->{state} = DATA_STATE;
4285              $self->{s_kwd} = '';
4286              $self->{ct}->{quirks} = 1;
4287            } else {
4288              
4289              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4290              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4291            }
4292    
4293            ## reconsume
4294            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4295            redo A;
4296          } elsif ($self->{is_xml} and
4297                   $self->{ct}->{type} == DOCTYPE_TOKEN and
4298                   $self->{nc} == 0x005B) { # [
4299            
4300            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4301            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4302            $self->{in_subset} = 1;
4303            
4304        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4305          $self->{line_prev} = $self->{line};
4306          $self->{column_prev} = $self->{column};
4307          $self->{column}++;
4308          $self->{nc}
4309              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4310        } else {
4311          $self->{set_nc}->($self);
4312        }
4313      
4314          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
4315            redo A;
4316          } else {
4317            $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
4318    
4319            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4320              
4321              #$self->{ct}->{quirks} = 1;
4322              $self->{state} = BOGUS_DOCTYPE_STATE;
4323            } else {
4324              
4325              $self->{state} = BOGUS_MD_STATE;
4326            }
4327    
4328            
4329        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4330          $self->{line_prev} = $self->{line};
4331          $self->{column_prev} = $self->{column};
4332          $self->{column}++;
4333          $self->{nc}
4334              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4335        } else {
4336          $self->{set_nc}->($self);
4337        }
4338      
4339            redo A;
4340          }
4341        } elsif ($self->{state} == BEFORE_NDATA_STATE) {
4342          if ($is_space->{$self->{nc}}) {
4343            
4344            ## Stay in the state.
4345            
4346        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4347          $self->{line_prev} = $self->{line};
4348          $self->{column_prev} = $self->{column};
4349          $self->{column}++;
4350          $self->{nc}
4351              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4352        } else {
4353          $self->{set_nc}->($self);
4354        }
4355      
4356            redo A;
4357          } elsif ($self->{nc} == 0x003E) { # >
4358            
4359            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4360            
4361        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4362          $self->{line_prev} = $self->{line};
4363          $self->{column_prev} = $self->{column};
4364          $self->{column}++;
4365          $self->{nc}
4366              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4367        } else {
4368          $self->{set_nc}->($self);
4369        }
4370      
4371            return  ($self->{ct}); # ENTITY
4372            redo A;
4373          } elsif ($self->{nc} == 0x004E or # N
4374                   $self->{nc} == 0x006E) { # n
4375            
4376            $self->{state} = NDATA_STATE;
4377            $self->{kwd} = chr $self->{nc};
4378            
4379        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4380          $self->{line_prev} = $self->{line};
4381          $self->{column_prev} = $self->{column};
4382          $self->{column}++;
4383          $self->{nc}
4384              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4385        } else {
4386          $self->{set_nc}->($self);
4387        }
4388      
4389          redo A;          redo A;
4390        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
4391                    
4392          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4393          $self->{state} = DATA_STATE;          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4394          ## reconsume          ## reconsume
4395            return  ($self->{ct}); # ENTITY
         $self->{ct}->{quirks} = 1;  
         return  ($self->{ct}); # DOCTYPE  
   
4396          redo A;          redo A;
4397        } else {        } else {
4398                    
4399          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
4400          #$self->{ct}->{quirks} = 1;          $self->{state} = BOGUS_MD_STATE;
   
         $self->{state} = BOGUS_DOCTYPE_STATE;  
4401                    
4402      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4403        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3482  sub _get_next_token ($) { Line 4415  sub _get_next_token ($) {
4415        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
4416                    
4417          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
4418            $self->{s_kwd} = '';
4419                    
4420      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4421        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3497  sub _get_next_token ($) { Line 4431  sub _get_next_token ($) {
4431          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
4432    
4433          redo A;          redo A;
4434          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
4435            
4436            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4437            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4438            $self->{in_subset} = 1;
4439            
4440        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4441          $self->{line_prev} = $self->{line};
4442          $self->{column_prev} = $self->{column};
4443          $self->{column}++;
4444          $self->{nc}
4445              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4446        } else {
4447          $self->{set_nc}->($self);
4448        }
4449      
4450            return  ($self->{ct}); # DOCTYPE
4451            redo A;
4452        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
4453                    
4454          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
4455            $self->{s_kwd} = '';
4456          ## reconsume          ## reconsume
4457    
4458          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
# Line 3508  sub _get_next_token ($) { Line 4461  sub _get_next_token ($) {
4461        } else {        } else {
4462                    
4463          my $s = '';          my $s = '';
4464          $self->{read_until}->($s, q[>], 0);          $self->{read_until}->($s, q{>[}, 0);
4465    
4466          ## Stay in the state          ## Stay in the state
4467                    
# Line 3528  sub _get_next_token ($) { Line 4481  sub _get_next_token ($) {
4481        ## NOTE: "CDATA section state" in the state is jointly implemented        ## NOTE: "CDATA section state" in the state is jointly implemented
4482        ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,        ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
4483        ## and |CDATA_SECTION_MSE2_STATE|.        ## and |CDATA_SECTION_MSE2_STATE|.
4484    
4485          ## XML5: "CDATA state".
4486                
4487        if ($self->{nc} == 0x005D) { # ]        if ($self->{nc} == 0x005D) { # ]
4488                    
# Line 3545  sub _get_next_token ($) { Line 4500  sub _get_next_token ($) {
4500        
4501          redo A;          redo A;
4502        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
4503            if ($self->{is_xml}) {
4504              
4505              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no mse'); ## TODO: type
4506            } else {
4507              
4508            }
4509    
4510          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
4511                    $self->{s_kwd} = '';
4512      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {          ## Reconsume.
       $self->{line_prev} = $self->{line};  
       $self->{column_prev} = $self->{column};  
       $self->{column}++;  
       $self->{nc}  
           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);  
     } else {  
       $self->{set_nc}->($self);  
     }  
     
4513          if (length $self->{ct}->{data}) { # character          if (length $self->{ct}->{data}) { # character
4514                        
4515            return  ($self->{ct}); # character            return  ($self->{ct}); # character
# Line 3589  sub _get_next_token ($) { Line 4542  sub _get_next_token ($) {
4542    
4543        ## ISSUE: "text tokens" in spec.        ## ISSUE: "text tokens" in spec.
4544      } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {      } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
4545          ## XML5: "CDATA bracket state".
4546    
4547        if ($self->{nc} == 0x005D) { # ]        if ($self->{nc} == 0x005D) { # ]
4548                    
4549          $self->{state} = CDATA_SECTION_MSE2_STATE;          $self->{state} = CDATA_SECTION_MSE2_STATE;
# Line 3606  sub _get_next_token ($) { Line 4561  sub _get_next_token ($) {
4561          redo A;          redo A;
4562        } else {        } else {
4563                    
4564            ## XML5: If EOF, "]" is not appended and changed to the data state.
4565          $self->{ct}->{data} .= ']';          $self->{ct}->{data} .= ']';
4566          $self->{state} = CDATA_SECTION_STATE;          $self->{state} = CDATA_SECTION_STATE; ## XML5: Stay in the state.
4567          ## Reconsume.          ## Reconsume.
4568          redo A;          redo A;
4569        }        }
4570      } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {      } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
4571          ## XML5: "CDATA end state".
4572    
4573        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
4574          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
4575            $self->{s_kwd} = '';
4576                    
4577      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4578        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3653  sub _get_next_token ($) { Line 4612  sub _get_next_token ($) {
4612                    
4613          $self->{ct}->{data} .= ']]'; # character          $self->{ct}->{data} .= ']]'; # character
4614          $self->{state} = CDATA_SECTION_STATE;          $self->{state} = CDATA_SECTION_STATE;
4615          ## Reconsume.          ## Reconsume. ## XML5: Emit.
4616          redo A;          redo A;
4617        }        }
4618      } elsif ($self->{state} == ENTITY_STATE) {      } elsif ($self->{state} == ENTITY_STATE) {
# Line 3670  sub _get_next_token ($) { Line 4629  sub _get_next_token ($) {
4629        } elsif ($self->{nc} == 0x0023) { # #        } elsif ($self->{nc} == 0x0023) { # #
4630                    
4631          $self->{state} = ENTITY_HASH_STATE;          $self->{state} = ENTITY_HASH_STATE;
4632          $self->{s_kwd} = '#';          $self->{kwd} = '#';
4633                    
4634      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4635        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3690  sub _get_next_token ($) { Line 4649  sub _get_next_token ($) {
4649                    
4650          require Whatpm::_NamedEntityList;          require Whatpm::_NamedEntityList;
4651          $self->{state} = ENTITY_NAME_STATE;          $self->{state} = ENTITY_NAME_STATE;
4652          $self->{s_kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
4653          $self->{entity__value} = $self->{s_kwd};          $self->{entity__value} = $self->{kwd};
4654          $self->{entity__match} = 0;          $self->{entity__match} = 0;
4655                    
4656      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 3721  sub _get_next_token ($) { Line 4680  sub _get_next_token ($) {
4680        if ($self->{prev_state} == DATA_STATE) {        if ($self->{prev_state} == DATA_STATE) {
4681                    
4682          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
4683            $self->{s_kwd} = '';
4684          ## Reconsume.          ## Reconsume.
4685          return  ({type => CHARACTER_TOKEN, data => '&',          return  ({type => CHARACTER_TOKEN, data => '&',
4686                    line => $self->{line_prev},                    line => $self->{line_prev},
# Line 3731  sub _get_next_token ($) { Line 4691  sub _get_next_token ($) {
4691                    
4692          $self->{ca}->{value} .= '&';          $self->{ca}->{value} .= '&';
4693          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
4694            $self->{s_kwd} = '';
4695          ## Reconsume.          ## Reconsume.
4696          redo A;          redo A;
4697        }        }
# Line 3739  sub _get_next_token ($) { Line 4700  sub _get_next_token ($) {
4700            $self->{nc} == 0x0058) { # X            $self->{nc} == 0x0058) { # X
4701                    
4702          $self->{state} = HEXREF_X_STATE;          $self->{state} = HEXREF_X_STATE;
4703          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
4704                    
4705      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4706        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3756  sub _get_next_token ($) { Line 4717  sub _get_next_token ($) {
4717                 $self->{nc} <= 0x0039) { # 0..9                 $self->{nc} <= 0x0039) { # 0..9
4718                    
4719          $self->{state} = NCR_NUM_STATE;          $self->{state} = NCR_NUM_STATE;
4720          $self->{s_kwd} = $self->{nc} - 0x0030;          $self->{kwd} = $self->{nc} - 0x0030;
4721                    
4722      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4723        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3781  sub _get_next_token ($) { Line 4742  sub _get_next_token ($) {
4742          if ($self->{prev_state} == DATA_STATE) {          if ($self->{prev_state} == DATA_STATE) {
4743                        
4744            $self->{state} = $self->{prev_state};            $self->{state} = $self->{prev_state};
4745              $self->{s_kwd} = '';
4746            ## Reconsume.            ## Reconsume.
4747            return  ({type => CHARACTER_TOKEN,            return  ({type => CHARACTER_TOKEN,
4748                      data => '&#',                      data => '&#',
# Line 3792  sub _get_next_token ($) { Line 4754  sub _get_next_token ($) {
4754                        
4755            $self->{ca}->{value} .= '&#';            $self->{ca}->{value} .= '&#';
4756            $self->{state} = $self->{prev_state};            $self->{state} = $self->{prev_state};
4757              $self->{s_kwd} = '';
4758            ## Reconsume.            ## Reconsume.
4759            redo A;            redo A;
4760          }          }
# Line 3800  sub _get_next_token ($) { Line 4763  sub _get_next_token ($) {
4763        if (0x0030 <= $self->{nc} and        if (0x0030 <= $self->{nc} and
4764            $self->{nc} <= 0x0039) { # 0..9            $self->{nc} <= 0x0039) { # 0..9
4765                    
4766          $self->{s_kwd} *= 10;          $self->{kwd} *= 10;
4767          $self->{s_kwd} += $self->{nc} - 0x0030;          $self->{kwd} += $self->{nc} - 0x0030;
4768                    
4769          ## Stay in the state.          ## Stay in the state.
4770                    
# Line 3837  sub _get_next_token ($) { Line 4800  sub _get_next_token ($) {
4800          #          #
4801        }        }
4802    
4803        my $code = $self->{s_kwd};        my $code = $self->{kwd};
4804        my $l = $self->{line_prev};        my $l = $self->{line_prev};
4805        my $c = $self->{column_prev};        my $c = $self->{column_prev};
4806        if ($charref_map->{$code}) {        if ($charref_map->{$code}) {
# Line 3857  sub _get_next_token ($) { Line 4820  sub _get_next_token ($) {
4820        if ($self->{prev_state} == DATA_STATE) {        if ($self->{prev_state} == DATA_STATE) {
4821                    
4822          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
4823            $self->{s_kwd} = '';
4824          ## Reconsume.          ## Reconsume.
4825          return  ({type => CHARACTER_TOKEN, data => chr $code,          return  ({type => CHARACTER_TOKEN, data => chr $code,
4826                      has_reference => 1,
4827                    line => $l, column => $c,                    line => $l, column => $c,
4828                   });                   });
4829          redo A;          redo A;
# Line 3867  sub _get_next_token ($) { Line 4832  sub _get_next_token ($) {
4832          $self->{ca}->{value} .= chr $code;          $self->{ca}->{value} .= chr $code;
4833          $self->{ca}->{has_reference} = 1;          $self->{ca}->{has_reference} = 1;
4834          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
4835            $self->{s_kwd} = '';
4836          ## Reconsume.          ## Reconsume.
4837          redo A;          redo A;
4838        }        }
# Line 3877  sub _get_next_token ($) { Line 4843  sub _get_next_token ($) {
4843          # 0..9, A..F, a..f          # 0..9, A..F, a..f
4844                    
4845          $self->{state} = HEXREF_HEX_STATE;          $self->{state} = HEXREF_HEX_STATE;
4846          $self->{s_kwd} = 0;          $self->{kwd} = 0;
4847          ## Reconsume.          ## Reconsume.
4848          redo A;          redo A;
4849        } else {        } else {
# Line 3892  sub _get_next_token ($) { Line 4858  sub _get_next_token ($) {
4858          if ($self->{prev_state} == DATA_STATE) {          if ($self->{prev_state} == DATA_STATE) {
4859                        
4860            $self->{state} = $self->{prev_state};            $self->{state} = $self->{prev_state};
4861              $self->{s_kwd} = '';
4862            ## Reconsume.            ## Reconsume.
4863            return  ({type => CHARACTER_TOKEN,            return  ({type => CHARACTER_TOKEN,
4864                      data => '&' . $self->{s_kwd},                      data => '&' . $self->{kwd},
4865                      line => $self->{line_prev},                      line => $self->{line_prev},
4866                      column => $self->{column_prev} - length $self->{s_kwd},                      column => $self->{column_prev} - length $self->{kwd},
4867                     });                     });
4868            redo A;            redo A;
4869          } else {          } else {
4870                        
4871            $self->{ca}->{value} .= '&' . $self->{s_kwd};            $self->{ca}->{value} .= '&' . $self->{kwd};
4872            $self->{state} = $self->{prev_state};            $self->{state} = $self->{prev_state};
4873              $self->{s_kwd} = '';
4874            ## Reconsume.            ## Reconsume.
4875            redo A;            redo A;
4876          }          }
# Line 3911  sub _get_next_token ($) { Line 4879  sub _get_next_token ($) {
4879        if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {        if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
4880          # 0..9          # 0..9
4881                    
4882          $self->{s_kwd} *= 0x10;          $self->{kwd} *= 0x10;
4883          $self->{s_kwd} += $self->{nc} - 0x0030;          $self->{kwd} += $self->{nc} - 0x0030;
4884          ## Stay in the state.          ## Stay in the state.
4885                    
4886      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 3929  sub _get_next_token ($) { Line 4897  sub _get_next_token ($) {
4897        } elsif (0x0061 <= $self->{nc} and        } elsif (0x0061 <= $self->{nc} and
4898                 $self->{nc} <= 0x0066) { # a..f                 $self->{nc} <= 0x0066) { # a..f
4899                    
4900          $self->{s_kwd} *= 0x10;          $self->{kwd} *= 0x10;
4901          $self->{s_kwd} += $self->{nc} - 0x0060 + 9;          $self->{kwd} += $self->{nc} - 0x0060 + 9;
4902          ## Stay in the state.          ## Stay in the state.
4903                    
4904      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 3947  sub _get_next_token ($) { Line 4915  sub _get_next_token ($) {
4915        } elsif (0x0041 <= $self->{nc} and        } elsif (0x0041 <= $self->{nc} and
4916                 $self->{nc} <= 0x0046) { # A..F                 $self->{nc} <= 0x0046) { # A..F
4917                    
4918          $self->{s_kwd} *= 0x10;          $self->{kwd} *= 0x10;
4919          $self->{s_kwd} += $self->{nc} - 0x0040 + 9;          $self->{kwd} += $self->{nc} - 0x0040 + 9;
4920          ## Stay in the state.          ## Stay in the state.
4921                    
4922      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 3985  sub _get_next_token ($) { Line 4953  sub _get_next_token ($) {
4953          #          #
4954        }        }
4955    
4956        my $code = $self->{s_kwd};        my $code = $self->{kwd};
4957        my $l = $self->{line_prev};        my $l = $self->{line_prev};
4958        my $c = $self->{column_prev};        my $c = $self->{column_prev};
4959        if ($charref_map->{$code}) {        if ($charref_map->{$code}) {
# Line 4005  sub _get_next_token ($) { Line 4973  sub _get_next_token ($) {
4973        if ($self->{prev_state} == DATA_STATE) {        if ($self->{prev_state} == DATA_STATE) {
4974                    
4975          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
4976            $self->{s_kwd} = '';
4977          ## Reconsume.          ## Reconsume.
4978          return  ({type => CHARACTER_TOKEN, data => chr $code,          return  ({type => CHARACTER_TOKEN, data => chr $code,
4979                      has_reference => 1,
4980                    line => $l, column => $c,                    line => $l, column => $c,
4981                   });                   });
4982          redo A;          redo A;
# Line 4015  sub _get_next_token ($) { Line 4985  sub _get_next_token ($) {
4985          $self->{ca}->{value} .= chr $code;          $self->{ca}->{value} .= chr $code;
4986          $self->{ca}->{has_reference} = 1;          $self->{ca}->{has_reference} = 1;
4987          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
4988            $self->{s_kwd} = '';
4989          ## Reconsume.          ## Reconsume.
4990          redo A;          redo A;
4991        }        }
4992      } elsif ($self->{state} == ENTITY_NAME_STATE) {      } elsif ($self->{state} == ENTITY_NAME_STATE) {
4993        if (length $self->{s_kwd} < 30 and        if (length $self->{kwd} < 30 and
4994            ## NOTE: Some number greater than the maximum length of entity name            ## NOTE: Some number greater than the maximum length of entity name
4995            ((0x0041 <= $self->{nc} and # a            ((0x0041 <= $self->{nc} and # a
4996              $self->{nc} <= 0x005A) or # x              $self->{nc} <= 0x005A) or # x
# Line 4029  sub _get_next_token ($) { Line 5000  sub _get_next_token ($) {
5000              $self->{nc} <= 0x0039) or # 9              $self->{nc} <= 0x0039) or # 9
5001             $self->{nc} == 0x003B)) { # ;             $self->{nc} == 0x003B)) { # ;
5002          our $EntityChar;          our $EntityChar;
5003          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
5004          if (defined $EntityChar->{$self->{s_kwd}}) {          if (defined $EntityChar->{$self->{kwd}}) {
5005            if ($self->{nc} == 0x003B) { # ;            if ($self->{nc} == 0x003B) { # ;
5006                            
5007              $self->{entity__value} = $EntityChar->{$self->{s_kwd}};              $self->{entity__value} = $EntityChar->{$self->{kwd}};
5008              $self->{entity__match} = 1;              $self->{entity__match} = 1;
5009                            
5010      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 4049  sub _get_next_token ($) { Line 5020  sub _get_next_token ($) {
5020              #              #
5021            } else {            } else {
5022                            
5023              $self->{entity__value} = $EntityChar->{$self->{s_kwd}};              $self->{entity__value} = $EntityChar->{$self->{kwd}};
5024              $self->{entity__match} = -1;              $self->{entity__match} = -1;
5025              ## Stay in the state.              ## Stay in the state.
5026                            
# Line 4097  sub _get_next_token ($) { Line 5068  sub _get_next_token ($) {
5068          if ($self->{prev_state} != DATA_STATE and # in attribute          if ($self->{prev_state} != DATA_STATE and # in attribute
5069              $self->{entity__match} < -1) {              $self->{entity__match} < -1) {
5070                        
5071            $data = '&' . $self->{s_kwd};            $data = '&' . $self->{kwd};
5072            #            #
5073          } else {          } else {
5074                        
# Line 4109  sub _get_next_token ($) { Line 5080  sub _get_next_token ($) {
5080                    
5081          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
5082                          line => $self->{line_prev},                          line => $self->{line_prev},
5083                          column => $self->{column_prev} - length $self->{s_kwd});                          column => $self->{column_prev} - length $self->{kwd});
5084          $data = '&' . $self->{s_kwd};          $data = '&' . $self->{kwd};
5085          #          #
5086        }        }
5087        
# Line 4127  sub _get_next_token ($) { Line 5098  sub _get_next_token ($) {
5098        if ($self->{prev_state} == DATA_STATE) {        if ($self->{prev_state} == DATA_STATE) {
5099                    
5100          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
5101            $self->{s_kwd} = '';
5102          ## Reconsume.          ## Reconsume.
5103          return  ({type => CHARACTER_TOKEN,          return  ({type => CHARACTER_TOKEN,
5104                    data => $data,                    data => $data,
5105                      has_reference => $has_ref,
5106                    line => $self->{line_prev},                    line => $self->{line_prev},
5107                    column => $self->{column_prev} + 1 - length $self->{s_kwd},                    column => $self->{column_prev} + 1 - length $self->{kwd},
5108                   });                   });
5109          redo A;          redo A;
5110        } else {        } else {
# Line 4139  sub _get_next_token ($) { Line 5112  sub _get_next_token ($) {
5112          $self->{ca}->{value} .= $data;          $self->{ca}->{value} .= $data;
5113          $self->{ca}->{has_reference} = 1 if $has_ref;          $self->{ca}->{has_reference} = 1 if $has_ref;
5114          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
5115            $self->{s_kwd} = '';
5116            ## Reconsume.
5117            redo A;
5118          }
5119    
5120        ## XML-only states
5121    
5122        } elsif ($self->{state} == PI_STATE) {
5123          ## XML5: "Pi state" and "DOCTYPE pi state".
5124    
5125          if ($is_space->{$self->{nc}} or
5126              $self->{nc} == 0x003F or # ?
5127              $self->{nc} == -1) {
5128            ## XML5: U+003F: "pi state": Same as "Anything else"; "DOCTYPE
5129            ## pi state": Switch to the "DOCTYPE pi after state".  EOF:
5130            ## "DOCTYPE pi state": Parse error, switch to the "data
5131            ## state".
5132            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare pio', ## TODO: type
5133                            line => $self->{line_prev},
5134                            column => $self->{column_prev}
5135                                - 1 * ($self->{nc} != -1));
5136            $self->{state} = BOGUS_COMMENT_STATE;
5137            ## Reconsume.
5138            $self->{ct} = {type => COMMENT_TOKEN,
5139                           data => '?',
5140                           line => $self->{line_prev},
5141                           column => $self->{column_prev}
5142                               - 1 * ($self->{nc} != -1),
5143                          };
5144            redo A;
5145          } else {
5146            ## XML5: "DOCTYPE pi state": Stay in the state.
5147            $self->{ct} = {type => PI_TOKEN,
5148                           target => chr $self->{nc},
5149                           data => '',
5150                           line => $self->{line_prev},
5151                           column => $self->{column_prev} - 1,
5152                          };
5153            $self->{state} = PI_TARGET_STATE;
5154            
5155        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5156          $self->{line_prev} = $self->{line};
5157          $self->{column_prev} = $self->{column};
5158          $self->{column}++;
5159          $self->{nc}
5160              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5161        } else {
5162          $self->{set_nc}->($self);
5163        }
5164      
5165            redo A;
5166          }
5167        } elsif ($self->{state} == PI_TARGET_STATE) {
5168          if ($is_space->{$self->{nc}}) {
5169            $self->{state} = PI_TARGET_AFTER_STATE;
5170            
5171        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5172          $self->{line_prev} = $self->{line};
5173          $self->{column_prev} = $self->{column};
5174          $self->{column}++;
5175          $self->{nc}
5176              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5177        } else {
5178          $self->{set_nc}->($self);
5179        }
5180      
5181            redo A;
5182          } elsif ($self->{nc} == -1) {
5183            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
5184            if ($self->{in_subset}) {
5185              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5186            } else {
5187              $self->{state} = DATA_STATE;
5188              $self->{s_kwd} = '';
5189            }
5190            ## Reconsume.
5191            return  ($self->{ct}); # pi
5192            redo A;
5193          } elsif ($self->{nc} == 0x003F) { # ?
5194            $self->{state} = PI_AFTER_STATE;
5195            
5196        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5197          $self->{line_prev} = $self->{line};
5198          $self->{column_prev} = $self->{column};
5199          $self->{column}++;
5200          $self->{nc}
5201              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5202        } else {
5203          $self->{set_nc}->($self);
5204        }
5205      
5206            redo A;
5207          } else {
5208            ## XML5: typo ("tag name" -> "target")
5209            $self->{ct}->{target} .= chr $self->{nc}; # pi
5210            
5211        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5212          $self->{line_prev} = $self->{line};
5213          $self->{column_prev} = $self->{column};
5214          $self->{column}++;
5215          $self->{nc}
5216              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5217        } else {
5218          $self->{set_nc}->($self);
5219        }
5220      
5221            redo A;
5222          }
5223        } elsif ($self->{state} == PI_TARGET_AFTER_STATE) {
5224          if ($is_space->{$self->{nc}}) {
5225            ## Stay in the state.
5226            
5227        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5228          $self->{line_prev} = $self->{line};
5229          $self->{column_prev} = $self->{column};
5230          $self->{column}++;
5231          $self->{nc}
5232              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5233        } else {
5234          $self->{set_nc}->($self);
5235        }
5236      
5237            redo A;
5238          } else {
5239            $self->{state} = PI_DATA_STATE;
5240            ## Reprocess.
5241            redo A;
5242          }
5243        } elsif ($self->{state} == PI_DATA_STATE) {
5244          if ($self->{nc} == 0x003F) { # ?
5245            $self->{state} = PI_DATA_AFTER_STATE;
5246            
5247        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5248          $self->{line_prev} = $self->{line};
5249          $self->{column_prev} = $self->{column};
5250          $self->{column}++;
5251          $self->{nc}
5252              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5253        } else {
5254          $self->{set_nc}->($self);
5255        }
5256      
5257            redo A;
5258          } elsif ($self->{nc} == -1) {
5259            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
5260            if ($self->{in_subset}) {
5261              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state"
5262            } else {
5263              $self->{state} = DATA_STATE;
5264              $self->{s_kwd} = '';
5265            }
5266            ## Reprocess.
5267            return  ($self->{ct}); # pi
5268            redo A;
5269          } else {
5270            $self->{ct}->{data} .= chr $self->{nc}; # pi
5271            $self->{read_until}->($self->{ct}->{data}, q[?],
5272                                  length $self->{ct}->{data});
5273            ## Stay in the state.
5274            
5275        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5276          $self->{line_prev} = $self->{line};
5277          $self->{column_prev} = $self->{column};
5278          $self->{column}++;
5279          $self->{nc}
5280              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5281        } else {
5282          $self->{set_nc}->($self);
5283        }
5284      
5285            ## Reprocess.
5286            redo A;
5287          }
5288        } elsif ($self->{state} == PI_AFTER_STATE) {
5289          ## XML5: Part of "Pi after state".
5290    
5291          if ($self->{nc} == 0x003E) { # >
5292            if ($self->{in_subset}) {
5293              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5294            } else {
5295              $self->{state} = DATA_STATE;
5296              $self->{s_kwd} = '';
5297            }
5298            
5299        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5300          $self->{line_prev} = $self->{line};
5301          $self->{column_prev} = $self->{column};
5302          $self->{column}++;
5303          $self->{nc}
5304              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5305        } else {
5306          $self->{set_nc}->($self);
5307        }
5308      
5309            return  ($self->{ct}); # pi
5310            redo A;
5311          } elsif ($self->{nc} == 0x003F) { # ?
5312            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
5313                            line => $self->{line_prev},
5314                            column => $self->{column_prev}); ## XML5: no error
5315            $self->{ct}->{data} .= '?';
5316            $self->{state} = PI_DATA_AFTER_STATE;
5317            
5318        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5319          $self->{line_prev} = $self->{line};
5320          $self->{column_prev} = $self->{column};
5321          $self->{column}++;
5322          $self->{nc}
5323              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5324        } else {
5325          $self->{set_nc}->($self);
5326        }
5327      
5328            redo A;
5329          } else {
5330            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
5331                            line => $self->{line_prev},
5332                            column => $self->{column_prev}
5333                                + 1 * ($self->{nc} == -1)); ## XML5: no error
5334            $self->{ct}->{data} .= '?'; ## XML5: not appended
5335            $self->{state} = PI_DATA_STATE;
5336            ## Reprocess.
5337            redo A;
5338          }
5339        } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
5340          ## XML5: Same as "pi after state" and "DOCTYPE pi after state".
5341    
5342          if ($self->{nc} == 0x003E) { # >
5343            if ($self->{in_subset}) {
5344              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5345            } else {
5346              $self->{state} = DATA_STATE;
5347              $self->{s_kwd} = '';
5348            }
5349            
5350        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5351          $self->{line_prev} = $self->{line};
5352          $self->{column_prev} = $self->{column};
5353          $self->{column}++;
5354          $self->{nc}
5355              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5356        } else {
5357          $self->{set_nc}->($self);
5358        }
5359      
5360            return  ($self->{ct}); # pi
5361            redo A;
5362          } elsif ($self->{nc} == 0x003F) { # ?
5363            $self->{ct}->{data} .= '?';
5364            ## Stay in the state.
5365            
5366        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5367          $self->{line_prev} = $self->{line};
5368          $self->{column_prev} = $self->{column};
5369          $self->{column}++;
5370          $self->{nc}
5371              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5372        } else {
5373          $self->{set_nc}->($self);
5374        }
5375      
5376            redo A;
5377          } else {
5378            $self->{ct}->{data} .= '?'; ## XML5: not appended
5379            $self->{state} = PI_DATA_STATE;
5380            ## Reprocess.
5381            redo A;
5382          }
5383    
5384        } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_STATE) {
5385          if ($self->{nc} == 0x003C) { # <
5386            $self->{state} = DOCTYPE_TAG_STATE;
5387            
5388        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5389          $self->{line_prev} = $self->{line};
5390          $self->{column_prev} = $self->{column};
5391          $self->{column}++;
5392          $self->{nc}
5393              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5394        } else {
5395          $self->{set_nc}->($self);
5396        }
5397      
5398            redo A;
5399          } elsif ($self->{nc} == 0x0025) { # %
5400            ## XML5: Not defined yet.
5401    
5402            ## TODO:
5403            
5404        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5405          $self->{line_prev} = $self->{line};
5406          $self->{column_prev} = $self->{column};
5407          $self->{column}++;
5408          $self->{nc}
5409              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5410        } else {
5411          $self->{set_nc}->($self);
5412        }
5413      
5414            redo A;
5415          } elsif ($self->{nc} == 0x005D) { # ]
5416            delete $self->{in_subset};
5417            $self->{state} = DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
5418            
5419        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5420          $self->{line_prev} = $self->{line};
5421          $self->{column_prev} = $self->{column};
5422          $self->{column}++;
5423          $self->{nc}
5424              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5425        } else {
5426          $self->{set_nc}->($self);
5427        }
5428      
5429            redo A;
5430          } elsif ($is_space->{$self->{nc}}) {
5431            ## Stay in the state.
5432            
5433        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5434          $self->{line_prev} = $self->{line};
5435          $self->{column_prev} = $self->{column};
5436          $self->{column}++;
5437          $self->{nc}
5438              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5439        } else {
5440          $self->{set_nc}->($self);
5441        }
5442      
5443            redo A;
5444          } elsif ($self->{nc} == -1) {
5445            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed internal subset'); ## TODO: type
5446            delete $self->{in_subset};
5447            $self->{state} = DATA_STATE;
5448            $self->{s_kwd} = '';
5449            ## Reconsume.
5450            return  ({type => END_OF_DOCTYPE_TOKEN});
5451            redo A;
5452          } else {
5453            unless ($self->{internal_subset_tainted}) {
5454              ## XML5: No parse error.
5455              $self->{parse_error}->(level => $self->{level}->{must}, type => 'string in internal subset');
5456              $self->{internal_subset_tainted} = 1;
5457            }
5458            ## Stay in the state.
5459            
5460        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5461          $self->{line_prev} = $self->{line};
5462          $self->{column_prev} = $self->{column};
5463          $self->{column}++;
5464          $self->{nc}
5465              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5466        } else {
5467          $self->{set_nc}->($self);
5468        }
5469      
5470            redo A;
5471          }
5472        } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
5473          if ($self->{nc} == 0x003E) { # >
5474            $self->{state} = DATA_STATE;
5475            $self->{s_kwd} = '';
5476            
5477        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5478          $self->{line_prev} = $self->{line};
5479          $self->{column_prev} = $self->{column};
5480          $self->{column}++;
5481          $self->{nc}
5482              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5483        } else {
5484          $self->{set_nc}->($self);
5485        }
5486      
5487            return  ({type => END_OF_DOCTYPE_TOKEN});
5488            redo A;
5489          } elsif ($self->{nc} == -1) {
5490            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
5491            $self->{state} = DATA_STATE;
5492            $self->{s_kwd} = '';
5493            ## Reconsume.
5494            return  ({type => END_OF_DOCTYPE_TOKEN});
5495            redo A;
5496          } else {
5497            ## XML5: No parse error and stay in the state.
5498            $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after internal subset'); ## TODO: type
5499    
5500            $self->{state} = BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
5501            
5502        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5503          $self->{line_prev} = $self->{line};
5504          $self->{column_prev} = $self->{column};
5505          $self->{column}++;
5506          $self->{nc}
5507              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5508        } else {
5509          $self->{set_nc}->($self);
5510        }
5511      
5512            redo A;
5513          }
5514        } elsif ($self->{state} == BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
5515          if ($self->{nc} == 0x003E) { # >
5516            $self->{state} = DATA_STATE;
5517            $self->{s_kwd} = '';
5518            
5519        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5520          $self->{line_prev} = $self->{line};
5521          $self->{column_prev} = $self->{column};
5522          $self->{column}++;
5523          $self->{nc}
5524              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5525        } else {
5526          $self->{set_nc}->($self);
5527        }
5528      
5529            return  ({type => END_OF_DOCTYPE_TOKEN});
5530            redo A;
5531          } elsif ($self->{nc} == -1) {
5532            $self->{state} = DATA_STATE;
5533            $self->{s_kwd} = '';
5534            ## Reconsume.
5535            return  ({type => END_OF_DOCTYPE_TOKEN});
5536            redo A;
5537          } else {
5538            ## Stay in the state.
5539            
5540        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5541          $self->{line_prev} = $self->{line};
5542          $self->{column_prev} = $self->{column};
5543          $self->{column}++;
5544          $self->{nc}
5545              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5546        } else {
5547          $self->{set_nc}->($self);
5548        }
5549      
5550            redo A;
5551          }
5552        } elsif ($self->{state} == DOCTYPE_TAG_STATE) {
5553          if ($self->{nc} == 0x0021) { # !
5554            $self->{state} = DOCTYPE_MARKUP_DECLARATION_OPEN_STATE;
5555            
5556        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5557          $self->{line_prev} = $self->{line};
5558          $self->{column_prev} = $self->{column};
5559          $self->{column}++;
5560          $self->{nc}
5561              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5562        } else {
5563          $self->{set_nc}->($self);
5564        }
5565      
5566            redo A;
5567          } elsif ($self->{nc} == 0x003F) { # ?
5568            $self->{state} = PI_STATE;
5569            
5570        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5571          $self->{line_prev} = $self->{line};
5572          $self->{column_prev} = $self->{column};
5573          $self->{column}++;
5574          $self->{nc}
5575              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5576        } else {
5577          $self->{set_nc}->($self);
5578        }
5579      
5580            redo A;
5581          } elsif ($self->{nc} == -1) {
5582            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago');
5583            $self->{state} = DATA_STATE;
5584            $self->{s_kwd} = '';
5585            ## Reconsume.
5586            redo A;
5587          } else {
5588            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago', ## XML5: Not a parse error.
5589                            line => $self->{line_prev},
5590                            column => $self->{column_prev});
5591            $self->{state} = BOGUS_COMMENT_STATE;
5592            $self->{ct} = {type => COMMENT_TOKEN,
5593                           data => '',
5594                          }; ## NOTE: Will be discarded.
5595            
5596        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5597          $self->{line_prev} = $self->{line};
5598          $self->{column_prev} = $self->{column};
5599          $self->{column}++;
5600          $self->{nc}
5601              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5602        } else {
5603          $self->{set_nc}->($self);
5604        }
5605      
5606            redo A;
5607          }
5608        } elsif ($self->{state} == DOCTYPE_MARKUP_DECLARATION_OPEN_STATE) {
5609          ## XML5: "DOCTYPE markup declaration state".
5610          
5611          if ($self->{nc} == 0x002D) { # -
5612            $self->{state} = MD_HYPHEN_STATE;
5613            
5614        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5615          $self->{line_prev} = $self->{line};
5616          $self->{column_prev} = $self->{column};
5617          $self->{column}++;
5618          $self->{nc}
5619              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5620        } else {
5621          $self->{set_nc}->($self);
5622        }
5623      
5624            redo A;
5625          } elsif ($self->{nc} == 0x0045 or # E
5626                   $self->{nc} == 0x0065) { # e
5627            $self->{state} = MD_E_STATE;
5628            $self->{kwd} = chr $self->{nc};
5629            
5630        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5631          $self->{line_prev} = $self->{line};
5632          $self->{column_prev} = $self->{column};
5633          $self->{column}++;
5634          $self->{nc}
5635              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5636        } else {
5637          $self->{set_nc}->($self);
5638        }
5639      
5640            redo A;
5641          } elsif ($self->{nc} == 0x0041 or # A
5642                   $self->{nc} == 0x0061) { # a
5643            $self->{state} = MD_ATTLIST_STATE;
5644            $self->{kwd} = chr $self->{nc};
5645            
5646        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5647          $self->{line_prev} = $self->{line};
5648          $self->{column_prev} = $self->{column};
5649          $self->{column}++;
5650          $self->{nc}
5651              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5652        } else {
5653          $self->{set_nc}->($self);
5654        }
5655      
5656            redo A;
5657          } elsif ($self->{nc} == 0x004E or # N
5658                   $self->{nc} == 0x006E) { # n
5659            $self->{state} = MD_NOTATION_STATE;
5660            $self->{kwd} = chr $self->{nc};
5661            
5662        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5663          $self->{line_prev} = $self->{line};
5664          $self->{column_prev} = $self->{column};
5665          $self->{column}++;
5666          $self->{nc}
5667              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5668        } else {
5669          $self->{set_nc}->($self);
5670        }
5671      
5672            redo A;
5673          } else {
5674            #
5675          }
5676          
5677          ## XML5: No parse error.
5678          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5679                          line => $self->{line_prev},
5680                          column => $self->{column_prev} - 1);
5681          ## Reconsume.
5682          $self->{state} = BOGUS_COMMENT_STATE;
5683          $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded.
5684          redo A;
5685        } elsif ($self->{state} == MD_E_STATE) {
5686          if ($self->{nc} == 0x004E or # N
5687              $self->{nc} == 0x006E) { # n
5688            $self->{state} = MD_ENTITY_STATE;
5689            $self->{kwd} .= chr $self->{nc};
5690            
5691        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5692          $self->{line_prev} = $self->{line};
5693          $self->{column_prev} = $self->{column};
5694          $self->{column}++;
5695          $self->{nc}
5696              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5697        } else {
5698          $self->{set_nc}->($self);
5699        }
5700      
5701            redo A;
5702          } elsif ($self->{nc} == 0x004C or # L
5703                   $self->{nc} == 0x006C) { # l
5704            ## XML5: <!ELEMENT> not supported.
5705            $self->{state} = MD_ELEMENT_STATE;
5706            $self->{kwd} .= chr $self->{nc};
5707            
5708        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5709          $self->{line_prev} = $self->{line};
5710          $self->{column_prev} = $self->{column};
5711          $self->{column}++;
5712          $self->{nc}
5713              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5714        } else {
5715          $self->{set_nc}->($self);
5716        }
5717      
5718            redo A;
5719          } else {
5720            ## XML5: No parse error.
5721            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5722                            line => $self->{line_prev},
5723                            column => $self->{column_prev} - 2
5724                                + 1 * ($self->{nc} == -1));
5725            ## Reconsume.
5726            $self->{state} = BOGUS_COMMENT_STATE;
5727            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5728            redo A;
5729          }
5730        } elsif ($self->{state} == MD_ENTITY_STATE) {
5731          if ($self->{nc} == [
5732                undef,
5733                undef,
5734                0x0054, # T
5735                0x0049, # I
5736                0x0054, # T
5737              ]->[length $self->{kwd}] or
5738              $self->{nc} == [
5739                undef,
5740                undef,
5741                0x0074, # t
5742                0x0069, # i
5743                0x0074, # t
5744              ]->[length $self->{kwd}]) {
5745            ## Stay in the state.
5746            $self->{kwd} .= chr $self->{nc};
5747            
5748        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5749          $self->{line_prev} = $self->{line};
5750          $self->{column_prev} = $self->{column};
5751          $self->{column}++;
5752          $self->{nc}
5753              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5754        } else {
5755          $self->{set_nc}->($self);
5756        }
5757      
5758            redo A;
5759          } elsif ((length $self->{kwd}) == 5 and
5760                   ($self->{nc} == 0x0059 or # Y
5761                    $self->{nc} == 0x0079)) { # y
5762            if ($self->{kwd} ne 'ENTIT' or $self->{nc} == 0x0079) {
5763              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
5764                              text => 'ENTITY',
5765                              line => $self->{line_prev},
5766                              column => $self->{column_prev} - 4);
5767            }
5768            $self->{ct} = {type => GENERAL_ENTITY_TOKEN, name => '',
5769                           line => $self->{line_prev},
5770                           column => $self->{column_prev} - 6};
5771            $self->{state} = DOCTYPE_MD_STATE;
5772            
5773        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5774          $self->{line_prev} = $self->{line};
5775          $self->{column_prev} = $self->{column};
5776          $self->{column}++;
5777          $self->{nc}
5778              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5779        } else {
5780          $self->{set_nc}->($self);
5781        }
5782      
5783            redo A;
5784          } else {
5785            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5786                            line => $self->{line_prev},
5787                            column => $self->{column_prev} - 1
5788                                - (length $self->{kwd})
5789                                + 1 * ($self->{nc} == -1));
5790            $self->{state} = BOGUS_COMMENT_STATE;
5791            ## Reconsume.
5792            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5793            redo A;
5794          }
5795        } elsif ($self->{state} == MD_ELEMENT_STATE) {
5796          if ($self->{nc} == [
5797               undef,
5798               undef,
5799               0x0045, # E
5800               0x004D, # M
5801               0x0045, # E
5802               0x004E, # N
5803              ]->[length $self->{kwd}] or
5804              $self->{nc} == [
5805               undef,
5806               undef,
5807               0x0065, # e
5808               0x006D, # m
5809               0x0065, # e
5810               0x006E, # n
5811              ]->[length $self->{kwd}]) {
5812            ## Stay in the state.
5813            $self->{kwd} .= chr $self->{nc};
5814            
5815        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5816          $self->{line_prev} = $self->{line};
5817          $self->{column_prev} = $self->{column};
5818          $self->{column}++;
5819          $self->{nc}
5820              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5821        } else {
5822          $self->{set_nc}->($self);
5823        }
5824      
5825            redo A;
5826          } elsif ((length $self->{kwd}) == 6 and
5827                   ($self->{nc} == 0x0054 or # T
5828                    $self->{nc} == 0x0074)) { # t
5829            if ($self->{kwd} ne 'ELEMEN' or $self->{nc} == 0x0074) {
5830              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
5831                              text => 'ELEMENT',
5832                              line => $self->{line_prev},
5833                              column => $self->{column_prev} - 5);
5834            }
5835            $self->{ct} = {type => ELEMENT_TOKEN, name => '',
5836                           line => $self->{line_prev},
5837                           column => $self->{column_prev} - 6};
5838            $self->{state} = DOCTYPE_MD_STATE;
5839            
5840        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5841          $self->{line_prev} = $self->{line};
5842          $self->{column_prev} = $self->{column};
5843          $self->{column}++;
5844          $self->{nc}
5845              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5846        } else {
5847          $self->{set_nc}->($self);
5848        }
5849      
5850            redo A;
5851          } else {
5852            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5853                            line => $self->{line_prev},
5854                            column => $self->{column_prev} - 1
5855                                - (length $self->{kwd})
5856                                + 1 * ($self->{nc} == -1));
5857            $self->{state} = BOGUS_COMMENT_STATE;
5858            ## Reconsume.
5859            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5860            redo A;
5861          }
5862        } elsif ($self->{state} == MD_ATTLIST_STATE) {
5863          if ($self->{nc} == [
5864               undef,
5865               0x0054, # T
5866               0x0054, # T
5867               0x004C, # L
5868               0x0049, # I
5869               0x0053, # S
5870              ]->[length $self->{kwd}] or
5871              $self->{nc} == [
5872               undef,
5873               0x0074, # t
5874               0x0074, # t
5875               0x006C, # l
5876               0x0069, # i
5877               0x0073, # s
5878              ]->[length $self->{kwd}]) {
5879            ## Stay in the state.
5880            $self->{kwd} .= chr $self->{nc};
5881            
5882        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5883          $self->{line_prev} = $self->{line};
5884          $self->{column_prev} = $self->{column};
5885          $self->{column}++;
5886          $self->{nc}
5887              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5888        } else {
5889          $self->{set_nc}->($self);
5890        }
5891      
5892            redo A;
5893          } elsif ((length $self->{kwd}) == 6 and
5894                   ($self->{nc} == 0x0054 or # T
5895                    $self->{nc} == 0x0074)) { # t
5896            if ($self->{kwd} ne 'ATTLIS' or $self->{nc} == 0x0074) {
5897              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
5898                              text => 'ATTLIST',
5899                              line => $self->{line_prev},
5900                              column => $self->{column_prev} - 5);
5901            }
5902            $self->{ct} = {type => ATTLIST_TOKEN, name => '',
5903                           attrdefs => [],
5904                           line => $self->{line_prev},
5905                           column => $self->{column_prev} - 6};
5906            $self->{state} = DOCTYPE_MD_STATE;
5907            
5908        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5909          $self->{line_prev} = $self->{line};
5910          $self->{column_prev} = $self->{column};
5911          $self->{column}++;
5912          $self->{nc}
5913              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5914        } else {
5915          $self->{set_nc}->($self);
5916        }
5917      
5918            redo A;
5919          } else {
5920            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5921                            line => $self->{line_prev},
5922                            column => $self->{column_prev} - 1
5923                                 - (length $self->{kwd})
5924                                 + 1 * ($self->{nc} == -1));
5925            $self->{state} = BOGUS_COMMENT_STATE;
5926            ## Reconsume.
5927            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5928            redo A;
5929          }
5930        } elsif ($self->{state} == MD_NOTATION_STATE) {
5931          if ($self->{nc} == [
5932               undef,
5933               0x004F, # O
5934               0x0054, # T
5935               0x0041, # A
5936               0x0054, # T
5937               0x0049, # I
5938               0x004F, # O
5939              ]->[length $self->{kwd}] or
5940              $self->{nc} == [
5941               undef,
5942               0x006F, # o
5943               0x0074, # t
5944               0x0061, # a
5945               0x0074, # t
5946               0x0069, # i
5947               0x006F, # o
5948              ]->[length $self->{kwd}]) {
5949            ## Stay in the state.
5950            $self->{kwd} .= chr $self->{nc};
5951            
5952        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5953          $self->{line_prev} = $self->{line};
5954          $self->{column_prev} = $self->{column};
5955          $self->{column}++;
5956          $self->{nc}
5957              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5958        } else {
5959          $self->{set_nc}->($self);
5960        }
5961      
5962            redo A;
5963          } elsif ((length $self->{kwd}) == 7 and
5964                   ($self->{nc} == 0x004E or # N
5965                    $self->{nc} == 0x006E)) { # n
5966            if ($self->{kwd} ne 'NOTATIO' or $self->{nc} == 0x006E) {
5967              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
5968                              text => 'NOTATION',
5969                              line => $self->{line_prev},
5970                              column => $self->{column_prev} - 6);
5971            }
5972            $self->{ct} = {type => NOTATION_TOKEN, name => '',
5973                           line => $self->{line_prev},
5974                           column => $self->{column_prev} - 6};
5975            $self->{state} = DOCTYPE_MD_STATE;
5976            
5977        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5978          $self->{line_prev} = $self->{line};
5979          $self->{column_prev} = $self->{column};
5980          $self->{column}++;
5981          $self->{nc}
5982              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5983        } else {
5984          $self->{set_nc}->($self);
5985        }
5986      
5987            redo A;
5988          } else {
5989            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5990                            line => $self->{line_prev},
5991                            column => $self->{column_prev} - 1
5992                                - (length $self->{kwd})
5993                                + 1 * ($self->{nc} == -1));
5994            $self->{state} = BOGUS_COMMENT_STATE;
5995            ## Reconsume.
5996            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5997            redo A;
5998          }
5999        } elsif ($self->{state} == DOCTYPE_MD_STATE) {
6000          ## XML5: "DOCTYPE ENTITY state", "DOCTYPE ATTLIST state", and
6001          ## "DOCTYPE NOTATION state".
6002    
6003          if ($is_space->{$self->{nc}}) {
6004            ## XML5: [NOTATION] Switch to the "DOCTYPE NOTATION identifier state".
6005            $self->{state} = BEFORE_MD_NAME_STATE;
6006            
6007        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6008          $self->{line_prev} = $self->{line};
6009          $self->{column_prev} = $self->{column};
6010          $self->{column}++;
6011          $self->{nc}
6012              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6013        } else {
6014          $self->{set_nc}->($self);
6015        }
6016      
6017            redo A;
6018          } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
6019                   $self->{nc} == 0x0025) { # %
6020            ## XML5: Switch to the "DOCTYPE bogus comment state".
6021            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before md name'); ## TODO: type
6022            $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
6023            
6024        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6025          $self->{line_prev} = $self->{line};
6026          $self->{column_prev} = $self->{column};
6027          $self->{column}++;
6028          $self->{nc}
6029              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6030        } else {
6031          $self->{set_nc}->($self);
6032        }
6033      
6034            redo A;
6035          } elsif ($self->{nc} == -1) {
6036            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6037            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6038            ## Reconsume.
6039            redo A;
6040          } elsif ($self->{nc} == 0x003E) { # >
6041            ## XML5: Switch to the "DOCTYPE bogus comment state".
6042            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6043            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6044            
6045        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6046          $self->{line_prev} = $self->{line};
6047          $self->{column_prev} = $self->{column};
6048          $self->{column}++;
6049          $self->{nc}
6050              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6051        } else {
6052          $self->{set_nc}->($self);
6053        }
6054      
6055            redo A;
6056          } else {
6057            ## XML5: Switch to the "DOCTYPE bogus comment state".
6058            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before md name'); ## TODO: type
6059            $self->{state} = BEFORE_MD_NAME_STATE;
6060            redo A;
6061          }
6062        } elsif ($self->{state} == BEFORE_MD_NAME_STATE) {
6063          ## XML5: "DOCTYPE ENTITY parameter state", "DOCTYPE ENTITY type
6064          ## before state", "DOCTYPE ATTLIST name before state".
6065    
6066          if ($is_space->{$self->{nc}}) {
6067            ## Stay in the state.
6068            
6069        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6070          $self->{line_prev} = $self->{line};
6071          $self->{column_prev} = $self->{column};
6072          $self->{column}++;
6073          $self->{nc}
6074              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6075        } else {
6076          $self->{set_nc}->($self);
6077        }
6078      
6079            redo A;
6080          } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
6081                   $self->{nc} == 0x0025) { # %
6082            $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
6083            
6084        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6085          $self->{line_prev} = $self->{line};
6086          $self->{column_prev} = $self->{column};
6087          $self->{column}++;
6088          $self->{nc}
6089              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6090        } else {
6091          $self->{set_nc}->($self);
6092        }
6093      
6094            redo A;
6095          } elsif ($self->{nc} == 0x003E) { # >
6096            ## XML5: Same as "Anything else".
6097            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6098            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6099            
6100        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6101          $self->{line_prev} = $self->{line};
6102          $self->{column_prev} = $self->{column};
6103          $self->{column}++;
6104          $self->{nc}
6105              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6106        } else {
6107          $self->{set_nc}->($self);
6108        }
6109      
6110            redo A;
6111          } elsif ($self->{nc} == -1) {
6112            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6113            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6114            ## Reconsume.
6115            redo A;
6116          } else {
6117            ## XML5: [ATTLIST] Not defined yet.
6118            $self->{ct}->{name} .= chr $self->{nc};
6119            $self->{state} = MD_NAME_STATE;
6120            
6121        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6122          $self->{line_prev} = $self->{line};
6123          $self->{column_prev} = $self->{column};
6124          $self->{column}++;
6125          $self->{nc}
6126              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6127        } else {
6128          $self->{set_nc}->($self);
6129        }
6130      
6131            redo A;
6132          }
6133        } elsif ($self->{state} == DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE) {
6134          if ($is_space->{$self->{nc}}) {
6135            ## XML5: Switch to the "DOCTYPE ENTITY parameter state".
6136            $self->{ct}->{type} = PARAMETER_ENTITY_TOKEN;
6137            $self->{state} = BEFORE_MD_NAME_STATE;
6138            
6139        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6140          $self->{line_prev} = $self->{line};
6141          $self->{column_prev} = $self->{column};
6142          $self->{column}++;
6143          $self->{nc}
6144              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6145        } else {
6146          $self->{set_nc}->($self);
6147        }
6148      
6149            redo A;
6150          } elsif ($self->{nc} == 0x003E) { # >
6151            ## XML5: Same as "Anything else".
6152            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6153            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6154            
6155        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6156          $self->{line_prev} = $self->{line};
6157          $self->{column_prev} = $self->{column};
6158          $self->{column}++;
6159          $self->{nc}
6160              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6161        } else {
6162          $self->{set_nc}->($self);
6163        }
6164      
6165            redo A;
6166          } elsif ($self->{nc} == -1) {
6167            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md');
6168            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6169            ## Reconsume.
6170            redo A;
6171          } else {
6172            ## XML5: No parse error.
6173            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space after ENTITY percent'); ## TODO: type
6174            $self->{state} = BOGUS_COMMENT_STATE;
6175            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6176            ## Reconsume.
6177            redo A;
6178          }
6179        } elsif ($self->{state} == MD_NAME_STATE) {
6180          ## XML5: "DOCTYPE ENTITY name state" and "DOCTYPE ATTLIST name state".
6181          
6182          if ($is_space->{$self->{nc}}) {
6183            if ($self->{ct}->{type} == ATTLIST_TOKEN) {
6184              $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
6185            } elsif ($self->{ct}->{type} == ELEMENT_TOKEN) {
6186              ## TODO: ...
6187              $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
6188            } else { # ENTITY/NOTATION
6189              $self->{state} = AFTER_DOCTYPE_NAME_STATE;
6190            }
6191            
6192        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6193          $self->{line_prev} = $self->{line};
6194          $self->{column_prev} = $self->{column};
6195          $self->{column}++;
6196          $self->{nc}
6197              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6198        } else {
6199          $self->{set_nc}->($self);
6200        }
6201      
6202            redo A;
6203          } elsif ($self->{nc} == 0x003E) { # >
6204            if ($self->{ct}->{type} == ATTLIST_TOKEN) {
6205              #
6206            } else {
6207              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
6208            }
6209            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6210            
6211        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6212          $self->{line_prev} = $self->{line};
6213          $self->{column_prev} = $self->{column};
6214          $self->{column}++;
6215          $self->{nc}
6216              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6217        } else {
6218          $self->{set_nc}->($self);
6219        }
6220      
6221            return  ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
6222            redo A;
6223          } elsif ($self->{nc} == -1) {
6224            ## XML5: [ATTLIST] No parse error.
6225            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md');
6226            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6227            ## Reconsume.
6228            return  ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
6229            redo A;
6230          } else {
6231            ## XML5: [ATTLIST] Not defined yet.
6232            $self->{ct}->{name} .= chr $self->{nc};
6233            ## Stay in the state.
6234            
6235        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6236          $self->{line_prev} = $self->{line};
6237          $self->{column_prev} = $self->{column};
6238          $self->{column}++;
6239          $self->{nc}
6240              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6241        } else {
6242          $self->{set_nc}->($self);
6243        }
6244      
6245            redo A;
6246          }
6247        } elsif ($self->{state} == DOCTYPE_ATTLIST_NAME_AFTER_STATE) {
6248          if ($is_space->{$self->{nc}}) {
6249            ## Stay in the state.
6250            
6251        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6252          $self->{line_prev} = $self->{line};
6253          $self->{column_prev} = $self->{column};
6254          $self->{column}++;
6255          $self->{nc}
6256              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6257        } else {
6258          $self->{set_nc}->($self);
6259        }
6260      
6261            redo A;
6262          } elsif ($self->{nc} == 0x003E) { # >
6263            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6264            
6265        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6266          $self->{line_prev} = $self->{line};
6267          $self->{column_prev} = $self->{column};
6268          $self->{column}++;
6269          $self->{nc}
6270              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6271        } else {
6272          $self->{set_nc}->($self);
6273        }
6274      
6275            return  ($self->{ct}); # ATTLIST
6276            redo A;
6277          } elsif ($self->{nc} == -1) {
6278            ## XML5: No parse error.
6279            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6280            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6281            return  ($self->{ct});
6282            redo A;
6283          } else {
6284            ## XML5: Not defined yet.
6285            $self->{ca} = {name => chr ($self->{nc}), # attrdef
6286                           tokens => [],
6287                           line => $self->{line}, column => $self->{column}};
6288            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE;
6289            
6290        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6291          $self->{line_prev} = $self->{line};
6292          $self->{column_prev} = $self->{column};
6293          $self->{column}++;
6294          $self->{nc}
6295              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6296        } else {
6297          $self->{set_nc}->($self);
6298        }
6299      
6300            redo A;
6301          }
6302        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE) {
6303          if ($is_space->{$self->{nc}}) {
6304            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE;
6305            
6306        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6307          $self->{line_prev} = $self->{line};
6308          $self->{column_prev} = $self->{column};
6309          $self->{column}++;
6310          $self->{nc}
6311              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6312        } else {
6313          $self->{set_nc}->($self);
6314        }
6315      
6316            redo A;
6317          } elsif ($self->{nc} == 0x003E) { # >
6318            ## XML5: Same as "anything else".
6319            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr type'); ## TODO: type
6320            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6321            
6322        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6323          $self->{line_prev} = $self->{line};
6324          $self->{column_prev} = $self->{column};
6325          $self->{column}++;
6326          $self->{nc}
6327              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6328        } else {
6329          $self->{set_nc}->($self);
6330        }
6331      
6332            return  ($self->{ct}); # ATTLIST
6333            redo A;
6334          } elsif ($self->{nc} == 0x0028) { # (
6335            ## XML5: Same as "anything else".
6336            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before paren'); ## TODO: type
6337            $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6338            
6339        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6340          $self->{line_prev} = $self->{line};
6341          $self->{column_prev} = $self->{column};
6342          $self->{column}++;
6343          $self->{nc}
6344              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6345        } else {
6346          $self->{set_nc}->($self);
6347        }
6348      
6349            redo A;
6350          } elsif ($self->{nc} == -1) {
6351            ## XML5: No parse error.
6352            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6353            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6354            
6355        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6356          $self->{line_prev} = $self->{line};
6357          $self->{column_prev} = $self->{column};
6358          $self->{column}++;
6359          $self->{nc}
6360              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6361        } else {
6362          $self->{set_nc}->($self);
6363        }
6364      
6365            return  ($self->{ct}); # ATTLIST
6366            redo A;
6367          } else {
6368            ## XML5: Not defined yet.
6369            $self->{ca}->{name} .= chr $self->{nc};
6370            ## Stay in the state.
6371            
6372        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6373          $self->{line_prev} = $self->{line};
6374          $self->{column_prev} = $self->{column};
6375          $self->{column}++;
6376          $self->{nc}
6377              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6378        } else {
6379          $self->{set_nc}->($self);
6380        }
6381      
6382            redo A;
6383          }
6384        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE) {
6385          if ($is_space->{$self->{nc}}) {
6386            ## Stay in the state.
6387            
6388        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6389          $self->{line_prev} = $self->{line};
6390          $self->{column_prev} = $self->{column};
6391          $self->{column}++;
6392          $self->{nc}
6393              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6394        } else {
6395          $self->{set_nc}->($self);
6396        }
6397      
6398            redo A;
6399          } elsif ($self->{nc} == 0x003E) { # >
6400            ## XML5: Same as "anything else".
6401            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr type'); ## TODO: type
6402            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6403            
6404        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6405          $self->{line_prev} = $self->{line};
6406          $self->{column_prev} = $self->{column};
6407          $self->{column}++;
6408          $self->{nc}
6409              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6410        } else {
6411          $self->{set_nc}->($self);
6412        }
6413      
6414            return  ($self->{ct}); # ATTLIST
6415            redo A;
6416          } elsif ($self->{nc} == 0x0028) { # (
6417            ## XML5: Same as "anything else".
6418            $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6419            
6420        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6421          $self->{line_prev} = $self->{line};
6422          $self->{column_prev} = $self->{column};
6423          $self->{column}++;
6424          $self->{nc}
6425              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6426        } else {
6427          $self->{set_nc}->($self);
6428        }
6429      
6430            redo A;
6431          } elsif ($self->{nc} == -1) {
6432            ## XML5: No parse error.
6433            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6434            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6435            
6436        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6437          $self->{line_prev} = $self->{line};
6438          $self->{column_prev} = $self->{column};
6439          $self->{column}++;
6440          $self->{nc}
6441              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6442        } else {
6443          $self->{set_nc}->($self);
6444        }
6445      
6446            return  ($self->{ct});
6447            redo A;
6448          } else {
6449            ## XML5: Not defined yet.
6450            $self->{ca}->{type} = chr $self->{nc};
6451            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE;
6452            
6453        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6454          $self->{line_prev} = $self->{line};
6455          $self->{column_prev} = $self->{column};
6456          $self->{column}++;
6457          $self->{nc}
6458              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6459        } else {
6460          $self->{set_nc}->($self);
6461        }
6462      
6463            redo A;
6464          }
6465        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE) {
6466          if ($is_space->{$self->{nc}}) {
6467            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE;
6468            
6469        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6470          $self->{line_prev} = $self->{line};
6471          $self->{column_prev} = $self->{column};
6472          $self->{column}++;
6473          $self->{nc}
6474              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6475        } else {
6476          $self->{set_nc}->($self);
6477        }
6478      
6479            redo A;
6480          } elsif ($self->{nc} == 0x0023) { # #
6481            ## XML5: Same as "anything else".
6482            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6483            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
6484            
6485        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6486          $self->{line_prev} = $self->{line};
6487          $self->{column_prev} = $self->{column};
6488          $self->{column}++;
6489          $self->{nc}
6490              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6491        } else {
6492          $self->{set_nc}->($self);
6493        }
6494      
6495            redo A;
6496          } elsif ($self->{nc} == 0x0022) { # "
6497            ## XML5: Same as "anything else".
6498            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6499            $self->{ca}->{value} = '';
6500            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
6501            
6502        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6503          $self->{line_prev} = $self->{line};
6504          $self->{column_prev} = $self->{column};
6505          $self->{column}++;
6506          $self->{nc}
6507              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6508        } else {
6509          $self->{set_nc}->($self);
6510        }
6511      
6512            redo A;
6513          } elsif ($self->{nc} == 0x0027) { # '
6514            ## XML5: Same as "anything else".
6515            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6516            $self->{ca}->{value} = '';
6517            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
6518            
6519        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6520          $self->{line_prev} = $self->{line};
6521          $self->{column_prev} = $self->{column};
6522          $self->{column}++;
6523          $self->{nc}
6524              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6525        } else {
6526          $self->{set_nc}->($self);
6527        }
6528      
6529            redo A;
6530          } elsif ($self->{nc} == 0x003E) { # >
6531            ## XML5: Same as "anything else".
6532            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
6533            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6534            
6535        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6536          $self->{line_prev} = $self->{line};
6537          $self->{column_prev} = $self->{column};
6538          $self->{column}++;
6539          $self->{nc}
6540              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6541        } else {
6542          $self->{set_nc}->($self);
6543        }
6544      
6545            return  ($self->{ct}); # ATTLIST
6546            redo A;
6547          } elsif ($self->{nc} == 0x0028) { # (
6548            ## XML5: Same as "anything else".
6549            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before paren'); ## TODO: type
6550            $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6551            
6552        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6553          $self->{line_prev} = $self->{line};
6554          $self->{column_prev} = $self->{column};
6555          $self->{column}++;
6556          $self->{nc}
6557              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6558        } else {
6559          $self->{set_nc}->($self);
6560        }
6561      
6562            redo A;
6563          } elsif ($self->{nc} == -1) {
6564            ## XML5: No parse error.
6565            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6566            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6567            
6568        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6569          $self->{line_prev} = $self->{line};
6570          $self->{column_prev} = $self->{column};
6571          $self->{column}++;
6572          $self->{nc}
6573              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6574        } else {
6575          $self->{set_nc}->($self);
6576        }
6577      
6578            return  ($self->{ct});
6579            redo A;
6580          } else {
6581            ## XML5: Not defined yet.
6582            $self->{ca}->{type} .= chr $self->{nc};
6583            ## Stay in the state.
6584            
6585        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6586          $self->{line_prev} = $self->{line};
6587          $self->{column_prev} = $self->{column};
6588          $self->{column}++;
6589          $self->{nc}
6590              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6591        } else {
6592          $self->{set_nc}->($self);
6593        }
6594      
6595            redo A;
6596          }
6597        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE) {
6598          if ($is_space->{$self->{nc}}) {
6599            ## Stay in the state.
6600            
6601        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6602          $self->{line_prev} = $self->{line};
6603          $self->{column_prev} = $self->{column};
6604          $self->{column}++;
6605          $self->{nc}
6606              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6607        } else {
6608          $self->{set_nc}->($self);
6609        }
6610      
6611            redo A;
6612          } elsif ($self->{nc} == 0x0028) { # (
6613            ## XML5: Same as "anything else".
6614            $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6615            
6616        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6617          $self->{line_prev} = $self->{line};
6618          $self->{column_prev} = $self->{column};
6619          $self->{column}++;
6620          $self->{nc}
6621              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6622        } else {
6623          $self->{set_nc}->($self);
6624        }
6625      
6626            redo A;
6627          } elsif ($self->{nc} == 0x0023) { # #
6628            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
6629            
6630        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6631          $self->{line_prev} = $self->{line};
6632          $self->{column_prev} = $self->{column};
6633          $self->{column}++;
6634          $self->{nc}
6635              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6636        } else {
6637          $self->{set_nc}->($self);
6638        }
6639      
6640            redo A;
6641          } elsif ($self->{nc} == 0x0022) { # "
6642            ## XML5: Same as "anything else".
6643            $self->{ca}->{value} = '';
6644            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
6645            
6646        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6647          $self->{line_prev} = $self->{line};
6648          $self->{column_prev} = $self->{column};
6649          $self->{column}++;
6650          $self->{nc}
6651              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6652        } else {
6653          $self->{set_nc}->($self);
6654        }
6655      
6656            redo A;
6657          } elsif ($self->{nc} == 0x0027) { # '
6658            ## XML5: Same as "anything else".
6659            $self->{ca}->{value} = '';
6660            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
6661            
6662        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6663          $self->{line_prev} = $self->{line};
6664          $self->{column_prev} = $self->{column};
6665          $self->{column}++;
6666          $self->{nc}
6667              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6668        } else {
6669          $self->{set_nc}->($self);
6670        }
6671      
6672            redo A;
6673          } elsif ($self->{nc} == 0x003E) { # >
6674            ## XML5: Same as "anything else".
6675            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
6676            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6677            
6678        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6679          $self->{line_prev} = $self->{line};
6680          $self->{column_prev} = $self->{column};
6681          $self->{column}++;
6682          $self->{nc}
6683              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6684        } else {
6685          $self->{set_nc}->($self);
6686        }
6687      
6688            return  ($self->{ct}); # ATTLIST
6689            redo A;
6690          } elsif ($self->{nc} == -1) {
6691            ## XML5: No parse error.
6692            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6693            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6694            
6695        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6696          $self->{line_prev} = $self->{line};
6697          $self->{column_prev} = $self->{column};
6698          $self->{column}++;
6699          $self->{nc}
6700              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6701        } else {
6702          $self->{set_nc}->($self);
6703        }
6704      
6705            return  ($self->{ct});
6706            redo A;
6707          } else {
6708            ## XML5: Switch to the "DOCTYPE bogus comment state".
6709            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
6710            $self->{ca}->{value} = '';
6711            $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
6712            ## Reconsume.
6713            redo A;
6714          }
6715        } elsif ($self->{state} == BEFORE_ALLOWED_TOKEN_STATE) {
6716          if ($is_space->{$self->{nc}}) {
6717            ## Stay in the state.
6718            
6719        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6720          $self->{line_prev} = $self->{line};
6721          $self->{column_prev} = $self->{column};
6722          $self->{column}++;
6723          $self->{nc}
6724              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6725        } else {
6726          $self->{set_nc}->($self);
6727        }
6728      
6729            redo A;
6730          } elsif ($self->{nc} == 0x007C) { # |
6731            $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty allowed token'); ## TODO: type
6732            ## Stay in the state.
6733            
6734        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6735          $self->{line_prev} = $self->{line};
6736          $self->{column_prev} = $self->{column};
6737          $self->{column}++;
6738          $self->{nc}
6739              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6740        } else {
6741          $self->{set_nc}->($self);
6742        }
6743      
6744            redo A;
6745          } elsif ($self->{nc} == 0x0029) { # )
6746            $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty allowed token'); ## TODO: type
6747            $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
6748            
6749        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6750          $self->{line_prev} = $self->{line};
6751          $self->{column_prev} = $self->{column};
6752          $self->{column}++;
6753          $self->{nc}
6754              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6755        } else {
6756          $self->{set_nc}->($self);
6757        }
6758      
6759            redo A;
6760          } elsif ($self->{nc} == 0x003E) { # >
6761            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
6762            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6763            
6764        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6765          $self->{line_prev} = $self->{line};
6766          $self->{column_prev} = $self->{column};
6767          $self->{column}++;
6768          $self->{nc}
6769              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6770        } else {
6771          $self->{set_nc}->($self);
6772        }
6773      
6774            return  ($self->{ct}); # ATTLIST
6775            redo A;
6776          } elsif ($self->{nc} == -1) {
6777            ## XML5: No parse error.
6778            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6779            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6780            
6781        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6782          $self->{line_prev} = $self->{line};
6783          $self->{column_prev} = $self->{column};
6784          $self->{column}++;
6785          $self->{nc}
6786              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6787        } else {
6788          $self->{set_nc}->($self);
6789        }
6790      
6791            return  ($self->{ct});
6792            redo A;
6793          } else {
6794            push @{$self->{ca}->{tokens}}, chr $self->{nc};
6795            $self->{state} = ALLOWED_TOKEN_STATE;
6796            
6797        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6798          $self->{line_prev} = $self->{line};
6799          $self->{column_prev} = $self->{column};
6800          $self->{column}++;
6801          $self->{nc}
6802              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6803        } else {
6804          $self->{set_nc}->($self);
6805        }
6806      
6807            redo A;
6808          }
6809        } elsif ($self->{state} == ALLOWED_TOKEN_STATE) {
6810          if ($is_space->{$self->{nc}}) {
6811            $self->{state} = AFTER_ALLOWED_TOKEN_STATE;
6812            
6813        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6814          $self->{line_prev} = $self->{line};
6815          $self->{column_prev} = $self->{column};
6816          $self->{column}++;
6817          $self->{nc}
6818              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6819        } else {
6820          $self->{set_nc}->($self);
6821        }
6822      
6823            redo A;
6824          } elsif ($self->{nc} == 0x007C) { # |
6825            $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6826            
6827        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6828          $self->{line_prev} = $self->{line};
6829          $self->{column_prev} = $self->{column};
6830          $self->{column}++;
6831          $self->{nc}
6832              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6833        } else {
6834          $self->{set_nc}->($self);
6835        }
6836      
6837            redo A;
6838          } elsif ($self->{nc} == 0x0029) { # )
6839            $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
6840            
6841        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6842          $self->{line_prev} = $self->{line};
6843          $self->{column_prev} = $self->{column};
6844          $self->{column}++;
6845          $self->{nc}
6846              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6847        } else {
6848          $self->{set_nc}->($self);
6849        }
6850      
6851            redo A;
6852          } elsif ($self->{nc} == 0x003E) { # >
6853            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
6854            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6855            
6856        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6857          $self->{line_prev} = $self->{line};
6858          $self->{column_prev} = $self->{column};
6859          $self->{column}++;
6860          $self->{nc}
6861              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6862        } else {
6863          $self->{set_nc}->($self);
6864        }
6865      
6866            return  ($self->{ct}); # ATTLIST
6867            redo A;
6868          } elsif ($self->{nc} == -1) {
6869            ## XML5: No parse error.
6870            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6871            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6872            
6873        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6874          $self->{line_prev} = $self->{line};
6875          $self->{column_prev} = $self->{column};
6876          $self->{column}++;
6877          $self->{nc}
6878              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6879        } else {
6880          $self->{set_nc}->($self);
6881        }
6882      
6883            return  ($self->{ct});
6884            redo A;
6885          } else {
6886            $self->{ca}->{tokens}->[-1] .= chr $self->{nc};
6887            ## Stay in the state.
6888            
6889        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6890          $self->{line_prev} = $self->{line};
6891          $self->{column_prev} = $self->{column};
6892          $self->{column}++;
6893          $self->{nc}
6894              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6895        } else {
6896          $self->{set_nc}->($self);
6897        }
6898      
6899            redo A;
6900          }
6901        } elsif ($self->{state} == AFTER_ALLOWED_TOKEN_STATE) {
6902          if ($is_space->{$self->{nc}}) {
6903            ## Stay in the state.
6904            
6905        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6906          $self->{line_prev} = $self->{line};
6907          $self->{column_prev} = $self->{column};
6908          $self->{column}++;
6909          $self->{nc}
6910              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6911        } else {
6912          $self->{set_nc}->($self);
6913        }
6914      
6915            redo A;
6916          } elsif ($self->{nc} == 0x007C) { # |
6917            $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6918            
6919        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6920          $self->{line_prev} = $self->{line};
6921          $self->{column_prev} = $self->{column};
6922          $self->{column}++;
6923          $self->{nc}
6924              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6925        } else {
6926          $self->{set_nc}->($self);
6927        }
6928      
6929            redo A;
6930          } elsif ($self->{nc} == 0x0029) { # )
6931            $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
6932            
6933        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6934          $self->{line_prev} = $self->{line};
6935          $self->{column_prev} = $self->{column};
6936          $self->{column}++;
6937          $self->{nc}
6938              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6939        } else {
6940          $self->{set_nc}->($self);
6941        }
6942      
6943            redo A;
6944          } elsif ($self->{nc} == 0x003E) { # >
6945            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
6946            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6947            
6948        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6949          $self->{line_prev} = $self->{line};
6950          $self->{column_prev} = $self->{column};
6951          $self->{column}++;
6952          $self->{nc}
6953              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6954        } else {
6955          $self->{set_nc}->($self);
6956        }
6957      
6958            return  ($self->{ct}); # ATTLIST
6959            redo A;
6960          } elsif ($self->{nc} == -1) {
6961            ## XML5: No parse error.
6962            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6963            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6964            
6965        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6966          $self->{line_prev} = $self->{line};
6967          $self->{column_prev} = $self->{column};
6968          $self->{column}++;
6969          $self->{nc}
6970              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6971        } else {
6972          $self->{set_nc}->($self);
6973        }
6974      
6975            return  ($self->{ct});
6976            redo A;
6977          } else {
6978            $self->{parse_error}->(level => $self->{level}->{must}, type => 'space in allowed token', ## TODO: type
6979                            line => $self->{line_prev},
6980                            column => $self->{column_prev});
6981            $self->{ca}->{tokens}->[-1] .= ' ' . chr $self->{nc};
6982            $self->{state} = ALLOWED_TOKEN_STATE;
6983            
6984        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6985          $self->{line_prev} = $self->{line};
6986          $self->{column_prev} = $self->{column};
6987          $self->{column}++;
6988          $self->{nc}
6989              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6990        } else {
6991          $self->{set_nc}->($self);
6992        }
6993      
6994            redo A;
6995          }
6996        } elsif ($self->{state} == AFTER_ALLOWED_TOKENS_STATE) {
6997          if ($is_space->{$self->{nc}}) {
6998            $self->{state} = BEFORE_ATTR_DEFAULT_STATE;
6999            
7000        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7001          $self->{line_prev} = $self->{line};
7002          $self->{column_prev} = $self->{column};
7003          $self->{column}++;
7004          $self->{nc}
7005              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7006        } else {
7007          $self->{set_nc}->($self);
7008        }
7009      
7010            redo A;
7011          } elsif ($self->{nc} == 0x0023) { # #
7012            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7013            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
7014            
7015        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7016          $self->{line_prev} = $self->{line};
7017          $self->{column_prev} = $self->{column};
7018          $self->{column}++;
7019          $self->{nc}
7020              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7021        } else {
7022          $self->{set_nc}->($self);
7023        }
7024      
7025            redo A;
7026          } elsif ($self->{nc} == 0x0022) { # "
7027            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7028            $self->{ca}->{value} = '';
7029            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7030            
7031        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7032          $self->{line_prev} = $self->{line};
7033          $self->{column_prev} = $self->{column};
7034          $self->{column}++;
7035          $self->{nc}
7036              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7037        } else {
7038          $self->{set_nc}->($self);
7039        }
7040      
7041            redo A;
7042          } elsif ($self->{nc} == 0x0027) { # '
7043            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7044            $self->{ca}->{value} = '';
7045            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7046            
7047        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7048          $self->{line_prev} = $self->{line};
7049          $self->{column_prev} = $self->{column};
7050          $self->{column}++;
7051          $self->{nc}
7052              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7053        } else {
7054          $self->{set_nc}->($self);
7055        }
7056      
7057            redo A;
7058          } elsif ($self->{nc} == 0x003E) { # >
7059            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7060            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7061            
7062        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7063          $self->{line_prev} = $self->{line};
7064          $self->{column_prev} = $self->{column};
7065          $self->{column}++;
7066          $self->{nc}
7067              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7068        } else {
7069          $self->{set_nc}->($self);
7070        }
7071      
7072            return  ($self->{ct}); # ATTLIST
7073            redo A;
7074          } elsif ($self->{nc} == -1) {
7075            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7076            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7077            
7078        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7079          $self->{line_prev} = $self->{line};
7080          $self->{column_prev} = $self->{column};
7081          $self->{column}++;
7082          $self->{nc}
7083              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7084        } else {
7085          $self->{set_nc}->($self);
7086        }
7087      
7088            return  ($self->{ct});
7089            redo A;
7090          } else {
7091            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
7092            $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7093            ## Reconsume.
7094            redo A;
7095          }
7096        } elsif ($self->{state} == BEFORE_ATTR_DEFAULT_STATE) {
7097          if ($is_space->{$self->{nc}}) {
7098            ## Stay in the state.
7099            
7100        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7101          $self->{line_prev} = $self->{line};
7102          $self->{column_prev} = $self->{column};
7103          $self->{column}++;
7104          $self->{nc}
7105              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7106        } else {
7107          $self->{set_nc}->($self);
7108        }
7109      
7110            redo A;
7111          } elsif ($self->{nc} == 0x0023) { # #
7112            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
7113            
7114        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7115          $self->{line_prev} = $self->{line};
7116          $self->{column_prev} = $self->{column};
7117          $self->{column}++;
7118          $self->{nc}
7119              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7120        } else {
7121          $self->{set_nc}->($self);
7122        }
7123      
7124            redo A;
7125          } elsif ($self->{nc} == 0x0022) { # "
7126            $self->{ca}->{value} = '';
7127            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7128            
7129        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7130          $self->{line_prev} = $self->{line};
7131          $self->{column_prev} = $self->{column};
7132          $self->{column}++;
7133          $self->{nc}
7134              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7135        } else {
7136          $self->{set_nc}->($self);
7137        }
7138      
7139            redo A;
7140          } elsif ($self->{nc} == 0x0027) { # '
7141            $self->{ca}->{value} = '';
7142            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7143            
7144        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7145          $self->{line_prev} = $self->{line};
7146          $self->{column_prev} = $self->{column};
7147          $self->{column}++;
7148          $self->{nc}
7149              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7150        } else {
7151          $self->{set_nc}->($self);
7152        }
7153      
7154            redo A;
7155          } elsif ($self->{nc} == 0x003E) { # >
7156            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7157            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7158            
7159        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7160          $self->{line_prev} = $self->{line};
7161          $self->{column_prev} = $self->{column};
7162          $self->{column}++;
7163          $self->{nc}
7164              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7165        } else {
7166          $self->{set_nc}->($self);
7167        }
7168      
7169            return  ($self->{ct}); # ATTLIST
7170            redo A;
7171          } elsif ($self->{nc} == -1) {
7172            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7173            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7174            
7175        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7176          $self->{line_prev} = $self->{line};
7177          $self->{column_prev} = $self->{column};
7178          $self->{column}++;
7179          $self->{nc}
7180              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7181        } else {
7182          $self->{set_nc}->($self);
7183        }
7184      
7185            return  ($self->{ct});
7186            redo A;
7187          } else {
7188            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
7189            $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7190            ## Reconsume.
7191            redo A;
7192          }
7193        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE) {
7194          if ($is_space->{$self->{nc}}) {
7195            ## XML5: No parse error.
7196            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no default type'); ## TODO: type
7197            $self->{state} = BOGUS_MD_STATE;
7198            ## Reconsume.
7199            redo A;
7200          } elsif ($self->{nc} == 0x0022) { # "
7201            ## XML5: Same as "anything else".
7202            $self->{ca}->{value} = '';
7203            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7204            
7205        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7206          $self->{line_prev} = $self->{line};
7207          $self->{column_prev} = $self->{column};
7208          $self->{column}++;
7209          $self->{nc}
7210              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7211        } else {
7212          $self->{set_nc}->($self);
7213        }
7214      
7215            redo A;
7216          } elsif ($self->{nc} == 0x0027) { # '
7217            ## XML5: Same as "anything else".
7218            $self->{ca}->{value} = '';
7219            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7220            
7221        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7222          $self->{line_prev} = $self->{line};
7223          $self->{column_prev} = $self->{column};
7224          $self->{column}++;
7225          $self->{nc}
7226              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7227        } else {
7228          $self->{set_nc}->($self);
7229        }
7230      
7231            redo A;
7232          } elsif ($self->{nc} == 0x003E) { # >
7233            ## XML5: Same as "anything else".
7234            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7235            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7236            
7237        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7238          $self->{line_prev} = $self->{line};
7239          $self->{column_prev} = $self->{column};
7240          $self->{column}++;
7241          $self->{nc}
7242              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7243        } else {
7244          $self->{set_nc}->($self);
7245        }
7246      
7247            return  ($self->{ct}); # ATTLIST
7248            redo A;
7249          } elsif ($self->{nc} == -1) {
7250            ## XML5: No parse error.
7251            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7252            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7253            
7254        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7255          $self->{line_prev} = $self->{line};
7256          $self->{column_prev} = $self->{column};
7257          $self->{column}++;
7258          $self->{nc}
7259              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7260        } else {
7261          $self->{set_nc}->($self);
7262        }
7263      
7264            return  ($self->{ct});
7265            redo A;
7266          } else {
7267            $self->{ca}->{default} = chr $self->{nc};
7268            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE;
7269            
7270        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7271          $self->{line_prev} = $self->{line};
7272          $self->{column_prev} = $self->{column};
7273          $self->{column}++;
7274          $self->{nc}
7275              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7276        } else {
7277          $self->{set_nc}->($self);
7278        }
7279      
7280            redo A;
7281          }
7282        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE) {
7283          if ($is_space->{$self->{nc}}) {
7284            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE;
7285            
7286        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7287          $self->{line_prev} = $self->{line};
7288          $self->{column_prev} = $self->{column};
7289          $self->{column}++;
7290          $self->{nc}
7291              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7292        } else {
7293          $self->{set_nc}->($self);
7294        }
7295      
7296            redo A;
7297          } elsif ($self->{nc} == 0x0022) { # "
7298            ## XML5: Same as "anything else".
7299            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7300            $self->{ca}->{value} = '';
7301            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7302            
7303        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7304          $self->{line_prev} = $self->{line};
7305          $self->{column_prev} = $self->{column};
7306          $self->{column}++;
7307          $self->{nc}
7308              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7309        } else {
7310          $self->{set_nc}->($self);
7311        }
7312      
7313            redo A;
7314          } elsif ($self->{nc} == 0x0027) { # '
7315            ## XML5: Same as "anything else".
7316            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7317            $self->{ca}->{value} = '';
7318            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7319            
7320        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7321          $self->{line_prev} = $self->{line};
7322          $self->{column_prev} = $self->{column};
7323          $self->{column}++;
7324          $self->{nc}
7325              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7326        } else {
7327          $self->{set_nc}->($self);
7328        }
7329      
7330            redo A;
7331          } elsif ($self->{nc} == 0x003E) { # >
7332            ## XML5: Same as "anything else".
7333            push @{$self->{ct}->{attrdefs}}, $self->{ca};
7334            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7335            
7336        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7337          $self->{line_prev} = $self->{line};
7338          $self->{column_prev} = $self->{column};
7339          $self->{column}++;
7340          $self->{nc}
7341              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7342        } else {
7343          $self->{set_nc}->($self);
7344        }
7345      
7346            return  ($self->{ct}); # ATTLIST
7347            redo A;
7348          } elsif ($self->{nc} == -1) {
7349            ## XML5: No parse error.
7350            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7351            push @{$self->{ct}->{attrdefs}}, $self->{ca};
7352            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7353            
7354        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7355          $self->{line_prev} = $self->{line};
7356          $self->{column_prev} = $self->{column};
7357          $self->{column}++;
7358          $self->{nc}
7359              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7360        } else {
7361          $self->{set_nc}->($self);
7362        }
7363      
7364            return  ($self->{ct});
7365            redo A;
7366          } else {
7367            $self->{ca}->{default} .= chr $self->{nc};
7368            ## Stay in the state.
7369            
7370        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7371          $self->{line_prev} = $self->{line};
7372          $self->{column_prev} = $self->{column};
7373          $self->{column}++;
7374          $self->{nc}
7375              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7376        } else {
7377          $self->{set_nc}->($self);
7378        }
7379      
7380            redo A;
7381          }
7382        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE) {
7383          if ($is_space->{$self->{nc}}) {
7384            ## Stay in the state.
7385            
7386        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7387          $self->{line_prev} = $self->{line};
7388          $self->{column_prev} = $self->{column};
7389          $self->{column}++;
7390          $self->{nc}
7391              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7392        } else {
7393          $self->{set_nc}->($self);
7394        }
7395      
7396            redo A;
7397          } elsif ($self->{nc} == 0x0022) { # "
7398            $self->{ca}->{value} = '';
7399            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7400            
7401        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7402          $self->{line_prev} = $self->{line};
7403          $self->{column_prev} = $self->{column};
7404          $self->{column}++;
7405          $self->{nc}
7406              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7407        } else {
7408          $self->{set_nc}->($self);
7409        }
7410      
7411            redo A;
7412          } elsif ($self->{nc} == 0x0027) { # '
7413            $self->{ca}->{value} = '';
7414            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7415            
7416        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7417          $self->{line_prev} = $self->{line};
7418          $self->{column_prev} = $self->{column};
7419          $self->{column}++;
7420          $self->{nc}
7421              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7422        } else {
7423          $self->{set_nc}->($self);
7424        }
7425      
7426            redo A;
7427          } elsif ($self->{nc} == 0x003E) { # >
7428            push @{$self->{ct}->{attrdefs}}, $self->{ca};
7429            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7430            
7431        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7432          $self->{line_prev} = $self->{line};
7433          $self->{column_prev} = $self->{column};
7434          $self->{column}++;
7435          $self->{nc}
7436              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7437        } else {
7438          $self->{set_nc}->($self);
7439        }
7440      
7441            return  ($self->{ct}); # ATTLIST
7442            redo A;
7443          } elsif ($self->{nc} == -1) {
7444            ## XML5: No parse error.
7445            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7446            push @{$self->{ct}->{attrdefs}}, $self->{ca};
7447            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7448            
7449        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7450          $self->{line_prev} = $self->{line};
7451          $self->{column_prev} = $self->{column};
7452          $self->{column}++;
7453          $self->{nc}
7454              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7455        } else {
7456          $self->{set_nc}->($self);
7457        }
7458      
7459            return  ($self->{ct});
7460            redo A;
7461          } else {
7462            ## XML5: Not defined yet.
7463            if ($self->{ca}->{default} eq 'FIXED') {
7464              $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7465            } else {
7466              push @{$self->{ct}->{attrdefs}}, $self->{ca};
7467              $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7468            }
7469            ## Reconsume.
7470            redo A;
7471          }
7472        } elsif ($self->{state} == AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE) {
7473          if ($is_space->{$self->{nc}} or
7474              $self->{nc} == -1 or
7475              $self->{nc} == 0x003E) { # >
7476            $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7477            ## Reconsume.
7478            redo A;
7479          } else {
7480            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before attr name'); ## TODO: type
7481            $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7482            ## Reconsume.
7483            redo A;
7484          }
7485        } elsif ($self->{state} == NDATA_STATE) {
7486          ## ASCII case-insensitive
7487          if ($self->{nc} == [
7488                undef,
7489                0x0044, # D
7490                0x0041, # A
7491                0x0054, # T
7492              ]->[length $self->{kwd}] or
7493              $self->{nc} == [
7494                undef,
7495                0x0064, # d
7496                0x0061, # a
7497                0x0074, # t
7498              ]->[length $self->{kwd}]) {
7499            
7500            ## Stay in the state.
7501            $self->{kwd} .= chr $self->{nc};
7502            
7503        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7504          $self->{line_prev} = $self->{line};
7505          $self->{column_prev} = $self->{column};
7506          $self->{column}++;
7507          $self->{nc}
7508              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7509        } else {
7510          $self->{set_nc}->($self);
7511        }
7512      
7513            redo A;
7514          } elsif ((length $self->{kwd}) == 4 and
7515                   ($self->{nc} == 0x0041 or # A
7516                    $self->{nc} == 0x0061)) { # a
7517            if ($self->{kwd} ne 'NDAT' or $self->{nc} == 0x0061) { # a
7518              
7519              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
7520                              text => 'NDATA',
7521                              line => $self->{line_prev},
7522                              column => $self->{column_prev} - 4);
7523            } else {
7524              
7525            }
7526            $self->{state} = AFTER_NDATA_STATE;
7527            
7528        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7529          $self->{line_prev} = $self->{line};
7530          $self->{column_prev} = $self->{column};
7531          $self->{column}++;
7532          $self->{nc}
7533              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7534        } else {
7535          $self->{set_nc}->($self);
7536        }
7537      
7538            redo A;
7539          } else {
7540            $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after literal', ## TODO: type
7541                            line => $self->{line_prev},
7542                            column => $self->{column_prev} + 1
7543                                - length $self->{kwd});
7544            
7545            $self->{state} = BOGUS_MD_STATE;
7546          ## Reconsume.          ## Reconsume.
7547          redo A;          redo A;
7548        }        }
7549        } elsif ($self->{state} == AFTER_NDATA_STATE) {
7550          if ($is_space->{$self->{nc}}) {
7551            $self->{state} = BEFORE_NOTATION_NAME_STATE;
7552            
7553        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7554          $self->{line_prev} = $self->{line};
7555          $self->{column_prev} = $self->{column};
7556          $self->{column}++;
7557          $self->{nc}
7558              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7559        } else {
7560          $self->{set_nc}->($self);
7561        }
7562      
7563            redo A;
7564          } elsif ($self->{nc} == 0x003E) { # >
7565            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no notation name'); ## TODO: type
7566            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7567            
7568        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7569          $self->{line_prev} = $self->{line};
7570          $self->{column_prev} = $self->{column};
7571          $self->{column}++;
7572          $self->{nc}
7573              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7574        } else {
7575          $self->{set_nc}->($self);
7576        }
7577      
7578            return  ($self->{ct}); # ENTITY
7579            redo A;
7580          } elsif ($self->{nc} == -1) {
7581            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7582            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7583            
7584        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7585          $self->{line_prev} = $self->{line};
7586          $self->{column_prev} = $self->{column};
7587          $self->{column}++;
7588          $self->{nc}
7589              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7590        } else {
7591          $self->{set_nc}->($self);
7592        }
7593      
7594            return  ($self->{ct}); # ENTITY
7595            redo A;
7596          } else {
7597            $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after literal', ## TODO: type
7598                            line => $self->{line_prev},
7599                            column => $self->{column_prev} + 1
7600                                - length $self->{kwd});
7601            $self->{state} = BOGUS_MD_STATE;
7602            ## Reconsume.
7603            redo A;
7604          }
7605        } elsif ($self->{state} == BEFORE_NOTATION_NAME_STATE) {
7606          if ($is_space->{$self->{nc}}) {
7607            ## Stay in the state.
7608            
7609        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7610          $self->{line_prev} = $self->{line};
7611          $self->{column_prev} = $self->{column};
7612          $self->{column}++;
7613          $self->{nc}
7614              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7615        } else {
7616          $self->{set_nc}->($self);
7617        }
7618      
7619            redo A;
7620          } elsif ($self->{nc} == 0x003E) { # >
7621            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no notation name'); ## TODO: type
7622            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7623            
7624        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7625          $self->{line_prev} = $self->{line};
7626          $self->{column_prev} = $self->{column};
7627          $self->{column}++;
7628          $self->{nc}
7629              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7630        } else {
7631          $self->{set_nc}->($self);
7632        }
7633      
7634            return  ($self->{ct}); # ENTITY
7635            redo A;
7636          } elsif ($self->{nc} == -1) {
7637            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7638            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7639            
7640        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7641          $self->{line_prev} = $self->{line};
7642          $self->{column_prev} = $self->{column};
7643          $self->{column}++;
7644          $self->{nc}
7645              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7646        } else {
7647          $self->{set_nc}->($self);
7648        }
7649      
7650            return  ($self->{ct}); # ENTITY
7651            redo A;
7652          } else {
7653            $self->{ct}->{notation} = chr $self->{nc}; # ENTITY
7654            $self->{state} = NOTATION_NAME_STATE;
7655            
7656        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7657          $self->{line_prev} = $self->{line};
7658          $self->{column_prev} = $self->{column};
7659          $self->{column}++;
7660          $self->{nc}
7661              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7662        } else {
7663          $self->{set_nc}->($self);
7664        }
7665      
7666            redo A;
7667          }
7668        } elsif ($self->{state} == NOTATION_NAME_STATE) {
7669          if ($is_space->{$self->{nc}}) {
7670            $self->{state} = AFTER_NOTATION_NAME_STATE;
7671            
7672        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7673          $self->{line_prev} = $self->{line};
7674          $self->{column_prev} = $self->{column};
7675          $self->{column}++;
7676          $self->{nc}
7677              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7678        } else {
7679          $self->{set_nc}->($self);
7680        }
7681      
7682            redo A;
7683          } elsif ($self->{nc} == 0x003E) { # >
7684            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7685            
7686        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7687          $self->{line_prev} = $self->{line};
7688          $self->{column_prev} = $self->{column};
7689          $self->{column}++;
7690          $self->{nc}
7691              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7692        } else {
7693          $self->{set_nc}->($self);
7694        }
7695      
7696            return  ($self->{ct}); # ENTITY
7697            redo A;
7698          } elsif ($self->{nc} == -1) {
7699            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7700            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7701            
7702        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7703          $self->{line_prev} = $self->{line};
7704          $self->{column_prev} = $self->{column};
7705          $self->{column}++;
7706          $self->{nc}
7707              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7708        } else {
7709          $self->{set_nc}->($self);
7710        }
7711      
7712            return  ($self->{ct}); # ENTITY
7713            redo A;
7714          } else {
7715            $self->{ct}->{notation} .= chr $self->{nc}; # ENTITY
7716            ## Stay in the state.
7717            
7718        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7719          $self->{line_prev} = $self->{line};
7720          $self->{column_prev} = $self->{column};
7721          $self->{column}++;
7722          $self->{nc}
7723              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7724        } else {
7725          $self->{set_nc}->($self);
7726        }
7727      
7728            redo A;
7729          }
7730        } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE) {
7731          if ($self->{nc} == 0x0022) { # "
7732            $self->{state} = AFTER_NOTATION_NAME_STATE;
7733            
7734        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7735          $self->{line_prev} = $self->{line};
7736          $self->{column_prev} = $self->{column};
7737          $self->{column}++;
7738          $self->{nc}
7739              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7740        } else {
7741          $self->{set_nc}->($self);
7742        }
7743      
7744            redo A;
7745          } elsif ($self->{nc} == 0x0026) { # &
7746            $self->{prev_state} = $self->{state};
7747            $self->{state} = ENTITY_VALUE_ENTITY_STATE;
7748            $self->{entity_add} = 0x0022; # "
7749            
7750        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7751          $self->{line_prev} = $self->{line};
7752          $self->{column_prev} = $self->{column};
7753          $self->{column}++;
7754          $self->{nc}
7755              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7756        } else {
7757          $self->{set_nc}->($self);
7758        }
7759      
7760            redo A;
7761    ## TODO: %
7762          } elsif ($self->{nc} == -1) {
7763            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed entity value'); ## TODO: type
7764            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7765            ## Reconsume.
7766            return  ($self->{ct}); # ENTITY
7767            redo A;
7768          } else {
7769            $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
7770            
7771        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7772          $self->{line_prev} = $self->{line};
7773          $self->{column_prev} = $self->{column};
7774          $self->{column}++;
7775          $self->{nc}
7776              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7777        } else {
7778          $self->{set_nc}->($self);
7779        }
7780      
7781            redo A;
7782          }
7783        } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE) {
7784          if ($self->{nc} == 0x0027) { # '
7785            $self->{state} = AFTER_NOTATION_NAME_STATE;
7786            
7787        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7788          $self->{line_prev} = $self->{line};
7789          $self->{column_prev} = $self->{column};
7790          $self->{column}++;
7791          $self->{nc}
7792              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7793        } else {
7794          $self->{set_nc}->($self);
7795        }
7796      
7797            redo A;
7798          } elsif ($self->{nc} == 0x0026) { # &
7799            $self->{prev_state} = $self->{state};
7800            $self->{state} = ENTITY_VALUE_ENTITY_STATE;
7801            $self->{entity_add} = 0x0027; # '
7802            
7803        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7804          $self->{line_prev} = $self->{line};
7805          $self->{column_prev} = $self->{column};
7806          $self->{column}++;
7807          $self->{nc}
7808              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7809        } else {
7810          $self->{set_nc}->($self);
7811        }
7812      
7813            redo A;
7814    ## TODO: %
7815          } elsif ($self->{nc} == -1) {
7816            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed entity value'); ## TODO: type
7817            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7818            ## Reconsume.
7819            return  ($self->{ct}); # ENTITY
7820            redo A;
7821          } else {
7822            $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
7823            
7824        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7825          $self->{line_prev} = $self->{line};
7826          $self->{column_prev} = $self->{column};
7827          $self->{column}++;
7828          $self->{nc}
7829              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7830        } else {
7831          $self->{set_nc}->($self);
7832        }
7833      
7834            redo A;
7835          }
7836        } elsif ($self->{state} == ENTITY_VALUE_ENTITY_STATE) {
7837          ## TODO: XMLize
7838    
7839          if ($is_space->{$self->{nc}} or
7840              {
7841                0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
7842                $self->{entity_add} => 1,
7843              }->{$self->{nc}}) {
7844            ## Don't consume
7845            ## No error
7846            ## Return nothing.
7847            #
7848          } elsif ($self->{nc} == 0x0023) { # #
7849            $self->{ca} = $self->{ct};
7850            $self->{state} = ENTITY_HASH_STATE;
7851            $self->{kwd} = '#';
7852            
7853        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7854          $self->{line_prev} = $self->{line};
7855          $self->{column_prev} = $self->{column};
7856          $self->{column}++;
7857          $self->{nc}
7858              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7859        } else {
7860          $self->{set_nc}->($self);
7861        }
7862      
7863            redo A;
7864          } elsif ((0x0041 <= $self->{nc} and
7865                    $self->{nc} <= 0x005A) or # A..Z
7866                   (0x0061 <= $self->{nc} and
7867                    $self->{nc} <= 0x007A)) { # a..z
7868            #
7869          } else {
7870            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero');
7871            ## Return nothing.
7872            #
7873          }
7874    
7875          $self->{ct}->{value} .= '&';
7876          $self->{state} = $self->{prev_state};
7877          ## Reconsume.
7878          redo A;
7879        } elsif ($self->{state} == AFTER_NOTATION_NAME_STATE) {
7880          if ($is_space->{$self->{nc}}) {
7881            ## Stay in the state.
7882            
7883        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7884          $self->{line_prev} = $self->{line};
7885          $self->{column_prev} = $self->{column};
7886          $self->{column}++;
7887          $self->{nc}
7888              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7889        } else {
7890          $self->{set_nc}->($self);
7891        }
7892      
7893            redo A;
7894          } elsif ($self->{nc} == 0x003E) { # >
7895            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7896            
7897        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7898          $self->{line_prev} = $self->{line};
7899          $self->{column_prev} = $self->{column};
7900          $self->{column}++;
7901          $self->{nc}
7902              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7903        } else {
7904          $self->{set_nc}->($self);
7905        }
7906      
7907            return  ($self->{ct}); # ENTITY
7908            redo A;
7909          } elsif ($self->{nc} == -1) {
7910            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7911            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7912            
7913        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7914          $self->{line_prev} = $self->{line};
7915          $self->{column_prev} = $self->{column};
7916          $self->{column}++;
7917          $self->{nc}
7918              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7919        } else {
7920          $self->{set_nc}->($self);
7921        }
7922      
7923            return  ($self->{ct}); # ENTITY
7924            redo A;
7925          } else {
7926            $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after notation name'); ## TODO: type
7927            $self->{state} = BOGUS_MD_STATE;
7928            ## Reconsume.
7929            redo A;
7930          }
7931        } elsif ($self->{state} == BOGUS_MD_STATE) {
7932          if ($self->{nc} == 0x003E) { # >
7933            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7934            
7935        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7936          $self->{line_prev} = $self->{line};
7937          $self->{column_prev} = $self->{column};
7938          $self->{column}++;
7939          $self->{nc}
7940              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7941        } else {
7942          $self->{set_nc}->($self);
7943        }
7944      
7945            return  ($self->{ct}); # ATTLIST/ENTITY/NOTATION
7946            redo A;
7947          } elsif ($self->{nc} == -1) {
7948            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7949            ## Reconsume.
7950            return  ($self->{ct}); # ATTLIST/ENTITY/NOTATION
7951            redo A;
7952          } else {
7953            ## Stay in the state.
7954            
7955        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7956          $self->{line_prev} = $self->{line};
7957          $self->{column_prev} = $self->{column};
7958          $self->{column}++;
7959          $self->{nc}
7960              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7961        } else {
7962          $self->{set_nc}->($self);
7963        }
7964      
7965            redo A;
7966          }
7967      } else {      } else {
7968        die "$0: $self->{state}: Unknown state";        die "$0: $self->{state}: Unknown state";
7969      }      }
# Line 4152  sub _get_next_token ($) { Line 7974  sub _get_next_token ($) {
7974    
7975  1;  1;
7976  ## $Date$  ## $Date$
7977                                    

Legend:
Removed from v.1.4  
changed lines
  Added in v.1.19

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24