/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm
Suika

Diff of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.4 by wakaba, Tue Oct 14 11:46:57 2008 UTC revision 1.32 by wakaba, Sat Sep 5 09:57:55 2009 UTC
# Line 15  BEGIN { Line 15  BEGIN {
15      CHARACTER_TOKEN      CHARACTER_TOKEN
16      PI_TOKEN      PI_TOKEN
17      ABORT_TOKEN      ABORT_TOKEN
18        END_OF_DOCTYPE_TOKEN
19        ATTLIST_TOKEN
20        ELEMENT_TOKEN
21        GENERAL_ENTITY_TOKEN
22        PARAMETER_ENTITY_TOKEN
23        NOTATION_TOKEN
24    );    );
25        
26    our %EXPORT_TAGS = (    our %EXPORT_TAGS = (
# Line 27  BEGIN { Line 33  BEGIN {
33        CHARACTER_TOKEN        CHARACTER_TOKEN
34        PI_TOKEN        PI_TOKEN
35        ABORT_TOKEN        ABORT_TOKEN
36          END_OF_DOCTYPE_TOKEN
37          ATTLIST_TOKEN
38          ELEMENT_TOKEN
39          GENERAL_ENTITY_TOKEN
40          PARAMETER_ENTITY_TOKEN
41          NOTATION_TOKEN
42      )],      )],
43    );    );
44  }  }
45    
46    ## NOTE: Differences from the XML5 draft are marked as "XML5:".
47    
48  ## Token types  ## Token types
49    
50  sub DOCTYPE_TOKEN () { 1 }  sub DOCTYPE_TOKEN () { 1 } ## XML5: No DOCTYPE token.
51  sub COMMENT_TOKEN () { 2 }  sub COMMENT_TOKEN () { 2 }
52  sub START_TAG_TOKEN () { 3 }  sub START_TAG_TOKEN () { 3 }
53  sub END_TAG_TOKEN () { 4 }  sub END_TAG_TOKEN () { 4 }
54  sub END_OF_FILE_TOKEN () { 5 }  sub END_OF_FILE_TOKEN () { 5 }
55  sub CHARACTER_TOKEN () { 6 }  sub CHARACTER_TOKEN () { 6 }
56  sub PI_TOKEN () { 7 } # XML5  sub PI_TOKEN () { 7 } ## NOTE: XML only.
57  sub ABORT_TOKEN () { 8 } # Not a token actually  sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing.
58    sub END_OF_DOCTYPE_TOKEN () { 9 } ## NOTE: XML only.
59    sub ATTLIST_TOKEN () { 10 } ## NOTE: XML only.
60    sub ELEMENT_TOKEN () { 11 } ## NOTE: XML only.
61    sub GENERAL_ENTITY_TOKEN () { 12 } ## NOTE: XML only.
62    sub PARAMETER_ENTITY_TOKEN () { 13 } ## NOTE: XML only.
63    sub NOTATION_TOKEN () { 14 } ## NOTE: XML only.
64    
65    ## XML5: XML5 has "empty tag token".  In this implementation, it is
66    ## represented as a start tag token with $self->{self_closing} flag
67    ## set to true.
68    
69    ## XML5: XML5 has "short end tag token".  In this implementation, it
70    ## is represented as an end tag token with $token->{tag_name} flag set
71    ## to an empty string.
72    
73  package Whatpm::HTML;  package Whatpm::HTML;
74    
# Line 77  sub COMMENT_START_STATE () { 14 } Line 105  sub COMMENT_START_STATE () { 14 }
105  sub COMMENT_START_DASH_STATE () { 15 }  sub COMMENT_START_DASH_STATE () { 15 }
106  sub COMMENT_STATE () { 16 }  sub COMMENT_STATE () { 16 }
107  sub COMMENT_END_STATE () { 17 }  sub COMMENT_END_STATE () { 17 }
108    sub COMMENT_END_BANG_STATE () { 102 }
109    sub COMMENT_END_SPACE_STATE () { 103 } ## LAST
110  sub COMMENT_END_DASH_STATE () { 18 }  sub COMMENT_END_DASH_STATE () { 18 }
111  sub BOGUS_COMMENT_STATE () { 19 }  sub BOGUS_COMMENT_STATE () { 19 }
112  sub DOCTYPE_STATE () { 20 }  sub DOCTYPE_STATE () { 20 }
# Line 114  sub HEXREF_HEX_STATE () { 48 } Line 144  sub HEXREF_HEX_STATE () { 48 }
144  sub ENTITY_NAME_STATE () { 49 }  sub ENTITY_NAME_STATE () { 49 }
145  sub PCDATA_STATE () { 50 } # "data state" in the spec  sub PCDATA_STATE () { 50 } # "data state" in the spec
146    
147    ## XML-only states
148    sub PI_STATE () { 51 }
149    sub PI_TARGET_STATE () { 52 }
150    sub PI_TARGET_AFTER_STATE () { 53 }
151    sub PI_DATA_STATE () { 54 }
152    sub PI_AFTER_STATE () { 55 }
153    sub PI_DATA_AFTER_STATE () { 56 }
154    sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 }
155    sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 }
156    sub BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 59 }
157    sub DOCTYPE_TAG_STATE () { 60 }
158    sub DOCTYPE_MARKUP_DECLARATION_OPEN_STATE () { 61 }
159    sub MD_ATTLIST_STATE () { 62 }
160    sub MD_E_STATE () { 63 }
161    sub MD_ELEMENT_STATE () { 64 }
162    sub MD_ENTITY_STATE () { 65 }
163    sub MD_NOTATION_STATE () { 66 }
164    sub DOCTYPE_MD_STATE () { 67 }
165    sub BEFORE_MD_NAME_STATE () { 68 }
166    sub MD_NAME_STATE () { 69 }
167    sub DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE () { 70 }
168    sub DOCTYPE_ATTLIST_NAME_AFTER_STATE () { 71 }
169    sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE () { 72 }
170    sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE () { 73 }
171    sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE () { 74 }
172    sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE () { 75 }
173    sub BEFORE_ALLOWED_TOKEN_STATE () { 76 }
174    sub ALLOWED_TOKEN_STATE () { 77 }
175    sub AFTER_ALLOWED_TOKEN_STATE () { 78 }
176    sub AFTER_ALLOWED_TOKENS_STATE () { 79 }
177    sub BEFORE_ATTR_DEFAULT_STATE () { 80 }
178    sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE () { 81 }
179    sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE () { 82 }
180    sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE () { 83 }
181    sub AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE () { 84 }
182    sub BEFORE_NDATA_STATE () { 85 }
183    sub NDATA_STATE () { 86 }
184    sub AFTER_NDATA_STATE () { 87 }
185    sub BEFORE_NOTATION_NAME_STATE () { 88 }
186    sub NOTATION_NAME_STATE () { 89 }
187    sub DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE () { 90 }
188    sub DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE () { 91 }
189    sub ENTITY_VALUE_ENTITY_STATE () { 92 }
190    sub AFTER_ELEMENT_NAME_STATE () { 93 }
191    sub BEFORE_ELEMENT_CONTENT_STATE () { 94 }
192    sub CONTENT_KEYWORD_STATE () { 95 }
193    sub AFTER_CM_GROUP_OPEN_STATE () { 96 }
194    sub CM_ELEMENT_NAME_STATE () { 97 }
195    sub AFTER_CM_ELEMENT_NAME_STATE () { 98 }
196    sub AFTER_CM_GROUP_CLOSE_STATE () { 99 }
197    sub AFTER_MD_DEF_STATE () { 100 }
198    sub BOGUS_MD_STATE () { 101 }
199    
200  ## Tree constructor state constants (see Whatpm::HTML for the full  ## Tree constructor state constants (see Whatpm::HTML for the full
201  ## list and descriptions)  ## list and descriptions)
202    
# Line 178  sub _initialize_tokenizer ($) { Line 261  sub _initialize_tokenizer ($) {
261    #$self->{is_xml} (if XML)    #$self->{is_xml} (if XML)
262    
263    $self->{state} = DATA_STATE; # MUST    $self->{state} = DATA_STATE; # MUST
264    #$self->{s_kwd}; # state keyword - initialized when used    $self->{s_kwd} = ''; # Data state keyword
265      #$self->{kwd} = ''; # State-dependent keyword; initialized when used
266    #$self->{entity__value}; # initialized when used    #$self->{entity__value}; # initialized when used
267    #$self->{entity__match}; # initialized when used    #$self->{entity__match}; # initialized when used
268    $self->{content_model} = PCDATA_CONTENT_MODEL; # be    $self->{content_model} = PCDATA_CONTENT_MODEL; # be
# Line 208  sub _initialize_tokenizer ($) { Line 292  sub _initialize_tokenizer ($) {
292    
293  ## A token has:  ## A token has:
294  ##   ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,  ##   ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
295  ##       CHARACTER_TOKEN, or END_OF_FILE_TOKEN  ##       CHARACTER_TOKEN, END_OF_FILE_TOKEN, PI_TOKEN, or ABORT_TOKEN
296  ##   ->{name} (DOCTYPE_TOKEN)  ##   ->{name} (DOCTYPE_TOKEN)
297  ##   ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)  ##   ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
298    ##   ->{target} (PI_TOKEN)
299  ##   ->{pubid} (DOCTYPE_TOKEN)  ##   ->{pubid} (DOCTYPE_TOKEN)
300  ##   ->{sysid} (DOCTYPE_TOKEN)  ##   ->{sysid} (DOCTYPE_TOKEN)
301  ##   ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag  ##   ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
# Line 218  sub _initialize_tokenizer ($) { Line 303  sub _initialize_tokenizer ($) {
303  ##        ->{name}  ##        ->{name}
304  ##        ->{value}  ##        ->{value}
305  ##        ->{has_reference} == 1 or 0  ##        ->{has_reference} == 1 or 0
306  ##   ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)  ##        ->{index}: Index of the attribute in a tag.
307    ##   ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)
308    ##   ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
309    ##   ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.
310    ##   ->{has_internal_subset} = 1 or 0 (DOCTYPE_TOKEN)
311    
312  ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.  ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
313  ##     |->{self_closing}| is used to save the value of |$self->{self_closing}|  ##     |->{self_closing}| is used to save the value of |$self->{self_closing}|
314  ##     while the token is pushed back to the stack.  ##     while the token is pushed back to the stack.
# Line 238  my $is_space = { Line 328  my $is_space = {
328    0x0009 => 1, # CHARACTER TABULATION (HT)    0x0009 => 1, # CHARACTER TABULATION (HT)
329    0x000A => 1, # LINE FEED (LF)    0x000A => 1, # LINE FEED (LF)
330    #0x000B => 0, # LINE TABULATION (VT)    #0x000B => 0, # LINE TABULATION (VT)
331    0x000C => 1, # FORM FEED (FF)    0x000C => 1, # FORM FEED (FF) ## XML5: Not a space character.
332    #0x000D => 1, # CARRIAGE RETURN (CR)    #0x000D => 1, # CARRIAGE RETURN (CR)
333    0x0020 => 1, # SPACE (SP)    0x0020 => 1, # SPACE (SP)
334  };  };
# Line 362  sub _get_next_token ($) { Line 452  sub _get_next_token ($) {
452          }          }
453        } elsif ($self->{nc} == 0x002D) { # -        } elsif ($self->{nc} == 0x002D) { # -
454          if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA          if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
455            $self->{s_kwd} .= '-';            if ($self->{s_kwd} eq '<!-') {
             
           if ($self->{s_kwd} eq '<!--') {  
456                            
457              $self->{escape} = 1; # unless $self->{escape};              $self->{escape} = 1; # unless $self->{escape};
458              $self->{s_kwd} = '--';              $self->{s_kwd} = '--';
459              #              #
460            } elsif ($self->{s_kwd} eq '---') {            } elsif ($self->{s_kwd} eq '-') {
461                            
462              $self->{s_kwd} = '--';              $self->{s_kwd} = '--';
463              #              #
464              } elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') {
465                
466                $self->{s_kwd} .= '-';
467                #
468            } else {            } else {
469                            
470                $self->{s_kwd} = '-';
471              #              #
472            }            }
473          }          }
# Line 420  sub _get_next_token ($) { Line 513  sub _get_next_token ($) {
513            if ($self->{s_kwd} eq '--') {            if ($self->{s_kwd} eq '--') {
514                            
515              delete $self->{escape};              delete $self->{escape};
516                #
517            } else {            } else {
518                            
519                #
520            }            }
521            } elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') {
522              
523              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched mse', ## TODO: type
524                              line => $self->{line_prev},
525                              column => $self->{column_prev} - 1);
526              #
527          } else {          } else {
528                        
529              #
530          }          }
531                    
532          $self->{s_kwd} = '';          $self->{s_kwd} = '';
533          #          #
534          } elsif ($self->{nc} == 0x005D) { # ]
535            if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') {
536              
537              $self->{s_kwd} .= ']';
538            } elsif ($self->{s_kwd} eq ']]') {
539              
540              #
541            } else {
542              
543              $self->{s_kwd} = '';
544            }
545            #
546        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
547                    
548          $self->{s_kwd} = '';          $self->{s_kwd} = '';
# Line 446  sub _get_next_token ($) { Line 560  sub _get_next_token ($) {
560                     data => chr $self->{nc},                     data => chr $self->{nc},
561                     line => $self->{line}, column => $self->{column},                     line => $self->{line}, column => $self->{column},
562                    };                    };
563        if ($self->{read_until}->($token->{data}, q[-!<>&],        if ($self->{read_until}->($token->{data}, q{-!<>&\]},
564                                  length $token->{data})) {                                  length $token->{data})) {
565          $self->{s_kwd} = '';          $self->{s_kwd} = '';
566        }        }
567    
568        ## Stay in the data state.        ## Stay in the data state.
569        if ($self->{content_model} == PCDATA_CONTENT_MODEL) {        if (not $self->{is_xml} and
570              $self->{content_model} == PCDATA_CONTENT_MODEL) {
571                    
572          $self->{state} = PCDATA_STATE;          $self->{state} = PCDATA_STATE;
573        } else {        } else {
# Line 473  sub _get_next_token ($) { Line 588  sub _get_next_token ($) {
588        return  ($token);        return  ($token);
589        redo A;        redo A;
590      } elsif ($self->{state} == TAG_OPEN_STATE) {      } elsif ($self->{state} == TAG_OPEN_STATE) {
591          ## XML5: "tag state".
592    
593        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
594          if ($self->{nc} == 0x002F) { # /          if ($self->{nc} == 0x002F) { # /
595                        
# Line 491  sub _get_next_token ($) { Line 608  sub _get_next_token ($) {
608            redo A;            redo A;
609          } elsif ($self->{nc} == 0x0021) { # !          } elsif ($self->{nc} == 0x0021) { # !
610                        
611            $self->{s_kwd} = '<' unless $self->{escape};            $self->{s_kwd} = $self->{escaped} ? '' : '<';
612            #            #
613          } else {          } else {
614                        
615              $self->{s_kwd} = '';
616            #            #
617          }          }
618    
# Line 583  sub _get_next_token ($) { Line 701  sub _get_next_token ($) {
701                            line => $self->{line_prev},                            line => $self->{line_prev},
702                            column => $self->{column_prev});                            column => $self->{column_prev});
703            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
704              $self->{s_kwd} = '';
705                        
706      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
707        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 602  sub _get_next_token ($) { Line 721  sub _get_next_token ($) {
721    
722            redo A;            redo A;
723          } elsif ($self->{nc} == 0x003F) { # ?          } elsif ($self->{nc} == 0x003F) { # ?
724                        if ($self->{is_xml}) {
725            $self->{parse_error}->(level => $self->{level}->{must}, type => 'pio',              
726                            line => $self->{line_prev},              $self->{state} = PI_STATE;
727                            column => $self->{column_prev});              
728            $self->{state} = BOGUS_COMMENT_STATE;      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
729            $self->{ct} = {type => COMMENT_TOKEN, data => '',        $self->{line_prev} = $self->{line};
730                                      line => $self->{line_prev},        $self->{column_prev} = $self->{column};
731                                      column => $self->{column_prev},        $self->{column}++;
732                                     };        $self->{nc}
733            ## $self->{nc} is intentionally left as is            = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
734            redo A;      } else {
735          } else {        $self->{set_nc}->($self);
736        }
737      
738                redo A;
739              } else {
740                
741                $self->{parse_error}->(level => $self->{level}->{must}, type => 'pio',
742                                line => $self->{line_prev},
743                                column => $self->{column_prev});
744                $self->{state} = BOGUS_COMMENT_STATE;
745                $self->{ct} = {type => COMMENT_TOKEN, data => '',
746                               line => $self->{line_prev},
747                               column => $self->{column_prev},
748                              };
749                ## $self->{nc} is intentionally left as is
750                redo A;
751              }
752            } elsif (not $self->{is_xml} or $is_space->{$self->{nc}}) {
753                        
754            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago',            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago',
755                            line => $self->{line_prev},                            line => $self->{line_prev},
756                            column => $self->{column_prev});                            column => $self->{column_prev});
757            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
758              $self->{s_kwd} = '';
759            ## reconsume            ## reconsume
760    
761            return  ({type => CHARACTER_TOKEN, data => '<',            return  ({type => CHARACTER_TOKEN, data => '<',
# Line 627  sub _get_next_token ($) { Line 764  sub _get_next_token ($) {
764                     });                     });
765    
766            redo A;            redo A;
767            } else {
768              ## XML5: "<:" is a parse error.
769              
770              $self->{ct} = {type => START_TAG_TOKEN,
771                                        tag_name => chr ($self->{nc}),
772                                        line => $self->{line_prev},
773                                        column => $self->{column_prev}};
774              $self->{state} = TAG_NAME_STATE;
775              
776        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
777          $self->{line_prev} = $self->{line};
778          $self->{column_prev} = $self->{column};
779          $self->{column}++;
780          $self->{nc}
781              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
782        } else {
783          $self->{set_nc}->($self);
784        }
785      
786              redo A;
787          }          }
788        } else {        } else {
789          die "$0: $self->{content_model} in tag open";          die "$0: $self->{content_model} in tag open";
# Line 635  sub _get_next_token ($) { Line 792  sub _get_next_token ($) {
792        ## NOTE: The "close tag open state" in the spec is implemented as        ## NOTE: The "close tag open state" in the spec is implemented as
793        ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.        ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
794    
795          ## XML5: "end tag state".
796    
797        my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"        my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
798        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
799          if (defined $self->{last_stag_name}) {          if (defined $self->{last_stag_name}) {
800            $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;            $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
801            $self->{s_kwd} = '';            $self->{kwd} = '';
802            ## Reconsume.            ## Reconsume.
803            redo A;            redo A;
804          } else {          } else {
# Line 647  sub _get_next_token ($) { Line 806  sub _get_next_token ($) {
806            ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.            ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
807                        
808            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
809              $self->{s_kwd} = '';
810            ## Reconsume.            ## Reconsume.
811            return  ({type => CHARACTER_TOKEN, data => '</',            return  ({type => CHARACTER_TOKEN, data => '</',
812                      line => $l, column => $c,                      line => $l, column => $c,
# Line 695  sub _get_next_token ($) { Line 855  sub _get_next_token ($) {
855        
856          redo A;          redo A;
857        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
           
858          $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty end tag',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty end tag',
859                          line => $self->{line_prev}, ## "<" in "</>"                          line => $self->{line_prev}, ## "<" in "</>"
860                          column => $self->{column_prev} - 1);                          column => $self->{column_prev} - 1);
861          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
862                    $self->{s_kwd} = '';
863            if ($self->{is_xml}) {
864              
865              ## XML5: No parse error.
866              
867              ## NOTE: This parser raises a parse error, since it supports
868              ## XML1, not XML5.
869    
870              ## NOTE: A short end tag token.
871              my $ct = {type => END_TAG_TOKEN,
872                        tag_name => '',
873                        line => $self->{line_prev},
874                        column => $self->{column_prev} - 1,
875                       };
876              
877        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
878          $self->{line_prev} = $self->{line};
879          $self->{column_prev} = $self->{column};
880          $self->{column}++;
881          $self->{nc}
882              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
883        } else {
884          $self->{set_nc}->($self);
885        }
886      
887              return  ($ct);
888            } else {
889              
890              
891      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
892        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
893        $self->{column_prev} = $self->{column};        $self->{column_prev} = $self->{column};
# Line 711  sub _get_next_token ($) { Line 898  sub _get_next_token ($) {
898        $self->{set_nc}->($self);        $self->{set_nc}->($self);
899      }      }
900        
901            }
902          redo A;          redo A;
903        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
904                    
905          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare etago');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare etago');
906            $self->{s_kwd} = '';
907          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
908          # reconsume          # reconsume
909    
# Line 723  sub _get_next_token ($) { Line 912  sub _get_next_token ($) {
912                   });                   });
913    
914          redo A;          redo A;
915        } else {        } elsif (not $self->{is_xml} or
916                   $is_space->{$self->{nc}}) {
917                    
918          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus end tag');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus end tag',
919                            line => $self->{line_prev}, # "<" of "</"
920                            column => $self->{column_prev} - 1);
921          $self->{state} = BOGUS_COMMENT_STATE;          $self->{state} = BOGUS_COMMENT_STATE;
922          $self->{ct} = {type => COMMENT_TOKEN, data => '',          $self->{ct} = {type => COMMENT_TOKEN, data => '',
923                                    line => $self->{line_prev}, # "<" of "</"                                    line => $self->{line_prev}, # "<" of "</"
# Line 738  sub _get_next_token ($) { Line 930  sub _get_next_token ($) {
930          ## generated from the bogus end tag, as defined in the          ## generated from the bogus end tag, as defined in the
931          ## "bogus comment state" entry.          ## "bogus comment state" entry.
932          redo A;          redo A;
933          } else {
934            ## XML5: "</:" is a parse error.
935            
936            $self->{ct} = {type => END_TAG_TOKEN,
937                           tag_name => chr ($self->{nc}),
938                           line => $l, column => $c};
939            $self->{state} = TAG_NAME_STATE; ## XML5: "end tag name state".
940            
941        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
942          $self->{line_prev} = $self->{line};
943          $self->{column_prev} = $self->{column};
944          $self->{column}++;
945          $self->{nc}
946              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
947        } else {
948          $self->{set_nc}->($self);
949        }
950      
951            redo A;
952        }        }
953      } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {      } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
954        my $ch = substr $self->{last_stag_name}, length $self->{s_kwd}, 1;        my $ch = substr $self->{last_stag_name}, length $self->{kwd}, 1;
955        if (length $ch) {        if (length $ch) {
956          my $CH = $ch;          my $CH = $ch;
957          $ch =~ tr/a-z/A-Z/;          $ch =~ tr/a-z/A-Z/;
# Line 748  sub _get_next_token ($) { Line 959  sub _get_next_token ($) {
959          if ($nch eq $ch or $nch eq $CH) {          if ($nch eq $ch or $nch eq $CH) {
960                        
961            ## Stay in the state.            ## Stay in the state.
962            $self->{s_kwd} .= $nch;            $self->{kwd} .= $nch;
963                        
964      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
965        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 764  sub _get_next_token ($) { Line 975  sub _get_next_token ($) {
975          } else {          } else {
976                        
977            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
978              $self->{s_kwd} = '';
979            ## Reconsume.            ## Reconsume.
980            return  ({type => CHARACTER_TOKEN,            return  ({type => CHARACTER_TOKEN,
981                      data => '</' . $self->{s_kwd},                      data => '</' . $self->{kwd},
982                      line => $self->{line_prev},                      line => $self->{line_prev},
983                      column => $self->{column_prev} - 1 - length $self->{s_kwd},                      column => $self->{column_prev} - 1 - length $self->{kwd},
984                     });                     });
985            redo A;            redo A;
986          }          }
# Line 782  sub _get_next_token ($) { Line 994  sub _get_next_token ($) {
994                        
995            ## Reconsume.            ## Reconsume.
996            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
997              $self->{s_kwd} = '';
998            return  ({type => CHARACTER_TOKEN,            return  ({type => CHARACTER_TOKEN,
999                      data => '</' . $self->{s_kwd},                      data => '</' . $self->{kwd},
1000                      line => $self->{line_prev},                      line => $self->{line_prev},
1001                      column => $self->{column_prev} - 1 - length $self->{s_kwd},                      column => $self->{column_prev} - 1 - length $self->{kwd},
1002                     });                     });
1003            redo A;            redo A;
1004          } else {          } else {
# Line 794  sub _get_next_token ($) { Line 1007  sub _get_next_token ($) {
1007                = {type => END_TAG_TOKEN,                = {type => END_TAG_TOKEN,
1008                   tag_name => $self->{last_stag_name},                   tag_name => $self->{last_stag_name},
1009                   line => $self->{line_prev},                   line => $self->{line_prev},
1010                   column => $self->{column_prev} - 1 - length $self->{s_kwd}};                   column => $self->{column_prev} - 1 - length $self->{kwd}};
1011            $self->{state} = TAG_NAME_STATE;            $self->{state} = TAG_NAME_STATE;
1012            ## Reconsume.            ## Reconsume.
1013            redo A;            redo A;
# Line 833  sub _get_next_token ($) { Line 1046  sub _get_next_token ($) {
1046            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1047          }          }
1048          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1049            $self->{s_kwd} = '';
1050                    
1051      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1052        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 885  sub _get_next_token ($) { Line 1099  sub _get_next_token ($) {
1099            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1100          }          }
1101          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1102            $self->{s_kwd} = '';
1103          # reconsume          # reconsume
1104    
1105          return  ($self->{ct}); # start tag or end tag          return  ($self->{ct}); # start tag or end tag
# Line 924  sub _get_next_token ($) { Line 1139  sub _get_next_token ($) {
1139          redo A;          redo A;
1140        }        }
1141      } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {      } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
1142          ## XML5: "Tag attribute name before state".
1143    
1144        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
1145                    
1146          ## Stay in the state          ## Stay in the state
# Line 955  sub _get_next_token ($) { Line 1172  sub _get_next_token ($) {
1172            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1173          }          }
1174          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1175            $self->{s_kwd} = '';
1176                    
1177      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1178        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 1022  sub _get_next_token ($) { Line 1240  sub _get_next_token ($) {
1240            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1241          }          }
1242          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1243            $self->{s_kwd} = '';
1244          # reconsume          # reconsume
1245    
1246          return  ($self->{ct}); # start tag or end tag          return  ($self->{ct}); # start tag or end tag
# Line 1031  sub _get_next_token ($) { Line 1250  sub _get_next_token ($) {
1250          if ({          if ({
1251               0x0022 => 1, # "               0x0022 => 1, # "
1252               0x0027 => 1, # '               0x0027 => 1, # '
1253                 0x003C => 1, # <
1254               0x003D => 1, # =               0x003D => 1, # =
1255              }->{$self->{nc}}) {              }->{$self->{nc}}) {
1256                        
1257              ## XML5: Not a parse error.
1258            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1259          } else {          } else {
1260                        
1261              ## XML5: ":" raises a parse error and is ignored.
1262          }          }
1263          $self->{ca}          $self->{ca}
1264              = {name => chr ($self->{nc}),              = {name => chr ($self->{nc}),
# Line 1057  sub _get_next_token ($) { Line 1279  sub _get_next_token ($) {
1279          redo A;          redo A;
1280        }        }
1281      } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {      } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
1282          ## XML5: "Tag attribute name state".
1283    
1284        my $before_leave = sub {        my $before_leave = sub {
1285          if (exists $self->{ct}->{attributes} # start tag or end tag          if (exists $self->{ct}->{attributes} # start tag or end tag
1286              ->{$self->{ca}->{name}}) { # MUST              ->{$self->{ca}->{name}}) { # MUST
# Line 1067  sub _get_next_token ($) { Line 1291  sub _get_next_token ($) {
1291                        
1292            $self->{ct}->{attributes}->{$self->{ca}->{name}}            $self->{ct}->{attributes}->{$self->{ca}->{name}}
1293              = $self->{ca};              = $self->{ca};
1294              $self->{ca}->{index} = ++$self->{ct}->{last_index};
1295          }          }
1296        }; # $before_leave        }; # $before_leave
1297    
# Line 1103  sub _get_next_token ($) { Line 1328  sub _get_next_token ($) {
1328        
1329          redo A;          redo A;
1330        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
1331            if ($self->{is_xml}) {
1332              
1333              ## XML5: Not a parse error.
1334              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1335            } else {
1336              
1337            }
1338    
1339          $before_leave->();          $before_leave->();
1340          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
1341                        
# Line 1117  sub _get_next_token ($) { Line 1350  sub _get_next_token ($) {
1350            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1351          }          }
1352          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1353            $self->{s_kwd} = '';
1354                    
1355      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1356        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 1151  sub _get_next_token ($) { Line 1385  sub _get_next_token ($) {
1385        
1386          redo A;          redo A;
1387        } elsif ($self->{nc} == 0x002F) { # /        } elsif ($self->{nc} == 0x002F) { # /
1388            if ($self->{is_xml}) {
1389              
1390              ## XML5: Not a parse error.
1391              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1392            } else {
1393              
1394            }
1395                    
1396          $before_leave->();          $before_leave->();
1397          $self->{state} = SELF_CLOSING_START_TAG_STATE;          $self->{state} = SELF_CLOSING_START_TAG_STATE;
# Line 1185  sub _get_next_token ($) { Line 1426  sub _get_next_token ($) {
1426            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1427          }          }
1428          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1429            $self->{s_kwd} = '';
1430          # reconsume          # reconsume
1431    
1432          return  ($self->{ct}); # start tag or end tag          return  ($self->{ct}); # start tag or end tag
1433    
1434          redo A;          redo A;
1435        } else {        } else {
1436          if ($self->{nc} == 0x0022 or # "          if ({
1437              $self->{nc} == 0x0027) { # '               0x0022 => 1, # "
1438                 0x0027 => 1, # '
1439                 0x003C => 1, # <
1440                }->{$self->{nc}}) {
1441                        
1442              ## XML5: Not a parse error.
1443            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1444          } else {          } else {
1445                        
# Line 1214  sub _get_next_token ($) { Line 1460  sub _get_next_token ($) {
1460          redo A;          redo A;
1461        }        }
1462      } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {      } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1463          ## XML5: "Tag attribute name after state".
1464          
1465        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
1466                    
1467          ## Stay in the state          ## Stay in the state
# Line 1245  sub _get_next_token ($) { Line 1493  sub _get_next_token ($) {
1493        
1494          redo A;          redo A;
1495        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
1496            if ($self->{is_xml}) {
1497              
1498              ## XML5: Not a parse error.
1499              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1500            } else {
1501              
1502            }
1503    
1504          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
1505                        
1506            $self->{last_stag_name} = $self->{ct}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
# Line 1261  sub _get_next_token ($) { Line 1517  sub _get_next_token ($) {
1517            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1518          }          }
1519          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1520            $self->{s_kwd} = '';
1521                    
1522      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1523        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 1297  sub _get_next_token ($) { Line 1554  sub _get_next_token ($) {
1554        
1555          redo A;          redo A;
1556        } elsif ($self->{nc} == 0x002F) { # /        } elsif ($self->{nc} == 0x002F) { # /
1557            if ($self->{is_xml}) {
1558              
1559              ## XML5: Not a parse error.
1560              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1561            } else {
1562              
1563            }
1564                    
1565          $self->{state} = SELF_CLOSING_START_TAG_STATE;          $self->{state} = SELF_CLOSING_START_TAG_STATE;
1566                    
# Line 1328  sub _get_next_token ($) { Line 1592  sub _get_next_token ($) {
1592          } else {          } else {
1593            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1594          }          }
1595            $self->{s_kwd} = '';
1596          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1597          # reconsume          # reconsume
1598    
# Line 1335  sub _get_next_token ($) { Line 1600  sub _get_next_token ($) {
1600    
1601          redo A;          redo A;
1602        } else {        } else {
1603          if ($self->{nc} == 0x0022 or # "          if ($self->{is_xml}) {
1604              $self->{nc} == 0x0027) { # '            
1605              ## XML5: Not a parse error.
1606              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1607            } else {
1608              
1609            }
1610    
1611            if ({
1612                 0x0022 => 1, # "
1613                 0x0027 => 1, # '
1614                 0x003C => 1, # <
1615                }->{$self->{nc}}) {
1616                        
1617              ## XML5: Not a parse error.
1618            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1619          } else {          } else {
1620                        
# Line 1361  sub _get_next_token ($) { Line 1638  sub _get_next_token ($) {
1638          redo A;                  redo A;        
1639        }        }
1640      } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {      } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1641          ## XML5: "Tag attribute value before state".
1642    
1643        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
1644                    
1645          ## Stay in the state          ## Stay in the state
# Line 1429  sub _get_next_token ($) { Line 1708  sub _get_next_token ($) {
1708            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1709          }          }
1710          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1711            $self->{s_kwd} = '';
1712                    
1713      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1714        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 1462  sub _get_next_token ($) { Line 1742  sub _get_next_token ($) {
1742            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1743          }          }
1744          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1745            $self->{s_kwd} = '';
1746          ## reconsume          ## reconsume
1747    
1748          return  ($self->{ct}); # start tag or end tag          return  ($self->{ct}); # start tag or end tag
1749    
1750          redo A;          redo A;
1751        } else {        } else {
1752          if ($self->{nc} == 0x003D) { # =          if ($self->{nc} == 0x003D or $self->{nc} == 0x003C) { # =, <
1753                        
1754              ## XML5: Not a parse error.
1755            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
1756            } elsif ($self->{is_xml}) {
1757              
1758              ## XML5: No parse error.
1759              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO
1760          } else {          } else {
1761                        
1762          }          }
# Line 1490  sub _get_next_token ($) { Line 1776  sub _get_next_token ($) {
1776          redo A;          redo A;
1777        }        }
1778      } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {      } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1779          ## XML5: "Tag attribute value double quoted state" and "DOCTYPE
1780          ## ATTLIST attribute value double quoted state".
1781          
1782        if ($self->{nc} == 0x0022) { # "        if ($self->{nc} == 0x0022) { # "
1783                    if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1784          $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;            
1785              ## XML5: "DOCTYPE ATTLIST name after state".
1786              push @{$self->{ct}->{attrdefs}}, $self->{ca};
1787              $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1788            } else {
1789              
1790              ## XML5: "Tag attribute name before state".
1791              $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1792            }
1793                    
1794      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1795        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 1507  sub _get_next_token ($) { Line 1804  sub _get_next_token ($) {
1804          redo A;          redo A;
1805        } elsif ($self->{nc} == 0x0026) { # &        } elsif ($self->{nc} == 0x0026) { # &
1806                    
1807            ## XML5: Not defined yet.
1808    
1809          ## NOTE: In the spec, the tokenizer is switched to the          ## NOTE: In the spec, the tokenizer is switched to the
1810          ## "entity in attribute value state".  In this implementation, the          ## "entity in attribute value state".  In this implementation, the
1811          ## tokenizer is switched to the |ENTITY_STATE|, which is an          ## tokenizer is switched to the |ENTITY_STATE|, which is an
# Line 1526  sub _get_next_token ($) { Line 1825  sub _get_next_token ($) {
1825      }      }
1826        
1827          redo A;          redo A;
1828          } elsif ($self->{is_xml} and
1829                   $is_space->{$self->{nc}}) {
1830            
1831            $self->{ca}->{value} .= ' ';
1832            ## Stay in the state.
1833            
1834        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1835          $self->{line_prev} = $self->{line};
1836          $self->{column_prev} = $self->{column};
1837          $self->{column}++;
1838          $self->{nc}
1839              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1840        } else {
1841          $self->{set_nc}->($self);
1842        }
1843      
1844            redo A;
1845        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
1846          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1847          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
1848                        
1849            $self->{last_stag_name} = $self->{ct}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
1850    
1851              $self->{state} = DATA_STATE;
1852              $self->{s_kwd} = '';
1853              ## reconsume
1854              return  ($self->{ct}); # start tag
1855              redo A;
1856          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1857            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1858            if ($self->{ct}->{attributes}) {            if ($self->{ct}->{attributes}) {
# Line 1540  sub _get_next_token ($) { Line 1862  sub _get_next_token ($) {
1862              ## NOTE: This state should never be reached.              ## NOTE: This state should never be reached.
1863                            
1864            }            }
1865    
1866              $self->{state} = DATA_STATE;
1867              $self->{s_kwd} = '';
1868              ## reconsume
1869              return  ($self->{ct}); # end tag
1870              redo A;
1871            } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1872              ## XML5: No parse error above; not defined yet.
1873              push @{$self->{ct}->{attrdefs}}, $self->{ca};
1874              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1875              ## Reconsume.
1876              return  ($self->{ct}); # ATTLIST
1877              redo A;
1878          } else {          } else {
1879            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1880          }          }
         $self->{state} = DATA_STATE;  
         ## reconsume  
   
         return  ($self->{ct}); # start tag or end tag  
   
         redo A;  
1881        } else {        } else {
1882                    ## XML5 [ATTLIST]: Not defined yet.
1883            if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1884              
1885              ## XML5: Not a parse error.
1886              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
1887            } else {
1888              
1889            }
1890          $self->{ca}->{value} .= chr ($self->{nc});          $self->{ca}->{value} .= chr ($self->{nc});
1891          $self->{read_until}->($self->{ca}->{value},          $self->{read_until}->($self->{ca}->{value},
1892                                q["&],                                qq["&<\x09\x0C\x20],
1893                                length $self->{ca}->{value});                                length $self->{ca}->{value});
1894    
1895          ## Stay in the state          ## Stay in the state
# Line 1571  sub _get_next_token ($) { Line 1907  sub _get_next_token ($) {
1907          redo A;          redo A;
1908        }        }
1909      } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {      } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1910          ## XML5: "Tag attribute value single quoted state" and "DOCTYPE
1911          ## ATTLIST attribute value single quoted state".
1912    
1913        if ($self->{nc} == 0x0027) { # '        if ($self->{nc} == 0x0027) { # '
1914                    if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1915          $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;            
1916              ## XML5: "DOCTYPE ATTLIST name after state".
1917              push @{$self->{ct}->{attrdefs}}, $self->{ca};
1918              $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1919            } else {
1920              
1921              ## XML5: "Before attribute name state" (sic).
1922              $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1923            }
1924                    
1925      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1926        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 1588  sub _get_next_token ($) { Line 1935  sub _get_next_token ($) {
1935          redo A;          redo A;
1936        } elsif ($self->{nc} == 0x0026) { # &        } elsif ($self->{nc} == 0x0026) { # &
1937                    
1938            ## XML5: Not defined yet.
1939    
1940          ## NOTE: In the spec, the tokenizer is switched to the          ## NOTE: In the spec, the tokenizer is switched to the
1941          ## "entity in attribute value state".  In this implementation, the          ## "entity in attribute value state".  In this implementation, the
1942          ## tokenizer is switched to the |ENTITY_STATE|, which is an          ## tokenizer is switched to the |ENTITY_STATE|, which is an
# Line 1607  sub _get_next_token ($) { Line 1956  sub _get_next_token ($) {
1956      }      }
1957        
1958          redo A;          redo A;
1959          } elsif ($self->{is_xml} and
1960                   $is_space->{$self->{nc}}) {
1961            
1962            $self->{ca}->{value} .= ' ';
1963            ## Stay in the state.
1964            
1965        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1966          $self->{line_prev} = $self->{line};
1967          $self->{column_prev} = $self->{column};
1968          $self->{column}++;
1969          $self->{nc}
1970              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1971        } else {
1972          $self->{set_nc}->($self);
1973        }
1974      
1975            redo A;
1976        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
1977          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1978          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
1979                        
1980            $self->{last_stag_name} = $self->{ct}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
1981    
1982              $self->{state} = DATA_STATE;
1983              $self->{s_kwd} = '';
1984              ## reconsume
1985              return  ($self->{ct}); # start tag
1986              redo A;
1987          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1988            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1989            if ($self->{ct}->{attributes}) {            if ($self->{ct}->{attributes}) {
# Line 1621  sub _get_next_token ($) { Line 1993  sub _get_next_token ($) {
1993              ## NOTE: This state should never be reached.              ## NOTE: This state should never be reached.
1994                            
1995            }            }
1996    
1997              $self->{state} = DATA_STATE;
1998              $self->{s_kwd} = '';
1999              ## reconsume
2000              return  ($self->{ct}); # end tag
2001              redo A;
2002            } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2003              ## XML5: No parse error above; not defined yet.
2004              push @{$self->{ct}->{attrdefs}}, $self->{ca};
2005              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2006              ## Reconsume.
2007              return  ($self->{ct}); # ATTLIST
2008              redo A;
2009          } else {          } else {
2010            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
2011          }          }
         $self->{state} = DATA_STATE;  
         ## reconsume  
   
         return  ($self->{ct}); # start tag or end tag  
   
         redo A;  
2012        } else {        } else {
2013                    ## XML5 [ATTLIST]: Not defined yet.
2014            if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
2015              
2016              ## XML5: Not a parse error.
2017              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
2018            } else {
2019              
2020            }
2021          $self->{ca}->{value} .= chr ($self->{nc});          $self->{ca}->{value} .= chr ($self->{nc});
2022          $self->{read_until}->($self->{ca}->{value},          $self->{read_until}->($self->{ca}->{value},
2023                                q['&],                                qq['&<\x09\x0C\x20],
2024                                length $self->{ca}->{value});                                length $self->{ca}->{value});
2025    
2026          ## Stay in the state          ## Stay in the state
# Line 1652  sub _get_next_token ($) { Line 2038  sub _get_next_token ($) {
2038          redo A;          redo A;
2039        }        }
2040      } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {      } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
2041          ## XML5: "Tag attribute value unquoted state".
2042    
2043        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
2044                    if ($self->{ct}->{type} == ATTLIST_TOKEN) {
2045          $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;            
2046              push @{$self->{ct}->{attrdefs}}, $self->{ca};
2047              $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
2048            } else {
2049              
2050              ## XML5: "Tag attribute name before state".
2051              $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2052            }
2053                    
2054      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2055        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 1669  sub _get_next_token ($) { Line 2064  sub _get_next_token ($) {
2064          redo A;          redo A;
2065        } elsif ($self->{nc} == 0x0026) { # &        } elsif ($self->{nc} == 0x0026) { # &
2066                    
2067    
2068            ## XML5: Not defined yet.
2069    
2070          ## NOTE: In the spec, the tokenizer is switched to the          ## NOTE: In the spec, the tokenizer is switched to the
2071          ## "entity in attribute value state".  In this implementation, the          ## "entity in attribute value state".  In this implementation, the
2072          ## tokenizer is switched to the |ENTITY_STATE|, which is an          ## tokenizer is switched to the |ENTITY_STATE|, which is an
# Line 1692  sub _get_next_token ($) { Line 2090  sub _get_next_token ($) {
2090          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
2091                        
2092            $self->{last_stag_name} = $self->{ct}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
2093    
2094              $self->{state} = DATA_STATE;
2095              $self->{s_kwd} = '';
2096              
2097        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2098          $self->{line_prev} = $self->{line};
2099          $self->{column_prev} = $self->{column};
2100          $self->{column}++;
2101          $self->{nc}
2102              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2103        } else {
2104          $self->{set_nc}->($self);
2105        }
2106      
2107              return  ($self->{ct}); # start tag
2108              redo A;
2109          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2110            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2111            if ($self->{ct}->{attributes}) {            if ($self->{ct}->{attributes}) {
# Line 1701  sub _get_next_token ($) { Line 2115  sub _get_next_token ($) {
2115              ## NOTE: This state should never be reached.              ## NOTE: This state should never be reached.
2116                            
2117            }            }
2118          } else {  
2119            die "$0: $self->{ct}->{type}: Unknown token type";            $self->{state} = DATA_STATE;
2120          }            $self->{s_kwd} = '';
2121          $self->{state} = DATA_STATE;            
           
2122      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2123        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
2124        $self->{column_prev} = $self->{column};        $self->{column_prev} = $self->{column};
# Line 1716  sub _get_next_token ($) { Line 2129  sub _get_next_token ($) {
2129        $self->{set_nc}->($self);        $self->{set_nc}->($self);
2130      }      }
2131        
2132              return  ($self->{ct}); # end tag
2133          return  ($self->{ct}); # start tag or end tag            redo A;
2134            } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2135          redo A;            push @{$self->{ct}->{attrdefs}}, $self->{ca};
2136              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2137              
2138        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2139          $self->{line_prev} = $self->{line};
2140          $self->{column_prev} = $self->{column};
2141          $self->{column}++;
2142          $self->{nc}
2143              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2144        } else {
2145          $self->{set_nc}->($self);
2146        }
2147      
2148              return  ($self->{ct}); # ATTLIST
2149              redo A;
2150            } else {
2151              die "$0: $self->{ct}->{type}: Unknown token type";
2152            }
2153        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
         $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');  
2154          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
2155                        
2156              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2157            $self->{last_stag_name} = $self->{ct}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
2158    
2159              $self->{state} = DATA_STATE;
2160              $self->{s_kwd} = '';
2161              ## reconsume
2162              return  ($self->{ct}); # start tag
2163              redo A;
2164          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2165              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2166            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2167            if ($self->{ct}->{attributes}) {            if ($self->{ct}->{attributes}) {
2168                            
# Line 1734  sub _get_next_token ($) { Line 2171  sub _get_next_token ($) {
2171              ## NOTE: This state should never be reached.              ## NOTE: This state should never be reached.
2172                            
2173            }            }
2174    
2175              $self->{state} = DATA_STATE;
2176              $self->{s_kwd} = '';
2177              ## reconsume
2178              return  ($self->{ct}); # end tag
2179              redo A;
2180            } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2181              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
2182              push @{$self->{ct}->{attrdefs}}, $self->{ca};
2183              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2184              ## Reconsume.
2185              return  ($self->{ct}); # ATTLIST
2186              redo A;
2187          } else {          } else {
2188            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
2189          }          }
         $self->{state} = DATA_STATE;  
         ## reconsume  
   
         return  ($self->{ct}); # start tag or end tag  
   
         redo A;  
2190        } else {        } else {
2191          if ({          if ({
2192               0x0022 => 1, # "               0x0022 => 1, # "
2193               0x0027 => 1, # '               0x0027 => 1, # '
2194               0x003D => 1, # =               0x003D => 1, # =
2195                 0x003C => 1, # <
2196              }->{$self->{nc}}) {              }->{$self->{nc}}) {
2197                        
2198              ## XML5: Not a parse error.
2199            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
2200          } else {          } else {
2201                        
2202          }          }
2203          $self->{ca}->{value} .= chr ($self->{nc});          $self->{ca}->{value} .= chr ($self->{nc});
2204          $self->{read_until}->($self->{ca}->{value},          $self->{read_until}->($self->{ca}->{value},
2205                                q["'=& >],                                qq["'=& \x09\x0C>],
2206                                length $self->{ca}->{value});                                length $self->{ca}->{value});
2207    
2208          ## Stay in the state          ## Stay in the state
# Line 1806  sub _get_next_token ($) { Line 2252  sub _get_next_token ($) {
2252            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
2253          }          }
2254          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2255            $self->{s_kwd} = '';
2256                    
2257      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2258        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 1853  sub _get_next_token ($) { Line 2300  sub _get_next_token ($) {
2300            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
2301          }          }
2302          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2303            $self->{s_kwd} = '';
2304          ## Reconsume.          ## Reconsume.
2305          return  ($self->{ct}); # start tag or end tag          return  ($self->{ct}); # start tag or end tag
2306          redo A;          redo A;
# Line 1864  sub _get_next_token ($) { Line 2312  sub _get_next_token ($) {
2312          redo A;          redo A;
2313        }        }
2314      } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {      } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
2315          ## XML5: "Empty tag state".
2316    
2317        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
2318          if ($self->{ct}->{type} == END_TAG_TOKEN) {          if ($self->{ct}->{type} == END_TAG_TOKEN) {
2319                        
# Line 1883  sub _get_next_token ($) { Line 2333  sub _get_next_token ($) {
2333          }          }
2334    
2335          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2336            $self->{s_kwd} = '';
2337                    
2338      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2339        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 1914  sub _get_next_token ($) { Line 2365  sub _get_next_token ($) {
2365          } else {          } else {
2366            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
2367          }          }
2368            ## XML5: "Tag attribute name before state".
2369          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2370            $self->{s_kwd} = '';
2371          ## Reconsume.          ## Reconsume.
2372          return  ($self->{ct}); # start tag or end tag          return  ($self->{ct}); # start tag or end tag
2373          redo A;          redo A;
# Line 1927  sub _get_next_token ($) { Line 2380  sub _get_next_token ($) {
2380          redo A;          redo A;
2381        }        }
2382      } elsif ($self->{state} == BOGUS_COMMENT_STATE) {      } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
2383        ## (only happen if PCDATA state)        ## XML5: "Bogus comment state" and "DOCTYPE bogus comment state".
2384    
2385        ## NOTE: Unlike spec's "bogus comment state", this implementation        ## NOTE: Unlike spec's "bogus comment state", this implementation
2386        ## consumes characters one-by-one basis.        ## consumes characters one-by-one basis.
2387                
2388        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
2389                    if ($self->{in_subset}) {
2390          $self->{state} = DATA_STATE;            
2391              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2392            } else {
2393              
2394              $self->{state} = DATA_STATE;
2395              $self->{s_kwd} = '';
2396            }
2397                    
2398      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2399        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 1950  sub _get_next_token ($) { Line 2409  sub _get_next_token ($) {
2409          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
2410          redo A;          redo A;
2411        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
2412                    if ($self->{in_subset}) {
2413          $self->{state} = DATA_STATE;            
2414              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2415            } else {
2416              
2417              $self->{state} = DATA_STATE;
2418              $self->{s_kwd} = '';
2419            }
2420          ## reconsume          ## reconsume
2421    
2422          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
# Line 1978  sub _get_next_token ($) { Line 2443  sub _get_next_token ($) {
2443          redo A;          redo A;
2444        }        }
2445      } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {      } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
2446        ## (only happen if PCDATA state)        ## XML5: "Markup declaration state".
2447                
2448        if ($self->{nc} == 0x002D) { # -        if ($self->{nc} == 0x002D) { # -
2449                    
# Line 2000  sub _get_next_token ($) { Line 2465  sub _get_next_token ($) {
2465          ## ASCII case-insensitive.          ## ASCII case-insensitive.
2466                    
2467          $self->{state} = MD_DOCTYPE_STATE;          $self->{state} = MD_DOCTYPE_STATE;
2468          $self->{s_kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
2469                    
2470      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2471        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2019  sub _get_next_token ($) { Line 2484  sub _get_next_token ($) {
2484                 $self->{nc} == 0x005B) { # [                 $self->{nc} == 0x005B) { # [
2485                                                    
2486          $self->{state} = MD_CDATA_STATE;          $self->{state} = MD_CDATA_STATE;
2487          $self->{s_kwd} = '[';          $self->{kwd} = '[';
2488                    
2489      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2490        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2053  sub _get_next_token ($) { Line 2518  sub _get_next_token ($) {
2518                                    line => $self->{line_prev},                                    line => $self->{line_prev},
2519                                    column => $self->{column_prev} - 2,                                    column => $self->{column_prev} - 2,
2520                                   };                                   };
2521          $self->{state} = COMMENT_START_STATE;          $self->{state} = COMMENT_START_STATE; ## XML5: "comment state".
2522                    
2523      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2524        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2089  sub _get_next_token ($) { Line 2554  sub _get_next_token ($) {
2554              0x0054, # T              0x0054, # T
2555              0x0059, # Y              0x0059, # Y
2556              0x0050, # P              0x0050, # P
2557            ]->[length $self->{s_kwd}] or            ]->[length $self->{kwd}] or
2558            $self->{nc} == [            $self->{nc} == [
2559              undef,              undef,
2560              0x006F, # o              0x006F, # o
# Line 2097  sub _get_next_token ($) { Line 2562  sub _get_next_token ($) {
2562              0x0074, # t              0x0074, # t
2563              0x0079, # y              0x0079, # y
2564              0x0070, # p              0x0070, # p
2565            ]->[length $self->{s_kwd}]) {            ]->[length $self->{kwd}]) {
2566                    
2567          ## Stay in the state.          ## Stay in the state.
2568          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
2569                    
2570      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2571        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2113  sub _get_next_token ($) { Line 2578  sub _get_next_token ($) {
2578      }      }
2579        
2580          redo A;          redo A;
2581        } elsif ((length $self->{s_kwd}) == 6 and        } elsif ((length $self->{kwd}) == 6 and
2582                 ($self->{nc} == 0x0045 or # E                 ($self->{nc} == 0x0045 or # E
2583                  $self->{nc} == 0x0065)) { # e                  $self->{nc} == 0x0065)) { # e
2584                    if ($self->{is_xml} and
2585                ($self->{kwd} ne 'DOCTYP' or $self->{nc} == 0x0065)) {
2586              
2587              ## XML5: case-sensitive.
2588              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO
2589                              text => 'DOCTYPE',
2590                              line => $self->{line_prev},
2591                              column => $self->{column_prev} - 5);
2592            } else {
2593              
2594            }
2595          $self->{state} = DOCTYPE_STATE;          $self->{state} = DOCTYPE_STATE;
2596          $self->{ct} = {type => DOCTYPE_TOKEN,          $self->{ct} = {type => DOCTYPE_TOKEN,
2597                                    quirks => 1,                                    quirks => 1,
# Line 2139  sub _get_next_token ($) { Line 2614  sub _get_next_token ($) {
2614                                    
2615          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2616                          line => $self->{line_prev},                          line => $self->{line_prev},
2617                          column => $self->{column_prev} - 1 - length $self->{s_kwd});                          column => $self->{column_prev} - 1 - length $self->{kwd});
2618          $self->{state} = BOGUS_COMMENT_STATE;          $self->{state} = BOGUS_COMMENT_STATE;
2619          ## Reconsume.          ## Reconsume.
2620          $self->{ct} = {type => COMMENT_TOKEN,          $self->{ct} = {type => COMMENT_TOKEN,
2621                                    data => $self->{s_kwd},                                    data => $self->{kwd},
2622                                    line => $self->{line_prev},                                    line => $self->{line_prev},
2623                                    column => $self->{column_prev} - 1 - length $self->{s_kwd},                                    column => $self->{column_prev} - 1 - length $self->{kwd},
2624                                   };                                   };
2625          redo A;          redo A;
2626        }        }
# Line 2156  sub _get_next_token ($) { Line 2631  sub _get_next_token ($) {
2631              '[CD' => 0x0041, # A              '[CD' => 0x0041, # A
2632              '[CDA' => 0x0054, # T              '[CDA' => 0x0054, # T
2633              '[CDAT' => 0x0041, # A              '[CDAT' => 0x0041, # A
2634            }->{$self->{s_kwd}}) {            }->{$self->{kwd}}) {
2635                    
2636          ## Stay in the state.          ## Stay in the state.
2637          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
2638                    
2639      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2640        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2172  sub _get_next_token ($) { Line 2647  sub _get_next_token ($) {
2647      }      }
2648        
2649          redo A;          redo A;
2650        } elsif ($self->{s_kwd} eq '[CDATA' and        } elsif ($self->{kwd} eq '[CDATA' and
2651                 $self->{nc} == 0x005B) { # [                 $self->{nc} == 0x005B) { # [
2652                    if ($self->{is_xml} and
2653                not $self->{tainted} and
2654                @{$self->{open_elements} or []} == 0) {
2655              
2656              $self->{parse_error}->(level => $self->{level}->{must}, type => 'cdata outside of root element',
2657                              line => $self->{line_prev},
2658                              column => $self->{column_prev} - 7);
2659              $self->{tainted} = 1;
2660            } else {
2661              
2662            }
2663    
2664          $self->{ct} = {type => CHARACTER_TOKEN,          $self->{ct} = {type => CHARACTER_TOKEN,
2665                                    data => '',                                    data => '',
2666                                    line => $self->{line_prev},                                    line => $self->{line_prev},
# Line 2196  sub _get_next_token ($) { Line 2682  sub _get_next_token ($) {
2682                    
2683          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2684                          line => $self->{line_prev},                          line => $self->{line_prev},
2685                          column => $self->{column_prev} - 1 - length $self->{s_kwd});                          column => $self->{column_prev} - 1 - length $self->{kwd});
2686          $self->{state} = BOGUS_COMMENT_STATE;          $self->{state} = BOGUS_COMMENT_STATE;
2687          ## Reconsume.          ## Reconsume.
2688          $self->{ct} = {type => COMMENT_TOKEN,          $self->{ct} = {type => COMMENT_TOKEN,
2689                                    data => $self->{s_kwd},                                    data => $self->{kwd},
2690                                    line => $self->{line_prev},                                    line => $self->{line_prev},
2691                                    column => $self->{column_prev} - 1 - length $self->{s_kwd},                                    column => $self->{column_prev} - 1 - length $self->{kwd},
2692                                   };                                   };
2693          redo A;          redo A;
2694        }        }
# Line 2223  sub _get_next_token ($) { Line 2709  sub _get_next_token ($) {
2709        
2710          redo A;          redo A;
2711        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
           
2712          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2713          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
2714              
2715              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2716            } else {
2717              
2718              $self->{state} = DATA_STATE;
2719              $self->{s_kwd} = '';
2720            }
2721                    
2722      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2723        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2242  sub _get_next_token ($) { Line 2734  sub _get_next_token ($) {
2734    
2735          redo A;          redo A;
2736        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
2737          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2738          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
2739              
2740              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2741            } else {
2742              
2743              $self->{state} = DATA_STATE;
2744              $self->{s_kwd} = '';
2745            }
2746          ## reconsume          ## reconsume
2747    
2748          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
# Line 2285  sub _get_next_token ($) { Line 2783  sub _get_next_token ($) {
2783        
2784          redo A;          redo A;
2785        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
           
2786          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2787          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
2788              
2789              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2790            } else {
2791              
2792              $self->{state} = DATA_STATE;
2793              $self->{s_kwd} = '';
2794            }
2795                    
2796      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2797        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2304  sub _get_next_token ($) { Line 2808  sub _get_next_token ($) {
2808    
2809          redo A;          redo A;
2810        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
2811          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2812          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
2813              
2814              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2815            } else {
2816              
2817              $self->{state} = DATA_STATE;
2818              $self->{s_kwd} = '';
2819            }
2820          ## reconsume          ## reconsume
2821    
2822          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
# Line 2331  sub _get_next_token ($) { Line 2841  sub _get_next_token ($) {
2841          redo A;          redo A;
2842        }        }
2843      } elsif ($self->{state} == COMMENT_STATE) {      } elsif ($self->{state} == COMMENT_STATE) {
2844          ## XML5: "Comment state" and "DOCTYPE comment state".
2845    
2846        if ($self->{nc} == 0x002D) { # -        if ($self->{nc} == 0x002D) { # -
2847                    
2848          $self->{state} = COMMENT_END_DASH_STATE;          $self->{state} = COMMENT_END_DASH_STATE;
# Line 2347  sub _get_next_token ($) { Line 2859  sub _get_next_token ($) {
2859        
2860          redo A;          redo A;
2861        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
2862          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2863          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
2864              
2865              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2866            } else {
2867              
2868              $self->{state} = DATA_STATE;
2869              $self->{s_kwd} = '';
2870            }
2871          ## reconsume          ## reconsume
2872    
2873          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
# Line 2377  sub _get_next_token ($) { Line 2895  sub _get_next_token ($) {
2895          redo A;          redo A;
2896        }        }
2897      } elsif ($self->{state} == COMMENT_END_DASH_STATE) {      } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2898          ## XML5: "Comment dash state" and "DOCTYPE comment dash state".
2899    
2900        if ($self->{nc} == 0x002D) { # -        if ($self->{nc} == 0x002D) { # -
2901                    
2902          $self->{state} = COMMENT_END_STATE;          $self->{state} = COMMENT_END_STATE;
# Line 2393  sub _get_next_token ($) { Line 2913  sub _get_next_token ($) {
2913        
2914          redo A;          redo A;
2915        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
2916          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2917          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
2918              
2919              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2920            } else {
2921              
2922              $self->{state} = DATA_STATE;
2923              $self->{s_kwd} = '';
2924            }
2925          ## reconsume          ## reconsume
2926    
2927          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
# Line 2418  sub _get_next_token ($) { Line 2944  sub _get_next_token ($) {
2944        
2945          redo A;          redo A;
2946        }        }
2947      } elsif ($self->{state} == COMMENT_END_STATE) {      } elsif ($self->{state} == COMMENT_END_STATE or
2948                 $self->{state} == COMMENT_END_BANG_STATE) {
2949          ## XML5: "Comment end state" and "DOCTYPE comment end state".
2950          ## (No comment end bang state.)
2951    
2952        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
2953                    if ($self->{in_subset}) {
2954          $self->{state} = DATA_STATE;            
2955              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2956            } else {
2957              
2958              $self->{state} = DATA_STATE;
2959              $self->{s_kwd} = '';
2960            }
2961                    
2962      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2963        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2438  sub _get_next_token ($) { Line 2974  sub _get_next_token ($) {
2974    
2975          redo A;          redo A;
2976        } elsif ($self->{nc} == 0x002D) { # -        } elsif ($self->{nc} == 0x002D) { # -
2977            if ($self->{state} == COMMENT_END_BANG_STATE) {
2978              
2979              $self->{ct}->{data} .= '--!'; # comment
2980              $self->{state} = COMMENT_END_DASH_STATE;
2981            } else {
2982              
2983              ## XML5: Not a parse error.
2984              $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2985                              line => $self->{line_prev},
2986                              column => $self->{column_prev});
2987              $self->{ct}->{data} .= '-'; # comment
2988              ## Stay in the state
2989            }
2990                    
2991          $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2992                          line => $self->{line_prev},        $self->{line_prev} = $self->{line};
2993                          column => $self->{column_prev});        $self->{column_prev} = $self->{column};
2994          $self->{ct}->{data} .= '-'; # comment        $self->{column}++;
2995          ## Stay in the state        $self->{nc}
2996              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2997        } else {
2998          $self->{set_nc}->($self);
2999        }
3000      
3001            redo A;
3002          } elsif ($self->{state} != COMMENT_END_BANG_STATE and
3003                   $is_space->{$self->{nc}}) {
3004            
3005            $self->{parse_error}->(level => $self->{level}->{must}, type => 'comment end space'); # XXX error type
3006            $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
3007            $self->{state} = COMMENT_END_SPACE_STATE;
3008                    
3009      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3010        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2456  sub _get_next_token ($) { Line 3017  sub _get_next_token ($) {
3017      }      }
3018        
3019          redo A;          redo A;
3020        } elsif ($self->{nc} == -1) {        } elsif ($self->{state} != COMMENT_END_BANG_STATE and
3021                   $self->{nc} == 0x0021) { # !
3022                    
3023            $self->{parse_error}->(level => $self->{level}->{must}, type => 'comment end bang'); # XXX error type
3024            $self->{state} = COMMENT_END_BANG_STATE;
3025            
3026        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3027          $self->{line_prev} = $self->{line};
3028          $self->{column_prev} = $self->{column};
3029          $self->{column}++;
3030          $self->{nc}
3031              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3032        } else {
3033          $self->{set_nc}->($self);
3034        }
3035      
3036            redo A;
3037          } elsif ($self->{nc} == -1) {
3038          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
3039          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
3040          ## reconsume            
3041              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3042            } else {
3043              
3044              $self->{state} = DATA_STATE;
3045              $self->{s_kwd} = '';
3046            }
3047            ## Reconsume.
3048    
3049          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
3050    
3051          redo A;          redo A;
3052        } else {        } else {
3053                    
3054          $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',          if ($self->{state} == COMMENT_END_BANG_STATE) {
3055                          line => $self->{line_prev},            $self->{ct}->{data} .= '--!' . chr ($self->{nc}); # comment
3056                          column => $self->{column_prev});          } else {
3057          $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment            $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
3058            }
3059          $self->{state} = COMMENT_STATE;          $self->{state} = COMMENT_STATE;
3060                    
3061      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 2485  sub _get_next_token ($) { Line 3070  sub _get_next_token ($) {
3070        
3071          redo A;          redo A;
3072        }        }
3073        } elsif ($self->{state} == COMMENT_END_SPACE_STATE) {
3074          ## XML5: Not exist.
3075    
3076          if ($self->{nc} == 0x003E) { # >
3077            if ($self->{in_subset}) {
3078              
3079              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3080            } else {
3081              
3082              $self->{state} = DATA_STATE;
3083              $self->{s_kwd} = '';
3084            }
3085            
3086        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3087          $self->{line_prev} = $self->{line};
3088          $self->{column_prev} = $self->{column};
3089          $self->{column}++;
3090          $self->{nc}
3091              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3092        } else {
3093          $self->{set_nc}->($self);
3094        }
3095      
3096    
3097            return  ($self->{ct}); # comment
3098    
3099            redo A;
3100          } elsif ($is_space->{$self->{nc}}) {
3101            
3102            $self->{ct}->{data} .= chr ($self->{nc}); # comment
3103            ## Stay in the state.
3104            
3105        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3106          $self->{line_prev} = $self->{line};
3107          $self->{column_prev} = $self->{column};
3108          $self->{column}++;
3109          $self->{nc}
3110              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3111        } else {
3112          $self->{set_nc}->($self);
3113        }
3114      
3115            redo A;
3116          } elsif ($self->{nc} == -1) {
3117            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
3118            if ($self->{in_subset}) {
3119              
3120              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3121            } else {
3122              
3123              $self->{state} = DATA_STATE;
3124              $self->{s_kwd} = '';
3125            }
3126            ## Reconsume.
3127    
3128            return  ($self->{ct}); # comment
3129    
3130            redo A;
3131          } else {
3132            
3133            $self->{ct}->{data} .= chr ($self->{nc}); # comment
3134            $self->{state} = COMMENT_STATE;
3135            
3136        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3137          $self->{line_prev} = $self->{line};
3138          $self->{column_prev} = $self->{column};
3139          $self->{column}++;
3140          $self->{nc}
3141              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3142        } else {
3143          $self->{set_nc}->($self);
3144        }
3145      
3146            redo A;
3147          }
3148      } elsif ($self->{state} == DOCTYPE_STATE) {      } elsif ($self->{state} == DOCTYPE_STATE) {
3149        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
3150                    
# Line 2501  sub _get_next_token ($) { Line 3161  sub _get_next_token ($) {
3161      }      }
3162        
3163          redo A;          redo A;
3164          } elsif ($self->{nc} == -1) {
3165            
3166            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3167            $self->{ct}->{quirks} = 1;
3168    
3169            $self->{state} = DATA_STATE;
3170            ## Reconsume.
3171            return  ($self->{ct}); # DOCTYPE (quirks)
3172    
3173            redo A;
3174        } else {        } else {
3175                    
3176            ## XML5: Swith to the bogus comment state.
3177          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before DOCTYPE name');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before DOCTYPE name');
3178          $self->{state} = BEFORE_DOCTYPE_NAME_STATE;          $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
3179          ## reconsume          ## reconsume
3180          redo A;          redo A;
3181        }        }
3182      } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {      } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
3183          ## XML5: "DOCTYPE root name before state".
3184    
3185        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
3186                    
3187          ## Stay in the state          ## Stay in the state
# Line 2526  sub _get_next_token ($) { Line 3199  sub _get_next_token ($) {
3199          redo A;          redo A;
3200        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
3201                    
3202            ## XML5: No parse error.
3203          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3204          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3205            $self->{s_kwd} = '';
3206                    
3207      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3208        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2543  sub _get_next_token ($) { Line 3218  sub _get_next_token ($) {
3218          return  ($self->{ct}); # DOCTYPE (quirks)          return  ($self->{ct}); # DOCTYPE (quirks)
3219    
3220          redo A;          redo A;
3221          } elsif (0x0041 <= $self->{nc} and $self->{nc} <= 0x005A) { # A..Z
3222            
3223            $self->{ct}->{name} # DOCTYPE
3224                = chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
3225            delete $self->{ct}->{quirks};
3226            $self->{state} = DOCTYPE_NAME_STATE;
3227            
3228        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3229          $self->{line_prev} = $self->{line};
3230          $self->{column_prev} = $self->{column};
3231          $self->{column}++;
3232          $self->{nc}
3233              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3234        } else {
3235          $self->{set_nc}->($self);
3236        }
3237      
3238            redo A;
3239        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
3240                    
3241          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3242          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3243            $self->{s_kwd} = '';
3244          ## reconsume          ## reconsume
3245    
3246          return  ($self->{ct}); # DOCTYPE (quirks)          return  ($self->{ct}); # DOCTYPE (quirks)
3247    
3248          redo A;          redo A;
3249          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3250            
3251            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3252            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3253            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3254            $self->{in_subset} = 1;
3255            
3256        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3257          $self->{line_prev} = $self->{line};
3258          $self->{column_prev} = $self->{column};
3259          $self->{column}++;
3260          $self->{nc}
3261              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3262        } else {
3263          $self->{set_nc}->($self);
3264        }
3265      
3266            return  ($self->{ct}); # DOCTYPE
3267            redo A;
3268        } else {        } else {
3269                    
3270          $self->{ct}->{name} = chr $self->{nc};          $self->{ct}->{name} = chr $self->{nc};
# Line 2571  sub _get_next_token ($) { Line 3284  sub _get_next_token ($) {
3284          redo A;          redo A;
3285        }        }
3286      } elsif ($self->{state} == DOCTYPE_NAME_STATE) {      } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
3287  ## ISSUE: Redundant "First," in the spec.        ## XML5: "DOCTYPE root name state".
3288    
3289          ## ISSUE: Redundant "First," in the spec.
3290    
3291        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
3292                    
3293          $self->{state} = AFTER_DOCTYPE_NAME_STATE;          $self->{state} = AFTER_DOCTYPE_NAME_STATE;
# Line 2590  sub _get_next_token ($) { Line 3306  sub _get_next_token ($) {
3306        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
3307                    
3308          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3309            $self->{s_kwd} = '';
3310                    
3311      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3312        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2605  sub _get_next_token ($) { Line 3322  sub _get_next_token ($) {
3322          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
3323    
3324          redo A;          redo A;
3325          } elsif (0x0041 <= $self->{nc} and $self->{nc} <= 0x005A) { # A..Z
3326            
3327            $self->{ct}->{name} # DOCTYPE
3328                .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
3329            delete $self->{ct}->{quirks};
3330            ## Stay in the state.
3331            
3332        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3333          $self->{line_prev} = $self->{line};
3334          $self->{column_prev} = $self->{column};
3335          $self->{column}++;
3336          $self->{nc}
3337              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3338        } else {
3339          $self->{set_nc}->($self);
3340        }
3341      
3342            redo A;
3343        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
3344                    
3345          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3346          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3347            $self->{s_kwd} = '';
3348          ## reconsume          ## reconsume
3349    
3350          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
3351          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
3352    
3353          redo A;          redo A;
3354          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3355            
3356            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3357            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3358            $self->{in_subset} = 1;
3359            
3360        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3361          $self->{line_prev} = $self->{line};
3362          $self->{column_prev} = $self->{column};
3363          $self->{column}++;
3364          $self->{nc}
3365              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3366        } else {
3367          $self->{set_nc}->($self);
3368        }
3369      
3370            return  ($self->{ct}); # DOCTYPE
3371            redo A;
3372        } else {        } else {
3373                    
3374          $self->{ct}->{name}          $self->{ct}->{name} .= chr ($self->{nc}); # DOCTYPE
3375            .= chr ($self->{nc}); # DOCTYPE          ## Stay in the state.
         ## Stay in the state  
3376                    
3377      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3378        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2634  sub _get_next_token ($) { Line 3387  sub _get_next_token ($) {
3387          redo A;          redo A;
3388        }        }
3389      } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {      } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
3390          ## XML5: Corresponding to XML5's "DOCTYPE root name after
3391          ## state", but implemented differently.
3392    
3393        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
3394                    
3395          ## Stay in the state          ## Stay in the state
# Line 2650  sub _get_next_token ($) { Line 3406  sub _get_next_token ($) {
3406        
3407          redo A;          redo A;
3408        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
3409            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3410              
3411              $self->{state} = DATA_STATE;
3412              $self->{s_kwd} = '';
3413            } else {
3414              
3415              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
3416              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3417            }
3418                    
         $self->{state} = DATA_STATE;  
3419                    
3420      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3421        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2663  sub _get_next_token ($) { Line 3427  sub _get_next_token ($) {
3427        $self->{set_nc}->($self);        $self->{set_nc}->($self);
3428      }      }
3429        
3430            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         return  ($self->{ct}); # DOCTYPE  
   
3431          redo A;          redo A;
3432        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
3433            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3434              
3435              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3436              $self->{state} = DATA_STATE;
3437              $self->{s_kwd} = '';
3438              $self->{ct}->{quirks} = 1;
3439            } else {
3440              
3441              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3442              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3443            }
3444                    
3445          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');          ## Reconsume.
3446          $self->{state} = DATA_STATE;          return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         ## reconsume  
   
         $self->{ct}->{quirks} = 1;  
         return  ($self->{ct}); # DOCTYPE  
   
3447          redo A;          redo A;
3448        } elsif ($self->{nc} == 0x0050 or # P        } elsif ($self->{nc} == 0x0050 or # P
3449                 $self->{nc} == 0x0070) { # p                 $self->{nc} == 0x0070) { # p
3450            
3451          $self->{state} = PUBLIC_STATE;          $self->{state} = PUBLIC_STATE;
3452          $self->{s_kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
3453                    
3454      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3455        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2695  sub _get_next_token ($) { Line 3464  sub _get_next_token ($) {
3464          redo A;          redo A;
3465        } elsif ($self->{nc} == 0x0053 or # S        } elsif ($self->{nc} == 0x0053 or # S
3466                 $self->{nc} == 0x0073) { # s                 $self->{nc} == 0x0073) { # s
3467            
3468          $self->{state} = SYSTEM_STATE;          $self->{state} = SYSTEM_STATE;
3469          $self->{s_kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
3470                    
3471      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3472        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2709  sub _get_next_token ($) { Line 3479  sub _get_next_token ($) {
3479      }      }
3480        
3481          redo A;          redo A;
3482        } else {        } elsif ($self->{nc} == 0x0022 and # "
3483                   ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
3484                    $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
3485                    
3486          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name');          $self->{state} = DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE;
3487          $self->{ct}->{quirks} = 1;          $self->{ct}->{value} = ''; # ENTITY
3488            
3489        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3490          $self->{line_prev} = $self->{line};
3491          $self->{column_prev} = $self->{column};
3492          $self->{column}++;
3493          $self->{nc}
3494              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3495        } else {
3496          $self->{set_nc}->($self);
3497        }
3498      
3499            redo A;
3500          } elsif ($self->{nc} == 0x0027 and # '
3501                   ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
3502                    $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
3503            
3504            $self->{state} = DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE;
3505            $self->{ct}->{value} = ''; # ENTITY
3506            
3507        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3508          $self->{line_prev} = $self->{line};
3509          $self->{column_prev} = $self->{column};
3510          $self->{column}++;
3511          $self->{nc}
3512              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3513        } else {
3514          $self->{set_nc}->($self);
3515        }
3516      
3517            redo A;
3518          } elsif ($self->{is_xml} and
3519                   $self->{ct}->{type} == DOCTYPE_TOKEN and
3520                   $self->{nc} == 0x005B) { # [
3521            
3522            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3523            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3524            $self->{in_subset} = 1;
3525            
3526        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3527          $self->{line_prev} = $self->{line};
3528          $self->{column_prev} = $self->{column};
3529          $self->{column}++;
3530          $self->{nc}
3531              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3532        } else {
3533          $self->{set_nc}->($self);
3534        }
3535      
3536            return  ($self->{ct}); # DOCTYPE
3537            redo A;
3538          } else {
3539            $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name'); ## TODO: type
3540    
3541            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3542              
3543              $self->{ct}->{quirks} = 1;
3544              $self->{state} = BOGUS_DOCTYPE_STATE;
3545            } else {
3546              
3547              $self->{state} = BOGUS_MD_STATE;
3548            }
3549    
         $self->{state} = BOGUS_DOCTYPE_STATE;  
3550                    
3551      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3552        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2736  sub _get_next_token ($) { Line 3568  sub _get_next_token ($) {
3568              0x0042, # B              0x0042, # B
3569              0x004C, # L              0x004C, # L
3570              0x0049, # I              0x0049, # I
3571            ]->[length $self->{s_kwd}] or            ]->[length $self->{kwd}] or
3572            $self->{nc} == [            $self->{nc} == [
3573              undef,              undef,
3574              0x0075, # u              0x0075, # u
3575              0x0062, # b              0x0062, # b
3576              0x006C, # l              0x006C, # l
3577              0x0069, # i              0x0069, # i
3578            ]->[length $self->{s_kwd}]) {            ]->[length $self->{kwd}]) {
3579                    
3580          ## Stay in the state.          ## Stay in the state.
3581          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
3582                    
3583      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3584        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2759  sub _get_next_token ($) { Line 3591  sub _get_next_token ($) {
3591      }      }
3592        
3593          redo A;          redo A;
3594        } elsif ((length $self->{s_kwd}) == 5 and        } elsif ((length $self->{kwd}) == 5 and
3595                 ($self->{nc} == 0x0043 or # C                 ($self->{nc} == 0x0043 or # C
3596                  $self->{nc} == 0x0063)) { # c                  $self->{nc} == 0x0063)) { # c
3597                    if ($self->{is_xml} and
3598                ($self->{kwd} ne 'PUBLI' or $self->{nc} == 0x0063)) { # c
3599              
3600              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
3601                              text => 'PUBLIC',
3602                              line => $self->{line_prev},
3603                              column => $self->{column_prev} - 4);
3604            } else {
3605              
3606            }
3607          $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;          $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3608                    
3609      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 2777  sub _get_next_token ($) { Line 3618  sub _get_next_token ($) {
3618        
3619          redo A;          redo A;
3620        } else {        } else {
3621                    $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name', ## TODO: type
         $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name',  
3622                          line => $self->{line_prev},                          line => $self->{line_prev},
3623                          column => $self->{column_prev} + 1 - length $self->{s_kwd});                          column => $self->{column_prev} + 1 - length $self->{kwd});
3624          $self->{ct}->{quirks} = 1;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3625              
3626          $self->{state} = BOGUS_DOCTYPE_STATE;            $self->{ct}->{quirks} = 1;
3627              $self->{state} = BOGUS_DOCTYPE_STATE;
3628            } else {
3629              
3630              $self->{state} = BOGUS_MD_STATE;
3631            }
3632          ## Reconsume.          ## Reconsume.
3633          redo A;          redo A;
3634        }        }
# Line 2795  sub _get_next_token ($) { Line 3640  sub _get_next_token ($) {
3640              0x0053, # S              0x0053, # S
3641              0x0054, # T              0x0054, # T
3642              0x0045, # E              0x0045, # E
3643            ]->[length $self->{s_kwd}] or            ]->[length $self->{kwd}] or
3644            $self->{nc} == [            $self->{nc} == [
3645              undef,              undef,
3646              0x0079, # y              0x0079, # y
3647              0x0073, # s              0x0073, # s
3648              0x0074, # t              0x0074, # t
3649              0x0065, # e              0x0065, # e
3650            ]->[length $self->{s_kwd}]) {            ]->[length $self->{kwd}]) {
3651                    
3652          ## Stay in the state.          ## Stay in the state.
3653          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
3654                    
3655      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3656        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2818  sub _get_next_token ($) { Line 3663  sub _get_next_token ($) {
3663      }      }
3664        
3665          redo A;          redo A;
3666        } elsif ((length $self->{s_kwd}) == 5 and        } elsif ((length $self->{kwd}) == 5 and
3667                 ($self->{nc} == 0x004D or # M                 ($self->{nc} == 0x004D or # M
3668                  $self->{nc} == 0x006D)) { # m                  $self->{nc} == 0x006D)) { # m
3669                    if ($self->{is_xml} and
3670                ($self->{kwd} ne 'SYSTE' or $self->{nc} == 0x006D)) { # m
3671              
3672              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
3673                              text => 'SYSTEM',
3674                              line => $self->{line_prev},
3675                              column => $self->{column_prev} - 4);
3676            } else {
3677              
3678            }
3679          $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;          $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
3680                    
3681      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 2836  sub _get_next_token ($) { Line 3690  sub _get_next_token ($) {
3690        
3691          redo A;          redo A;
3692        } else {        } else {
3693                    $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name', ## TODO: type
         $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name',  
3694                          line => $self->{line_prev},                          line => $self->{line_prev},
3695                          column => $self->{column_prev} + 1 - length $self->{s_kwd});                          column => $self->{column_prev} + 1 - length $self->{kwd});
3696          $self->{ct}->{quirks} = 1;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3697              
3698          $self->{state} = BOGUS_DOCTYPE_STATE;            $self->{ct}->{quirks} = 1;
3699              $self->{state} = BOGUS_DOCTYPE_STATE;
3700            } else {
3701              
3702              $self->{state} = BOGUS_MD_STATE;
3703            }
3704          ## Reconsume.          ## Reconsume.
3705          redo A;          redo A;
3706        }        }
# Line 2895  sub _get_next_token ($) { Line 3753  sub _get_next_token ($) {
3753        
3754          redo A;          redo A;
3755        } elsif ($self->{nc} eq 0x003E) { # >        } elsif ($self->{nc} eq 0x003E) { # >
           
3756          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3757            
3758          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3759              
3760              $self->{state} = DATA_STATE;
3761              $self->{s_kwd} = '';
3762              $self->{ct}->{quirks} = 1;
3763            } else {
3764              
3765              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3766            }
3767            
3768                    
3769      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3770        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2910  sub _get_next_token ($) { Line 3776  sub _get_next_token ($) {
3776        $self->{set_nc}->($self);        $self->{set_nc}->($self);
3777      }      }
3778        
3779            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         return  ($self->{ct}); # DOCTYPE  
   
3780          redo A;          redo A;
3781        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
3782            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3783              
3784              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3785              $self->{state} = DATA_STATE;
3786              $self->{s_kwd} = '';
3787              $self->{ct}->{quirks} = 1;
3788            } else {
3789              
3790              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3791              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3792            }
3793                    
         $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');  
   
         $self->{state} = DATA_STATE;  
3794          ## reconsume          ## reconsume
   
         $self->{ct}->{quirks} = 1;  
3795          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
   
3796          redo A;          redo A;
3797        } else {        } elsif ($self->{is_xml} and
3798                   $self->{ct}->{type} == DOCTYPE_TOKEN and
3799                   $self->{nc} == 0x005B) { # [
3800            
3801            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3802            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3803            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3804            $self->{in_subset} = 1;
3805                    
3806        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3807          $self->{line_prev} = $self->{line};
3808          $self->{column_prev} = $self->{column};
3809          $self->{column}++;
3810          $self->{nc}
3811              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3812        } else {
3813          $self->{set_nc}->($self);
3814        }
3815      
3816            return  ($self->{ct}); # DOCTYPE
3817            redo A;
3818          } else {
3819          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC');
         $self->{ct}->{quirks} = 1;  
3820    
3821          $self->{state} = BOGUS_DOCTYPE_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3822              
3823              $self->{ct}->{quirks} = 1;
3824              $self->{state} = BOGUS_DOCTYPE_STATE;
3825            } else {
3826              
3827              $self->{state} = BOGUS_MD_STATE;
3828            }
3829    
3830                    
3831      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3832        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2962  sub _get_next_token ($) { Line 3857  sub _get_next_token ($) {
3857        
3858          redo A;          redo A;
3859        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
           
3860          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3861    
3862          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3863              
3864              $self->{state} = DATA_STATE;
3865              $self->{s_kwd} = '';
3866              $self->{ct}->{quirks} = 1;
3867            } else {
3868              
3869              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3870            }
3871    
3872                    
3873      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3874        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2977  sub _get_next_token ($) { Line 3880  sub _get_next_token ($) {
3880        $self->{set_nc}->($self);        $self->{set_nc}->($self);
3881      }      }
3882        
3883            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         return  ($self->{ct}); # DOCTYPE  
   
3884          redo A;          redo A;
3885        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
3886          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3887    
3888          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3889          ## reconsume            
3890              $self->{state} = DATA_STATE;
3891          $self->{ct}->{quirks} = 1;            $self->{s_kwd} = '';
3892              $self->{ct}->{quirks} = 1;
3893            } else {
3894              
3895              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3896            }
3897            
3898            ## Reconsume.
3899          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
   
3900          redo A;          redo A;
3901        } else {        } else {
3902                    
3903          $self->{ct}->{pubid} # DOCTYPE          $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
             .= chr $self->{nc};  
3904          $self->{read_until}->($self->{ct}->{pubid}, q[">],          $self->{read_until}->($self->{ct}->{pubid}, q[">],
3905                                length $self->{ct}->{pubid});                                length $self->{ct}->{pubid});
3906    
# Line 3031  sub _get_next_token ($) { Line 3935  sub _get_next_token ($) {
3935        
3936          redo A;          redo A;
3937        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
           
3938          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3939    
3940          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3941              
3942              $self->{state} = DATA_STATE;
3943              $self->{s_kwd} = '';
3944              $self->{ct}->{quirks} = 1;
3945            } else {
3946              
3947              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3948            }
3949    
3950                    
3951      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3952        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3046  sub _get_next_token ($) { Line 3958  sub _get_next_token ($) {
3958        $self->{set_nc}->($self);        $self->{set_nc}->($self);
3959      }      }
3960        
3961            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         return  ($self->{ct}); # DOCTYPE  
   
3962          redo A;          redo A;
3963        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
3964          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3965    
3966          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3967              
3968              $self->{state} = DATA_STATE;
3969              $self->{s_kwd} = '';
3970              $self->{ct}->{quirks} = 1;
3971            } else {
3972              
3973              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3974            }
3975          
3976          ## reconsume          ## reconsume
3977            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         return  ($self->{ct}); # DOCTYPE  
   
3978          redo A;          redo A;
3979        } else {        } else {
3980                    
3981          $self->{ct}->{pubid} # DOCTYPE          $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
             .= chr $self->{nc};  
3982          $self->{read_until}->($self->{ct}->{pubid}, q['>],          $self->{read_until}->($self->{ct}->{pubid}, q['>],
3983                                length $self->{ct}->{pubid});                                length $self->{ct}->{pubid});
3984    
# Line 3101  sub _get_next_token ($) { Line 4014  sub _get_next_token ($) {
4014          redo A;          redo A;
4015        } elsif ($self->{nc} == 0x0022) { # "        } elsif ($self->{nc} == 0x0022) { # "
4016                    
4017          $self->{ct}->{sysid} = ''; # DOCTYPE          $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
4018          $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;          $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
4019                    
4020      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 3117  sub _get_next_token ($) { Line 4030  sub _get_next_token ($) {
4030          redo A;          redo A;
4031        } elsif ($self->{nc} == 0x0027) { # '        } elsif ($self->{nc} == 0x0027) { # '
4032                    
4033          $self->{ct}->{sysid} = ''; # DOCTYPE          $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
4034          $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;          $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
4035                    
4036      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 3132  sub _get_next_token ($) { Line 4045  sub _get_next_token ($) {
4045        
4046          redo A;          redo A;
4047        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
4048            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4049              if ($self->{is_xml}) {
4050                
4051                $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
4052              } else {
4053                
4054              }
4055              $self->{state} = DATA_STATE;
4056              $self->{s_kwd} = '';
4057            } else {
4058              if ($self->{ct}->{type} == NOTATION_TOKEN) {
4059                
4060              } else {
4061                
4062                $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');            
4063              }
4064              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4065            }
4066                    
         $self->{state} = DATA_STATE;  
4067                    
4068      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4069        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3145  sub _get_next_token ($) { Line 4075  sub _get_next_token ($) {
4075        $self->{set_nc}->($self);        $self->{set_nc}->($self);
4076      }      }
4077        
4078            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         return  ($self->{ct}); # DOCTYPE  
   
4079          redo A;          redo A;
4080        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
4081            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4082              
4083              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
4084              
4085              $self->{state} = DATA_STATE;
4086              $self->{s_kwd} = '';
4087              $self->{ct}->{quirks} = 1;
4088            } else {
4089              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4090              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4091            }
4092                    
         $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');  
   
         $self->{state} = DATA_STATE;  
4093          ## reconsume          ## reconsume
4094            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4095          $self->{ct}->{quirks} = 1;          redo A;
4096          } elsif ($self->{is_xml} and
4097                   $self->{ct}->{type} == DOCTYPE_TOKEN and
4098                   $self->{nc} == 0x005B) { # [
4099            
4100            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
4101            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4102            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4103            $self->{in_subset} = 1;
4104            
4105        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4106          $self->{line_prev} = $self->{line};
4107          $self->{column_prev} = $self->{column};
4108          $self->{column}++;
4109          $self->{nc}
4110              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4111        } else {
4112          $self->{set_nc}->($self);
4113        }
4114      
4115          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
   
4116          redo A;          redo A;
4117        } else {        } else {
           
4118          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC literal');
         $self->{ct}->{quirks} = 1;  
4119    
4120          $self->{state} = BOGUS_DOCTYPE_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4121              
4122              $self->{ct}->{quirks} = 1;
4123              $self->{state} = BOGUS_DOCTYPE_STATE;
4124            } else {
4125              
4126              $self->{state} = BOGUS_MD_STATE;
4127            }
4128    
4129                    
4130      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4131        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3228  sub _get_next_token ($) { Line 4188  sub _get_next_token ($) {
4188        
4189          redo A;          redo A;
4190        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
           
4191          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
         $self->{state} = DATA_STATE;  
4192                    
4193      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4194        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3243  sub _get_next_token ($) { Line 4201  sub _get_next_token ($) {
4201      }      }
4202        
4203    
4204          $self->{ct}->{quirks} = 1;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4205          return  ($self->{ct}); # DOCTYPE            
4206              $self->{state} = DATA_STATE;
4207              $self->{s_kwd} = '';
4208              $self->{ct}->{quirks} = 1;
4209            } else {
4210              
4211              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4212            }
4213    
4214            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4215          redo A;          redo A;
4216        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
4217            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4218              
4219              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
4220              $self->{state} = DATA_STATE;
4221              $self->{s_kwd} = '';
4222              $self->{ct}->{quirks} = 1;
4223            } else {
4224              
4225              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4226              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4227            }
4228                    
         $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');  
   
         $self->{state} = DATA_STATE;  
4229          ## reconsume          ## reconsume
4230            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4231            redo A;
4232          } elsif ($self->{is_xml} and
4233                   $self->{ct}->{type} == DOCTYPE_TOKEN and
4234                   $self->{nc} == 0x005B) { # [
4235            
4236            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
4237    
4238          $self->{ct}->{quirks} = 1;          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4239            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4240            $self->{in_subset} = 1;
4241            
4242        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4243          $self->{line_prev} = $self->{line};
4244          $self->{column_prev} = $self->{column};
4245          $self->{column}++;
4246          $self->{nc}
4247              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4248        } else {
4249          $self->{set_nc}->($self);
4250        }
4251      
4252          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
   
4253          redo A;          redo A;
4254        } else {        } else {
           
4255          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM');
         $self->{ct}->{quirks} = 1;  
4256    
4257          $self->{state} = BOGUS_DOCTYPE_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4258                        
4259              $self->{ct}->{quirks} = 1;
4260              $self->{state} = BOGUS_DOCTYPE_STATE;
4261            } else {
4262              
4263              $self->{state} = BOGUS_MD_STATE;
4264            }
4265    
4266                    
4267      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4268        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3293  sub _get_next_token ($) { Line 4292  sub _get_next_token ($) {
4292      }      }
4293        
4294          redo A;          redo A;
4295        } elsif ($self->{nc} == 0x003E) { # >        } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
           
4296          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4297    
4298          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4299              
4300              $self->{state} = DATA_STATE;
4301              $self->{s_kwd} = '';
4302              $self->{ct}->{quirks} = 1;
4303            } else {
4304              
4305              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4306            }
4307            
4308                    
4309      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4310        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3309  sub _get_next_token ($) { Line 4316  sub _get_next_token ($) {
4316        $self->{set_nc}->($self);        $self->{set_nc}->($self);
4317      }      }
4318        
4319            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         return  ($self->{ct}); # DOCTYPE  
   
4320          redo A;          redo A;
4321        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
4322          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4323    
4324          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4325              
4326              $self->{state} = DATA_STATE;
4327              $self->{s_kwd} = '';
4328              $self->{ct}->{quirks} = 1;
4329            } else {
4330              
4331              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4332            }
4333            
4334          ## reconsume          ## reconsume
4335            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         return  ($self->{ct}); # DOCTYPE  
   
4336          redo A;          redo A;
4337        } else {        } else {
4338                    
4339          $self->{ct}->{sysid} # DOCTYPE          $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
             .= chr $self->{nc};  
4340          $self->{read_until}->($self->{ct}->{sysid}, q[">],          $self->{read_until}->($self->{ct}->{sysid}, q[">],
4341                                length $self->{ct}->{sysid});                                length $self->{ct}->{sysid});
4342    
# Line 3362  sub _get_next_token ($) { Line 4370  sub _get_next_token ($) {
4370      }      }
4371        
4372          redo A;          redo A;
4373        } elsif ($self->{nc} == 0x003E) { # >        } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
4374                    
4375          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4376    
4377          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
4378            $self->{s_kwd} = '';
4379                    
4380      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4381        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3384  sub _get_next_token ($) { Line 4393  sub _get_next_token ($) {
4393    
4394          redo A;          redo A;
4395        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
4396          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4397    
4398          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4399          ## reconsume            
4400              $self->{state} = DATA_STATE;
4401          $self->{ct}->{quirks} = 1;            $self->{s_kwd} = '';
4402          return  ($self->{ct}); # DOCTYPE            $self->{ct}->{quirks} = 1;
4403            } else {
4404              
4405              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4406            }
4407    
4408            ## reconsume
4409            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4410          redo A;          redo A;
4411        } else {        } else {
4412                    
4413          $self->{ct}->{sysid} # DOCTYPE          $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
             .= chr $self->{nc};  
4414          $self->{read_until}->($self->{ct}->{sysid}, q['>],          $self->{read_until}->($self->{ct}->{sysid}, q['>],
4415                                length $self->{ct}->{sysid});                                length $self->{ct}->{sysid});
4416    
# Line 3417  sub _get_next_token ($) { Line 4430  sub _get_next_token ($) {
4430        }        }
4431      } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {      } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
4432        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
4433                    if ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN) {
4434          ## Stay in the state            
4435              $self->{state} = BEFORE_NDATA_STATE;
4436            } else {
4437              
4438              ## Stay in the state
4439            }
4440                    
4441      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4442        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3432  sub _get_next_token ($) { Line 4450  sub _get_next_token ($) {
4450        
4451          redo A;          redo A;
4452        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
4453            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4454              
4455              $self->{state} = DATA_STATE;
4456              $self->{s_kwd} = '';
4457            } else {
4458              
4459              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4460            }
4461    
4462                    
4463          $self->{state} = DATA_STATE;      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4464          $self->{line_prev} = $self->{line};
4465          $self->{column_prev} = $self->{column};
4466          $self->{column}++;
4467          $self->{nc}
4468              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4469        } else {
4470          $self->{set_nc}->($self);
4471        }
4472      
4473            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4474            redo A;
4475          } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
4476                   ($self->{nc} == 0x004E or # N
4477                    $self->{nc} == 0x006E)) { # n
4478            
4479            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before NDATA'); ## TODO: type
4480            $self->{state} = NDATA_STATE;
4481            $self->{kwd} = chr $self->{nc};
4482                    
4483      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4484        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3445  sub _get_next_token ($) { Line 4490  sub _get_next_token ($) {
4490        $self->{set_nc}->($self);        $self->{set_nc}->($self);
4491      }      }
4492        
4493            redo A;
4494          } elsif ($self->{nc} == -1) {
4495            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4496              
4497              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
4498              $self->{state} = DATA_STATE;
4499              $self->{s_kwd} = '';
4500              $self->{ct}->{quirks} = 1;
4501            } else {
4502              
4503              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4504              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4505            }
4506    
4507            ## reconsume
4508            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4509            redo A;
4510          } elsif ($self->{is_xml} and
4511                   $self->{ct}->{type} == DOCTYPE_TOKEN and
4512                   $self->{nc} == 0x005B) { # [
4513            
4514            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4515            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4516            $self->{in_subset} = 1;
4517            
4518        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4519          $self->{line_prev} = $self->{line};
4520          $self->{column_prev} = $self->{column};
4521          $self->{column}++;
4522          $self->{nc}
4523              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4524        } else {
4525          $self->{set_nc}->($self);
4526        }
4527      
4528          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
4529            redo A;
4530          } else {
4531            $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
4532    
4533            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4534              
4535              #$self->{ct}->{quirks} = 1;
4536              $self->{state} = BOGUS_DOCTYPE_STATE;
4537            } else {
4538              
4539              $self->{state} = BOGUS_MD_STATE;
4540            }
4541    
4542            
4543        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4544          $self->{line_prev} = $self->{line};
4545          $self->{column_prev} = $self->{column};
4546          $self->{column}++;
4547          $self->{nc}
4548              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4549        } else {
4550          $self->{set_nc}->($self);
4551        }
4552      
4553            redo A;
4554          }
4555        } elsif ($self->{state} == BEFORE_NDATA_STATE) {
4556          if ($is_space->{$self->{nc}}) {
4557            
4558            ## Stay in the state.
4559            
4560        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4561          $self->{line_prev} = $self->{line};
4562          $self->{column_prev} = $self->{column};
4563          $self->{column}++;
4564          $self->{nc}
4565              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4566        } else {
4567          $self->{set_nc}->($self);
4568        }
4569      
4570            redo A;
4571          } elsif ($self->{nc} == 0x003E) { # >
4572            
4573            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4574            
4575        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4576          $self->{line_prev} = $self->{line};
4577          $self->{column_prev} = $self->{column};
4578          $self->{column}++;
4579          $self->{nc}
4580              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4581        } else {
4582          $self->{set_nc}->($self);
4583        }
4584      
4585            return  ($self->{ct}); # ENTITY
4586            redo A;
4587          } elsif ($self->{nc} == 0x004E or # N
4588                   $self->{nc} == 0x006E) { # n
4589            
4590            $self->{state} = NDATA_STATE;
4591            $self->{kwd} = chr $self->{nc};
4592            
4593        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4594          $self->{line_prev} = $self->{line};
4595          $self->{column_prev} = $self->{column};
4596          $self->{column}++;
4597          $self->{nc}
4598              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4599        } else {
4600          $self->{set_nc}->($self);
4601        }
4602      
4603          redo A;          redo A;
4604        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
4605                    
4606          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4607          $self->{state} = DATA_STATE;          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4608          ## reconsume          ## reconsume
4609            return  ($self->{ct}); # ENTITY
         $self->{ct}->{quirks} = 1;  
         return  ($self->{ct}); # DOCTYPE  
   
4610          redo A;          redo A;
4611        } else {        } else {
4612                    
4613          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
4614          #$self->{ct}->{quirks} = 1;          $self->{state} = BOGUS_MD_STATE;
   
         $self->{state} = BOGUS_DOCTYPE_STATE;  
4615                    
4616      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4617        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3482  sub _get_next_token ($) { Line 4629  sub _get_next_token ($) {
4629        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
4630                    
4631          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
4632            $self->{s_kwd} = '';
4633                    
4634      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4635        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3497  sub _get_next_token ($) { Line 4645  sub _get_next_token ($) {
4645          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
4646    
4647          redo A;          redo A;
4648          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
4649            
4650            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4651            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4652            $self->{in_subset} = 1;
4653            
4654        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4655          $self->{line_prev} = $self->{line};
4656          $self->{column_prev} = $self->{column};
4657          $self->{column}++;
4658          $self->{nc}
4659              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4660        } else {
4661          $self->{set_nc}->($self);
4662        }
4663      
4664            return  ($self->{ct}); # DOCTYPE
4665            redo A;
4666        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
4667                    
4668          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
4669            $self->{s_kwd} = '';
4670          ## reconsume          ## reconsume
4671    
4672          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
# Line 3508  sub _get_next_token ($) { Line 4675  sub _get_next_token ($) {
4675        } else {        } else {
4676                    
4677          my $s = '';          my $s = '';
4678          $self->{read_until}->($s, q[>], 0);          $self->{read_until}->($s, q{>[}, 0);
4679    
4680          ## Stay in the state          ## Stay in the state
4681                    
# Line 3528  sub _get_next_token ($) { Line 4695  sub _get_next_token ($) {
4695        ## NOTE: "CDATA section state" in the state is jointly implemented        ## NOTE: "CDATA section state" in the state is jointly implemented
4696        ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,        ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
4697        ## and |CDATA_SECTION_MSE2_STATE|.        ## and |CDATA_SECTION_MSE2_STATE|.
4698    
4699          ## XML5: "CDATA state".
4700                
4701        if ($self->{nc} == 0x005D) { # ]        if ($self->{nc} == 0x005D) { # ]
4702                    
# Line 3545  sub _get_next_token ($) { Line 4714  sub _get_next_token ($) {
4714        
4715          redo A;          redo A;
4716        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
4717            if ($self->{is_xml}) {
4718              
4719              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no mse'); ## TODO: type
4720            } else {
4721              
4722            }
4723    
4724          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
4725                    $self->{s_kwd} = '';
4726      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {          ## Reconsume.
       $self->{line_prev} = $self->{line};  
       $self->{column_prev} = $self->{column};  
       $self->{column}++;  
       $self->{nc}  
           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);  
     } else {  
       $self->{set_nc}->($self);  
     }  
     
4727          if (length $self->{ct}->{data}) { # character          if (length $self->{ct}->{data}) { # character
4728                        
4729            return  ($self->{ct}); # character            return  ($self->{ct}); # character
# Line 3589  sub _get_next_token ($) { Line 4756  sub _get_next_token ($) {
4756    
4757        ## ISSUE: "text tokens" in spec.        ## ISSUE: "text tokens" in spec.
4758      } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {      } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
4759          ## XML5: "CDATA bracket state".
4760    
4761        if ($self->{nc} == 0x005D) { # ]        if ($self->{nc} == 0x005D) { # ]
4762                    
4763          $self->{state} = CDATA_SECTION_MSE2_STATE;          $self->{state} = CDATA_SECTION_MSE2_STATE;
# Line 3606  sub _get_next_token ($) { Line 4775  sub _get_next_token ($) {
4775          redo A;          redo A;
4776        } else {        } else {
4777                    
4778            ## XML5: If EOF, "]" is not appended and changed to the data state.
4779          $self->{ct}->{data} .= ']';          $self->{ct}->{data} .= ']';
4780          $self->{state} = CDATA_SECTION_STATE;          $self->{state} = CDATA_SECTION_STATE; ## XML5: Stay in the state.
4781          ## Reconsume.          ## Reconsume.
4782          redo A;          redo A;
4783        }        }
4784      } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {      } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
4785          ## XML5: "CDATA end state".
4786    
4787        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
4788          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
4789            $self->{s_kwd} = '';
4790                    
4791      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4792        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3653  sub _get_next_token ($) { Line 4826  sub _get_next_token ($) {
4826                    
4827          $self->{ct}->{data} .= ']]'; # character          $self->{ct}->{data} .= ']]'; # character
4828          $self->{state} = CDATA_SECTION_STATE;          $self->{state} = CDATA_SECTION_STATE;
4829          ## Reconsume.          ## Reconsume. ## XML5: Emit.
4830          redo A;          redo A;
4831        }        }
4832      } elsif ($self->{state} == ENTITY_STATE) {      } elsif ($self->{state} == ENTITY_STATE) {
# Line 3662  sub _get_next_token ($) { Line 4835  sub _get_next_token ($) {
4835              0x003C => 1, 0x0026 => 1, -1 => 1, # <, &              0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
4836              $self->{entity_add} => 1,              $self->{entity_add} => 1,
4837            }->{$self->{nc}}) {            }->{$self->{nc}}) {
4838                    if ($self->{is_xml}) {
4839              
4840              $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
4841                              line => $self->{line_prev},
4842                              column => $self->{column_prev}
4843                                  + ($self->{nc} == -1 ? 1 : 0));
4844            } else {
4845              
4846              ## No error
4847            }
4848          ## Don't consume          ## Don't consume
         ## No error  
4849          ## Return nothing.          ## Return nothing.
4850          #          #
4851        } elsif ($self->{nc} == 0x0023) { # #        } elsif ($self->{nc} == 0x0023) { # #
4852                    
4853          $self->{state} = ENTITY_HASH_STATE;          $self->{state} = ENTITY_HASH_STATE;
4854          $self->{s_kwd} = '#';          $self->{kwd} = '#';
4855                    
4856      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4857        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3683  sub _get_next_token ($) { Line 4864  sub _get_next_token ($) {
4864      }      }
4865        
4866          redo A;          redo A;
4867        } elsif ((0x0041 <= $self->{nc} and        } elsif ($self->{is_xml} or
4868                   (0x0041 <= $self->{nc} and
4869                  $self->{nc} <= 0x005A) or # A..Z                  $self->{nc} <= 0x005A) or # A..Z
4870                 (0x0061 <= $self->{nc} and                 (0x0061 <= $self->{nc} and
4871                  $self->{nc} <= 0x007A)) { # a..z                  $self->{nc} <= 0x007A)) { # a..z
4872                    
4873          require Whatpm::_NamedEntityList;          require Whatpm::_NamedEntityList;
4874          $self->{state} = ENTITY_NAME_STATE;          $self->{state} = ENTITY_NAME_STATE;
4875          $self->{s_kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
4876          $self->{entity__value} = $self->{s_kwd};          $self->{entity__value} = $self->{kwd};
4877          $self->{entity__match} = 0;          $self->{entity__match} = 0;
4878                    
4879      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 3721  sub _get_next_token ($) { Line 4903  sub _get_next_token ($) {
4903        if ($self->{prev_state} == DATA_STATE) {        if ($self->{prev_state} == DATA_STATE) {
4904                    
4905          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
4906            $self->{s_kwd} = '';
4907          ## Reconsume.          ## Reconsume.
4908          return  ({type => CHARACTER_TOKEN, data => '&',          return  ({type => CHARACTER_TOKEN, data => '&',
4909                    line => $self->{line_prev},                    line => $self->{line_prev},
# Line 3731  sub _get_next_token ($) { Line 4914  sub _get_next_token ($) {
4914                    
4915          $self->{ca}->{value} .= '&';          $self->{ca}->{value} .= '&';
4916          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
4917            $self->{s_kwd} = '';
4918          ## Reconsume.          ## Reconsume.
4919          redo A;          redo A;
4920        }        }
4921      } elsif ($self->{state} == ENTITY_HASH_STATE) {      } elsif ($self->{state} == ENTITY_HASH_STATE) {
4922        if ($self->{nc} == 0x0078 or # x        if ($self->{nc} == 0x0078) { # x
4923            $self->{nc} == 0x0058) { # X          
4924            $self->{state} = HEXREF_X_STATE;
4925            $self->{kwd} .= chr $self->{nc};
4926            
4927        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4928          $self->{line_prev} = $self->{line};
4929          $self->{column_prev} = $self->{column};
4930          $self->{column}++;
4931          $self->{nc}
4932              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4933        } else {
4934          $self->{set_nc}->($self);
4935        }
4936      
4937            redo A;
4938          } elsif ($self->{nc} == 0x0058) { # X
4939                    
4940            if ($self->{is_xml}) {
4941              $self->{parse_error}->(level => $self->{level}->{must}, type => 'uppercase hcro'); ## TODO: type
4942            }
4943          $self->{state} = HEXREF_X_STATE;          $self->{state} = HEXREF_X_STATE;
4944          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
4945                    
4946      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4947        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3756  sub _get_next_token ($) { Line 4958  sub _get_next_token ($) {
4958                 $self->{nc} <= 0x0039) { # 0..9                 $self->{nc} <= 0x0039) { # 0..9
4959                    
4960          $self->{state} = NCR_NUM_STATE;          $self->{state} = NCR_NUM_STATE;
4961          $self->{s_kwd} = $self->{nc} - 0x0030;          $self->{kwd} = $self->{nc} - 0x0030;
4962                    
4963      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4964        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3781  sub _get_next_token ($) { Line 4983  sub _get_next_token ($) {
4983          if ($self->{prev_state} == DATA_STATE) {          if ($self->{prev_state} == DATA_STATE) {
4984                        
4985            $self->{state} = $self->{prev_state};            $self->{state} = $self->{prev_state};
4986              $self->{s_kwd} = '';
4987            ## Reconsume.            ## Reconsume.
4988            return  ({type => CHARACTER_TOKEN,            return  ({type => CHARACTER_TOKEN,
4989                      data => '&#',                      data => '&#',
# Line 3792  sub _get_next_token ($) { Line 4995  sub _get_next_token ($) {
4995                        
4996            $self->{ca}->{value} .= '&#';            $self->{ca}->{value} .= '&#';
4997            $self->{state} = $self->{prev_state};            $self->{state} = $self->{prev_state};
4998              $self->{s_kwd} = '';
4999            ## Reconsume.            ## Reconsume.
5000            redo A;            redo A;
5001          }          }
# Line 3800  sub _get_next_token ($) { Line 5004  sub _get_next_token ($) {
5004        if (0x0030 <= $self->{nc} and        if (0x0030 <= $self->{nc} and
5005            $self->{nc} <= 0x0039) { # 0..9            $self->{nc} <= 0x0039) { # 0..9
5006                    
5007          $self->{s_kwd} *= 10;          $self->{kwd} *= 10;
5008          $self->{s_kwd} += $self->{nc} - 0x0030;          $self->{kwd} += $self->{nc} - 0x0030;
5009                    
5010          ## Stay in the state.          ## Stay in the state.
5011                    
# Line 3837  sub _get_next_token ($) { Line 5041  sub _get_next_token ($) {
5041          #          #
5042        }        }
5043    
5044        my $code = $self->{s_kwd};        my $code = $self->{kwd};
5045        my $l = $self->{line_prev};        my $l = $self->{line_prev};
5046        my $c = $self->{column_prev};        my $c = $self->{column_prev};
5047        if ($charref_map->{$code}) {        if ((not $self->{is_xml} and $charref_map->{$code}) or
5048              ($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or
5049              ($self->{is_xml} and $code == 0x0000)) {
5050                    
5051          $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
5052                          text => (sprintf 'U+%04X', $code),                          text => (sprintf 'U+%04X', $code),
# Line 3857  sub _get_next_token ($) { Line 5063  sub _get_next_token ($) {
5063        if ($self->{prev_state} == DATA_STATE) {        if ($self->{prev_state} == DATA_STATE) {
5064                    
5065          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
5066            $self->{s_kwd} = '';
5067          ## Reconsume.          ## Reconsume.
5068          return  ({type => CHARACTER_TOKEN, data => chr $code,          return  ({type => CHARACTER_TOKEN, data => chr $code,
5069                      has_reference => 1,
5070                    line => $l, column => $c,                    line => $l, column => $c,
5071                   });                   });
5072          redo A;          redo A;
# Line 3867  sub _get_next_token ($) { Line 5075  sub _get_next_token ($) {
5075          $self->{ca}->{value} .= chr $code;          $self->{ca}->{value} .= chr $code;
5076          $self->{ca}->{has_reference} = 1;          $self->{ca}->{has_reference} = 1;
5077          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
5078            $self->{s_kwd} = '';
5079          ## Reconsume.          ## Reconsume.
5080          redo A;          redo A;
5081        }        }
# Line 3877  sub _get_next_token ($) { Line 5086  sub _get_next_token ($) {
5086          # 0..9, A..F, a..f          # 0..9, A..F, a..f
5087                    
5088          $self->{state} = HEXREF_HEX_STATE;          $self->{state} = HEXREF_HEX_STATE;
5089          $self->{s_kwd} = 0;          $self->{kwd} = 0;
5090          ## Reconsume.          ## Reconsume.
5091          redo A;          redo A;
5092        } else {        } else {
# Line 3892  sub _get_next_token ($) { Line 5101  sub _get_next_token ($) {
5101          if ($self->{prev_state} == DATA_STATE) {          if ($self->{prev_state} == DATA_STATE) {
5102                        
5103            $self->{state} = $self->{prev_state};            $self->{state} = $self->{prev_state};
5104              $self->{s_kwd} = '';
5105            ## Reconsume.            ## Reconsume.
5106            return  ({type => CHARACTER_TOKEN,            return  ({type => CHARACTER_TOKEN,
5107                      data => '&' . $self->{s_kwd},                      data => '&' . $self->{kwd},
5108                      line => $self->{line_prev},                      line => $self->{line_prev},
5109                      column => $self->{column_prev} - length $self->{s_kwd},                      column => $self->{column_prev} - length $self->{kwd},
5110                     });                     });
5111            redo A;            redo A;
5112          } else {          } else {
5113                        
5114            $self->{ca}->{value} .= '&' . $self->{s_kwd};            $self->{ca}->{value} .= '&' . $self->{kwd};
5115            $self->{state} = $self->{prev_state};            $self->{state} = $self->{prev_state};
5116              $self->{s_kwd} = '';
5117            ## Reconsume.            ## Reconsume.
5118            redo A;            redo A;
5119          }          }
# Line 3911  sub _get_next_token ($) { Line 5122  sub _get_next_token ($) {
5122        if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {        if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
5123          # 0..9          # 0..9
5124                    
5125          $self->{s_kwd} *= 0x10;          $self->{kwd} *= 0x10;
5126          $self->{s_kwd} += $self->{nc} - 0x0030;          $self->{kwd} += $self->{nc} - 0x0030;
5127          ## Stay in the state.          ## Stay in the state.
5128                    
5129      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 3929  sub _get_next_token ($) { Line 5140  sub _get_next_token ($) {
5140        } elsif (0x0061 <= $self->{nc} and        } elsif (0x0061 <= $self->{nc} and
5141                 $self->{nc} <= 0x0066) { # a..f                 $self->{nc} <= 0x0066) { # a..f
5142                    
5143          $self->{s_kwd} *= 0x10;          $self->{kwd} *= 0x10;
5144          $self->{s_kwd} += $self->{nc} - 0x0060 + 9;          $self->{kwd} += $self->{nc} - 0x0060 + 9;
5145          ## Stay in the state.          ## Stay in the state.
5146                    
5147      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 3947  sub _get_next_token ($) { Line 5158  sub _get_next_token ($) {
5158        } elsif (0x0041 <= $self->{nc} and        } elsif (0x0041 <= $self->{nc} and
5159                 $self->{nc} <= 0x0046) { # A..F                 $self->{nc} <= 0x0046) { # A..F
5160                    
5161          $self->{s_kwd} *= 0x10;          $self->{kwd} *= 0x10;
5162          $self->{s_kwd} += $self->{nc} - 0x0040 + 9;          $self->{kwd} += $self->{nc} - 0x0040 + 9;
5163          ## Stay in the state.          ## Stay in the state.
5164                    
5165      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 3985  sub _get_next_token ($) { Line 5196  sub _get_next_token ($) {
5196          #          #
5197        }        }
5198    
5199        my $code = $self->{s_kwd};        my $code = $self->{kwd};
5200        my $l = $self->{line_prev};        my $l = $self->{line_prev};
5201        my $c = $self->{column_prev};        my $c = $self->{column_prev};
5202        if ($charref_map->{$code}) {        if ((not $self->{is_xml} and $charref_map->{$code}) or
5203              ($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or
5204              ($self->{is_xml} and $code == 0x0000)) {
5205                    
5206          $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
5207                          text => (sprintf 'U+%04X', $code),                          text => (sprintf 'U+%04X', $code),
# Line 4005  sub _get_next_token ($) { Line 5218  sub _get_next_token ($) {
5218        if ($self->{prev_state} == DATA_STATE) {        if ($self->{prev_state} == DATA_STATE) {
5219                    
5220          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
5221            $self->{s_kwd} = '';
5222          ## Reconsume.          ## Reconsume.
5223          return  ({type => CHARACTER_TOKEN, data => chr $code,          return  ({type => CHARACTER_TOKEN, data => chr $code,
5224                      has_reference => 1,
5225                    line => $l, column => $c,                    line => $l, column => $c,
5226                   });                   });
5227          redo A;          redo A;
# Line 4015  sub _get_next_token ($) { Line 5230  sub _get_next_token ($) {
5230          $self->{ca}->{value} .= chr $code;          $self->{ca}->{value} .= chr $code;
5231          $self->{ca}->{has_reference} = 1;          $self->{ca}->{has_reference} = 1;
5232          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
5233            $self->{s_kwd} = '';
5234          ## Reconsume.          ## Reconsume.
5235          redo A;          redo A;
5236        }        }
5237      } elsif ($self->{state} == ENTITY_NAME_STATE) {      } elsif ($self->{state} == ENTITY_NAME_STATE) {
5238        if (length $self->{s_kwd} < 30 and        if ((0x0041 <= $self->{nc} and # a
5239            ## NOTE: Some number greater than the maximum length of entity name             $self->{nc} <= 0x005A) or # x
5240            ((0x0041 <= $self->{nc} and # a            (0x0061 <= $self->{nc} and # a
5241              $self->{nc} <= 0x005A) or # x             $self->{nc} <= 0x007A) or # z
5242             (0x0061 <= $self->{nc} and # a            (0x0030 <= $self->{nc} and # 0
5243              $self->{nc} <= 0x007A) or # z             $self->{nc} <= 0x0039) or # 9
5244             (0x0030 <= $self->{nc} and # 0            $self->{nc} == 0x003B or # ;
5245              $self->{nc} <= 0x0039) or # 9            ($self->{is_xml} and
5246             $self->{nc} == 0x003B)) { # ;             not ($is_space->{$self->{nc}} or
5247                    {
5248                      0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
5249                      $self->{entity_add} => 1,
5250                    }->{$self->{nc}}))) {
5251          our $EntityChar;          our $EntityChar;
5252          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
5253          if (defined $EntityChar->{$self->{s_kwd}}) {          if (defined $EntityChar->{$self->{kwd}} or
5254                $self->{ge}->{$self->{kwd}}) {
5255            if ($self->{nc} == 0x003B) { # ;            if ($self->{nc} == 0x003B) { # ;
5256                            if (defined $self->{ge}->{$self->{kwd}}) {
5257              $self->{entity__value} = $EntityChar->{$self->{s_kwd}};                if ($self->{ge}->{$self->{kwd}}->{only_text}) {
5258                    
5259                    $self->{entity__value} = $self->{ge}->{$self->{kwd}}->{value};
5260                  } else {
5261                    if (defined $self->{ge}->{$self->{kwd}}->{notation}) {
5262                      
5263                      $self->{parse_error}->(level => $self->{level}->{must}, type => 'unparsed entity', ## TODO: type
5264                                      value => $self->{kwd});
5265                    } else {
5266                      
5267                    }
5268                    $self->{entity__value} = '&' . $self->{kwd}; ## TODO: expand
5269                  }
5270                } else {
5271                  if ($self->{is_xml}) {
5272                    
5273                    $self->{parse_error}->(level => $self->{level}->{must}, type => 'entity not declared', ## TODO: type
5274                                    value => $self->{kwd},
5275                                    level => {
5276                                              'amp;' => $self->{level}->{warn},
5277                                              'quot;' => $self->{level}->{warn},
5278                                              'lt;' => $self->{level}->{warn},
5279                                              'gt;' => $self->{level}->{warn},
5280                                              'apos;' => $self->{level}->{warn},
5281                                             }->{$self->{kwd}} ||
5282                                             $self->{level}->{must});
5283                  } else {
5284                    
5285                  }
5286                  $self->{entity__value} = $EntityChar->{$self->{kwd}};
5287                }
5288              $self->{entity__match} = 1;              $self->{entity__match} = 1;
5289                            
5290      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 4049  sub _get_next_token ($) { Line 5300  sub _get_next_token ($) {
5300              #              #
5301            } else {            } else {
5302                            
5303              $self->{entity__value} = $EntityChar->{$self->{s_kwd}};              $self->{entity__value} = $EntityChar->{$self->{kwd}};
5304              $self->{entity__match} = -1;              $self->{entity__match} = -1;
5305              ## Stay in the state.              ## Stay in the state.
5306                            
# Line 4097  sub _get_next_token ($) { Line 5348  sub _get_next_token ($) {
5348          if ($self->{prev_state} != DATA_STATE and # in attribute          if ($self->{prev_state} != DATA_STATE and # in attribute
5349              $self->{entity__match} < -1) {              $self->{entity__match} < -1) {
5350                        
5351            $data = '&' . $self->{s_kwd};            $data = '&' . $self->{kwd};
5352            #            #
5353          } else {          } else {
5354                        
# Line 4109  sub _get_next_token ($) { Line 5360  sub _get_next_token ($) {
5360                    
5361          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
5362                          line => $self->{line_prev},                          line => $self->{line_prev},
5363                          column => $self->{column_prev} - length $self->{s_kwd});                          column => $self->{column_prev} - length $self->{kwd});
5364          $data = '&' . $self->{s_kwd};          $data = '&' . $self->{kwd};
5365          #          #
5366        }        }
5367        
# Line 4127  sub _get_next_token ($) { Line 5378  sub _get_next_token ($) {
5378        if ($self->{prev_state} == DATA_STATE) {        if ($self->{prev_state} == DATA_STATE) {
5379                    
5380          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
5381            $self->{s_kwd} = '';
5382          ## Reconsume.          ## Reconsume.
5383          return  ({type => CHARACTER_TOKEN,          return  ({type => CHARACTER_TOKEN,
5384                    data => $data,                    data => $data,
5385                      has_reference => $has_ref,
5386                    line => $self->{line_prev},                    line => $self->{line_prev},
5387                    column => $self->{column_prev} + 1 - length $self->{s_kwd},                    column => $self->{column_prev} + 1 - length $self->{kwd},
5388                   });                   });
5389          redo A;          redo A;
5390        } else {        } else {
# Line 4139  sub _get_next_token ($) { Line 5392  sub _get_next_token ($) {
5392          $self->{ca}->{value} .= $data;          $self->{ca}->{value} .= $data;
5393          $self->{ca}->{has_reference} = 1 if $has_ref;          $self->{ca}->{has_reference} = 1 if $has_ref;
5394          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
5395            $self->{s_kwd} = '';
5396            ## Reconsume.
5397            redo A;
5398          }
5399    
5400        ## XML-only states
5401    
5402        } elsif ($self->{state} == PI_STATE) {
5403          ## XML5: "Pi state" and "DOCTYPE pi state".
5404    
5405          if ($is_space->{$self->{nc}} or
5406              $self->{nc} == 0x003F or # ?
5407              $self->{nc} == -1) {
5408            ## XML5: U+003F: "pi state": Same as "Anything else"; "DOCTYPE
5409            ## pi state": Switch to the "DOCTYPE pi after state".  EOF:
5410            ## "DOCTYPE pi state": Parse error, switch to the "data
5411            ## state".
5412            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare pio', ## TODO: type
5413                            line => $self->{line_prev},
5414                            column => $self->{column_prev}
5415                                - 1 * ($self->{nc} != -1));
5416            $self->{state} = BOGUS_COMMENT_STATE;
5417            ## Reconsume.
5418            $self->{ct} = {type => COMMENT_TOKEN,
5419                           data => '?',
5420                           line => $self->{line_prev},
5421                           column => $self->{column_prev}
5422                               - 1 * ($self->{nc} != -1),
5423                          };
5424            redo A;
5425          } else {
5426            ## XML5: "DOCTYPE pi state": Stay in the state.
5427            $self->{ct} = {type => PI_TOKEN,
5428                           target => chr $self->{nc},
5429                           data => '',
5430                           line => $self->{line_prev},
5431                           column => $self->{column_prev} - 1,
5432                          };
5433            $self->{state} = PI_TARGET_STATE;
5434            
5435        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5436          $self->{line_prev} = $self->{line};
5437          $self->{column_prev} = $self->{column};
5438          $self->{column}++;
5439          $self->{nc}
5440              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5441        } else {
5442          $self->{set_nc}->($self);
5443        }
5444      
5445            redo A;
5446          }
5447        } elsif ($self->{state} == PI_TARGET_STATE) {
5448          if ($is_space->{$self->{nc}}) {
5449            $self->{state} = PI_TARGET_AFTER_STATE;
5450            
5451        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5452          $self->{line_prev} = $self->{line};
5453          $self->{column_prev} = $self->{column};
5454          $self->{column}++;
5455          $self->{nc}
5456              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5457        } else {
5458          $self->{set_nc}->($self);
5459        }
5460      
5461            redo A;
5462          } elsif ($self->{nc} == -1) {
5463            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
5464            if ($self->{in_subset}) {
5465              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5466            } else {
5467              $self->{state} = DATA_STATE;
5468              $self->{s_kwd} = '';
5469            }
5470            ## Reconsume.
5471            return  ($self->{ct}); # pi
5472            redo A;
5473          } elsif ($self->{nc} == 0x003F) { # ?
5474            $self->{state} = PI_AFTER_STATE;
5475            
5476        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5477          $self->{line_prev} = $self->{line};
5478          $self->{column_prev} = $self->{column};
5479          $self->{column}++;
5480          $self->{nc}
5481              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5482        } else {
5483          $self->{set_nc}->($self);
5484        }
5485      
5486            redo A;
5487          } else {
5488            ## XML5: typo ("tag name" -> "target")
5489            $self->{ct}->{target} .= chr $self->{nc}; # pi
5490            
5491        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5492          $self->{line_prev} = $self->{line};
5493          $self->{column_prev} = $self->{column};
5494          $self->{column}++;
5495          $self->{nc}
5496              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5497        } else {
5498          $self->{set_nc}->($self);
5499        }
5500      
5501            redo A;
5502          }
5503        } elsif ($self->{state} == PI_TARGET_AFTER_STATE) {
5504          if ($is_space->{$self->{nc}}) {
5505            ## Stay in the state.
5506            
5507        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5508          $self->{line_prev} = $self->{line};
5509          $self->{column_prev} = $self->{column};
5510          $self->{column}++;
5511          $self->{nc}
5512              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5513        } else {
5514          $self->{set_nc}->($self);
5515        }
5516      
5517            redo A;
5518          } else {
5519            $self->{state} = PI_DATA_STATE;
5520            ## Reprocess.
5521            redo A;
5522          }
5523        } elsif ($self->{state} == PI_DATA_STATE) {
5524          if ($self->{nc} == 0x003F) { # ?
5525            $self->{state} = PI_DATA_AFTER_STATE;
5526            
5527        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5528          $self->{line_prev} = $self->{line};
5529          $self->{column_prev} = $self->{column};
5530          $self->{column}++;
5531          $self->{nc}
5532              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5533        } else {
5534          $self->{set_nc}->($self);
5535        }
5536      
5537            redo A;
5538          } elsif ($self->{nc} == -1) {
5539            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
5540            if ($self->{in_subset}) {
5541              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state"
5542            } else {
5543              $self->{state} = DATA_STATE;
5544              $self->{s_kwd} = '';
5545            }
5546            ## Reprocess.
5547            return  ($self->{ct}); # pi
5548            redo A;
5549          } else {
5550            $self->{ct}->{data} .= chr $self->{nc}; # pi
5551            $self->{read_until}->($self->{ct}->{data}, q[?],
5552                                  length $self->{ct}->{data});
5553            ## Stay in the state.
5554            
5555        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5556          $self->{line_prev} = $self->{line};
5557          $self->{column_prev} = $self->{column};
5558          $self->{column}++;
5559          $self->{nc}
5560              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5561        } else {
5562          $self->{set_nc}->($self);
5563        }
5564      
5565            ## Reprocess.
5566            redo A;
5567          }
5568        } elsif ($self->{state} == PI_AFTER_STATE) {
5569          ## XML5: Part of "Pi after state".
5570    
5571          if ($self->{nc} == 0x003E) { # >
5572            if ($self->{in_subset}) {
5573              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5574            } else {
5575              $self->{state} = DATA_STATE;
5576              $self->{s_kwd} = '';
5577            }
5578            
5579        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5580          $self->{line_prev} = $self->{line};
5581          $self->{column_prev} = $self->{column};
5582          $self->{column}++;
5583          $self->{nc}
5584              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5585        } else {
5586          $self->{set_nc}->($self);
5587        }
5588      
5589            return  ($self->{ct}); # pi
5590            redo A;
5591          } elsif ($self->{nc} == 0x003F) { # ?
5592            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
5593                            line => $self->{line_prev},
5594                            column => $self->{column_prev}); ## XML5: no error
5595            $self->{ct}->{data} .= '?';
5596            $self->{state} = PI_DATA_AFTER_STATE;
5597            
5598        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5599          $self->{line_prev} = $self->{line};
5600          $self->{column_prev} = $self->{column};
5601          $self->{column}++;
5602          $self->{nc}
5603              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5604        } else {
5605          $self->{set_nc}->($self);
5606        }
5607      
5608            redo A;
5609          } else {
5610            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
5611                            line => $self->{line_prev},
5612                            column => $self->{column_prev}
5613                                + 1 * ($self->{nc} == -1)); ## XML5: no error
5614            $self->{ct}->{data} .= '?'; ## XML5: not appended
5615            $self->{state} = PI_DATA_STATE;
5616            ## Reprocess.
5617            redo A;
5618          }
5619        } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
5620          ## XML5: Same as "pi after state" and "DOCTYPE pi after state".
5621    
5622          if ($self->{nc} == 0x003E) { # >
5623            if ($self->{in_subset}) {
5624              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5625            } else {
5626              $self->{state} = DATA_STATE;
5627              $self->{s_kwd} = '';
5628            }
5629            
5630        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5631          $self->{line_prev} = $self->{line};
5632          $self->{column_prev} = $self->{column};
5633          $self->{column}++;
5634          $self->{nc}
5635              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5636        } else {
5637          $self->{set_nc}->($self);
5638        }
5639      
5640            return  ($self->{ct}); # pi
5641            redo A;
5642          } elsif ($self->{nc} == 0x003F) { # ?
5643            $self->{ct}->{data} .= '?';
5644            ## Stay in the state.
5645            
5646        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5647          $self->{line_prev} = $self->{line};
5648          $self->{column_prev} = $self->{column};
5649          $self->{column}++;
5650          $self->{nc}
5651              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5652        } else {
5653          $self->{set_nc}->($self);
5654        }
5655      
5656            redo A;
5657          } else {
5658            $self->{ct}->{data} .= '?'; ## XML5: not appended
5659            $self->{state} = PI_DATA_STATE;
5660            ## Reprocess.
5661            redo A;
5662          }
5663    
5664        } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_STATE) {
5665          if ($self->{nc} == 0x003C) { # <
5666            $self->{state} = DOCTYPE_TAG_STATE;
5667            
5668        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5669          $self->{line_prev} = $self->{line};
5670          $self->{column_prev} = $self->{column};
5671          $self->{column}++;
5672          $self->{nc}
5673              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5674        } else {
5675          $self->{set_nc}->($self);
5676        }
5677      
5678            redo A;
5679          } elsif ($self->{nc} == 0x0025) { # %
5680            ## XML5: Not defined yet.
5681    
5682            ## TODO:
5683    
5684            if (not $self->{stop_processing} and
5685                not $self->{document}->xml_standalone) {
5686              $self->{parse_error}->(level => $self->{level}->{must}, type => 'stop processing', ## TODO: type
5687                              level => $self->{level}->{info});
5688              $self->{stop_processing} = 1;
5689            }
5690    
5691            
5692        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5693          $self->{line_prev} = $self->{line};
5694          $self->{column_prev} = $self->{column};
5695          $self->{column}++;
5696          $self->{nc}
5697              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5698        } else {
5699          $self->{set_nc}->($self);
5700        }
5701      
5702            redo A;
5703          } elsif ($self->{nc} == 0x005D) { # ]
5704            delete $self->{in_subset};
5705            $self->{state} = DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
5706            
5707        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5708          $self->{line_prev} = $self->{line};
5709          $self->{column_prev} = $self->{column};
5710          $self->{column}++;
5711          $self->{nc}
5712              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5713        } else {
5714          $self->{set_nc}->($self);
5715        }
5716      
5717            redo A;
5718          } elsif ($is_space->{$self->{nc}}) {
5719            ## Stay in the state.
5720            
5721        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5722          $self->{line_prev} = $self->{line};
5723          $self->{column_prev} = $self->{column};
5724          $self->{column}++;
5725          $self->{nc}
5726              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5727        } else {
5728          $self->{set_nc}->($self);
5729        }
5730      
5731            redo A;
5732          } elsif ($self->{nc} == -1) {
5733            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed internal subset'); ## TODO: type
5734            delete $self->{in_subset};
5735            $self->{state} = DATA_STATE;
5736            $self->{s_kwd} = '';
5737            ## Reconsume.
5738            return  ({type => END_OF_DOCTYPE_TOKEN});
5739            redo A;
5740          } else {
5741            unless ($self->{internal_subset_tainted}) {
5742              ## XML5: No parse error.
5743              $self->{parse_error}->(level => $self->{level}->{must}, type => 'string in internal subset');
5744              $self->{internal_subset_tainted} = 1;
5745            }
5746            ## Stay in the state.
5747            
5748        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5749          $self->{line_prev} = $self->{line};
5750          $self->{column_prev} = $self->{column};
5751          $self->{column}++;
5752          $self->{nc}
5753              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5754        } else {
5755          $self->{set_nc}->($self);
5756        }
5757      
5758            redo A;
5759          }
5760        } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
5761          if ($self->{nc} == 0x003E) { # >
5762            $self->{state} = DATA_STATE;
5763            $self->{s_kwd} = '';
5764            
5765        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5766          $self->{line_prev} = $self->{line};
5767          $self->{column_prev} = $self->{column};
5768          $self->{column}++;
5769          $self->{nc}
5770              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5771        } else {
5772          $self->{set_nc}->($self);
5773        }
5774      
5775            return  ({type => END_OF_DOCTYPE_TOKEN});
5776            redo A;
5777          } elsif ($self->{nc} == -1) {
5778            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
5779            $self->{state} = DATA_STATE;
5780            $self->{s_kwd} = '';
5781            ## Reconsume.
5782            return  ({type => END_OF_DOCTYPE_TOKEN});
5783            redo A;
5784          } else {
5785            ## XML5: No parse error and stay in the state.
5786            $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after internal subset'); ## TODO: type
5787    
5788            $self->{state} = BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
5789            
5790        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5791          $self->{line_prev} = $self->{line};
5792          $self->{column_prev} = $self->{column};
5793          $self->{column}++;
5794          $self->{nc}
5795              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5796        } else {
5797          $self->{set_nc}->($self);
5798        }
5799      
5800            redo A;
5801          }
5802        } elsif ($self->{state} == BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
5803          if ($self->{nc} == 0x003E) { # >
5804            $self->{state} = DATA_STATE;
5805            $self->{s_kwd} = '';
5806            
5807        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5808          $self->{line_prev} = $self->{line};
5809          $self->{column_prev} = $self->{column};
5810          $self->{column}++;
5811          $self->{nc}
5812              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5813        } else {
5814          $self->{set_nc}->($self);
5815        }
5816      
5817            return  ({type => END_OF_DOCTYPE_TOKEN});
5818            redo A;
5819          } elsif ($self->{nc} == -1) {
5820            $self->{state} = DATA_STATE;
5821            $self->{s_kwd} = '';
5822            ## Reconsume.
5823            return  ({type => END_OF_DOCTYPE_TOKEN});
5824            redo A;
5825          } else {
5826            ## Stay in the state.
5827            
5828        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5829          $self->{line_prev} = $self->{line};
5830          $self->{column_prev} = $self->{column};
5831          $self->{column}++;
5832          $self->{nc}
5833              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5834        } else {
5835          $self->{set_nc}->($self);
5836        }
5837      
5838            redo A;
5839          }
5840        } elsif ($self->{state} == DOCTYPE_TAG_STATE) {
5841          if ($self->{nc} == 0x0021) { # !
5842            $self->{state} = DOCTYPE_MARKUP_DECLARATION_OPEN_STATE;
5843            
5844        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5845          $self->{line_prev} = $self->{line};
5846          $self->{column_prev} = $self->{column};
5847          $self->{column}++;
5848          $self->{nc}
5849              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5850        } else {
5851          $self->{set_nc}->($self);
5852        }
5853      
5854            redo A;
5855          } elsif ($self->{nc} == 0x003F) { # ?
5856            $self->{state} = PI_STATE;
5857            
5858        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5859          $self->{line_prev} = $self->{line};
5860          $self->{column_prev} = $self->{column};
5861          $self->{column}++;
5862          $self->{nc}
5863              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5864        } else {
5865          $self->{set_nc}->($self);
5866        }
5867      
5868            redo A;
5869          } elsif ($self->{nc} == -1) {
5870            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago');
5871            $self->{state} = DATA_STATE;
5872            $self->{s_kwd} = '';
5873            ## Reconsume.
5874            redo A;
5875          } else {
5876            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago', ## XML5: Not a parse error.
5877                            line => $self->{line_prev},
5878                            column => $self->{column_prev});
5879            $self->{state} = BOGUS_COMMENT_STATE;
5880            $self->{ct} = {type => COMMENT_TOKEN,
5881                           data => '',
5882                          }; ## NOTE: Will be discarded.
5883            
5884        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5885          $self->{line_prev} = $self->{line};
5886          $self->{column_prev} = $self->{column};
5887          $self->{column}++;
5888          $self->{nc}
5889              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5890        } else {
5891          $self->{set_nc}->($self);
5892        }
5893      
5894            redo A;
5895          }
5896        } elsif ($self->{state} == DOCTYPE_MARKUP_DECLARATION_OPEN_STATE) {
5897          ## XML5: "DOCTYPE markup declaration state".
5898          
5899          if ($self->{nc} == 0x002D) { # -
5900            $self->{state} = MD_HYPHEN_STATE;
5901            
5902        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5903          $self->{line_prev} = $self->{line};
5904          $self->{column_prev} = $self->{column};
5905          $self->{column}++;
5906          $self->{nc}
5907              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5908        } else {
5909          $self->{set_nc}->($self);
5910        }
5911      
5912            redo A;
5913          } elsif ($self->{nc} == 0x0045 or # E
5914                   $self->{nc} == 0x0065) { # e
5915            $self->{state} = MD_E_STATE;
5916            $self->{kwd} = chr $self->{nc};
5917            
5918        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5919          $self->{line_prev} = $self->{line};
5920          $self->{column_prev} = $self->{column};
5921          $self->{column}++;
5922          $self->{nc}
5923              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5924        } else {
5925          $self->{set_nc}->($self);
5926        }
5927      
5928            redo A;
5929          } elsif ($self->{nc} == 0x0041 or # A
5930                   $self->{nc} == 0x0061) { # a
5931            $self->{state} = MD_ATTLIST_STATE;
5932            $self->{kwd} = chr $self->{nc};
5933            
5934        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5935          $self->{line_prev} = $self->{line};
5936          $self->{column_prev} = $self->{column};
5937          $self->{column}++;
5938          $self->{nc}
5939              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5940        } else {
5941          $self->{set_nc}->($self);
5942        }
5943      
5944            redo A;
5945          } elsif ($self->{nc} == 0x004E or # N
5946                   $self->{nc} == 0x006E) { # n
5947            $self->{state} = MD_NOTATION_STATE;
5948            $self->{kwd} = chr $self->{nc};
5949            
5950        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5951          $self->{line_prev} = $self->{line};
5952          $self->{column_prev} = $self->{column};
5953          $self->{column}++;
5954          $self->{nc}
5955              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5956        } else {
5957          $self->{set_nc}->($self);
5958        }
5959      
5960            redo A;
5961          } else {
5962            #
5963          }
5964          
5965          ## XML5: No parse error.
5966          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5967                          line => $self->{line_prev},
5968                          column => $self->{column_prev} - 1);
5969          ## Reconsume.
5970          $self->{state} = BOGUS_COMMENT_STATE;
5971          $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded.
5972          redo A;
5973        } elsif ($self->{state} == MD_E_STATE) {
5974          if ($self->{nc} == 0x004E or # N
5975              $self->{nc} == 0x006E) { # n
5976            $self->{state} = MD_ENTITY_STATE;
5977            $self->{kwd} .= chr $self->{nc};
5978            
5979        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5980          $self->{line_prev} = $self->{line};
5981          $self->{column_prev} = $self->{column};
5982          $self->{column}++;
5983          $self->{nc}
5984              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5985        } else {
5986          $self->{set_nc}->($self);
5987        }
5988      
5989            redo A;
5990          } elsif ($self->{nc} == 0x004C or # L
5991                   $self->{nc} == 0x006C) { # l
5992            ## XML5: <!ELEMENT> not supported.
5993            $self->{state} = MD_ELEMENT_STATE;
5994            $self->{kwd} .= chr $self->{nc};
5995            
5996        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5997          $self->{line_prev} = $self->{line};
5998          $self->{column_prev} = $self->{column};
5999          $self->{column}++;
6000          $self->{nc}
6001              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6002        } else {
6003          $self->{set_nc}->($self);
6004        }
6005      
6006            redo A;
6007          } else {
6008            ## XML5: No parse error.
6009            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
6010                            line => $self->{line_prev},
6011                            column => $self->{column_prev} - 2
6012                                + 1 * ($self->{nc} == -1));
6013            ## Reconsume.
6014            $self->{state} = BOGUS_COMMENT_STATE;
6015            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6016            redo A;
6017          }
6018        } elsif ($self->{state} == MD_ENTITY_STATE) {
6019          if ($self->{nc} == [
6020                undef,
6021                undef,
6022                0x0054, # T
6023                0x0049, # I
6024                0x0054, # T
6025              ]->[length $self->{kwd}] or
6026              $self->{nc} == [
6027                undef,
6028                undef,
6029                0x0074, # t
6030                0x0069, # i
6031                0x0074, # t
6032              ]->[length $self->{kwd}]) {
6033            ## Stay in the state.
6034            $self->{kwd} .= chr $self->{nc};
6035            
6036        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6037          $self->{line_prev} = $self->{line};
6038          $self->{column_prev} = $self->{column};
6039          $self->{column}++;
6040          $self->{nc}
6041              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6042        } else {
6043          $self->{set_nc}->($self);
6044        }
6045      
6046            redo A;
6047          } elsif ((length $self->{kwd}) == 5 and
6048                   ($self->{nc} == 0x0059 or # Y
6049                    $self->{nc} == 0x0079)) { # y
6050            if ($self->{kwd} ne 'ENTIT' or $self->{nc} == 0x0079) {
6051              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
6052                              text => 'ENTITY',
6053                              line => $self->{line_prev},
6054                              column => $self->{column_prev} - 4);
6055            }
6056            $self->{ct} = {type => GENERAL_ENTITY_TOKEN, name => '',
6057                           line => $self->{line_prev},
6058                           column => $self->{column_prev} - 6};
6059            $self->{state} = DOCTYPE_MD_STATE;
6060            
6061        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6062          $self->{line_prev} = $self->{line};
6063          $self->{column_prev} = $self->{column};
6064          $self->{column}++;
6065          $self->{nc}
6066              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6067        } else {
6068          $self->{set_nc}->($self);
6069        }
6070      
6071            redo A;
6072          } else {
6073            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
6074                            line => $self->{line_prev},
6075                            column => $self->{column_prev} - 1
6076                                - (length $self->{kwd})
6077                                + 1 * ($self->{nc} == -1));
6078            $self->{state} = BOGUS_COMMENT_STATE;
6079            ## Reconsume.
6080            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6081            redo A;
6082          }
6083        } elsif ($self->{state} == MD_ELEMENT_STATE) {
6084          if ($self->{nc} == [
6085               undef,
6086               undef,
6087               0x0045, # E
6088               0x004D, # M
6089               0x0045, # E
6090               0x004E, # N
6091              ]->[length $self->{kwd}] or
6092              $self->{nc} == [
6093               undef,
6094               undef,
6095               0x0065, # e
6096               0x006D, # m
6097               0x0065, # e
6098               0x006E, # n
6099              ]->[length $self->{kwd}]) {
6100            ## Stay in the state.
6101            $self->{kwd} .= chr $self->{nc};
6102            
6103        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6104          $self->{line_prev} = $self->{line};
6105          $self->{column_prev} = $self->{column};
6106          $self->{column}++;
6107          $self->{nc}
6108              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6109        } else {
6110          $self->{set_nc}->($self);
6111        }
6112      
6113            redo A;
6114          } elsif ((length $self->{kwd}) == 6 and
6115                   ($self->{nc} == 0x0054 or # T
6116                    $self->{nc} == 0x0074)) { # t
6117            if ($self->{kwd} ne 'ELEMEN' or $self->{nc} == 0x0074) {
6118              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
6119                              text => 'ELEMENT',
6120                              line => $self->{line_prev},
6121                              column => $self->{column_prev} - 5);
6122            }
6123            $self->{ct} = {type => ELEMENT_TOKEN, name => '',
6124                           line => $self->{line_prev},
6125                           column => $self->{column_prev} - 7};
6126            $self->{state} = DOCTYPE_MD_STATE;
6127            
6128        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6129          $self->{line_prev} = $self->{line};
6130          $self->{column_prev} = $self->{column};
6131          $self->{column}++;
6132          $self->{nc}
6133              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6134        } else {
6135          $self->{set_nc}->($self);
6136        }
6137      
6138            redo A;
6139          } else {
6140            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
6141                            line => $self->{line_prev},
6142                            column => $self->{column_prev} - 1
6143                                - (length $self->{kwd})
6144                                + 1 * ($self->{nc} == -1));
6145            $self->{state} = BOGUS_COMMENT_STATE;
6146            ## Reconsume.
6147            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6148            redo A;
6149          }
6150        } elsif ($self->{state} == MD_ATTLIST_STATE) {
6151          if ($self->{nc} == [
6152               undef,
6153               0x0054, # T
6154               0x0054, # T
6155               0x004C, # L
6156               0x0049, # I
6157               0x0053, # S
6158              ]->[length $self->{kwd}] or
6159              $self->{nc} == [
6160               undef,
6161               0x0074, # t
6162               0x0074, # t
6163               0x006C, # l
6164               0x0069, # i
6165               0x0073, # s
6166              ]->[length $self->{kwd}]) {
6167            ## Stay in the state.
6168            $self->{kwd} .= chr $self->{nc};
6169            
6170        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6171          $self->{line_prev} = $self->{line};
6172          $self->{column_prev} = $self->{column};
6173          $self->{column}++;
6174          $self->{nc}
6175              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6176        } else {
6177          $self->{set_nc}->($self);
6178        }
6179      
6180            redo A;
6181          } elsif ((length $self->{kwd}) == 6 and
6182                   ($self->{nc} == 0x0054 or # T
6183                    $self->{nc} == 0x0074)) { # t
6184            if ($self->{kwd} ne 'ATTLIS' or $self->{nc} == 0x0074) {
6185              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
6186                              text => 'ATTLIST',
6187                              line => $self->{line_prev},
6188                              column => $self->{column_prev} - 5);
6189            }
6190            $self->{ct} = {type => ATTLIST_TOKEN, name => '',
6191                           attrdefs => [],
6192                           line => $self->{line_prev},
6193                           column => $self->{column_prev} - 7};
6194            $self->{state} = DOCTYPE_MD_STATE;
6195            
6196        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6197          $self->{line_prev} = $self->{line};
6198          $self->{column_prev} = $self->{column};
6199          $self->{column}++;
6200          $self->{nc}
6201              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6202        } else {
6203          $self->{set_nc}->($self);
6204        }
6205      
6206            redo A;
6207          } else {
6208            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
6209                            line => $self->{line_prev},
6210                            column => $self->{column_prev} - 1
6211                                 - (length $self->{kwd})
6212                                 + 1 * ($self->{nc} == -1));
6213            $self->{state} = BOGUS_COMMENT_STATE;
6214            ## Reconsume.
6215            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6216            redo A;
6217          }
6218        } elsif ($self->{state} == MD_NOTATION_STATE) {
6219          if ($self->{nc} == [
6220               undef,
6221               0x004F, # O
6222               0x0054, # T
6223               0x0041, # A
6224               0x0054, # T
6225               0x0049, # I
6226               0x004F, # O
6227              ]->[length $self->{kwd}] or
6228              $self->{nc} == [
6229               undef,
6230               0x006F, # o
6231               0x0074, # t
6232               0x0061, # a
6233               0x0074, # t
6234               0x0069, # i
6235               0x006F, # o
6236              ]->[length $self->{kwd}]) {
6237            ## Stay in the state.
6238            $self->{kwd} .= chr $self->{nc};
6239            
6240        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6241          $self->{line_prev} = $self->{line};
6242          $self->{column_prev} = $self->{column};
6243          $self->{column}++;
6244          $self->{nc}
6245              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6246        } else {
6247          $self->{set_nc}->($self);
6248        }
6249      
6250            redo A;
6251          } elsif ((length $self->{kwd}) == 7 and
6252                   ($self->{nc} == 0x004E or # N
6253                    $self->{nc} == 0x006E)) { # n
6254            if ($self->{kwd} ne 'NOTATIO' or $self->{nc} == 0x006E) {
6255              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
6256                              text => 'NOTATION',
6257                              line => $self->{line_prev},
6258                              column => $self->{column_prev} - 6);
6259            }
6260            $self->{ct} = {type => NOTATION_TOKEN, name => '',
6261                           line => $self->{line_prev},
6262                           column => $self->{column_prev} - 8};
6263            $self->{state} = DOCTYPE_MD_STATE;
6264            
6265        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6266          $self->{line_prev} = $self->{line};
6267          $self->{column_prev} = $self->{column};
6268          $self->{column}++;
6269          $self->{nc}
6270              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6271        } else {
6272          $self->{set_nc}->($self);
6273        }
6274      
6275            redo A;
6276          } else {
6277            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
6278                            line => $self->{line_prev},
6279                            column => $self->{column_prev} - 1
6280                                - (length $self->{kwd})
6281                                + 1 * ($self->{nc} == -1));
6282            $self->{state} = BOGUS_COMMENT_STATE;
6283            ## Reconsume.
6284            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6285            redo A;
6286          }
6287        } elsif ($self->{state} == DOCTYPE_MD_STATE) {
6288          ## XML5: "DOCTYPE ENTITY state", "DOCTYPE ATTLIST state", and
6289          ## "DOCTYPE NOTATION state".
6290    
6291          if ($is_space->{$self->{nc}}) {
6292            ## XML5: [NOTATION] Switch to the "DOCTYPE NOTATION identifier state".
6293            $self->{state} = BEFORE_MD_NAME_STATE;
6294            
6295        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6296          $self->{line_prev} = $self->{line};
6297          $self->{column_prev} = $self->{column};
6298          $self->{column}++;
6299          $self->{nc}
6300              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6301        } else {
6302          $self->{set_nc}->($self);
6303        }
6304      
6305            redo A;
6306          } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
6307                   $self->{nc} == 0x0025) { # %
6308            ## XML5: Switch to the "DOCTYPE bogus comment state".
6309            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before md name'); ## TODO: type
6310            $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
6311            
6312        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6313          $self->{line_prev} = $self->{line};
6314          $self->{column_prev} = $self->{column};
6315          $self->{column}++;
6316          $self->{nc}
6317              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6318        } else {
6319          $self->{set_nc}->($self);
6320        }
6321      
6322            redo A;
6323          } elsif ($self->{nc} == -1) {
6324            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6325            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6326            ## Reconsume.
6327            redo A;
6328          } elsif ($self->{nc} == 0x003E) { # >
6329            ## XML5: Switch to the "DOCTYPE bogus comment state".
6330            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6331            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6332            
6333        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6334          $self->{line_prev} = $self->{line};
6335          $self->{column_prev} = $self->{column};
6336          $self->{column}++;
6337          $self->{nc}
6338              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6339        } else {
6340          $self->{set_nc}->($self);
6341        }
6342      
6343            redo A;
6344          } else {
6345            ## XML5: Switch to the "DOCTYPE bogus comment state".
6346            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before md name'); ## TODO: type
6347            $self->{state} = BEFORE_MD_NAME_STATE;
6348            redo A;
6349          }
6350        } elsif ($self->{state} == BEFORE_MD_NAME_STATE) {
6351          ## XML5: "DOCTYPE ENTITY parameter state", "DOCTYPE ENTITY type
6352          ## before state", "DOCTYPE ATTLIST name before state".
6353    
6354          if ($is_space->{$self->{nc}}) {
6355            ## Stay in the state.
6356            
6357        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6358          $self->{line_prev} = $self->{line};
6359          $self->{column_prev} = $self->{column};
6360          $self->{column}++;
6361          $self->{nc}
6362              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6363        } else {
6364          $self->{set_nc}->($self);
6365        }
6366      
6367            redo A;
6368          } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
6369                   $self->{nc} == 0x0025) { # %
6370            $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
6371            
6372        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6373          $self->{line_prev} = $self->{line};
6374          $self->{column_prev} = $self->{column};
6375          $self->{column}++;
6376          $self->{nc}
6377              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6378        } else {
6379          $self->{set_nc}->($self);
6380        }
6381      
6382            redo A;
6383          } elsif ($self->{nc} == 0x003E) { # >
6384            ## XML5: Same as "Anything else".
6385            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6386            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6387            
6388        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6389          $self->{line_prev} = $self->{line};
6390          $self->{column_prev} = $self->{column};
6391          $self->{column}++;
6392          $self->{nc}
6393              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6394        } else {
6395          $self->{set_nc}->($self);
6396        }
6397      
6398            redo A;
6399          } elsif ($self->{nc} == -1) {
6400            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6401            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6402            ## Reconsume.
6403            redo A;
6404          } else {
6405            ## XML5: [ATTLIST] Not defined yet.
6406            $self->{ct}->{name} .= chr $self->{nc};
6407            $self->{state} = MD_NAME_STATE;
6408            
6409        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6410          $self->{line_prev} = $self->{line};
6411          $self->{column_prev} = $self->{column};
6412          $self->{column}++;
6413          $self->{nc}
6414              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6415        } else {
6416          $self->{set_nc}->($self);
6417        }
6418      
6419            redo A;
6420          }
6421        } elsif ($self->{state} == DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE) {
6422          if ($is_space->{$self->{nc}}) {
6423            ## XML5: Switch to the "DOCTYPE ENTITY parameter state".
6424            $self->{ct}->{type} = PARAMETER_ENTITY_TOKEN;
6425            $self->{state} = BEFORE_MD_NAME_STATE;
6426            
6427        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6428          $self->{line_prev} = $self->{line};
6429          $self->{column_prev} = $self->{column};
6430          $self->{column}++;
6431          $self->{nc}
6432              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6433        } else {
6434          $self->{set_nc}->($self);
6435        }
6436      
6437            redo A;
6438          } elsif ($self->{nc} == 0x003E) { # >
6439            ## XML5: Same as "Anything else".
6440            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6441            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6442            
6443        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6444          $self->{line_prev} = $self->{line};
6445          $self->{column_prev} = $self->{column};
6446          $self->{column}++;
6447          $self->{nc}
6448              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6449        } else {
6450          $self->{set_nc}->($self);
6451        }
6452      
6453            redo A;
6454          } elsif ($self->{nc} == -1) {
6455            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md');
6456            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6457            ## Reconsume.
6458            redo A;
6459          } else {
6460            ## XML5: No parse error.
6461            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space after ENTITY percent'); ## TODO: type
6462            $self->{state} = BOGUS_COMMENT_STATE;
6463            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6464            ## Reconsume.
6465            redo A;
6466          }
6467        } elsif ($self->{state} == MD_NAME_STATE) {
6468          ## XML5: "DOCTYPE ENTITY name state" and "DOCTYPE ATTLIST name state".
6469          
6470          if ($is_space->{$self->{nc}}) {
6471            if ($self->{ct}->{type} == ATTLIST_TOKEN) {
6472              $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
6473            } elsif ($self->{ct}->{type} == ELEMENT_TOKEN) {
6474              $self->{state} = AFTER_ELEMENT_NAME_STATE;
6475            } else { # ENTITY/NOTATION
6476              $self->{state} = AFTER_DOCTYPE_NAME_STATE;
6477            }
6478            
6479        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6480          $self->{line_prev} = $self->{line};
6481          $self->{column_prev} = $self->{column};
6482          $self->{column}++;
6483          $self->{nc}
6484              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6485        } else {
6486          $self->{set_nc}->($self);
6487        }
6488      
6489            redo A;
6490          } elsif ($self->{nc} == 0x003E) { # >
6491            if ($self->{ct}->{type} == ATTLIST_TOKEN) {
6492              #
6493            } else {
6494              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
6495            }
6496            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6497            
6498        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6499          $self->{line_prev} = $self->{line};
6500          $self->{column_prev} = $self->{column};
6501          $self->{column}++;
6502          $self->{nc}
6503              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6504        } else {
6505          $self->{set_nc}->($self);
6506        }
6507      
6508            return  ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
6509            redo A;
6510          } elsif ($self->{nc} == -1) {
6511            ## XML5: [ATTLIST] No parse error.
6512            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md');
6513            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6514            ## Reconsume.
6515            return  ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
6516            redo A;
6517          } else {
6518            ## XML5: [ATTLIST] Not defined yet.
6519            $self->{ct}->{name} .= chr $self->{nc};
6520            ## Stay in the state.
6521            
6522        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6523          $self->{line_prev} = $self->{line};
6524          $self->{column_prev} = $self->{column};
6525          $self->{column}++;
6526          $self->{nc}
6527              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6528        } else {
6529          $self->{set_nc}->($self);
6530        }
6531      
6532            redo A;
6533          }
6534        } elsif ($self->{state} == DOCTYPE_ATTLIST_NAME_AFTER_STATE) {
6535          if ($is_space->{$self->{nc}}) {
6536            ## Stay in the state.
6537            
6538        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6539          $self->{line_prev} = $self->{line};
6540          $self->{column_prev} = $self->{column};
6541          $self->{column}++;
6542          $self->{nc}
6543              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6544        } else {
6545          $self->{set_nc}->($self);
6546        }
6547      
6548            redo A;
6549          } elsif ($self->{nc} == 0x003E) { # >
6550            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6551            
6552        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6553          $self->{line_prev} = $self->{line};
6554          $self->{column_prev} = $self->{column};
6555          $self->{column}++;
6556          $self->{nc}
6557              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6558        } else {
6559          $self->{set_nc}->($self);
6560        }
6561      
6562            return  ($self->{ct}); # ATTLIST
6563            redo A;
6564          } elsif ($self->{nc} == -1) {
6565            ## XML5: No parse error.
6566            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6567            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6568            return  ($self->{ct});
6569            redo A;
6570          } else {
6571            ## XML5: Not defined yet.
6572            $self->{ca} = {name => chr ($self->{nc}), # attrdef
6573                           tokens => [],
6574                           line => $self->{line}, column => $self->{column}};
6575            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE;
6576            
6577        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6578          $self->{line_prev} = $self->{line};
6579          $self->{column_prev} = $self->{column};
6580          $self->{column}++;
6581          $self->{nc}
6582              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6583        } else {
6584          $self->{set_nc}->($self);
6585        }
6586      
6587            redo A;
6588          }
6589        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE) {
6590          if ($is_space->{$self->{nc}}) {
6591            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE;
6592            
6593        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6594          $self->{line_prev} = $self->{line};
6595          $self->{column_prev} = $self->{column};
6596          $self->{column}++;
6597          $self->{nc}
6598              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6599        } else {
6600          $self->{set_nc}->($self);
6601        }
6602      
6603            redo A;
6604          } elsif ($self->{nc} == 0x003E) { # >
6605            ## XML5: Same as "anything else".
6606            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr type'); ## TODO: type
6607            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6608            
6609        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6610          $self->{line_prev} = $self->{line};
6611          $self->{column_prev} = $self->{column};
6612          $self->{column}++;
6613          $self->{nc}
6614              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6615        } else {
6616          $self->{set_nc}->($self);
6617        }
6618      
6619            return  ($self->{ct}); # ATTLIST
6620            redo A;
6621          } elsif ($self->{nc} == 0x0028) { # (
6622            ## XML5: Same as "anything else".
6623            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before paren'); ## TODO: type
6624            $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6625            
6626        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6627          $self->{line_prev} = $self->{line};
6628          $self->{column_prev} = $self->{column};
6629          $self->{column}++;
6630          $self->{nc}
6631              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6632        } else {
6633          $self->{set_nc}->($self);
6634        }
6635      
6636            redo A;
6637          } elsif ($self->{nc} == -1) {
6638            ## XML5: No parse error.
6639            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6640            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6641            
6642        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6643          $self->{line_prev} = $self->{line};
6644          $self->{column_prev} = $self->{column};
6645          $self->{column}++;
6646          $self->{nc}
6647              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6648        } else {
6649          $self->{set_nc}->($self);
6650        }
6651      
6652            return  ($self->{ct}); # ATTLIST
6653            redo A;
6654          } else {
6655            ## XML5: Not defined yet.
6656            $self->{ca}->{name} .= chr $self->{nc};
6657            ## Stay in the state.
6658            
6659        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6660          $self->{line_prev} = $self->{line};
6661          $self->{column_prev} = $self->{column};
6662          $self->{column}++;
6663          $self->{nc}
6664              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6665        } else {
6666          $self->{set_nc}->($self);
6667        }
6668      
6669            redo A;
6670          }
6671        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE) {
6672          if ($is_space->{$self->{nc}}) {
6673            ## Stay in the state.
6674            
6675        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6676          $self->{line_prev} = $self->{line};
6677          $self->{column_prev} = $self->{column};
6678          $self->{column}++;
6679          $self->{nc}
6680              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6681        } else {
6682          $self->{set_nc}->($self);
6683        }
6684      
6685            redo A;
6686          } elsif ($self->{nc} == 0x003E) { # >
6687            ## XML5: Same as "anything else".
6688            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr type'); ## TODO: type
6689            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6690            
6691        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6692          $self->{line_prev} = $self->{line};
6693          $self->{column_prev} = $self->{column};
6694          $self->{column}++;
6695          $self->{nc}
6696              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6697        } else {
6698          $self->{set_nc}->($self);
6699        }
6700      
6701            return  ($self->{ct}); # ATTLIST
6702            redo A;
6703          } elsif ($self->{nc} == 0x0028) { # (
6704            ## XML5: Same as "anything else".
6705            $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6706            
6707        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6708          $self->{line_prev} = $self->{line};
6709          $self->{column_prev} = $self->{column};
6710          $self->{column}++;
6711          $self->{nc}
6712              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6713        } else {
6714          $self->{set_nc}->($self);
6715        }
6716      
6717            redo A;
6718          } elsif ($self->{nc} == -1) {
6719            ## XML5: No parse error.
6720            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6721            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6722            
6723        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6724          $self->{line_prev} = $self->{line};
6725          $self->{column_prev} = $self->{column};
6726          $self->{column}++;
6727          $self->{nc}
6728              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6729        } else {
6730          $self->{set_nc}->($self);
6731        }
6732      
6733            return  ($self->{ct});
6734            redo A;
6735          } else {
6736            ## XML5: Not defined yet.
6737            $self->{ca}->{type} = chr $self->{nc};
6738            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE;
6739            
6740        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6741          $self->{line_prev} = $self->{line};
6742          $self->{column_prev} = $self->{column};
6743          $self->{column}++;
6744          $self->{nc}
6745              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6746        } else {
6747          $self->{set_nc}->($self);
6748        }
6749      
6750            redo A;
6751          }
6752        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE) {
6753          if ($is_space->{$self->{nc}}) {
6754            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE;
6755            
6756        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6757          $self->{line_prev} = $self->{line};
6758          $self->{column_prev} = $self->{column};
6759          $self->{column}++;
6760          $self->{nc}
6761              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6762        } else {
6763          $self->{set_nc}->($self);
6764        }
6765      
6766            redo A;
6767          } elsif ($self->{nc} == 0x0023) { # #
6768            ## XML5: Same as "anything else".
6769            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6770            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
6771            
6772        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6773          $self->{line_prev} = $self->{line};
6774          $self->{column_prev} = $self->{column};
6775          $self->{column}++;
6776          $self->{nc}
6777              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6778        } else {
6779          $self->{set_nc}->($self);
6780        }
6781      
6782            redo A;
6783          } elsif ($self->{nc} == 0x0022) { # "
6784            ## XML5: Same as "anything else".
6785            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6786            $self->{ca}->{value} = '';
6787            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
6788            
6789        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6790          $self->{line_prev} = $self->{line};
6791          $self->{column_prev} = $self->{column};
6792          $self->{column}++;
6793          $self->{nc}
6794              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6795        } else {
6796          $self->{set_nc}->($self);
6797        }
6798      
6799            redo A;
6800          } elsif ($self->{nc} == 0x0027) { # '
6801            ## XML5: Same as "anything else".
6802            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6803            $self->{ca}->{value} = '';
6804            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
6805            
6806        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6807          $self->{line_prev} = $self->{line};
6808          $self->{column_prev} = $self->{column};
6809          $self->{column}++;
6810          $self->{nc}
6811              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6812        } else {
6813          $self->{set_nc}->($self);
6814        }
6815      
6816            redo A;
6817          } elsif ($self->{nc} == 0x003E) { # >
6818            ## XML5: Same as "anything else".
6819            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
6820            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6821            
6822        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6823          $self->{line_prev} = $self->{line};
6824          $self->{column_prev} = $self->{column};
6825          $self->{column}++;
6826          $self->{nc}
6827              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6828        } else {
6829          $self->{set_nc}->($self);
6830        }
6831      
6832            return  ($self->{ct}); # ATTLIST
6833            redo A;
6834          } elsif ($self->{nc} == 0x0028) { # (
6835            ## XML5: Same as "anything else".
6836            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before paren'); ## TODO: type
6837            $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6838            
6839        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6840          $self->{line_prev} = $self->{line};
6841          $self->{column_prev} = $self->{column};
6842          $self->{column}++;
6843          $self->{nc}
6844              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6845        } else {
6846          $self->{set_nc}->($self);
6847        }
6848      
6849            redo A;
6850          } elsif ($self->{nc} == -1) {
6851            ## XML5: No parse error.
6852            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6853            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6854            
6855        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6856          $self->{line_prev} = $self->{line};
6857          $self->{column_prev} = $self->{column};
6858          $self->{column}++;
6859          $self->{nc}
6860              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6861        } else {
6862          $self->{set_nc}->($self);
6863        }
6864      
6865            return  ($self->{ct});
6866            redo A;
6867          } else {
6868            ## XML5: Not defined yet.
6869            $self->{ca}->{type} .= chr $self->{nc};
6870            ## Stay in the state.
6871            
6872        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6873          $self->{line_prev} = $self->{line};
6874          $self->{column_prev} = $self->{column};
6875          $self->{column}++;
6876          $self->{nc}
6877              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6878        } else {
6879          $self->{set_nc}->($self);
6880        }
6881      
6882            redo A;
6883          }
6884        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE) {
6885          if ($is_space->{$self->{nc}}) {
6886            ## Stay in the state.
6887            
6888        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6889          $self->{line_prev} = $self->{line};
6890          $self->{column_prev} = $self->{column};
6891          $self->{column}++;
6892          $self->{nc}
6893              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6894        } else {
6895          $self->{set_nc}->($self);
6896        }
6897      
6898            redo A;
6899          } elsif ($self->{nc} == 0x0028) { # (
6900            ## XML5: Same as "anything else".
6901            $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6902            
6903        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6904          $self->{line_prev} = $self->{line};
6905          $self->{column_prev} = $self->{column};
6906          $self->{column}++;
6907          $self->{nc}
6908              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6909        } else {
6910          $self->{set_nc}->($self);
6911        }
6912      
6913            redo A;
6914          } elsif ($self->{nc} == 0x0023) { # #
6915            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
6916            
6917        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6918          $self->{line_prev} = $self->{line};
6919          $self->{column_prev} = $self->{column};
6920          $self->{column}++;
6921          $self->{nc}
6922              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6923        } else {
6924          $self->{set_nc}->($self);
6925        }
6926      
6927            redo A;
6928          } elsif ($self->{nc} == 0x0022) { # "
6929            ## XML5: Same as "anything else".
6930            $self->{ca}->{value} = '';
6931            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
6932            
6933        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6934          $self->{line_prev} = $self->{line};
6935          $self->{column_prev} = $self->{column};
6936          $self->{column}++;
6937          $self->{nc}
6938              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6939        } else {
6940          $self->{set_nc}->($self);
6941        }
6942      
6943            redo A;
6944          } elsif ($self->{nc} == 0x0027) { # '
6945            ## XML5: Same as "anything else".
6946            $self->{ca}->{value} = '';
6947            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
6948            
6949        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6950          $self->{line_prev} = $self->{line};
6951          $self->{column_prev} = $self->{column};
6952          $self->{column}++;
6953          $self->{nc}
6954              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6955        } else {
6956          $self->{set_nc}->($self);
6957        }
6958      
6959            redo A;
6960          } elsif ($self->{nc} == 0x003E) { # >
6961            ## XML5: Same as "anything else".
6962            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
6963            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6964            
6965        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6966          $self->{line_prev} = $self->{line};
6967          $self->{column_prev} = $self->{column};
6968          $self->{column}++;
6969          $self->{nc}
6970              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6971        } else {
6972          $self->{set_nc}->($self);
6973        }
6974      
6975            return  ($self->{ct}); # ATTLIST
6976            redo A;
6977          } elsif ($self->{nc} == -1) {
6978            ## XML5: No parse error.
6979            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6980            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6981            
6982        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6983          $self->{line_prev} = $self->{line};
6984          $self->{column_prev} = $self->{column};
6985          $self->{column}++;
6986          $self->{nc}
6987              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6988        } else {
6989          $self->{set_nc}->($self);
6990        }
6991      
6992            return  ($self->{ct});
6993            redo A;
6994          } else {
6995            ## XML5: Switch to the "DOCTYPE bogus comment state".
6996            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
6997            $self->{ca}->{value} = '';
6998            $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
6999            ## Reconsume.
7000            redo A;
7001          }
7002        } elsif ($self->{state} == BEFORE_ALLOWED_TOKEN_STATE) {
7003          if ($is_space->{$self->{nc}}) {
7004            ## Stay in the state.
7005            
7006        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7007          $self->{line_prev} = $self->{line};
7008          $self->{column_prev} = $self->{column};
7009          $self->{column}++;
7010          $self->{nc}
7011              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7012        } else {
7013          $self->{set_nc}->($self);
7014        }
7015      
7016            redo A;
7017          } elsif ($self->{nc} == 0x007C) { # |
7018            $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty allowed token'); ## TODO: type
7019            ## Stay in the state.
7020            
7021        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7022          $self->{line_prev} = $self->{line};
7023          $self->{column_prev} = $self->{column};
7024          $self->{column}++;
7025          $self->{nc}
7026              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7027        } else {
7028          $self->{set_nc}->($self);
7029        }
7030      
7031            redo A;
7032          } elsif ($self->{nc} == 0x0029) { # )
7033            $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty allowed token'); ## TODO: type
7034            $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
7035            
7036        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7037          $self->{line_prev} = $self->{line};
7038          $self->{column_prev} = $self->{column};
7039          $self->{column}++;
7040          $self->{nc}
7041              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7042        } else {
7043          $self->{set_nc}->($self);
7044        }
7045      
7046            redo A;
7047          } elsif ($self->{nc} == 0x003E) { # >
7048            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
7049            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7050            
7051        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7052          $self->{line_prev} = $self->{line};
7053          $self->{column_prev} = $self->{column};
7054          $self->{column}++;
7055          $self->{nc}
7056              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7057        } else {
7058          $self->{set_nc}->($self);
7059        }
7060      
7061            return  ($self->{ct}); # ATTLIST
7062            redo A;
7063          } elsif ($self->{nc} == -1) {
7064            ## XML5: No parse error.
7065            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7066            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7067            
7068        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7069          $self->{line_prev} = $self->{line};
7070          $self->{column_prev} = $self->{column};
7071          $self->{column}++;
7072          $self->{nc}
7073              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7074        } else {
7075          $self->{set_nc}->($self);
7076        }
7077      
7078            return  ($self->{ct});
7079            redo A;
7080          } else {
7081            push @{$self->{ca}->{tokens}}, chr $self->{nc};
7082            $self->{state} = ALLOWED_TOKEN_STATE;
7083            
7084        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7085          $self->{line_prev} = $self->{line};
7086          $self->{column_prev} = $self->{column};
7087          $self->{column}++;
7088          $self->{nc}
7089              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7090        } else {
7091          $self->{set_nc}->($self);
7092        }
7093      
7094            redo A;
7095          }
7096        } elsif ($self->{state} == ALLOWED_TOKEN_STATE) {
7097          if ($is_space->{$self->{nc}}) {
7098            $self->{state} = AFTER_ALLOWED_TOKEN_STATE;
7099            
7100        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7101          $self->{line_prev} = $self->{line};
7102          $self->{column_prev} = $self->{column};
7103          $self->{column}++;
7104          $self->{nc}
7105              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7106        } else {
7107          $self->{set_nc}->($self);
7108        }
7109      
7110            redo A;
7111          } elsif ($self->{nc} == 0x007C) { # |
7112            $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
7113            
7114        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7115          $self->{line_prev} = $self->{line};
7116          $self->{column_prev} = $self->{column};
7117          $self->{column}++;
7118          $self->{nc}
7119              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7120        } else {
7121          $self->{set_nc}->($self);
7122        }
7123      
7124            redo A;
7125          } elsif ($self->{nc} == 0x0029) { # )
7126            $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
7127            
7128        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7129          $self->{line_prev} = $self->{line};
7130          $self->{column_prev} = $self->{column};
7131          $self->{column}++;
7132          $self->{nc}
7133              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7134        } else {
7135          $self->{set_nc}->($self);
7136        }
7137      
7138            redo A;
7139          } elsif ($self->{nc} == 0x003E) { # >
7140            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
7141            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7142            
7143        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7144          $self->{line_prev} = $self->{line};
7145          $self->{column_prev} = $self->{column};
7146          $self->{column}++;
7147          $self->{nc}
7148              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7149        } else {
7150          $self->{set_nc}->($self);
7151        }
7152      
7153            return  ($self->{ct}); # ATTLIST
7154            redo A;
7155          } elsif ($self->{nc} == -1) {
7156            ## XML5: No parse error.
7157            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7158            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7159            
7160        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7161          $self->{line_prev} = $self->{line};
7162          $self->{column_prev} = $self->{column};
7163          $self->{column}++;
7164          $self->{nc}
7165              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7166        } else {
7167          $self->{set_nc}->($self);
7168        }
7169      
7170            return  ($self->{ct});
7171            redo A;
7172          } else {
7173            $self->{ca}->{tokens}->[-1] .= chr $self->{nc};
7174            ## Stay in the state.
7175            
7176        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7177          $self->{line_prev} = $self->{line};
7178          $self->{column_prev} = $self->{column};
7179          $self->{column}++;
7180          $self->{nc}
7181              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7182        } else {
7183          $self->{set_nc}->($self);
7184        }
7185      
7186            redo A;
7187          }
7188        } elsif ($self->{state} == AFTER_ALLOWED_TOKEN_STATE) {
7189          if ($is_space->{$self->{nc}}) {
7190            ## Stay in the state.
7191            
7192        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7193          $self->{line_prev} = $self->{line};
7194          $self->{column_prev} = $self->{column};
7195          $self->{column}++;
7196          $self->{nc}
7197              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7198        } else {
7199          $self->{set_nc}->($self);
7200        }
7201      
7202            redo A;
7203          } elsif ($self->{nc} == 0x007C) { # |
7204            $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
7205            
7206        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7207          $self->{line_prev} = $self->{line};
7208          $self->{column_prev} = $self->{column};
7209          $self->{column}++;
7210          $self->{nc}
7211              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7212        } else {
7213          $self->{set_nc}->($self);
7214        }
7215      
7216            redo A;
7217          } elsif ($self->{nc} == 0x0029) { # )
7218            $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
7219            
7220        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7221          $self->{line_prev} = $self->{line};
7222          $self->{column_prev} = $self->{column};
7223          $self->{column}++;
7224          $self->{nc}
7225              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7226        } else {
7227          $self->{set_nc}->($self);
7228        }
7229      
7230            redo A;
7231          } elsif ($self->{nc} == 0x003E) { # >
7232            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
7233            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7234            
7235        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7236          $self->{line_prev} = $self->{line};
7237          $self->{column_prev} = $self->{column};
7238          $self->{column}++;
7239          $self->{nc}
7240              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7241        } else {
7242          $self->{set_nc}->($self);
7243        }
7244      
7245            return  ($self->{ct}); # ATTLIST
7246            redo A;
7247          } elsif ($self->{nc} == -1) {
7248            ## XML5: No parse error.
7249            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7250            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7251            
7252        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7253          $self->{line_prev} = $self->{line};
7254          $self->{column_prev} = $self->{column};
7255          $self->{column}++;
7256          $self->{nc}
7257              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7258        } else {
7259          $self->{set_nc}->($self);
7260        }
7261      
7262            return  ($self->{ct});
7263            redo A;
7264          } else {
7265            $self->{parse_error}->(level => $self->{level}->{must}, type => 'space in allowed token', ## TODO: type
7266                            line => $self->{line_prev},
7267                            column => $self->{column_prev});
7268            $self->{ca}->{tokens}->[-1] .= ' ' . chr $self->{nc};
7269            $self->{state} = ALLOWED_TOKEN_STATE;
7270            
7271        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7272          $self->{line_prev} = $self->{line};
7273          $self->{column_prev} = $self->{column};
7274          $self->{column}++;
7275          $self->{nc}
7276              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7277        } else {
7278          $self->{set_nc}->($self);
7279        }
7280      
7281            redo A;
7282          }
7283        } elsif ($self->{state} == AFTER_ALLOWED_TOKENS_STATE) {
7284          if ($is_space->{$self->{nc}}) {
7285            $self->{state} = BEFORE_ATTR_DEFAULT_STATE;
7286            
7287        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7288          $self->{line_prev} = $self->{line};
7289          $self->{column_prev} = $self->{column};
7290          $self->{column}++;
7291          $self->{nc}
7292              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7293        } else {
7294          $self->{set_nc}->($self);
7295        }
7296      
7297            redo A;
7298          } elsif ($self->{nc} == 0x0023) { # #
7299            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7300            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
7301            
7302        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7303          $self->{line_prev} = $self->{line};
7304          $self->{column_prev} = $self->{column};
7305          $self->{column}++;
7306          $self->{nc}
7307              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7308        } else {
7309          $self->{set_nc}->($self);
7310        }
7311      
7312            redo A;
7313          } elsif ($self->{nc} == 0x0022) { # "
7314            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7315            $self->{ca}->{value} = '';
7316            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7317            
7318        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7319          $self->{line_prev} = $self->{line};
7320          $self->{column_prev} = $self->{column};
7321          $self->{column}++;
7322          $self->{nc}
7323              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7324        } else {
7325          $self->{set_nc}->($self);
7326        }
7327      
7328            redo A;
7329          } elsif ($self->{nc} == 0x0027) { # '
7330            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7331            $self->{ca}->{value} = '';
7332            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7333            
7334        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7335          $self->{line_prev} = $self->{line};
7336          $self->{column_prev} = $self->{column};
7337          $self->{column}++;
7338          $self->{nc}
7339              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7340        } else {
7341          $self->{set_nc}->($self);
7342        }
7343      
7344            redo A;
7345          } elsif ($self->{nc} == 0x003E) { # >
7346            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7347            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7348            
7349        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7350          $self->{line_prev} = $self->{line};
7351          $self->{column_prev} = $self->{column};
7352          $self->{column}++;
7353          $self->{nc}
7354              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7355        } else {
7356          $self->{set_nc}->($self);
7357        }
7358      
7359            return  ($self->{ct}); # ATTLIST
7360            redo A;
7361          } elsif ($self->{nc} == -1) {
7362            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7363            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7364            
7365        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7366          $self->{line_prev} = $self->{line};
7367          $self->{column_prev} = $self->{column};
7368          $self->{column}++;
7369          $self->{nc}
7370              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7371        } else {
7372          $self->{set_nc}->($self);
7373        }
7374      
7375            return  ($self->{ct});
7376            redo A;
7377          } else {
7378            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
7379            $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7380            ## Reconsume.
7381            redo A;
7382          }
7383        } elsif ($self->{state} == BEFORE_ATTR_DEFAULT_STATE) {
7384          if ($is_space->{$self->{nc}}) {
7385            ## Stay in the state.
7386            
7387        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7388          $self->{line_prev} = $self->{line};
7389          $self->{column_prev} = $self->{column};
7390          $self->{column}++;
7391          $self->{nc}
7392              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7393        } else {
7394          $self->{set_nc}->($self);
7395        }
7396      
7397            redo A;
7398          } elsif ($self->{nc} == 0x0023) { # #
7399            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
7400            
7401        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7402          $self->{line_prev} = $self->{line};
7403          $self->{column_prev} = $self->{column};
7404          $self->{column}++;
7405          $self->{nc}
7406              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7407        } else {
7408          $self->{set_nc}->($self);
7409        }
7410      
7411            redo A;
7412          } elsif ($self->{nc} == 0x0022) { # "
7413            $self->{ca}->{value} = '';
7414            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7415            
7416        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7417          $self->{line_prev} = $self->{line};
7418          $self->{column_prev} = $self->{column};
7419          $self->{column}++;
7420          $self->{nc}
7421              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7422        } else {
7423          $self->{set_nc}->($self);
7424        }
7425      
7426            redo A;
7427          } elsif ($self->{nc} == 0x0027) { # '
7428            $self->{ca}->{value} = '';
7429            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7430            
7431        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7432          $self->{line_prev} = $self->{line};
7433          $self->{column_prev} = $self->{column};
7434          $self->{column}++;
7435          $self->{nc}
7436              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7437        } else {
7438          $self->{set_nc}->($self);
7439        }
7440      
7441            redo A;
7442          } elsif ($self->{nc} == 0x003E) { # >
7443            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7444            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7445            
7446        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7447          $self->{line_prev} = $self->{line};
7448          $self->{column_prev} = $self->{column};
7449          $self->{column}++;
7450          $self->{nc}
7451              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7452        } else {
7453          $self->{set_nc}->($self);
7454        }
7455      
7456            return  ($self->{ct}); # ATTLIST
7457            redo A;
7458          } elsif ($self->{nc} == -1) {
7459            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7460            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7461            
7462        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7463          $self->{line_prev} = $self->{line};
7464          $self->{column_prev} = $self->{column};
7465          $self->{column}++;
7466          $self->{nc}
7467              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7468        } else {
7469          $self->{set_nc}->($self);
7470        }
7471      
7472            return  ($self->{ct});
7473            redo A;
7474          } else {
7475            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
7476            $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7477            ## Reconsume.
7478            redo A;
7479          }
7480        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE) {
7481          if ($is_space->{$self->{nc}}) {
7482            ## XML5: No parse error.
7483            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no default type'); ## TODO: type
7484            $self->{state} = BOGUS_MD_STATE;
7485            ## Reconsume.
7486            redo A;
7487          } elsif ($self->{nc} == 0x0022) { # "
7488            ## XML5: Same as "anything else".
7489            $self->{ca}->{value} = '';
7490            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7491            
7492        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7493          $self->{line_prev} = $self->{line};
7494          $self->{column_prev} = $self->{column};
7495          $self->{column}++;
7496          $self->{nc}
7497              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7498        } else {
7499          $self->{set_nc}->($self);
7500        }
7501      
7502            redo A;
7503          } elsif ($self->{nc} == 0x0027) { # '
7504            ## XML5: Same as "anything else".
7505            $self->{ca}->{value} = '';
7506            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7507            
7508        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7509          $self->{line_prev} = $self->{line};
7510          $self->{column_prev} = $self->{column};
7511          $self->{column}++;
7512          $self->{nc}
7513              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7514        } else {
7515          $self->{set_nc}->($self);
7516        }
7517      
7518            redo A;
7519          } elsif ($self->{nc} == 0x003E) { # >
7520            ## XML5: Same as "anything else".
7521            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7522            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7523            
7524        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7525          $self->{line_prev} = $self->{line};
7526          $self->{column_prev} = $self->{column};
7527          $self->{column}++;
7528          $self->{nc}
7529              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7530        } else {
7531          $self->{set_nc}->($self);
7532        }
7533      
7534            return  ($self->{ct}); # ATTLIST
7535            redo A;
7536          } elsif ($self->{nc} == -1) {
7537            ## XML5: No parse error.
7538            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7539            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7540            
7541        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7542          $self->{line_prev} = $self->{line};
7543          $self->{column_prev} = $self->{column};
7544          $self->{column}++;
7545          $self->{nc}
7546              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7547        } else {
7548          $self->{set_nc}->($self);
7549        }
7550      
7551            return  ($self->{ct});
7552            redo A;
7553          } else {
7554            $self->{ca}->{default} = chr $self->{nc};
7555            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE;
7556            
7557        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7558          $self->{line_prev} = $self->{line};
7559          $self->{column_prev} = $self->{column};
7560          $self->{column}++;
7561          $self->{nc}
7562              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7563        } else {
7564          $self->{set_nc}->($self);
7565        }
7566      
7567            redo A;
7568          }
7569        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE) {
7570          if ($is_space->{$self->{nc}}) {
7571            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE;
7572            
7573        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7574          $self->{line_prev} = $self->{line};
7575          $self->{column_prev} = $self->{column};
7576          $self->{column}++;
7577          $self->{nc}
7578              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7579        } else {
7580          $self->{set_nc}->($self);
7581        }
7582      
7583            redo A;
7584          } elsif ($self->{nc} == 0x0022) { # "
7585            ## XML5: Same as "anything else".
7586            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7587            $self->{ca}->{value} = '';
7588            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7589            
7590        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7591          $self->{line_prev} = $self->{line};
7592          $self->{column_prev} = $self->{column};
7593          $self->{column}++;
7594          $self->{nc}
7595              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7596        } else {
7597          $self->{set_nc}->($self);
7598        }
7599      
7600            redo A;
7601          } elsif ($self->{nc} == 0x0027) { # '
7602            ## XML5: Same as "anything else".
7603            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7604            $self->{ca}->{value} = '';
7605            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7606            
7607        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7608          $self->{line_prev} = $self->{line};
7609          $self->{column_prev} = $self->{column};
7610          $self->{column}++;
7611          $self->{nc}
7612              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7613        } else {
7614          $self->{set_nc}->($self);
7615        }
7616      
7617            redo A;
7618          } elsif ($self->{nc} == 0x003E) { # >
7619            ## XML5: Same as "anything else".
7620            push @{$self->{ct}->{attrdefs}}, $self->{ca};
7621            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7622            
7623        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7624          $self->{line_prev} = $self->{line};
7625          $self->{column_prev} = $self->{column};
7626          $self->{column}++;
7627          $self->{nc}
7628              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7629        } else {
7630          $self->{set_nc}->($self);
7631        }
7632      
7633            return  ($self->{ct}); # ATTLIST
7634            redo A;
7635          } elsif ($self->{nc} == -1) {
7636            ## XML5: No parse error.
7637            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7638            push @{$self->{ct}->{attrdefs}}, $self->{ca};
7639            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7640            
7641        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7642          $self->{line_prev} = $self->{line};
7643          $self->{column_prev} = $self->{column};
7644          $self->{column}++;
7645          $self->{nc}
7646              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7647        } else {
7648          $self->{set_nc}->($self);
7649        }
7650      
7651            return  ($self->{ct});
7652            redo A;
7653          } else {
7654            $self->{ca}->{default} .= chr $self->{nc};
7655            ## Stay in the state.
7656            
7657        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7658          $self->{line_prev} = $self->{line};
7659          $self->{column_prev} = $self->{column};
7660          $self->{column}++;
7661          $self->{nc}
7662              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7663        } else {
7664          $self->{set_nc}->($self);
7665        }
7666      
7667            redo A;
7668          }
7669        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE) {
7670          if ($is_space->{$self->{nc}}) {
7671            ## Stay in the state.
7672            
7673        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7674          $self->{line_prev} = $self->{line};
7675          $self->{column_prev} = $self->{column};
7676          $self->{column}++;
7677          $self->{nc}
7678              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7679        } else {
7680          $self->{set_nc}->($self);
7681        }
7682      
7683            redo A;
7684          } elsif ($self->{nc} == 0x0022) { # "
7685            $self->{ca}->{value} = '';
7686            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7687            
7688        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7689          $self->{line_prev} = $self->{line};
7690          $self->{column_prev} = $self->{column};
7691          $self->{column}++;
7692          $self->{nc}
7693              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7694        } else {
7695          $self->{set_nc}->($self);
7696        }
7697      
7698            redo A;
7699          } elsif ($self->{nc} == 0x0027) { # '
7700            $self->{ca}->{value} = '';
7701            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7702            
7703        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7704          $self->{line_prev} = $self->{line};
7705          $self->{column_prev} = $self->{column};
7706          $self->{column}++;
7707          $self->{nc}
7708              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7709        } else {
7710          $self->{set_nc}->($self);
7711        }
7712      
7713            redo A;
7714          } elsif ($self->{nc} == 0x003E) { # >
7715            push @{$self->{ct}->{attrdefs}}, $self->{ca};
7716            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7717            
7718        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7719          $self->{line_prev} = $self->{line};
7720          $self->{column_prev} = $self->{column};
7721          $self->{column}++;
7722          $self->{nc}
7723              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7724        } else {
7725          $self->{set_nc}->($self);
7726        }
7727      
7728            return  ($self->{ct}); # ATTLIST
7729            redo A;
7730          } elsif ($self->{nc} == -1) {
7731            ## XML5: No parse error.
7732            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7733            push @{$self->{ct}->{attrdefs}}, $self->{ca};
7734            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7735            
7736        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7737          $self->{line_prev} = $self->{line};
7738          $self->{column_prev} = $self->{column};
7739          $self->{column}++;
7740          $self->{nc}
7741              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7742        } else {
7743          $self->{set_nc}->($self);
7744        }
7745      
7746            return  ($self->{ct});
7747            redo A;
7748          } else {
7749            ## XML5: Not defined yet.
7750            if ($self->{ca}->{default} eq 'FIXED') {
7751              $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7752            } else {
7753              push @{$self->{ct}->{attrdefs}}, $self->{ca};
7754              $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7755            }
7756            ## Reconsume.
7757            redo A;
7758          }
7759        } elsif ($self->{state} == AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE) {
7760          if ($is_space->{$self->{nc}} or
7761              $self->{nc} == -1 or
7762              $self->{nc} == 0x003E) { # >
7763            $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7764            ## Reconsume.
7765            redo A;
7766          } else {
7767            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before attr name'); ## TODO: type
7768            $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7769            ## Reconsume.
7770            redo A;
7771          }
7772        } elsif ($self->{state} == NDATA_STATE) {
7773          ## ASCII case-insensitive
7774          if ($self->{nc} == [
7775                undef,
7776                0x0044, # D
7777                0x0041, # A
7778                0x0054, # T
7779              ]->[length $self->{kwd}] or
7780              $self->{nc} == [
7781                undef,
7782                0x0064, # d
7783                0x0061, # a
7784                0x0074, # t
7785              ]->[length $self->{kwd}]) {
7786            
7787            ## Stay in the state.
7788            $self->{kwd} .= chr $self->{nc};
7789            
7790        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7791          $self->{line_prev} = $self->{line};
7792          $self->{column_prev} = $self->{column};
7793          $self->{column}++;
7794          $self->{nc}
7795              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7796        } else {
7797          $self->{set_nc}->($self);
7798        }
7799      
7800            redo A;
7801          } elsif ((length $self->{kwd}) == 4 and
7802                   ($self->{nc} == 0x0041 or # A
7803                    $self->{nc} == 0x0061)) { # a
7804            if ($self->{kwd} ne 'NDAT' or $self->{nc} == 0x0061) { # a
7805              
7806              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
7807                              text => 'NDATA',
7808                              line => $self->{line_prev},
7809                              column => $self->{column_prev} - 4);
7810            } else {
7811              
7812            }
7813            $self->{state} = AFTER_NDATA_STATE;
7814            
7815        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7816          $self->{line_prev} = $self->{line};
7817          $self->{column_prev} = $self->{column};
7818          $self->{column}++;
7819          $self->{nc}
7820              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7821        } else {
7822          $self->{set_nc}->($self);
7823        }
7824      
7825            redo A;
7826          } else {
7827            $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after literal', ## TODO: type
7828                            line => $self->{line_prev},
7829                            column => $self->{column_prev} + 1
7830                                - length $self->{kwd});
7831            
7832            $self->{state} = BOGUS_MD_STATE;
7833            ## Reconsume.
7834            redo A;
7835          }
7836        } elsif ($self->{state} == AFTER_NDATA_STATE) {
7837          if ($is_space->{$self->{nc}}) {
7838            $self->{state} = BEFORE_NOTATION_NAME_STATE;
7839            
7840        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7841          $self->{line_prev} = $self->{line};
7842          $self->{column_prev} = $self->{column};
7843          $self->{column}++;
7844          $self->{nc}
7845              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7846        } else {
7847          $self->{set_nc}->($self);
7848        }
7849      
7850            redo A;
7851          } elsif ($self->{nc} == 0x003E) { # >
7852            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no notation name'); ## TODO: type
7853            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7854            
7855        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7856          $self->{line_prev} = $self->{line};
7857          $self->{column_prev} = $self->{column};
7858          $self->{column}++;
7859          $self->{nc}
7860              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7861        } else {
7862          $self->{set_nc}->($self);
7863        }
7864      
7865            return  ($self->{ct}); # ENTITY
7866            redo A;
7867          } elsif ($self->{nc} == -1) {
7868            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7869            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7870            
7871        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7872          $self->{line_prev} = $self->{line};
7873          $self->{column_prev} = $self->{column};
7874          $self->{column}++;
7875          $self->{nc}
7876              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7877        } else {
7878          $self->{set_nc}->($self);
7879        }
7880      
7881            return  ($self->{ct}); # ENTITY
7882            redo A;
7883          } else {
7884            $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after literal', ## TODO: type
7885                            line => $self->{line_prev},
7886                            column => $self->{column_prev} + 1
7887                                - length $self->{kwd});
7888            $self->{state} = BOGUS_MD_STATE;
7889          ## Reconsume.          ## Reconsume.
7890          redo A;          redo A;
7891        }        }
7892        } elsif ($self->{state} == BEFORE_NOTATION_NAME_STATE) {
7893          if ($is_space->{$self->{nc}}) {
7894            ## Stay in the state.
7895            
7896        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7897          $self->{line_prev} = $self->{line};
7898          $self->{column_prev} = $self->{column};
7899          $self->{column}++;
7900          $self->{nc}
7901              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7902        } else {
7903          $self->{set_nc}->($self);
7904        }
7905      
7906            redo A;
7907          } elsif ($self->{nc} == 0x003E) { # >
7908            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no notation name'); ## TODO: type
7909            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7910            
7911        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7912          $self->{line_prev} = $self->{line};
7913          $self->{column_prev} = $self->{column};
7914          $self->{column}++;
7915          $self->{nc}
7916              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7917        } else {
7918          $self->{set_nc}->($self);
7919        }
7920      
7921            return  ($self->{ct}); # ENTITY
7922            redo A;
7923          } elsif ($self->{nc} == -1) {
7924            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7925            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7926            
7927        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7928          $self->{line_prev} = $self->{line};
7929          $self->{column_prev} = $self->{column};
7930          $self->{column}++;
7931          $self->{nc}
7932              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7933        } else {
7934          $self->{set_nc}->($self);
7935        }
7936      
7937            return  ($self->{ct}); # ENTITY
7938            redo A;
7939          } else {
7940            $self->{ct}->{notation} = chr $self->{nc}; # ENTITY
7941            $self->{state} = NOTATION_NAME_STATE;
7942            
7943        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7944          $self->{line_prev} = $self->{line};
7945          $self->{column_prev} = $self->{column};
7946          $self->{column}++;
7947          $self->{nc}
7948              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7949        } else {
7950          $self->{set_nc}->($self);
7951        }
7952      
7953            redo A;
7954          }
7955        } elsif ($self->{state} == NOTATION_NAME_STATE) {
7956          if ($is_space->{$self->{nc}}) {
7957            $self->{state} = AFTER_MD_DEF_STATE;
7958            
7959        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7960          $self->{line_prev} = $self->{line};
7961          $self->{column_prev} = $self->{column};
7962          $self->{column}++;
7963          $self->{nc}
7964              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7965        } else {
7966          $self->{set_nc}->($self);
7967        }
7968      
7969            redo A;
7970          } elsif ($self->{nc} == 0x003E) { # >
7971            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7972            
7973        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7974          $self->{line_prev} = $self->{line};
7975          $self->{column_prev} = $self->{column};
7976          $self->{column}++;
7977          $self->{nc}
7978              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7979        } else {
7980          $self->{set_nc}->($self);
7981        }
7982      
7983            return  ($self->{ct}); # ENTITY
7984            redo A;
7985          } elsif ($self->{nc} == -1) {
7986            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7987            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7988            
7989        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7990          $self->{line_prev} = $self->{line};
7991          $self->{column_prev} = $self->{column};
7992          $self->{column}++;
7993          $self->{nc}
7994              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7995        } else {
7996          $self->{set_nc}->($self);
7997        }
7998      
7999            return  ($self->{ct}); # ENTITY
8000            redo A;
8001          } else {
8002            $self->{ct}->{notation} .= chr $self->{nc}; # ENTITY
8003            ## Stay in the state.
8004            
8005        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8006          $self->{line_prev} = $self->{line};
8007          $self->{column_prev} = $self->{column};
8008          $self->{column}++;
8009          $self->{nc}
8010              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8011        } else {
8012          $self->{set_nc}->($self);
8013        }
8014      
8015            redo A;
8016          }
8017        } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE) {
8018          if ($self->{nc} == 0x0022) { # "
8019            $self->{state} = AFTER_MD_DEF_STATE;
8020            
8021        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8022          $self->{line_prev} = $self->{line};
8023          $self->{column_prev} = $self->{column};
8024          $self->{column}++;
8025          $self->{nc}
8026              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8027        } else {
8028          $self->{set_nc}->($self);
8029        }
8030      
8031            redo A;
8032          } elsif ($self->{nc} == 0x0026) { # &
8033            $self->{prev_state} = $self->{state};
8034            $self->{state} = ENTITY_VALUE_ENTITY_STATE;
8035            $self->{entity_add} = 0x0022; # "
8036            
8037        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8038          $self->{line_prev} = $self->{line};
8039          $self->{column_prev} = $self->{column};
8040          $self->{column}++;
8041          $self->{nc}
8042              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8043        } else {
8044          $self->{set_nc}->($self);
8045        }
8046      
8047            redo A;
8048    ## TODO: %
8049          } elsif ($self->{nc} == -1) {
8050            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed entity value'); ## TODO: type
8051            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8052            ## Reconsume.
8053            return  ($self->{ct}); # ENTITY
8054            redo A;
8055          } else {
8056            $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
8057            
8058        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8059          $self->{line_prev} = $self->{line};
8060          $self->{column_prev} = $self->{column};
8061          $self->{column}++;
8062          $self->{nc}
8063              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8064        } else {
8065          $self->{set_nc}->($self);
8066        }
8067      
8068            redo A;
8069          }
8070        } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE) {
8071          if ($self->{nc} == 0x0027) { # '
8072            $self->{state} = AFTER_MD_DEF_STATE;
8073            
8074        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8075          $self->{line_prev} = $self->{line};
8076          $self->{column_prev} = $self->{column};
8077          $self->{column}++;
8078          $self->{nc}
8079              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8080        } else {
8081          $self->{set_nc}->($self);
8082        }
8083      
8084            redo A;
8085          } elsif ($self->{nc} == 0x0026) { # &
8086            $self->{prev_state} = $self->{state};
8087            $self->{state} = ENTITY_VALUE_ENTITY_STATE;
8088            $self->{entity_add} = 0x0027; # '
8089            
8090        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8091          $self->{line_prev} = $self->{line};
8092          $self->{column_prev} = $self->{column};
8093          $self->{column}++;
8094          $self->{nc}
8095              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8096        } else {
8097          $self->{set_nc}->($self);
8098        }
8099      
8100            redo A;
8101    ## TODO: %
8102          } elsif ($self->{nc} == -1) {
8103            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed entity value'); ## TODO: type
8104            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8105            ## Reconsume.
8106            return  ($self->{ct}); # ENTITY
8107            redo A;
8108          } else {
8109            $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
8110            
8111        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8112          $self->{line_prev} = $self->{line};
8113          $self->{column_prev} = $self->{column};
8114          $self->{column}++;
8115          $self->{nc}
8116              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8117        } else {
8118          $self->{set_nc}->($self);
8119        }
8120      
8121            redo A;
8122          }
8123        } elsif ($self->{state} == ENTITY_VALUE_ENTITY_STATE) {
8124          if ($is_space->{$self->{nc}} or
8125              {
8126                0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
8127                $self->{entity_add} => 1,
8128              }->{$self->{nc}}) {
8129            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
8130                            line => $self->{line_prev},
8131                            column => $self->{column_prev}
8132                                + ($self->{nc} == -1 ? 1 : 0));
8133            ## Don't consume
8134            ## Return nothing.
8135            #
8136          } elsif ($self->{nc} == 0x0023) { # #
8137            $self->{ca} = $self->{ct};
8138            $self->{state} = ENTITY_HASH_STATE;
8139            $self->{kwd} = '#';
8140            
8141        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8142          $self->{line_prev} = $self->{line};
8143          $self->{column_prev} = $self->{column};
8144          $self->{column}++;
8145          $self->{nc}
8146              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8147        } else {
8148          $self->{set_nc}->($self);
8149        }
8150      
8151            redo A;
8152          } else {
8153            #
8154          }
8155    
8156          $self->{ct}->{value} .= '&';
8157          $self->{state} = $self->{prev_state};
8158          ## Reconsume.
8159          redo A;
8160        } elsif ($self->{state} == AFTER_ELEMENT_NAME_STATE) {
8161          if ($is_space->{$self->{nc}}) {
8162            $self->{state} = BEFORE_ELEMENT_CONTENT_STATE;
8163            
8164        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8165          $self->{line_prev} = $self->{line};
8166          $self->{column_prev} = $self->{column};
8167          $self->{column}++;
8168          $self->{nc}
8169              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8170        } else {
8171          $self->{set_nc}->($self);
8172        }
8173      
8174            redo A;
8175          } elsif ($self->{nc} == 0x0028) { # (
8176            $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
8177            $self->{ct}->{content} = ['('];
8178            $self->{group_depth} = 1;
8179            
8180        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8181          $self->{line_prev} = $self->{line};
8182          $self->{column_prev} = $self->{column};
8183          $self->{column}++;
8184          $self->{nc}
8185              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8186        } else {
8187          $self->{set_nc}->($self);
8188        }
8189      
8190            redo A;
8191          } elsif ($self->{nc} == 0x003E) { # >
8192            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
8193            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8194            
8195        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8196          $self->{line_prev} = $self->{line};
8197          $self->{column_prev} = $self->{column};
8198          $self->{column}++;
8199          $self->{nc}
8200              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8201        } else {
8202          $self->{set_nc}->($self);
8203        }
8204      
8205            return  ($self->{ct}); # ELEMENT
8206            redo A;
8207          } elsif ($self->{nc} == -1) {
8208            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8209            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8210            
8211        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8212          $self->{line_prev} = $self->{line};
8213          $self->{column_prev} = $self->{column};
8214          $self->{column}++;
8215          $self->{nc}
8216              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8217        } else {
8218          $self->{set_nc}->($self);
8219        }
8220      
8221            return  ($self->{ct}); # ELEMENT
8222            redo A;
8223          } else {
8224            $self->{ct}->{content} = [chr $self->{nc}];
8225            $self->{state} = CONTENT_KEYWORD_STATE;
8226            
8227        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8228          $self->{line_prev} = $self->{line};
8229          $self->{column_prev} = $self->{column};
8230          $self->{column}++;
8231          $self->{nc}
8232              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8233        } else {
8234          $self->{set_nc}->($self);
8235        }
8236      
8237            redo A;
8238          }
8239        } elsif ($self->{state} == CONTENT_KEYWORD_STATE) {
8240          if ($is_space->{$self->{nc}}) {
8241            $self->{state} = AFTER_MD_DEF_STATE;
8242            
8243        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8244          $self->{line_prev} = $self->{line};
8245          $self->{column_prev} = $self->{column};
8246          $self->{column}++;
8247          $self->{nc}
8248              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8249        } else {
8250          $self->{set_nc}->($self);
8251        }
8252      
8253            redo A;
8254          } elsif ($self->{nc} == 0x003E) { # >
8255            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8256            
8257        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8258          $self->{line_prev} = $self->{line};
8259          $self->{column_prev} = $self->{column};
8260          $self->{column}++;
8261          $self->{nc}
8262              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8263        } else {
8264          $self->{set_nc}->($self);
8265        }
8266      
8267            return  ($self->{ct}); # ELEMENT
8268            redo A;
8269          } elsif ($self->{nc} == -1) {
8270            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8271            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8272            
8273        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8274          $self->{line_prev} = $self->{line};
8275          $self->{column_prev} = $self->{column};
8276          $self->{column}++;
8277          $self->{nc}
8278              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8279        } else {
8280          $self->{set_nc}->($self);
8281        }
8282      
8283            return  ($self->{ct}); # ELEMENT
8284            redo A;
8285          } else {
8286            $self->{ct}->{content}->[-1] .= chr $self->{nc}; # ELEMENT
8287            ## Stay in the state.
8288            
8289        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8290          $self->{line_prev} = $self->{line};
8291          $self->{column_prev} = $self->{column};
8292          $self->{column}++;
8293          $self->{nc}
8294              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8295        } else {
8296          $self->{set_nc}->($self);
8297        }
8298      
8299            redo A;
8300          }
8301        } elsif ($self->{state} == AFTER_CM_GROUP_OPEN_STATE) {
8302          if ($is_space->{$self->{nc}}) {
8303            ## Stay in the state.
8304            
8305        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8306          $self->{line_prev} = $self->{line};
8307          $self->{column_prev} = $self->{column};
8308          $self->{column}++;
8309          $self->{nc}
8310              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8311        } else {
8312          $self->{set_nc}->($self);
8313        }
8314      
8315            redo A;
8316          } elsif ($self->{nc} == 0x0028) { # (
8317            $self->{group_depth}++;
8318            push @{$self->{ct}->{content}}, chr $self->{nc};
8319            ## Stay in the state.
8320            
8321        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8322          $self->{line_prev} = $self->{line};
8323          $self->{column_prev} = $self->{column};
8324          $self->{column}++;
8325          $self->{nc}
8326              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8327        } else {
8328          $self->{set_nc}->($self);
8329        }
8330      
8331            redo A;
8332          } elsif ($self->{nc} == 0x007C or # |
8333                   $self->{nc} == 0x002C) { # ,
8334            $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty element name'); ## TODO: type
8335            ## Stay in the state.
8336            
8337        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8338          $self->{line_prev} = $self->{line};
8339          $self->{column_prev} = $self->{column};
8340          $self->{column}++;
8341          $self->{nc}
8342              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8343        } else {
8344          $self->{set_nc}->($self);
8345        }
8346      
8347            redo A;
8348          } elsif ($self->{nc} == 0x0029) { # )
8349            $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty element name'); ## TODO: type
8350            push @{$self->{ct}->{content}}, chr $self->{nc};
8351            $self->{group_depth}--;
8352            $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8353            
8354        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8355          $self->{line_prev} = $self->{line};
8356          $self->{column_prev} = $self->{column};
8357          $self->{column}++;
8358          $self->{nc}
8359              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8360        } else {
8361          $self->{set_nc}->($self);
8362        }
8363      
8364            redo A;
8365          } elsif ($self->{nc} == 0x003E) { # >
8366            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8367            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8368            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8369            
8370        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8371          $self->{line_prev} = $self->{line};
8372          $self->{column_prev} = $self->{column};
8373          $self->{column}++;
8374          $self->{nc}
8375              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8376        } else {
8377          $self->{set_nc}->($self);
8378        }
8379      
8380            return  ($self->{ct}); # ELEMENT
8381            redo A;
8382          } elsif ($self->{nc} == -1) {
8383            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8384            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8385            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8386            
8387        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8388          $self->{line_prev} = $self->{line};
8389          $self->{column_prev} = $self->{column};
8390          $self->{column}++;
8391          $self->{nc}
8392              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8393        } else {
8394          $self->{set_nc}->($self);
8395        }
8396      
8397            return  ($self->{ct}); # ELEMENT
8398            redo A;
8399          } else {
8400            push @{$self->{ct}->{content}}, chr $self->{nc};
8401            $self->{state} = CM_ELEMENT_NAME_STATE;
8402            
8403        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8404          $self->{line_prev} = $self->{line};
8405          $self->{column_prev} = $self->{column};
8406          $self->{column}++;
8407          $self->{nc}
8408              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8409        } else {
8410          $self->{set_nc}->($self);
8411        }
8412      
8413            redo A;
8414          }
8415        } elsif ($self->{state} == CM_ELEMENT_NAME_STATE) {
8416          if ($is_space->{$self->{nc}}) {
8417            $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8418            
8419        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8420          $self->{line_prev} = $self->{line};
8421          $self->{column_prev} = $self->{column};
8422          $self->{column}++;
8423          $self->{nc}
8424              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8425        } else {
8426          $self->{set_nc}->($self);
8427        }
8428      
8429            redo A;
8430          } elsif ($self->{nc} == 0x002A or # *
8431                   $self->{nc} == 0x002B or # +
8432                   $self->{nc} == 0x003F) { # ?
8433            push @{$self->{ct}->{content}}, chr $self->{nc};
8434            $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8435            
8436        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8437          $self->{line_prev} = $self->{line};
8438          $self->{column_prev} = $self->{column};
8439          $self->{column}++;
8440          $self->{nc}
8441              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8442        } else {
8443          $self->{set_nc}->($self);
8444        }
8445      
8446            redo A;
8447          } elsif ($self->{nc} == 0x007C or # |
8448                   $self->{nc} == 0x002C) { # ,
8449            push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
8450            $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
8451            
8452        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8453          $self->{line_prev} = $self->{line};
8454          $self->{column_prev} = $self->{column};
8455          $self->{column}++;
8456          $self->{nc}
8457              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8458        } else {
8459          $self->{set_nc}->($self);
8460        }
8461      
8462            redo A;
8463          } elsif ($self->{nc} == 0x0029) { # )
8464            $self->{group_depth}--;
8465            push @{$self->{ct}->{content}}, chr $self->{nc};
8466            $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8467            
8468        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8469          $self->{line_prev} = $self->{line};
8470          $self->{column_prev} = $self->{column};
8471          $self->{column}++;
8472          $self->{nc}
8473              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8474        } else {
8475          $self->{set_nc}->($self);
8476        }
8477      
8478            redo A;
8479          } elsif ($self->{nc} == 0x003E) { # >
8480            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8481            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8482            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8483            
8484        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8485          $self->{line_prev} = $self->{line};
8486          $self->{column_prev} = $self->{column};
8487          $self->{column}++;
8488          $self->{nc}
8489              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8490        } else {
8491          $self->{set_nc}->($self);
8492        }
8493      
8494            return  ($self->{ct}); # ELEMENT
8495            redo A;
8496          } elsif ($self->{nc} == -1) {
8497            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8498            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8499            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8500            
8501        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8502          $self->{line_prev} = $self->{line};
8503          $self->{column_prev} = $self->{column};
8504          $self->{column}++;
8505          $self->{nc}
8506              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8507        } else {
8508          $self->{set_nc}->($self);
8509        }
8510      
8511            return  ($self->{ct}); # ELEMENT
8512            redo A;
8513          } else {
8514            $self->{ct}->{content}->[-1] .= chr $self->{nc};
8515            ## Stay in the state.
8516            
8517        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8518          $self->{line_prev} = $self->{line};
8519          $self->{column_prev} = $self->{column};
8520          $self->{column}++;
8521          $self->{nc}
8522              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8523        } else {
8524          $self->{set_nc}->($self);
8525        }
8526      
8527            redo A;
8528          }
8529        } elsif ($self->{state} == AFTER_CM_ELEMENT_NAME_STATE) {
8530          if ($is_space->{$self->{nc}}) {
8531            ## Stay in the state.
8532            
8533        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8534          $self->{line_prev} = $self->{line};
8535          $self->{column_prev} = $self->{column};
8536          $self->{column}++;
8537          $self->{nc}
8538              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8539        } else {
8540          $self->{set_nc}->($self);
8541        }
8542      
8543            redo A;
8544          } elsif ($self->{nc} == 0x007C or # |
8545                   $self->{nc} == 0x002C) { # ,
8546            push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
8547            $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
8548            
8549        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8550          $self->{line_prev} = $self->{line};
8551          $self->{column_prev} = $self->{column};
8552          $self->{column}++;
8553          $self->{nc}
8554              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8555        } else {
8556          $self->{set_nc}->($self);
8557        }
8558      
8559            redo A;
8560          } elsif ($self->{nc} == 0x0029) { # )
8561            $self->{group_depth}--;
8562            push @{$self->{ct}->{content}}, chr $self->{nc};
8563            $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8564            
8565        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8566          $self->{line_prev} = $self->{line};
8567          $self->{column_prev} = $self->{column};
8568          $self->{column}++;
8569          $self->{nc}
8570              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8571        } else {
8572          $self->{set_nc}->($self);
8573        }
8574      
8575            redo A;
8576          } elsif ($self->{nc} == 0x003E) { # >
8577            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8578            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8579            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8580            
8581        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8582          $self->{line_prev} = $self->{line};
8583          $self->{column_prev} = $self->{column};
8584          $self->{column}++;
8585          $self->{nc}
8586              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8587        } else {
8588          $self->{set_nc}->($self);
8589        }
8590      
8591            return  ($self->{ct}); # ELEMENT
8592            redo A;
8593          } elsif ($self->{nc} == -1) {
8594            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8595            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8596            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8597            
8598        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8599          $self->{line_prev} = $self->{line};
8600          $self->{column_prev} = $self->{column};
8601          $self->{column}++;
8602          $self->{nc}
8603              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8604        } else {
8605          $self->{set_nc}->($self);
8606        }
8607      
8608            return  ($self->{ct}); # ELEMENT
8609            redo A;
8610          } else {
8611            $self->{parse_error}->(level => $self->{level}->{must}, type => 'after element name'); ## TODO: type
8612            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8613            $self->{state} = BOGUS_MD_STATE;
8614            
8615        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8616          $self->{line_prev} = $self->{line};
8617          $self->{column_prev} = $self->{column};
8618          $self->{column}++;
8619          $self->{nc}
8620              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8621        } else {
8622          $self->{set_nc}->($self);
8623        }
8624      
8625            redo A;
8626          }
8627        } elsif ($self->{state} == AFTER_CM_GROUP_CLOSE_STATE) {
8628          if ($is_space->{$self->{nc}}) {
8629            if ($self->{group_depth}) {
8630              $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8631            } else {
8632              $self->{state} = AFTER_MD_DEF_STATE;
8633            }
8634            
8635        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8636          $self->{line_prev} = $self->{line};
8637          $self->{column_prev} = $self->{column};
8638          $self->{column}++;
8639          $self->{nc}
8640              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8641        } else {
8642          $self->{set_nc}->($self);
8643        }
8644      
8645            redo A;
8646          } elsif ($self->{nc} == 0x002A or # *
8647                   $self->{nc} == 0x002B or # +
8648                   $self->{nc} == 0x003F) { # ?
8649            push @{$self->{ct}->{content}}, chr $self->{nc};
8650            if ($self->{group_depth}) {
8651              $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8652            } else {
8653              $self->{state} = AFTER_MD_DEF_STATE;
8654            }
8655            
8656        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8657          $self->{line_prev} = $self->{line};
8658          $self->{column_prev} = $self->{column};
8659          $self->{column}++;
8660          $self->{nc}
8661              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8662        } else {
8663          $self->{set_nc}->($self);
8664        }
8665      
8666            redo A;
8667          } elsif ($self->{nc} == 0x0029) { # )
8668            if ($self->{group_depth}) {
8669              $self->{group_depth}--;
8670              push @{$self->{ct}->{content}}, chr $self->{nc};
8671              ## Stay in the state.
8672              
8673        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8674          $self->{line_prev} = $self->{line};
8675          $self->{column_prev} = $self->{column};
8676          $self->{column}++;
8677          $self->{nc}
8678              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8679        } else {
8680          $self->{set_nc}->($self);
8681        }
8682      
8683              redo A;
8684            } else {
8685              $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8686              $self->{state} = BOGUS_MD_STATE;
8687              ## Reconsume.
8688              redo A;
8689            }
8690          } elsif ($self->{nc} == 0x003E) { # >
8691            if ($self->{group_depth}) {
8692              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8693              push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8694            }
8695            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8696            
8697        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8698          $self->{line_prev} = $self->{line};
8699          $self->{column_prev} = $self->{column};
8700          $self->{column}++;
8701          $self->{nc}
8702              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8703        } else {
8704          $self->{set_nc}->($self);
8705        }
8706      
8707            return  ($self->{ct}); # ELEMENT
8708            redo A;
8709          } elsif ($self->{nc} == -1) {
8710            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8711            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8712            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8713            
8714        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8715          $self->{line_prev} = $self->{line};
8716          $self->{column_prev} = $self->{column};
8717          $self->{column}++;
8718          $self->{nc}
8719              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8720        } else {
8721          $self->{set_nc}->($self);
8722        }
8723      
8724            return  ($self->{ct}); # ELEMENT
8725            redo A;
8726          } else {
8727            if ($self->{group_depth}) {
8728              $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8729            } else {
8730              $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8731              $self->{state} = BOGUS_MD_STATE;
8732            }
8733            ## Reconsume.
8734            redo A;
8735          }
8736        } elsif ($self->{state} == AFTER_MD_DEF_STATE) {
8737          if ($is_space->{$self->{nc}}) {
8738            ## Stay in the state.
8739            
8740        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8741          $self->{line_prev} = $self->{line};
8742          $self->{column_prev} = $self->{column};
8743          $self->{column}++;
8744          $self->{nc}
8745              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8746        } else {
8747          $self->{set_nc}->($self);
8748        }
8749      
8750            redo A;
8751          } elsif ($self->{nc} == 0x003E) { # >
8752            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8753            
8754        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8755          $self->{line_prev} = $self->{line};
8756          $self->{column_prev} = $self->{column};
8757          $self->{column}++;
8758          $self->{nc}
8759              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8760        } else {
8761          $self->{set_nc}->($self);
8762        }
8763      
8764            return  ($self->{ct}); # ENTITY/ELEMENT
8765            redo A;
8766          } elsif ($self->{nc} == -1) {
8767            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8768            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8769            
8770        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8771          $self->{line_prev} = $self->{line};
8772          $self->{column_prev} = $self->{column};
8773          $self->{column}++;
8774          $self->{nc}
8775              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8776        } else {
8777          $self->{set_nc}->($self);
8778        }
8779      
8780            return  ($self->{ct}); # ENTITY/ELEMENT
8781            redo A;
8782          } else {
8783            $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8784            $self->{state} = BOGUS_MD_STATE;
8785            ## Reconsume.
8786            redo A;
8787          }
8788        } elsif ($self->{state} == BOGUS_MD_STATE) {
8789          if ($self->{nc} == 0x003E) { # >
8790            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8791            
8792        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8793          $self->{line_prev} = $self->{line};
8794          $self->{column_prev} = $self->{column};
8795          $self->{column}++;
8796          $self->{nc}
8797              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8798        } else {
8799          $self->{set_nc}->($self);
8800        }
8801      
8802            return  ($self->{ct}); # ATTLIST/ENTITY/NOTATION
8803            redo A;
8804          } elsif ($self->{nc} == -1) {
8805            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8806            ## Reconsume.
8807            return  ($self->{ct}); # ATTLIST/ENTITY/NOTATION
8808            redo A;
8809          } else {
8810            ## Stay in the state.
8811            
8812        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8813          $self->{line_prev} = $self->{line};
8814          $self->{column_prev} = $self->{column};
8815          $self->{column}++;
8816          $self->{nc}
8817              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8818        } else {
8819          $self->{set_nc}->($self);
8820        }
8821      
8822            redo A;
8823          }
8824      } else {      } else {
8825        die "$0: $self->{state}: Unknown state";        die "$0: $self->{state}: Unknown state";
8826      }      }
# Line 4152  sub _get_next_token ($) { Line 8831  sub _get_next_token ($) {
8831    
8832  1;  1;
8833  ## $Date$  ## $Date$
8834                                    

Legend:
Removed from v.1.4  
changed lines
  Added in v.1.32

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24