/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm
Suika

Diff of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.4 by wakaba, Tue Oct 14 11:46:57 2008 UTC revision 1.20 by wakaba, Sun Oct 19 08:20:29 2008 UTC
# Line 15  BEGIN { Line 15  BEGIN {
15      CHARACTER_TOKEN      CHARACTER_TOKEN
16      PI_TOKEN      PI_TOKEN
17      ABORT_TOKEN      ABORT_TOKEN
18        END_OF_DOCTYPE_TOKEN
19        ATTLIST_TOKEN
20        ELEMENT_TOKEN
21        GENERAL_ENTITY_TOKEN
22        PARAMETER_ENTITY_TOKEN
23        NOTATION_TOKEN
24    );    );
25        
26    our %EXPORT_TAGS = (    our %EXPORT_TAGS = (
# Line 27  BEGIN { Line 33  BEGIN {
33        CHARACTER_TOKEN        CHARACTER_TOKEN
34        PI_TOKEN        PI_TOKEN
35        ABORT_TOKEN        ABORT_TOKEN
36          END_OF_DOCTYPE_TOKEN
37          ATTLIST_TOKEN
38          ELEMENT_TOKEN
39          GENERAL_ENTITY_TOKEN
40          PARAMETER_ENTITY_TOKEN
41          NOTATION_TOKEN
42      )],      )],
43    );    );
44  }  }
45    
46    ## NOTE: Differences from the XML5 draft are marked as "XML5:".
47    
48  ## Token types  ## Token types
49    
50  sub DOCTYPE_TOKEN () { 1 }  sub DOCTYPE_TOKEN () { 1 } ## XML5: No DOCTYPE token.
51  sub COMMENT_TOKEN () { 2 }  sub COMMENT_TOKEN () { 2 }
52  sub START_TAG_TOKEN () { 3 }  sub START_TAG_TOKEN () { 3 }
53  sub END_TAG_TOKEN () { 4 }  sub END_TAG_TOKEN () { 4 }
54  sub END_OF_FILE_TOKEN () { 5 }  sub END_OF_FILE_TOKEN () { 5 }
55  sub CHARACTER_TOKEN () { 6 }  sub CHARACTER_TOKEN () { 6 }
56  sub PI_TOKEN () { 7 } # XML5  sub PI_TOKEN () { 7 } ## NOTE: XML only.
57  sub ABORT_TOKEN () { 8 } # Not a token actually  sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing.
58    sub END_OF_DOCTYPE_TOKEN () { 9 } ## NOTE: XML only.
59    sub ATTLIST_TOKEN () { 10 } ## NOTE: XML only.
60    sub ELEMENT_TOKEN () { 11 } ## NOTE: XML only.
61    sub GENERAL_ENTITY_TOKEN () { 12 } ## NOTE: XML only.
62    sub PARAMETER_ENTITY_TOKEN () { 13 } ## NOTE: XML only.
63    sub NOTATION_TOKEN () { 14 } ## NOTE: XML only.
64    
65    ## XML5: XML5 has "empty tag token".  In this implementation, it is
66    ## represented as a start tag token with $self->{self_closing} flag
67    ## set to true.
68    
69    ## XML5: XML5 has "short end tag token".  In this implementation, it
70    ## is represented as an end tag token with $token->{tag_name} flag set
71    ## to an empty string.
72    
73  package Whatpm::HTML;  package Whatpm::HTML;
74    
# Line 114  sub HEXREF_HEX_STATE () { 48 } Line 142  sub HEXREF_HEX_STATE () { 48 }
142  sub ENTITY_NAME_STATE () { 49 }  sub ENTITY_NAME_STATE () { 49 }
143  sub PCDATA_STATE () { 50 } # "data state" in the spec  sub PCDATA_STATE () { 50 } # "data state" in the spec
144    
145    ## XML-only states
146    sub PI_STATE () { 51 }
147    sub PI_TARGET_STATE () { 52 }
148    sub PI_TARGET_AFTER_STATE () { 53 }
149    sub PI_DATA_STATE () { 54 }
150    sub PI_AFTER_STATE () { 55 }
151    sub PI_DATA_AFTER_STATE () { 56 }
152    sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 }
153    sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 }
154    sub BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 59 }
155    sub DOCTYPE_TAG_STATE () { 60 }
156    sub DOCTYPE_MARKUP_DECLARATION_OPEN_STATE () { 61 }
157    sub MD_ATTLIST_STATE () { 62 }
158    sub MD_E_STATE () { 63 }
159    sub MD_ELEMENT_STATE () { 64 }
160    sub MD_ENTITY_STATE () { 65 }
161    sub MD_NOTATION_STATE () { 66 }
162    sub DOCTYPE_MD_STATE () { 67 }
163    sub BEFORE_MD_NAME_STATE () { 68 }
164    sub MD_NAME_STATE () { 69 }
165    sub DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE () { 70 }
166    sub DOCTYPE_ATTLIST_NAME_AFTER_STATE () { 71 }
167    sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE () { 72 }
168    sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE () { 73 }
169    sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE () { 74 }
170    sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE () { 75 }
171    sub BEFORE_ALLOWED_TOKEN_STATE () { 76 }
172    sub ALLOWED_TOKEN_STATE () { 77 }
173    sub AFTER_ALLOWED_TOKEN_STATE () { 78 }
174    sub AFTER_ALLOWED_TOKENS_STATE () { 79 }
175    sub BEFORE_ATTR_DEFAULT_STATE () { 80 }
176    sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE () { 81 }
177    sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE () { 82 }
178    sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE () { 83 }
179    sub AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE () { 84 }
180    sub BEFORE_NDATA_STATE () { 85 }
181    sub NDATA_STATE () { 86 }
182    sub AFTER_NDATA_STATE () { 87 }
183    sub BEFORE_NOTATION_NAME_STATE () { 88 }
184    sub NOTATION_NAME_STATE () { 89 }
185    sub DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE () { 90 }
186    sub DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE () { 91 }
187    sub ENTITY_VALUE_ENTITY_STATE () { 92 }
188    sub AFTER_ELEMENT_NAME_STATE () { 93 }
189    sub BEFORE_ELEMENT_CONTENT_STATE () { 94 }
190    sub CONTENT_KEYWORD_STATE () { 95 }
191    sub AFTER_CM_GROUP_OPEN_STATE () { 96 }
192    sub CM_ELEMENT_NAME_STATE () { 97 }
193    sub AFTER_CM_ELEMENT_NAME_STATE () { 98 }
194    sub AFTER_CM_GROUP_CLOSE_STATE () { 99 }
195    sub AFTER_MD_DEF_STATE () { 100 }
196    sub BOGUS_MD_STATE () { 101 }
197    
198  ## Tree constructor state constants (see Whatpm::HTML for the full  ## Tree constructor state constants (see Whatpm::HTML for the full
199  ## list and descriptions)  ## list and descriptions)
200    
# Line 178  sub _initialize_tokenizer ($) { Line 259  sub _initialize_tokenizer ($) {
259    #$self->{is_xml} (if XML)    #$self->{is_xml} (if XML)
260    
261    $self->{state} = DATA_STATE; # MUST    $self->{state} = DATA_STATE; # MUST
262    #$self->{s_kwd}; # state keyword - initialized when used    $self->{s_kwd} = ''; # Data state keyword
263      #$self->{kwd} = ''; # State-dependent keyword; initialized when used
264    #$self->{entity__value}; # initialized when used    #$self->{entity__value}; # initialized when used
265    #$self->{entity__match}; # initialized when used    #$self->{entity__match}; # initialized when used
266    $self->{content_model} = PCDATA_CONTENT_MODEL; # be    $self->{content_model} = PCDATA_CONTENT_MODEL; # be
# Line 208  sub _initialize_tokenizer ($) { Line 290  sub _initialize_tokenizer ($) {
290    
291  ## A token has:  ## A token has:
292  ##   ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,  ##   ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
293  ##       CHARACTER_TOKEN, or END_OF_FILE_TOKEN  ##       CHARACTER_TOKEN, END_OF_FILE_TOKEN, PI_TOKEN, or ABORT_TOKEN
294  ##   ->{name} (DOCTYPE_TOKEN)  ##   ->{name} (DOCTYPE_TOKEN)
295  ##   ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)  ##   ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
296    ##   ->{target} (PI_TOKEN)
297  ##   ->{pubid} (DOCTYPE_TOKEN)  ##   ->{pubid} (DOCTYPE_TOKEN)
298  ##   ->{sysid} (DOCTYPE_TOKEN)  ##   ->{sysid} (DOCTYPE_TOKEN)
299  ##   ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag  ##   ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
# Line 218  sub _initialize_tokenizer ($) { Line 301  sub _initialize_tokenizer ($) {
301  ##        ->{name}  ##        ->{name}
302  ##        ->{value}  ##        ->{value}
303  ##        ->{has_reference} == 1 or 0  ##        ->{has_reference} == 1 or 0
304  ##   ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)  ##        ->{index}: Index of the attribute in a tag.
305    ##   ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)
306    ##   ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
307    ##   ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.
308    ##   ->{has_internal_subset} = 1 or 0 (DOCTYPE_TOKEN)
309    
310  ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.  ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
311  ##     |->{self_closing}| is used to save the value of |$self->{self_closing}|  ##     |->{self_closing}| is used to save the value of |$self->{self_closing}|
312  ##     while the token is pushed back to the stack.  ##     while the token is pushed back to the stack.
# Line 238  my $is_space = { Line 326  my $is_space = {
326    0x0009 => 1, # CHARACTER TABULATION (HT)    0x0009 => 1, # CHARACTER TABULATION (HT)
327    0x000A => 1, # LINE FEED (LF)    0x000A => 1, # LINE FEED (LF)
328    #0x000B => 0, # LINE TABULATION (VT)    #0x000B => 0, # LINE TABULATION (VT)
329    0x000C => 1, # FORM FEED (FF)    0x000C => 1, # FORM FEED (FF) ## XML5: Not a space character.
330    #0x000D => 1, # CARRIAGE RETURN (CR)    #0x000D => 1, # CARRIAGE RETURN (CR)
331    0x0020 => 1, # SPACE (SP)    0x0020 => 1, # SPACE (SP)
332  };  };
# Line 362  sub _get_next_token ($) { Line 450  sub _get_next_token ($) {
450          }          }
451        } elsif ($self->{nc} == 0x002D) { # -        } elsif ($self->{nc} == 0x002D) { # -
452          if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA          if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
453            $self->{s_kwd} .= '-';            if ($self->{s_kwd} eq '<!-') {
             
           if ($self->{s_kwd} eq '<!--') {  
454                            
455              $self->{escape} = 1; # unless $self->{escape};              $self->{escape} = 1; # unless $self->{escape};
456              $self->{s_kwd} = '--';              $self->{s_kwd} = '--';
457              #              #
458            } elsif ($self->{s_kwd} eq '---') {            } elsif ($self->{s_kwd} eq '-') {
459                            
460              $self->{s_kwd} = '--';              $self->{s_kwd} = '--';
461              #              #
462              } elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') {
463                
464                $self->{s_kwd} .= '-';
465                #
466            } else {            } else {
467                            
468                $self->{s_kwd} = '-';
469              #              #
470            }            }
471          }          }
# Line 420  sub _get_next_token ($) { Line 511  sub _get_next_token ($) {
511            if ($self->{s_kwd} eq '--') {            if ($self->{s_kwd} eq '--') {
512                            
513              delete $self->{escape};              delete $self->{escape};
514                #
515            } else {            } else {
516                            
517                #
518            }            }
519            } elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') {
520              
521              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched mse', ## TODO: type
522                              line => $self->{line_prev},
523                              column => $self->{column_prev} - 1);
524              #
525          } else {          } else {
526                        
527              #
528          }          }
529                    
530          $self->{s_kwd} = '';          $self->{s_kwd} = '';
531          #          #
532          } elsif ($self->{nc} == 0x005D) { # ]
533            if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') {
534              
535              $self->{s_kwd} .= ']';
536            } elsif ($self->{s_kwd} eq ']]') {
537              
538              #
539            } else {
540              
541              $self->{s_kwd} = '';
542            }
543            #
544        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
545                    
546          $self->{s_kwd} = '';          $self->{s_kwd} = '';
# Line 446  sub _get_next_token ($) { Line 558  sub _get_next_token ($) {
558                     data => chr $self->{nc},                     data => chr $self->{nc},
559                     line => $self->{line}, column => $self->{column},                     line => $self->{line}, column => $self->{column},
560                    };                    };
561        if ($self->{read_until}->($token->{data}, q[-!<>&],        if ($self->{read_until}->($token->{data}, q{-!<>&\]},
562                                  length $token->{data})) {                                  length $token->{data})) {
563          $self->{s_kwd} = '';          $self->{s_kwd} = '';
564        }        }
565    
566        ## Stay in the data state.        ## Stay in the data state.
567        if ($self->{content_model} == PCDATA_CONTENT_MODEL) {        if (not $self->{is_xml} and
568              $self->{content_model} == PCDATA_CONTENT_MODEL) {
569                    
570          $self->{state} = PCDATA_STATE;          $self->{state} = PCDATA_STATE;
571        } else {        } else {
# Line 473  sub _get_next_token ($) { Line 586  sub _get_next_token ($) {
586        return  ($token);        return  ($token);
587        redo A;        redo A;
588      } elsif ($self->{state} == TAG_OPEN_STATE) {      } elsif ($self->{state} == TAG_OPEN_STATE) {
589          ## XML5: "tag state".
590    
591        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
592          if ($self->{nc} == 0x002F) { # /          if ($self->{nc} == 0x002F) { # /
593                        
# Line 491  sub _get_next_token ($) { Line 606  sub _get_next_token ($) {
606            redo A;            redo A;
607          } elsif ($self->{nc} == 0x0021) { # !          } elsif ($self->{nc} == 0x0021) { # !
608                        
609            $self->{s_kwd} = '<' unless $self->{escape};            $self->{s_kwd} = $self->{escaped} ? '' : '<';
610            #            #
611          } else {          } else {
612                        
613              $self->{s_kwd} = '';
614            #            #
615          }          }
616    
# Line 583  sub _get_next_token ($) { Line 699  sub _get_next_token ($) {
699                            line => $self->{line_prev},                            line => $self->{line_prev},
700                            column => $self->{column_prev});                            column => $self->{column_prev});
701            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
702              $self->{s_kwd} = '';
703                        
704      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
705        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 602  sub _get_next_token ($) { Line 719  sub _get_next_token ($) {
719    
720            redo A;            redo A;
721          } elsif ($self->{nc} == 0x003F) { # ?          } elsif ($self->{nc} == 0x003F) { # ?
722                        if ($self->{is_xml}) {
723            $self->{parse_error}->(level => $self->{level}->{must}, type => 'pio',              
724                            line => $self->{line_prev},              $self->{state} = PI_STATE;
725                            column => $self->{column_prev});              
726            $self->{state} = BOGUS_COMMENT_STATE;      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
727            $self->{ct} = {type => COMMENT_TOKEN, data => '',        $self->{line_prev} = $self->{line};
728                                      line => $self->{line_prev},        $self->{column_prev} = $self->{column};
729                                      column => $self->{column_prev},        $self->{column}++;
730                                     };        $self->{nc}
731            ## $self->{nc} is intentionally left as is            = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
732            redo A;      } else {
733          } else {        $self->{set_nc}->($self);
734        }
735      
736                redo A;
737              } else {
738                
739                $self->{parse_error}->(level => $self->{level}->{must}, type => 'pio',
740                                line => $self->{line_prev},
741                                column => $self->{column_prev});
742                $self->{state} = BOGUS_COMMENT_STATE;
743                $self->{ct} = {type => COMMENT_TOKEN, data => '',
744                               line => $self->{line_prev},
745                               column => $self->{column_prev},
746                              };
747                ## $self->{nc} is intentionally left as is
748                redo A;
749              }
750            } elsif (not $self->{is_xml} or $is_space->{$self->{nc}}) {
751                        
752            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago',            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago',
753                            line => $self->{line_prev},                            line => $self->{line_prev},
754                            column => $self->{column_prev});                            column => $self->{column_prev});
755            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
756              $self->{s_kwd} = '';
757            ## reconsume            ## reconsume
758    
759            return  ({type => CHARACTER_TOKEN, data => '<',            return  ({type => CHARACTER_TOKEN, data => '<',
# Line 627  sub _get_next_token ($) { Line 762  sub _get_next_token ($) {
762                     });                     });
763    
764            redo A;            redo A;
765            } else {
766              ## XML5: "<:" is a parse error.
767              
768              $self->{ct} = {type => START_TAG_TOKEN,
769                                        tag_name => chr ($self->{nc}),
770                                        line => $self->{line_prev},
771                                        column => $self->{column_prev}};
772              $self->{state} = TAG_NAME_STATE;
773              
774        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
775          $self->{line_prev} = $self->{line};
776          $self->{column_prev} = $self->{column};
777          $self->{column}++;
778          $self->{nc}
779              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
780        } else {
781          $self->{set_nc}->($self);
782        }
783      
784              redo A;
785          }          }
786        } else {        } else {
787          die "$0: $self->{content_model} in tag open";          die "$0: $self->{content_model} in tag open";
# Line 635  sub _get_next_token ($) { Line 790  sub _get_next_token ($) {
790        ## NOTE: The "close tag open state" in the spec is implemented as        ## NOTE: The "close tag open state" in the spec is implemented as
791        ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.        ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
792    
793          ## XML5: "end tag state".
794    
795        my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"        my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
796        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
797          if (defined $self->{last_stag_name}) {          if (defined $self->{last_stag_name}) {
798            $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;            $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
799            $self->{s_kwd} = '';            $self->{kwd} = '';
800            ## Reconsume.            ## Reconsume.
801            redo A;            redo A;
802          } else {          } else {
# Line 647  sub _get_next_token ($) { Line 804  sub _get_next_token ($) {
804            ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.            ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
805                        
806            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
807              $self->{s_kwd} = '';
808            ## Reconsume.            ## Reconsume.
809            return  ({type => CHARACTER_TOKEN, data => '</',            return  ({type => CHARACTER_TOKEN, data => '</',
810                      line => $l, column => $c,                      line => $l, column => $c,
# Line 695  sub _get_next_token ($) { Line 853  sub _get_next_token ($) {
853        
854          redo A;          redo A;
855        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
           
856          $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty end tag',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty end tag',
857                          line => $self->{line_prev}, ## "<" in "</>"                          line => $self->{line_prev}, ## "<" in "</>"
858                          column => $self->{column_prev} - 1);                          column => $self->{column_prev} - 1);
859          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
860                    $self->{s_kwd} = '';
861            if ($self->{is_xml}) {
862              
863              ## XML5: No parse error.
864              
865              ## NOTE: This parser raises a parse error, since it supports
866              ## XML1, not XML5.
867    
868              ## NOTE: A short end tag token.
869              my $ct = {type => END_TAG_TOKEN,
870                        tag_name => '',
871                        line => $self->{line_prev},
872                        column => $self->{column_prev} - 1,
873                       };
874              
875      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
876        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
877        $self->{column_prev} = $self->{column};        $self->{column_prev} = $self->{column};
# Line 711  sub _get_next_token ($) { Line 882  sub _get_next_token ($) {
882        $self->{set_nc}->($self);        $self->{set_nc}->($self);
883      }      }
884        
885              return  ($ct);
886            } else {
887              
888              
889        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
890          $self->{line_prev} = $self->{line};
891          $self->{column_prev} = $self->{column};
892          $self->{column}++;
893          $self->{nc}
894              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
895        } else {
896          $self->{set_nc}->($self);
897        }
898      
899            }
900          redo A;          redo A;
901        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
902                    
903          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare etago');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare etago');
904            $self->{s_kwd} = '';
905          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
906          # reconsume          # reconsume
907    
# Line 723  sub _get_next_token ($) { Line 910  sub _get_next_token ($) {
910                   });                   });
911    
912          redo A;          redo A;
913        } else {        } elsif (not $self->{is_xml} or
914                   $is_space->{$self->{nc}}) {
915                    
916          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus end tag');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus end tag',
917                            line => $self->{line_prev}, # "<" of "</"
918                            column => $self->{column_prev} - 1);
919          $self->{state} = BOGUS_COMMENT_STATE;          $self->{state} = BOGUS_COMMENT_STATE;
920          $self->{ct} = {type => COMMENT_TOKEN, data => '',          $self->{ct} = {type => COMMENT_TOKEN, data => '',
921                                    line => $self->{line_prev}, # "<" of "</"                                    line => $self->{line_prev}, # "<" of "</"
# Line 738  sub _get_next_token ($) { Line 928  sub _get_next_token ($) {
928          ## generated from the bogus end tag, as defined in the          ## generated from the bogus end tag, as defined in the
929          ## "bogus comment state" entry.          ## "bogus comment state" entry.
930          redo A;          redo A;
931          } else {
932            ## XML5: "</:" is a parse error.
933            
934            $self->{ct} = {type => END_TAG_TOKEN,
935                           tag_name => chr ($self->{nc}),
936                           line => $l, column => $c};
937            $self->{state} = TAG_NAME_STATE; ## XML5: "end tag name state".
938            
939        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
940          $self->{line_prev} = $self->{line};
941          $self->{column_prev} = $self->{column};
942          $self->{column}++;
943          $self->{nc}
944              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
945        } else {
946          $self->{set_nc}->($self);
947        }
948      
949            redo A;
950        }        }
951      } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {      } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
952        my $ch = substr $self->{last_stag_name}, length $self->{s_kwd}, 1;        my $ch = substr $self->{last_stag_name}, length $self->{kwd}, 1;
953        if (length $ch) {        if (length $ch) {
954          my $CH = $ch;          my $CH = $ch;
955          $ch =~ tr/a-z/A-Z/;          $ch =~ tr/a-z/A-Z/;
# Line 748  sub _get_next_token ($) { Line 957  sub _get_next_token ($) {
957          if ($nch eq $ch or $nch eq $CH) {          if ($nch eq $ch or $nch eq $CH) {
958                        
959            ## Stay in the state.            ## Stay in the state.
960            $self->{s_kwd} .= $nch;            $self->{kwd} .= $nch;
961                        
962      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
963        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 764  sub _get_next_token ($) { Line 973  sub _get_next_token ($) {
973          } else {          } else {
974                        
975            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
976              $self->{s_kwd} = '';
977            ## Reconsume.            ## Reconsume.
978            return  ({type => CHARACTER_TOKEN,            return  ({type => CHARACTER_TOKEN,
979                      data => '</' . $self->{s_kwd},                      data => '</' . $self->{kwd},
980                      line => $self->{line_prev},                      line => $self->{line_prev},
981                      column => $self->{column_prev} - 1 - length $self->{s_kwd},                      column => $self->{column_prev} - 1 - length $self->{kwd},
982                     });                     });
983            redo A;            redo A;
984          }          }
# Line 782  sub _get_next_token ($) { Line 992  sub _get_next_token ($) {
992                        
993            ## Reconsume.            ## Reconsume.
994            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
995              $self->{s_kwd} = '';
996            return  ({type => CHARACTER_TOKEN,            return  ({type => CHARACTER_TOKEN,
997                      data => '</' . $self->{s_kwd},                      data => '</' . $self->{kwd},
998                      line => $self->{line_prev},                      line => $self->{line_prev},
999                      column => $self->{column_prev} - 1 - length $self->{s_kwd},                      column => $self->{column_prev} - 1 - length $self->{kwd},
1000                     });                     });
1001            redo A;            redo A;
1002          } else {          } else {
# Line 794  sub _get_next_token ($) { Line 1005  sub _get_next_token ($) {
1005                = {type => END_TAG_TOKEN,                = {type => END_TAG_TOKEN,
1006                   tag_name => $self->{last_stag_name},                   tag_name => $self->{last_stag_name},
1007                   line => $self->{line_prev},                   line => $self->{line_prev},
1008                   column => $self->{column_prev} - 1 - length $self->{s_kwd}};                   column => $self->{column_prev} - 1 - length $self->{kwd}};
1009            $self->{state} = TAG_NAME_STATE;            $self->{state} = TAG_NAME_STATE;
1010            ## Reconsume.            ## Reconsume.
1011            redo A;            redo A;
# Line 833  sub _get_next_token ($) { Line 1044  sub _get_next_token ($) {
1044            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1045          }          }
1046          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1047            $self->{s_kwd} = '';
1048                    
1049      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1050        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 885  sub _get_next_token ($) { Line 1097  sub _get_next_token ($) {
1097            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1098          }          }
1099          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1100            $self->{s_kwd} = '';
1101          # reconsume          # reconsume
1102    
1103          return  ($self->{ct}); # start tag or end tag          return  ($self->{ct}); # start tag or end tag
# Line 924  sub _get_next_token ($) { Line 1137  sub _get_next_token ($) {
1137          redo A;          redo A;
1138        }        }
1139      } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {      } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
1140          ## XML5: "Tag attribute name before state".
1141    
1142        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
1143                    
1144          ## Stay in the state          ## Stay in the state
# Line 955  sub _get_next_token ($) { Line 1170  sub _get_next_token ($) {
1170            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1171          }          }
1172          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1173            $self->{s_kwd} = '';
1174                    
1175      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1176        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 1022  sub _get_next_token ($) { Line 1238  sub _get_next_token ($) {
1238            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1239          }          }
1240          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1241            $self->{s_kwd} = '';
1242          # reconsume          # reconsume
1243    
1244          return  ($self->{ct}); # start tag or end tag          return  ($self->{ct}); # start tag or end tag
# Line 1034  sub _get_next_token ($) { Line 1251  sub _get_next_token ($) {
1251               0x003D => 1, # =               0x003D => 1, # =
1252              }->{$self->{nc}}) {              }->{$self->{nc}}) {
1253                        
1254              ## XML5: Not a parse error.
1255            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1256          } else {          } else {
1257                        
1258              ## XML5: ":" raises a parse error and is ignored.
1259          }          }
1260          $self->{ca}          $self->{ca}
1261              = {name => chr ($self->{nc}),              = {name => chr ($self->{nc}),
# Line 1057  sub _get_next_token ($) { Line 1276  sub _get_next_token ($) {
1276          redo A;          redo A;
1277        }        }
1278      } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {      } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
1279          ## XML5: "Tag attribute name state".
1280    
1281        my $before_leave = sub {        my $before_leave = sub {
1282          if (exists $self->{ct}->{attributes} # start tag or end tag          if (exists $self->{ct}->{attributes} # start tag or end tag
1283              ->{$self->{ca}->{name}}) { # MUST              ->{$self->{ca}->{name}}) { # MUST
# Line 1067  sub _get_next_token ($) { Line 1288  sub _get_next_token ($) {
1288                        
1289            $self->{ct}->{attributes}->{$self->{ca}->{name}}            $self->{ct}->{attributes}->{$self->{ca}->{name}}
1290              = $self->{ca};              = $self->{ca};
1291              $self->{ca}->{index} = ++$self->{ct}->{last_index};
1292          }          }
1293        }; # $before_leave        }; # $before_leave
1294    
# Line 1103  sub _get_next_token ($) { Line 1325  sub _get_next_token ($) {
1325        
1326          redo A;          redo A;
1327        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
1328            if ($self->{is_xml}) {
1329              
1330              ## XML5: Not a parse error.
1331              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1332            } else {
1333              
1334            }
1335    
1336          $before_leave->();          $before_leave->();
1337          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
1338                        
# Line 1117  sub _get_next_token ($) { Line 1347  sub _get_next_token ($) {
1347            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1348          }          }
1349          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1350            $self->{s_kwd} = '';
1351                    
1352      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1353        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 1151  sub _get_next_token ($) { Line 1382  sub _get_next_token ($) {
1382        
1383          redo A;          redo A;
1384        } elsif ($self->{nc} == 0x002F) { # /        } elsif ($self->{nc} == 0x002F) { # /
1385            if ($self->{is_xml}) {
1386              
1387              ## XML5: Not a parse error.
1388              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1389            } else {
1390              
1391            }
1392                    
1393          $before_leave->();          $before_leave->();
1394          $self->{state} = SELF_CLOSING_START_TAG_STATE;          $self->{state} = SELF_CLOSING_START_TAG_STATE;
# Line 1185  sub _get_next_token ($) { Line 1423  sub _get_next_token ($) {
1423            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1424          }          }
1425          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1426            $self->{s_kwd} = '';
1427          # reconsume          # reconsume
1428    
1429          return  ($self->{ct}); # start tag or end tag          return  ($self->{ct}); # start tag or end tag
# Line 1194  sub _get_next_token ($) { Line 1433  sub _get_next_token ($) {
1433          if ($self->{nc} == 0x0022 or # "          if ($self->{nc} == 0x0022 or # "
1434              $self->{nc} == 0x0027) { # '              $self->{nc} == 0x0027) { # '
1435                        
1436              ## XML5: Not a parse error.
1437            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1438          } else {          } else {
1439                        
# Line 1214  sub _get_next_token ($) { Line 1454  sub _get_next_token ($) {
1454          redo A;          redo A;
1455        }        }
1456      } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {      } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1457          ## XML5: "Tag attribute name after state".
1458          
1459        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
1460                    
1461          ## Stay in the state          ## Stay in the state
# Line 1245  sub _get_next_token ($) { Line 1487  sub _get_next_token ($) {
1487        
1488          redo A;          redo A;
1489        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
1490            if ($self->{is_xml}) {
1491              
1492              ## XML5: Not a parse error.
1493              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1494            } else {
1495              
1496            }
1497    
1498          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
1499                        
1500            $self->{last_stag_name} = $self->{ct}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
# Line 1261  sub _get_next_token ($) { Line 1511  sub _get_next_token ($) {
1511            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1512          }          }
1513          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1514            $self->{s_kwd} = '';
1515                    
1516      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1517        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 1297  sub _get_next_token ($) { Line 1548  sub _get_next_token ($) {
1548        
1549          redo A;          redo A;
1550        } elsif ($self->{nc} == 0x002F) { # /        } elsif ($self->{nc} == 0x002F) { # /
1551            if ($self->{is_xml}) {
1552              
1553              ## XML5: Not a parse error.
1554              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1555            } else {
1556              
1557            }
1558                    
1559          $self->{state} = SELF_CLOSING_START_TAG_STATE;          $self->{state} = SELF_CLOSING_START_TAG_STATE;
1560                    
# Line 1328  sub _get_next_token ($) { Line 1586  sub _get_next_token ($) {
1586          } else {          } else {
1587            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1588          }          }
1589            $self->{s_kwd} = '';
1590          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1591          # reconsume          # reconsume
1592    
# Line 1335  sub _get_next_token ($) { Line 1594  sub _get_next_token ($) {
1594    
1595          redo A;          redo A;
1596        } else {        } else {
1597            if ($self->{is_xml}) {
1598              
1599              ## XML5: Not a parse error.
1600              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1601            } else {
1602              
1603            }
1604    
1605          if ($self->{nc} == 0x0022 or # "          if ($self->{nc} == 0x0022 or # "
1606              $self->{nc} == 0x0027) { # '              $self->{nc} == 0x0027) { # '
1607                        
1608              ## XML5: Not a parse error.
1609            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1610          } else {          } else {
1611                        
# Line 1361  sub _get_next_token ($) { Line 1629  sub _get_next_token ($) {
1629          redo A;                  redo A;        
1630        }        }
1631      } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {      } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1632          ## XML5: "Tag attribute value before state".
1633    
1634        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
1635                    
1636          ## Stay in the state          ## Stay in the state
# Line 1429  sub _get_next_token ($) { Line 1699  sub _get_next_token ($) {
1699            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1700          }          }
1701          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1702            $self->{s_kwd} = '';
1703                    
1704      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1705        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 1462  sub _get_next_token ($) { Line 1733  sub _get_next_token ($) {
1733            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1734          }          }
1735          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1736            $self->{s_kwd} = '';
1737          ## reconsume          ## reconsume
1738    
1739          return  ($self->{ct}); # start tag or end tag          return  ($self->{ct}); # start tag or end tag
# Line 1470  sub _get_next_token ($) { Line 1742  sub _get_next_token ($) {
1742        } else {        } else {
1743          if ($self->{nc} == 0x003D) { # =          if ($self->{nc} == 0x003D) { # =
1744                        
1745              ## XML5: Not a parse error.
1746            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
1747            } elsif ($self->{is_xml}) {
1748              
1749              ## XML5: No parse error.
1750              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO
1751          } else {          } else {
1752                        
1753          }          }
# Line 1490  sub _get_next_token ($) { Line 1767  sub _get_next_token ($) {
1767          redo A;          redo A;
1768        }        }
1769      } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {      } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1770          ## XML5: "Tag attribute value double quoted state" and "DOCTYPE
1771          ## ATTLIST attribute value double quoted state".
1772          
1773        if ($self->{nc} == 0x0022) { # "        if ($self->{nc} == 0x0022) { # "
1774                    if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1775          $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;            
1776              ## XML5: "DOCTYPE ATTLIST name after state".
1777              push @{$self->{ct}->{attrdefs}}, $self->{ca};
1778              $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1779            } else {
1780              
1781              ## XML5: "Tag attribute name before state".
1782              $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1783            }
1784                    
1785      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1786        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 1507  sub _get_next_token ($) { Line 1795  sub _get_next_token ($) {
1795          redo A;          redo A;
1796        } elsif ($self->{nc} == 0x0026) { # &        } elsif ($self->{nc} == 0x0026) { # &
1797                    
1798            ## XML5: Not defined yet.
1799    
1800          ## NOTE: In the spec, the tokenizer is switched to the          ## NOTE: In the spec, the tokenizer is switched to the
1801          ## "entity in attribute value state".  In this implementation, the          ## "entity in attribute value state".  In this implementation, the
1802          ## tokenizer is switched to the |ENTITY_STATE|, which is an          ## tokenizer is switched to the |ENTITY_STATE|, which is an
# Line 1531  sub _get_next_token ($) { Line 1821  sub _get_next_token ($) {
1821          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
1822                        
1823            $self->{last_stag_name} = $self->{ct}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
1824    
1825              $self->{state} = DATA_STATE;
1826              $self->{s_kwd} = '';
1827              ## reconsume
1828              return  ($self->{ct}); # start tag
1829              redo A;
1830          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1831            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1832            if ($self->{ct}->{attributes}) {            if ($self->{ct}->{attributes}) {
# Line 1540  sub _get_next_token ($) { Line 1836  sub _get_next_token ($) {
1836              ## NOTE: This state should never be reached.              ## NOTE: This state should never be reached.
1837                            
1838            }            }
1839    
1840              $self->{state} = DATA_STATE;
1841              $self->{s_kwd} = '';
1842              ## reconsume
1843              return  ($self->{ct}); # end tag
1844              redo A;
1845            } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1846              ## XML5: No parse error above; not defined yet.
1847              push @{$self->{ct}->{attrdefs}}, $self->{ca};
1848              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1849              ## Reconsume.
1850              return  ($self->{ct}); # ATTLIST
1851              redo A;
1852          } else {          } else {
1853            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1854          }          }
         $self->{state} = DATA_STATE;  
         ## reconsume  
   
         return  ($self->{ct}); # start tag or end tag  
   
         redo A;  
1855        } else {        } else {
1856                    ## XML5 [ATTLIST]: Not defined yet.
1857            if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1858              
1859              ## XML5: Not a parse error.
1860              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
1861            } else {
1862              
1863            }
1864          $self->{ca}->{value} .= chr ($self->{nc});          $self->{ca}->{value} .= chr ($self->{nc});
1865          $self->{read_until}->($self->{ca}->{value},          $self->{read_until}->($self->{ca}->{value},
1866                                q["&],                                q["&<],
1867                                length $self->{ca}->{value});                                length $self->{ca}->{value});
1868    
1869          ## Stay in the state          ## Stay in the state
# Line 1571  sub _get_next_token ($) { Line 1881  sub _get_next_token ($) {
1881          redo A;          redo A;
1882        }        }
1883      } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {      } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1884          ## XML5: "Tag attribute value single quoted state" and "DOCTYPE
1885          ## ATTLIST attribute value single quoted state".
1886    
1887        if ($self->{nc} == 0x0027) { # '        if ($self->{nc} == 0x0027) { # '
1888                    if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1889          $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;            
1890              ## XML5: "DOCTYPE ATTLIST name after state".
1891              push @{$self->{ct}->{attrdefs}}, $self->{ca};
1892              $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1893            } else {
1894              
1895              ## XML5: "Before attribute name state" (sic).
1896              $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1897            }
1898                    
1899      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1900        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 1588  sub _get_next_token ($) { Line 1909  sub _get_next_token ($) {
1909          redo A;          redo A;
1910        } elsif ($self->{nc} == 0x0026) { # &        } elsif ($self->{nc} == 0x0026) { # &
1911                    
1912            ## XML5: Not defined yet.
1913    
1914          ## NOTE: In the spec, the tokenizer is switched to the          ## NOTE: In the spec, the tokenizer is switched to the
1915          ## "entity in attribute value state".  In this implementation, the          ## "entity in attribute value state".  In this implementation, the
1916          ## tokenizer is switched to the |ENTITY_STATE|, which is an          ## tokenizer is switched to the |ENTITY_STATE|, which is an
# Line 1612  sub _get_next_token ($) { Line 1935  sub _get_next_token ($) {
1935          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
1936                        
1937            $self->{last_stag_name} = $self->{ct}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
1938    
1939              $self->{state} = DATA_STATE;
1940              $self->{s_kwd} = '';
1941              ## reconsume
1942              return  ($self->{ct}); # start tag
1943              redo A;
1944          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1945            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1946            if ($self->{ct}->{attributes}) {            if ($self->{ct}->{attributes}) {
# Line 1621  sub _get_next_token ($) { Line 1950  sub _get_next_token ($) {
1950              ## NOTE: This state should never be reached.              ## NOTE: This state should never be reached.
1951                            
1952            }            }
1953    
1954              $self->{state} = DATA_STATE;
1955              $self->{s_kwd} = '';
1956              ## reconsume
1957              return  ($self->{ct}); # end tag
1958              redo A;
1959            } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1960              ## XML5: No parse error above; not defined yet.
1961              push @{$self->{ct}->{attrdefs}}, $self->{ca};
1962              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1963              ## Reconsume.
1964              return  ($self->{ct}); # ATTLIST
1965              redo A;
1966          } else {          } else {
1967            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1968          }          }
         $self->{state} = DATA_STATE;  
         ## reconsume  
   
         return  ($self->{ct}); # start tag or end tag  
   
         redo A;  
1969        } else {        } else {
1970                    ## XML5 [ATTLIST]: Not defined yet.
1971            if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1972              
1973              ## XML5: Not a parse error.
1974              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
1975            } else {
1976              
1977            }
1978          $self->{ca}->{value} .= chr ($self->{nc});          $self->{ca}->{value} .= chr ($self->{nc});
1979          $self->{read_until}->($self->{ca}->{value},          $self->{read_until}->($self->{ca}->{value},
1980                                q['&],                                q['&<],
1981                                length $self->{ca}->{value});                                length $self->{ca}->{value});
1982    
1983          ## Stay in the state          ## Stay in the state
# Line 1652  sub _get_next_token ($) { Line 1995  sub _get_next_token ($) {
1995          redo A;          redo A;
1996        }        }
1997      } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {      } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1998          ## XML5: "Tag attribute value unquoted state".
1999    
2000        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
2001                    if ($self->{ct}->{type} == ATTLIST_TOKEN) {
2002          $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;            
2003              push @{$self->{ct}->{attrdefs}}, $self->{ca};
2004              $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
2005            } else {
2006              
2007              ## XML5: "Tag attribute name before state".
2008              $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2009            }
2010                    
2011      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2012        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 1669  sub _get_next_token ($) { Line 2021  sub _get_next_token ($) {
2021          redo A;          redo A;
2022        } elsif ($self->{nc} == 0x0026) { # &        } elsif ($self->{nc} == 0x0026) { # &
2023                    
2024    
2025            ## XML5: Not defined yet.
2026    
2027          ## NOTE: In the spec, the tokenizer is switched to the          ## NOTE: In the spec, the tokenizer is switched to the
2028          ## "entity in attribute value state".  In this implementation, the          ## "entity in attribute value state".  In this implementation, the
2029          ## tokenizer is switched to the |ENTITY_STATE|, which is an          ## tokenizer is switched to the |ENTITY_STATE|, which is an
# Line 1692  sub _get_next_token ($) { Line 2047  sub _get_next_token ($) {
2047          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
2048                        
2049            $self->{last_stag_name} = $self->{ct}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
2050    
2051              $self->{state} = DATA_STATE;
2052              $self->{s_kwd} = '';
2053              
2054        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2055          $self->{line_prev} = $self->{line};
2056          $self->{column_prev} = $self->{column};
2057          $self->{column}++;
2058          $self->{nc}
2059              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2060        } else {
2061          $self->{set_nc}->($self);
2062        }
2063      
2064              return  ($self->{ct}); # start tag
2065              redo A;
2066          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2067            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2068            if ($self->{ct}->{attributes}) {            if ($self->{ct}->{attributes}) {
# Line 1701  sub _get_next_token ($) { Line 2072  sub _get_next_token ($) {
2072              ## NOTE: This state should never be reached.              ## NOTE: This state should never be reached.
2073                            
2074            }            }
2075          } else {  
2076            die "$0: $self->{ct}->{type}: Unknown token type";            $self->{state} = DATA_STATE;
2077          }            $self->{s_kwd} = '';
2078          $self->{state} = DATA_STATE;            
           
2079      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2080        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
2081        $self->{column_prev} = $self->{column};        $self->{column_prev} = $self->{column};
# Line 1716  sub _get_next_token ($) { Line 2086  sub _get_next_token ($) {
2086        $self->{set_nc}->($self);        $self->{set_nc}->($self);
2087      }      }
2088        
2089              return  ($self->{ct}); # end tag
2090          return  ($self->{ct}); # start tag or end tag            redo A;
2091            } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2092          redo A;            push @{$self->{ct}->{attrdefs}}, $self->{ca};
2093              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2094              
2095        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2096          $self->{line_prev} = $self->{line};
2097          $self->{column_prev} = $self->{column};
2098          $self->{column}++;
2099          $self->{nc}
2100              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2101        } else {
2102          $self->{set_nc}->($self);
2103        }
2104      
2105              return  ($self->{ct}); # ATTLIST
2106              redo A;
2107            } else {
2108              die "$0: $self->{ct}->{type}: Unknown token type";
2109            }
2110        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
         $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');  
2111          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
2112                        
2113              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2114            $self->{last_stag_name} = $self->{ct}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
2115    
2116              $self->{state} = DATA_STATE;
2117              $self->{s_kwd} = '';
2118              ## reconsume
2119              return  ($self->{ct}); # start tag
2120              redo A;
2121          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2122              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2123            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2124            if ($self->{ct}->{attributes}) {            if ($self->{ct}->{attributes}) {
2125                            
# Line 1734  sub _get_next_token ($) { Line 2128  sub _get_next_token ($) {
2128              ## NOTE: This state should never be reached.              ## NOTE: This state should never be reached.
2129                            
2130            }            }
2131    
2132              $self->{state} = DATA_STATE;
2133              $self->{s_kwd} = '';
2134              ## reconsume
2135              return  ($self->{ct}); # end tag
2136              redo A;
2137            } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2138              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
2139              push @{$self->{ct}->{attrdefs}}, $self->{ca};
2140              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2141              ## Reconsume.
2142              return  ($self->{ct}); # ATTLIST
2143              redo A;
2144          } else {          } else {
2145            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
2146          }          }
         $self->{state} = DATA_STATE;  
         ## reconsume  
   
         return  ($self->{ct}); # start tag or end tag  
   
         redo A;  
2147        } else {        } else {
2148          if ({          if ({
2149               0x0022 => 1, # "               0x0022 => 1, # "
# Line 1750  sub _get_next_token ($) { Line 2151  sub _get_next_token ($) {
2151               0x003D => 1, # =               0x003D => 1, # =
2152              }->{$self->{nc}}) {              }->{$self->{nc}}) {
2153                        
2154              ## XML5: Not a parse error.
2155            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
2156          } else {          } else {
2157                        
# Line 1806  sub _get_next_token ($) { Line 2208  sub _get_next_token ($) {
2208            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
2209          }          }
2210          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2211            $self->{s_kwd} = '';
2212                    
2213      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2214        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 1853  sub _get_next_token ($) { Line 2256  sub _get_next_token ($) {
2256            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
2257          }          }
2258          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2259            $self->{s_kwd} = '';
2260          ## Reconsume.          ## Reconsume.
2261          return  ($self->{ct}); # start tag or end tag          return  ($self->{ct}); # start tag or end tag
2262          redo A;          redo A;
# Line 1864  sub _get_next_token ($) { Line 2268  sub _get_next_token ($) {
2268          redo A;          redo A;
2269        }        }
2270      } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {      } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
2271          ## XML5: "Empty tag state".
2272    
2273        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
2274          if ($self->{ct}->{type} == END_TAG_TOKEN) {          if ($self->{ct}->{type} == END_TAG_TOKEN) {
2275                        
# Line 1883  sub _get_next_token ($) { Line 2289  sub _get_next_token ($) {
2289          }          }
2290    
2291          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2292            $self->{s_kwd} = '';
2293                    
2294      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2295        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 1914  sub _get_next_token ($) { Line 2321  sub _get_next_token ($) {
2321          } else {          } else {
2322            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
2323          }          }
2324            ## XML5: "Tag attribute name before state".
2325          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2326            $self->{s_kwd} = '';
2327          ## Reconsume.          ## Reconsume.
2328          return  ($self->{ct}); # start tag or end tag          return  ($self->{ct}); # start tag or end tag
2329          redo A;          redo A;
# Line 1927  sub _get_next_token ($) { Line 2336  sub _get_next_token ($) {
2336          redo A;          redo A;
2337        }        }
2338      } elsif ($self->{state} == BOGUS_COMMENT_STATE) {      } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
2339        ## (only happen if PCDATA state)        ## XML5: "Bogus comment state" and "DOCTYPE bogus comment state".
2340    
2341        ## NOTE: Unlike spec's "bogus comment state", this implementation        ## NOTE: Unlike spec's "bogus comment state", this implementation
2342        ## consumes characters one-by-one basis.        ## consumes characters one-by-one basis.
2343                
2344        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
2345                    if ($self->{in_subset}) {
2346          $self->{state} = DATA_STATE;            
2347              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2348            } else {
2349              
2350              $self->{state} = DATA_STATE;
2351              $self->{s_kwd} = '';
2352            }
2353                    
2354      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2355        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 1950  sub _get_next_token ($) { Line 2365  sub _get_next_token ($) {
2365          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
2366          redo A;          redo A;
2367        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
2368                    if ($self->{in_subset}) {
2369          $self->{state} = DATA_STATE;            
2370              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2371            } else {
2372              
2373              $self->{state} = DATA_STATE;
2374              $self->{s_kwd} = '';
2375            }
2376          ## reconsume          ## reconsume
2377    
2378          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
# Line 1978  sub _get_next_token ($) { Line 2399  sub _get_next_token ($) {
2399          redo A;          redo A;
2400        }        }
2401      } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {      } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
2402        ## (only happen if PCDATA state)        ## XML5: "Markup declaration state".
2403                
2404        if ($self->{nc} == 0x002D) { # -        if ($self->{nc} == 0x002D) { # -
2405                    
# Line 2000  sub _get_next_token ($) { Line 2421  sub _get_next_token ($) {
2421          ## ASCII case-insensitive.          ## ASCII case-insensitive.
2422                    
2423          $self->{state} = MD_DOCTYPE_STATE;          $self->{state} = MD_DOCTYPE_STATE;
2424          $self->{s_kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
2425                    
2426      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2427        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2019  sub _get_next_token ($) { Line 2440  sub _get_next_token ($) {
2440                 $self->{nc} == 0x005B) { # [                 $self->{nc} == 0x005B) { # [
2441                                                    
2442          $self->{state} = MD_CDATA_STATE;          $self->{state} = MD_CDATA_STATE;
2443          $self->{s_kwd} = '[';          $self->{kwd} = '[';
2444                    
2445      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2446        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2053  sub _get_next_token ($) { Line 2474  sub _get_next_token ($) {
2474                                    line => $self->{line_prev},                                    line => $self->{line_prev},
2475                                    column => $self->{column_prev} - 2,                                    column => $self->{column_prev} - 2,
2476                                   };                                   };
2477          $self->{state} = COMMENT_START_STATE;          $self->{state} = COMMENT_START_STATE; ## XML5: "comment state".
2478                    
2479      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2480        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2089  sub _get_next_token ($) { Line 2510  sub _get_next_token ($) {
2510              0x0054, # T              0x0054, # T
2511              0x0059, # Y              0x0059, # Y
2512              0x0050, # P              0x0050, # P
2513            ]->[length $self->{s_kwd}] or            ]->[length $self->{kwd}] or
2514            $self->{nc} == [            $self->{nc} == [
2515              undef,              undef,
2516              0x006F, # o              0x006F, # o
# Line 2097  sub _get_next_token ($) { Line 2518  sub _get_next_token ($) {
2518              0x0074, # t              0x0074, # t
2519              0x0079, # y              0x0079, # y
2520              0x0070, # p              0x0070, # p
2521            ]->[length $self->{s_kwd}]) {            ]->[length $self->{kwd}]) {
2522                    
2523          ## Stay in the state.          ## Stay in the state.
2524          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
2525                    
2526      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2527        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2113  sub _get_next_token ($) { Line 2534  sub _get_next_token ($) {
2534      }      }
2535        
2536          redo A;          redo A;
2537        } elsif ((length $self->{s_kwd}) == 6 and        } elsif ((length $self->{kwd}) == 6 and
2538                 ($self->{nc} == 0x0045 or # E                 ($self->{nc} == 0x0045 or # E
2539                  $self->{nc} == 0x0065)) { # e                  $self->{nc} == 0x0065)) { # e
2540                    if ($self->{is_xml} and
2541                ($self->{kwd} ne 'DOCTYP' or $self->{nc} == 0x0065)) {
2542              
2543              ## XML5: case-sensitive.
2544              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO
2545                              text => 'DOCTYPE',
2546                              line => $self->{line_prev},
2547                              column => $self->{column_prev} - 5);
2548            } else {
2549              
2550            }
2551          $self->{state} = DOCTYPE_STATE;          $self->{state} = DOCTYPE_STATE;
2552          $self->{ct} = {type => DOCTYPE_TOKEN,          $self->{ct} = {type => DOCTYPE_TOKEN,
2553                                    quirks => 1,                                    quirks => 1,
# Line 2139  sub _get_next_token ($) { Line 2570  sub _get_next_token ($) {
2570                                    
2571          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2572                          line => $self->{line_prev},                          line => $self->{line_prev},
2573                          column => $self->{column_prev} - 1 - length $self->{s_kwd});                          column => $self->{column_prev} - 1 - length $self->{kwd});
2574          $self->{state} = BOGUS_COMMENT_STATE;          $self->{state} = BOGUS_COMMENT_STATE;
2575          ## Reconsume.          ## Reconsume.
2576          $self->{ct} = {type => COMMENT_TOKEN,          $self->{ct} = {type => COMMENT_TOKEN,
2577                                    data => $self->{s_kwd},                                    data => $self->{kwd},
2578                                    line => $self->{line_prev},                                    line => $self->{line_prev},
2579                                    column => $self->{column_prev} - 1 - length $self->{s_kwd},                                    column => $self->{column_prev} - 1 - length $self->{kwd},
2580                                   };                                   };
2581          redo A;          redo A;
2582        }        }
# Line 2156  sub _get_next_token ($) { Line 2587  sub _get_next_token ($) {
2587              '[CD' => 0x0041, # A              '[CD' => 0x0041, # A
2588              '[CDA' => 0x0054, # T              '[CDA' => 0x0054, # T
2589              '[CDAT' => 0x0041, # A              '[CDAT' => 0x0041, # A
2590            }->{$self->{s_kwd}}) {            }->{$self->{kwd}}) {
2591                    
2592          ## Stay in the state.          ## Stay in the state.
2593          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
2594                    
2595      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2596        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2172  sub _get_next_token ($) { Line 2603  sub _get_next_token ($) {
2603      }      }
2604        
2605          redo A;          redo A;
2606        } elsif ($self->{s_kwd} eq '[CDATA' and        } elsif ($self->{kwd} eq '[CDATA' and
2607                 $self->{nc} == 0x005B) { # [                 $self->{nc} == 0x005B) { # [
2608                    if ($self->{is_xml} and
2609                not $self->{tainted} and
2610                @{$self->{open_elements} or []} == 0) {
2611              
2612              $self->{parse_error}->(level => $self->{level}->{must}, type => 'cdata outside of root element',
2613                              line => $self->{line_prev},
2614                              column => $self->{column_prev} - 7);
2615              $self->{tainted} = 1;
2616            } else {
2617              
2618            }
2619    
2620          $self->{ct} = {type => CHARACTER_TOKEN,          $self->{ct} = {type => CHARACTER_TOKEN,
2621                                    data => '',                                    data => '',
2622                                    line => $self->{line_prev},                                    line => $self->{line_prev},
# Line 2196  sub _get_next_token ($) { Line 2638  sub _get_next_token ($) {
2638                    
2639          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2640                          line => $self->{line_prev},                          line => $self->{line_prev},
2641                          column => $self->{column_prev} - 1 - length $self->{s_kwd});                          column => $self->{column_prev} - 1 - length $self->{kwd});
2642          $self->{state} = BOGUS_COMMENT_STATE;          $self->{state} = BOGUS_COMMENT_STATE;
2643          ## Reconsume.          ## Reconsume.
2644          $self->{ct} = {type => COMMENT_TOKEN,          $self->{ct} = {type => COMMENT_TOKEN,
2645                                    data => $self->{s_kwd},                                    data => $self->{kwd},
2646                                    line => $self->{line_prev},                                    line => $self->{line_prev},
2647                                    column => $self->{column_prev} - 1 - length $self->{s_kwd},                                    column => $self->{column_prev} - 1 - length $self->{kwd},
2648                                   };                                   };
2649          redo A;          redo A;
2650        }        }
# Line 2223  sub _get_next_token ($) { Line 2665  sub _get_next_token ($) {
2665        
2666          redo A;          redo A;
2667        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
           
2668          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2669          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
2670              
2671              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2672            } else {
2673              
2674              $self->{state} = DATA_STATE;
2675              $self->{s_kwd} = '';
2676            }
2677                    
2678      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2679        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2242  sub _get_next_token ($) { Line 2690  sub _get_next_token ($) {
2690    
2691          redo A;          redo A;
2692        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
2693          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2694          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
2695              
2696              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2697            } else {
2698              
2699              $self->{state} = DATA_STATE;
2700              $self->{s_kwd} = '';
2701            }
2702          ## reconsume          ## reconsume
2703    
2704          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
# Line 2285  sub _get_next_token ($) { Line 2739  sub _get_next_token ($) {
2739        
2740          redo A;          redo A;
2741        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
           
2742          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2743          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
2744              
2745              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2746            } else {
2747              
2748              $self->{state} = DATA_STATE;
2749              $self->{s_kwd} = '';
2750            }
2751                    
2752      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2753        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2304  sub _get_next_token ($) { Line 2764  sub _get_next_token ($) {
2764    
2765          redo A;          redo A;
2766        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
2767          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2768          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
2769              
2770              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2771            } else {
2772              
2773              $self->{state} = DATA_STATE;
2774              $self->{s_kwd} = '';
2775            }
2776          ## reconsume          ## reconsume
2777    
2778          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
# Line 2331  sub _get_next_token ($) { Line 2797  sub _get_next_token ($) {
2797          redo A;          redo A;
2798        }        }
2799      } elsif ($self->{state} == COMMENT_STATE) {      } elsif ($self->{state} == COMMENT_STATE) {
2800          ## XML5: "Comment state" and "DOCTYPE comment state".
2801    
2802        if ($self->{nc} == 0x002D) { # -        if ($self->{nc} == 0x002D) { # -
2803                    
2804          $self->{state} = COMMENT_END_DASH_STATE;          $self->{state} = COMMENT_END_DASH_STATE;
# Line 2347  sub _get_next_token ($) { Line 2815  sub _get_next_token ($) {
2815        
2816          redo A;          redo A;
2817        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
2818          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2819          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
2820              
2821              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2822            } else {
2823              
2824              $self->{state} = DATA_STATE;
2825              $self->{s_kwd} = '';
2826            }
2827          ## reconsume          ## reconsume
2828    
2829          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
# Line 2377  sub _get_next_token ($) { Line 2851  sub _get_next_token ($) {
2851          redo A;          redo A;
2852        }        }
2853      } elsif ($self->{state} == COMMENT_END_DASH_STATE) {      } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2854          ## XML5: "Comment dash state" and "DOCTYPE comment dash state".
2855    
2856        if ($self->{nc} == 0x002D) { # -        if ($self->{nc} == 0x002D) { # -
2857                    
2858          $self->{state} = COMMENT_END_STATE;          $self->{state} = COMMENT_END_STATE;
# Line 2393  sub _get_next_token ($) { Line 2869  sub _get_next_token ($) {
2869        
2870          redo A;          redo A;
2871        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
2872          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2873          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
2874              
2875              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2876            } else {
2877              
2878              $self->{state} = DATA_STATE;
2879              $self->{s_kwd} = '';
2880            }
2881          ## reconsume          ## reconsume
2882    
2883          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
# Line 2419  sub _get_next_token ($) { Line 2901  sub _get_next_token ($) {
2901          redo A;          redo A;
2902        }        }
2903      } elsif ($self->{state} == COMMENT_END_STATE) {      } elsif ($self->{state} == COMMENT_END_STATE) {
2904          ## XML5: "Comment end state" and "DOCTYPE comment end state".
2905    
2906        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
2907                    if ($self->{in_subset}) {
2908          $self->{state} = DATA_STATE;            
2909              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2910            } else {
2911              
2912              $self->{state} = DATA_STATE;
2913              $self->{s_kwd} = '';
2914            }
2915                    
2916      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2917        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2439  sub _get_next_token ($) { Line 2929  sub _get_next_token ($) {
2929          redo A;          redo A;
2930        } elsif ($self->{nc} == 0x002D) { # -        } elsif ($self->{nc} == 0x002D) { # -
2931                    
2932            ## XML5: Not a parse error.
2933          $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2934                          line => $self->{line_prev},                          line => $self->{line_prev},
2935                          column => $self->{column_prev});                          column => $self->{column_prev});
# Line 2457  sub _get_next_token ($) { Line 2948  sub _get_next_token ($) {
2948        
2949          redo A;          redo A;
2950        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
2951          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2952          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
2953              
2954              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2955            } else {
2956              
2957              $self->{state} = DATA_STATE;
2958              $self->{s_kwd} = '';
2959            }
2960          ## reconsume          ## reconsume
2961    
2962          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
# Line 2467  sub _get_next_token ($) { Line 2964  sub _get_next_token ($) {
2964          redo A;          redo A;
2965        } else {        } else {
2966                    
2967            ## XML5: Not a parse error.
2968          $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2969                          line => $self->{line_prev},                          line => $self->{line_prev},
2970                          column => $self->{column_prev});                          column => $self->{column_prev});
# Line 2503  sub _get_next_token ($) { Line 3001  sub _get_next_token ($) {
3001          redo A;          redo A;
3002        } else {        } else {
3003                    
3004            ## XML5: Unless EOF, swith to the bogus comment state.
3005          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before DOCTYPE name');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before DOCTYPE name');
3006          $self->{state} = BEFORE_DOCTYPE_NAME_STATE;          $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
3007          ## reconsume          ## reconsume
3008          redo A;          redo A;
3009        }        }
3010      } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {      } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
3011          ## XML5: "DOCTYPE root name before state".
3012    
3013        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
3014                    
3015          ## Stay in the state          ## Stay in the state
# Line 2526  sub _get_next_token ($) { Line 3027  sub _get_next_token ($) {
3027          redo A;          redo A;
3028        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
3029                    
3030            ## XML5: No parse error.
3031          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3032          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3033            $self->{s_kwd} = '';
3034                    
3035      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3036        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2547  sub _get_next_token ($) { Line 3050  sub _get_next_token ($) {
3050                    
3051          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3052          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3053            $self->{s_kwd} = '';
3054          ## reconsume          ## reconsume
3055    
3056          return  ($self->{ct}); # DOCTYPE (quirks)          return  ($self->{ct}); # DOCTYPE (quirks)
3057    
3058          redo A;          redo A;
3059          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3060            
3061            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3062            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3063            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3064            $self->{in_subset} = 1;
3065            
3066        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3067          $self->{line_prev} = $self->{line};
3068          $self->{column_prev} = $self->{column};
3069          $self->{column}++;
3070          $self->{nc}
3071              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3072        } else {
3073          $self->{set_nc}->($self);
3074        }
3075      
3076            return  ($self->{ct}); # DOCTYPE
3077            redo A;
3078        } else {        } else {
3079                    
3080          $self->{ct}->{name} = chr $self->{nc};          $self->{ct}->{name} = chr $self->{nc};
# Line 2571  sub _get_next_token ($) { Line 3094  sub _get_next_token ($) {
3094          redo A;          redo A;
3095        }        }
3096      } elsif ($self->{state} == DOCTYPE_NAME_STATE) {      } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
3097  ## ISSUE: Redundant "First," in the spec.        ## XML5: "DOCTYPE root name state".
3098    
3099          ## ISSUE: Redundant "First," in the spec.
3100    
3101        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
3102                    
3103          $self->{state} = AFTER_DOCTYPE_NAME_STATE;          $self->{state} = AFTER_DOCTYPE_NAME_STATE;
# Line 2590  sub _get_next_token ($) { Line 3116  sub _get_next_token ($) {
3116        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
3117                    
3118          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3119            $self->{s_kwd} = '';
3120                    
3121      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3122        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2609  sub _get_next_token ($) { Line 3136  sub _get_next_token ($) {
3136                    
3137          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3138          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3139            $self->{s_kwd} = '';
3140          ## reconsume          ## reconsume
3141    
3142          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
3143          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
3144    
3145          redo A;          redo A;
3146          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3147            
3148            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3149            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3150            $self->{in_subset} = 1;
3151            
3152        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3153          $self->{line_prev} = $self->{line};
3154          $self->{column_prev} = $self->{column};
3155          $self->{column}++;
3156          $self->{nc}
3157              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3158        } else {
3159          $self->{set_nc}->($self);
3160        }
3161      
3162            return  ($self->{ct}); # DOCTYPE
3163            redo A;
3164        } else {        } else {
3165                    
3166          $self->{ct}->{name}          $self->{ct}->{name}
# Line 2634  sub _get_next_token ($) { Line 3180  sub _get_next_token ($) {
3180          redo A;          redo A;
3181        }        }
3182      } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {      } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
3183          ## XML5: Corresponding to XML5's "DOCTYPE root name after
3184          ## state", but implemented differently.
3185    
3186        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
3187                    
3188          ## Stay in the state          ## Stay in the state
# Line 2650  sub _get_next_token ($) { Line 3199  sub _get_next_token ($) {
3199        
3200          redo A;          redo A;
3201        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
3202            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3203              
3204              $self->{state} = DATA_STATE;
3205              $self->{s_kwd} = '';
3206            } else {
3207              
3208              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
3209              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3210            }
3211                    
         $self->{state} = DATA_STATE;  
3212                    
3213      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3214        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2663  sub _get_next_token ($) { Line 3220  sub _get_next_token ($) {
3220        $self->{set_nc}->($self);        $self->{set_nc}->($self);
3221      }      }
3222        
3223            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         return  ($self->{ct}); # DOCTYPE  
   
3224          redo A;          redo A;
3225        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
3226            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3227              
3228              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3229              $self->{state} = DATA_STATE;
3230              $self->{s_kwd} = '';
3231              $self->{ct}->{quirks} = 1;
3232            } else {
3233              
3234              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3235              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3236            }
3237                    
3238          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');          ## Reconsume.
3239          $self->{state} = DATA_STATE;          return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         ## reconsume  
   
         $self->{ct}->{quirks} = 1;  
         return  ($self->{ct}); # DOCTYPE  
   
3240          redo A;          redo A;
3241        } elsif ($self->{nc} == 0x0050 or # P        } elsif ($self->{nc} == 0x0050 or # P
3242                 $self->{nc} == 0x0070) { # p                 $self->{nc} == 0x0070) { # p
3243            
3244          $self->{state} = PUBLIC_STATE;          $self->{state} = PUBLIC_STATE;
3245          $self->{s_kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
3246                    
3247      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3248        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2695  sub _get_next_token ($) { Line 3257  sub _get_next_token ($) {
3257          redo A;          redo A;
3258        } elsif ($self->{nc} == 0x0053 or # S        } elsif ($self->{nc} == 0x0053 or # S
3259                 $self->{nc} == 0x0073) { # s                 $self->{nc} == 0x0073) { # s
3260            
3261          $self->{state} = SYSTEM_STATE;          $self->{state} = SYSTEM_STATE;
3262          $self->{s_kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
3263                    
3264      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3265        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2709  sub _get_next_token ($) { Line 3272  sub _get_next_token ($) {
3272      }      }
3273        
3274          redo A;          redo A;
3275        } else {        } elsif ($self->{nc} == 0x0022 and # "
3276                   ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
3277                    $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
3278                    
3279          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name');          $self->{state} = DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE;
3280          $self->{ct}->{quirks} = 1;          $self->{ct}->{value} = ''; # ENTITY
3281            
3282        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3283          $self->{line_prev} = $self->{line};
3284          $self->{column_prev} = $self->{column};
3285          $self->{column}++;
3286          $self->{nc}
3287              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3288        } else {
3289          $self->{set_nc}->($self);
3290        }
3291      
3292            redo A;
3293          } elsif ($self->{nc} == 0x0027 and # '
3294                   ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
3295                    $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
3296            
3297            $self->{state} = DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE;
3298            $self->{ct}->{value} = ''; # ENTITY
3299            
3300        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3301          $self->{line_prev} = $self->{line};
3302          $self->{column_prev} = $self->{column};
3303          $self->{column}++;
3304          $self->{nc}
3305              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3306        } else {
3307          $self->{set_nc}->($self);
3308        }
3309      
3310            redo A;
3311          } elsif ($self->{is_xml} and
3312                   $self->{ct}->{type} == DOCTYPE_TOKEN and
3313                   $self->{nc} == 0x005B) { # [
3314            
3315            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3316            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3317            $self->{in_subset} = 1;
3318            
3319        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3320          $self->{line_prev} = $self->{line};
3321          $self->{column_prev} = $self->{column};
3322          $self->{column}++;
3323          $self->{nc}
3324              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3325        } else {
3326          $self->{set_nc}->($self);
3327        }
3328      
3329            return  ($self->{ct}); # DOCTYPE
3330            redo A;
3331          } else {
3332            $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name'); ## TODO: type
3333    
3334            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3335              
3336              $self->{ct}->{quirks} = 1;
3337              $self->{state} = BOGUS_DOCTYPE_STATE;
3338            } else {
3339              
3340              $self->{state} = BOGUS_MD_STATE;
3341            }
3342    
         $self->{state} = BOGUS_DOCTYPE_STATE;  
3343                    
3344      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3345        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2736  sub _get_next_token ($) { Line 3361  sub _get_next_token ($) {
3361              0x0042, # B              0x0042, # B
3362              0x004C, # L              0x004C, # L
3363              0x0049, # I              0x0049, # I
3364            ]->[length $self->{s_kwd}] or            ]->[length $self->{kwd}] or
3365            $self->{nc} == [            $self->{nc} == [
3366              undef,              undef,
3367              0x0075, # u              0x0075, # u
3368              0x0062, # b              0x0062, # b
3369              0x006C, # l              0x006C, # l
3370              0x0069, # i              0x0069, # i
3371            ]->[length $self->{s_kwd}]) {            ]->[length $self->{kwd}]) {
3372                    
3373          ## Stay in the state.          ## Stay in the state.
3374          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
3375                    
3376      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3377        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2759  sub _get_next_token ($) { Line 3384  sub _get_next_token ($) {
3384      }      }
3385        
3386          redo A;          redo A;
3387        } elsif ((length $self->{s_kwd}) == 5 and        } elsif ((length $self->{kwd}) == 5 and
3388                 ($self->{nc} == 0x0043 or # C                 ($self->{nc} == 0x0043 or # C
3389                  $self->{nc} == 0x0063)) { # c                  $self->{nc} == 0x0063)) { # c
3390                    if ($self->{is_xml} and
3391                ($self->{kwd} ne 'PUBLI' or $self->{nc} == 0x0063)) { # c
3392              
3393              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
3394                              text => 'PUBLIC',
3395                              line => $self->{line_prev},
3396                              column => $self->{column_prev} - 4);
3397            } else {
3398              
3399            }
3400          $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;          $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3401                    
3402      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 2777  sub _get_next_token ($) { Line 3411  sub _get_next_token ($) {
3411        
3412          redo A;          redo A;
3413        } else {        } else {
3414                    $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name', ## TODO: type
         $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name',  
3415                          line => $self->{line_prev},                          line => $self->{line_prev},
3416                          column => $self->{column_prev} + 1 - length $self->{s_kwd});                          column => $self->{column_prev} + 1 - length $self->{kwd});
3417          $self->{ct}->{quirks} = 1;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3418              
3419          $self->{state} = BOGUS_DOCTYPE_STATE;            $self->{ct}->{quirks} = 1;
3420              $self->{state} = BOGUS_DOCTYPE_STATE;
3421            } else {
3422              
3423              $self->{state} = BOGUS_MD_STATE;
3424            }
3425          ## Reconsume.          ## Reconsume.
3426          redo A;          redo A;
3427        }        }
# Line 2795  sub _get_next_token ($) { Line 3433  sub _get_next_token ($) {
3433              0x0053, # S              0x0053, # S
3434              0x0054, # T              0x0054, # T
3435              0x0045, # E              0x0045, # E
3436            ]->[length $self->{s_kwd}] or            ]->[length $self->{kwd}] or
3437            $self->{nc} == [            $self->{nc} == [
3438              undef,              undef,
3439              0x0079, # y              0x0079, # y
3440              0x0073, # s              0x0073, # s
3441              0x0074, # t              0x0074, # t
3442              0x0065, # e              0x0065, # e
3443            ]->[length $self->{s_kwd}]) {            ]->[length $self->{kwd}]) {
3444                    
3445          ## Stay in the state.          ## Stay in the state.
3446          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
3447                    
3448      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3449        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2818  sub _get_next_token ($) { Line 3456  sub _get_next_token ($) {
3456      }      }
3457        
3458          redo A;          redo A;
3459        } elsif ((length $self->{s_kwd}) == 5 and        } elsif ((length $self->{kwd}) == 5 and
3460                 ($self->{nc} == 0x004D or # M                 ($self->{nc} == 0x004D or # M
3461                  $self->{nc} == 0x006D)) { # m                  $self->{nc} == 0x006D)) { # m
3462                    if ($self->{is_xml} and
3463                ($self->{kwd} ne 'SYSTE' or $self->{nc} == 0x006D)) { # m
3464              
3465              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
3466                              text => 'SYSTEM',
3467                              line => $self->{line_prev},
3468                              column => $self->{column_prev} - 4);
3469            } else {
3470              
3471            }
3472          $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;          $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
3473                    
3474      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 2836  sub _get_next_token ($) { Line 3483  sub _get_next_token ($) {
3483        
3484          redo A;          redo A;
3485        } else {        } else {
3486                    $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name', ## TODO: type
         $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name',  
3487                          line => $self->{line_prev},                          line => $self->{line_prev},
3488                          column => $self->{column_prev} + 1 - length $self->{s_kwd});                          column => $self->{column_prev} + 1 - length $self->{kwd});
3489          $self->{ct}->{quirks} = 1;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3490              
3491          $self->{state} = BOGUS_DOCTYPE_STATE;            $self->{ct}->{quirks} = 1;
3492              $self->{state} = BOGUS_DOCTYPE_STATE;
3493            } else {
3494              
3495              $self->{state} = BOGUS_MD_STATE;
3496            }
3497          ## Reconsume.          ## Reconsume.
3498          redo A;          redo A;
3499        }        }
# Line 2895  sub _get_next_token ($) { Line 3546  sub _get_next_token ($) {
3546        
3547          redo A;          redo A;
3548        } elsif ($self->{nc} eq 0x003E) { # >        } elsif ($self->{nc} eq 0x003E) { # >
           
3549          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3550            
3551          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3552              
3553              $self->{state} = DATA_STATE;
3554              $self->{s_kwd} = '';
3555              $self->{ct}->{quirks} = 1;
3556            } else {
3557              
3558              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3559            }
3560            
3561                    
3562      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3563        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2910  sub _get_next_token ($) { Line 3569  sub _get_next_token ($) {
3569        $self->{set_nc}->($self);        $self->{set_nc}->($self);
3570      }      }
3571        
3572            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         return  ($self->{ct}); # DOCTYPE  
   
3573          redo A;          redo A;
3574        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
3575            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3576              
3577              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3578              $self->{state} = DATA_STATE;
3579              $self->{s_kwd} = '';
3580              $self->{ct}->{quirks} = 1;
3581            } else {
3582              
3583              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3584              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3585            }
3586                    
         $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');  
   
         $self->{state} = DATA_STATE;  
3587          ## reconsume          ## reconsume
   
         $self->{ct}->{quirks} = 1;  
3588          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
   
3589          redo A;          redo A;
3590        } else {        } elsif ($self->{is_xml} and
3591                   $self->{ct}->{type} == DOCTYPE_TOKEN and
3592                   $self->{nc} == 0x005B) { # [
3593            
3594            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3595            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3596            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3597            $self->{in_subset} = 1;
3598                    
3599        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3600          $self->{line_prev} = $self->{line};
3601          $self->{column_prev} = $self->{column};
3602          $self->{column}++;
3603          $self->{nc}
3604              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3605        } else {
3606          $self->{set_nc}->($self);
3607        }
3608      
3609            return  ($self->{ct}); # DOCTYPE
3610            redo A;
3611          } else {
3612          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC');
         $self->{ct}->{quirks} = 1;  
3613    
3614          $self->{state} = BOGUS_DOCTYPE_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3615              
3616              $self->{ct}->{quirks} = 1;
3617              $self->{state} = BOGUS_DOCTYPE_STATE;
3618            } else {
3619              
3620              $self->{state} = BOGUS_MD_STATE;
3621            }
3622    
3623                    
3624      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3625        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2962  sub _get_next_token ($) { Line 3650  sub _get_next_token ($) {
3650        
3651          redo A;          redo A;
3652        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
           
3653          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3654    
3655          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3656              
3657              $self->{state} = DATA_STATE;
3658              $self->{s_kwd} = '';
3659              $self->{ct}->{quirks} = 1;
3660            } else {
3661              
3662              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3663            }
3664    
3665                    
3666      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3667        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2977  sub _get_next_token ($) { Line 3673  sub _get_next_token ($) {
3673        $self->{set_nc}->($self);        $self->{set_nc}->($self);
3674      }      }
3675        
3676            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         return  ($self->{ct}); # DOCTYPE  
   
3677          redo A;          redo A;
3678        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
3679          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3680    
3681          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3682          ## reconsume            
3683              $self->{state} = DATA_STATE;
3684          $self->{ct}->{quirks} = 1;            $self->{s_kwd} = '';
3685              $self->{ct}->{quirks} = 1;
3686            } else {
3687              
3688              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3689            }
3690            
3691            ## Reconsume.
3692          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
   
3693          redo A;          redo A;
3694        } else {        } else {
3695                    
3696          $self->{ct}->{pubid} # DOCTYPE          $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
             .= chr $self->{nc};  
3697          $self->{read_until}->($self->{ct}->{pubid}, q[">],          $self->{read_until}->($self->{ct}->{pubid}, q[">],
3698                                length $self->{ct}->{pubid});                                length $self->{ct}->{pubid});
3699    
# Line 3031  sub _get_next_token ($) { Line 3728  sub _get_next_token ($) {
3728        
3729          redo A;          redo A;
3730        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
           
3731          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3732    
3733          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3734              
3735              $self->{state} = DATA_STATE;
3736              $self->{s_kwd} = '';
3737              $self->{ct}->{quirks} = 1;
3738            } else {
3739              
3740              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3741            }
3742    
3743                    
3744      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3745        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3046  sub _get_next_token ($) { Line 3751  sub _get_next_token ($) {
3751        $self->{set_nc}->($self);        $self->{set_nc}->($self);
3752      }      }
3753        
3754            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         return  ($self->{ct}); # DOCTYPE  
   
3755          redo A;          redo A;
3756        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
3757          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3758    
3759          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3760              
3761              $self->{state} = DATA_STATE;
3762              $self->{s_kwd} = '';
3763              $self->{ct}->{quirks} = 1;
3764            } else {
3765              
3766              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3767            }
3768          
3769          ## reconsume          ## reconsume
3770            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         return  ($self->{ct}); # DOCTYPE  
   
3771          redo A;          redo A;
3772        } else {        } else {
3773                    
3774          $self->{ct}->{pubid} # DOCTYPE          $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
             .= chr $self->{nc};  
3775          $self->{read_until}->($self->{ct}->{pubid}, q['>],          $self->{read_until}->($self->{ct}->{pubid}, q['>],
3776                                length $self->{ct}->{pubid});                                length $self->{ct}->{pubid});
3777    
# Line 3101  sub _get_next_token ($) { Line 3807  sub _get_next_token ($) {
3807          redo A;          redo A;
3808        } elsif ($self->{nc} == 0x0022) { # "        } elsif ($self->{nc} == 0x0022) { # "
3809                    
3810          $self->{ct}->{sysid} = ''; # DOCTYPE          $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
3811          $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;          $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3812                    
3813      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 3117  sub _get_next_token ($) { Line 3823  sub _get_next_token ($) {
3823          redo A;          redo A;
3824        } elsif ($self->{nc} == 0x0027) { # '        } elsif ($self->{nc} == 0x0027) { # '
3825                    
3826          $self->{ct}->{sysid} = ''; # DOCTYPE          $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
3827          $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;          $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
3828                    
3829      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 3132  sub _get_next_token ($) { Line 3838  sub _get_next_token ($) {
3838        
3839          redo A;          redo A;
3840        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
3841            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3842              if ($self->{is_xml}) {
3843                
3844                $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3845              } else {
3846                
3847              }
3848              $self->{state} = DATA_STATE;
3849              $self->{s_kwd} = '';
3850            } else {
3851              if ($self->{ct}->{type} == NOTATION_TOKEN) {
3852                
3853              } else {
3854                
3855                $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');            
3856              }
3857              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3858            }
3859                    
         $self->{state} = DATA_STATE;  
3860                    
3861      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3862        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3145  sub _get_next_token ($) { Line 3868  sub _get_next_token ($) {
3868        $self->{set_nc}->($self);        $self->{set_nc}->($self);
3869      }      }
3870        
3871            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         return  ($self->{ct}); # DOCTYPE  
   
3872          redo A;          redo A;
3873        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
3874            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3875              
3876              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3877              
3878              $self->{state} = DATA_STATE;
3879              $self->{s_kwd} = '';
3880              $self->{ct}->{quirks} = 1;
3881            } else {
3882              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3883              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3884            }
3885                    
         $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');  
   
         $self->{state} = DATA_STATE;  
3886          ## reconsume          ## reconsume
3887            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3888          $self->{ct}->{quirks} = 1;          redo A;
3889          } elsif ($self->{is_xml} and
3890                   $self->{ct}->{type} == DOCTYPE_TOKEN and
3891                   $self->{nc} == 0x005B) { # [
3892            
3893            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3894            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3895            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3896            $self->{in_subset} = 1;
3897            
3898        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3899          $self->{line_prev} = $self->{line};
3900          $self->{column_prev} = $self->{column};
3901          $self->{column}++;
3902          $self->{nc}
3903              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3904        } else {
3905          $self->{set_nc}->($self);
3906        }
3907      
3908          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
   
3909          redo A;          redo A;
3910        } else {        } else {
           
3911          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC literal');
         $self->{ct}->{quirks} = 1;  
3912    
3913          $self->{state} = BOGUS_DOCTYPE_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3914              
3915              $self->{ct}->{quirks} = 1;
3916              $self->{state} = BOGUS_DOCTYPE_STATE;
3917            } else {
3918              
3919              $self->{state} = BOGUS_MD_STATE;
3920            }
3921    
3922                    
3923      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3924        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3228  sub _get_next_token ($) { Line 3981  sub _get_next_token ($) {
3981        
3982          redo A;          redo A;
3983        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
           
3984          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
         $self->{state} = DATA_STATE;  
3985                    
3986      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3987        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3243  sub _get_next_token ($) { Line 3994  sub _get_next_token ($) {
3994      }      }
3995        
3996    
3997          $self->{ct}->{quirks} = 1;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3998          return  ($self->{ct}); # DOCTYPE            
3999              $self->{state} = DATA_STATE;
4000              $self->{s_kwd} = '';
4001              $self->{ct}->{quirks} = 1;
4002            } else {
4003              
4004              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4005            }
4006    
4007            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4008          redo A;          redo A;
4009        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
4010            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4011              
4012              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
4013              $self->{state} = DATA_STATE;
4014              $self->{s_kwd} = '';
4015              $self->{ct}->{quirks} = 1;
4016            } else {
4017              
4018              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4019              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4020            }
4021                    
         $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');  
   
         $self->{state} = DATA_STATE;  
4022          ## reconsume          ## reconsume
4023            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4024            redo A;
4025          } elsif ($self->{is_xml} and
4026                   $self->{ct}->{type} == DOCTYPE_TOKEN and
4027                   $self->{nc} == 0x005B) { # [
4028            
4029            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
4030    
4031          $self->{ct}->{quirks} = 1;          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4032            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4033            $self->{in_subset} = 1;
4034            
4035        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4036          $self->{line_prev} = $self->{line};
4037          $self->{column_prev} = $self->{column};
4038          $self->{column}++;
4039          $self->{nc}
4040              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4041        } else {
4042          $self->{set_nc}->($self);
4043        }
4044      
4045          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
   
4046          redo A;          redo A;
4047        } else {        } else {
           
4048          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM');
         $self->{ct}->{quirks} = 1;  
4049    
4050          $self->{state} = BOGUS_DOCTYPE_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4051                        
4052              $self->{ct}->{quirks} = 1;
4053              $self->{state} = BOGUS_DOCTYPE_STATE;
4054            } else {
4055              
4056              $self->{state} = BOGUS_MD_STATE;
4057            }
4058    
4059                    
4060      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4061        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3293  sub _get_next_token ($) { Line 4085  sub _get_next_token ($) {
4085      }      }
4086        
4087          redo A;          redo A;
4088        } elsif ($self->{nc} == 0x003E) { # >        } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
           
4089          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4090    
4091          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4092              
4093              $self->{state} = DATA_STATE;
4094              $self->{s_kwd} = '';
4095              $self->{ct}->{quirks} = 1;
4096            } else {
4097              
4098              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4099            }
4100            
4101                    
4102      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4103        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3309  sub _get_next_token ($) { Line 4109  sub _get_next_token ($) {
4109        $self->{set_nc}->($self);        $self->{set_nc}->($self);
4110      }      }
4111        
4112            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         return  ($self->{ct}); # DOCTYPE  
   
4113          redo A;          redo A;
4114        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
4115          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4116    
4117          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4118              
4119              $self->{state} = DATA_STATE;
4120              $self->{s_kwd} = '';
4121              $self->{ct}->{quirks} = 1;
4122            } else {
4123              
4124              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4125            }
4126            
4127          ## reconsume          ## reconsume
4128            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         return  ($self->{ct}); # DOCTYPE  
   
4129          redo A;          redo A;
4130        } else {        } else {
4131                    
4132          $self->{ct}->{sysid} # DOCTYPE          $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
             .= chr $self->{nc};  
4133          $self->{read_until}->($self->{ct}->{sysid}, q[">],          $self->{read_until}->($self->{ct}->{sysid}, q[">],
4134                                length $self->{ct}->{sysid});                                length $self->{ct}->{sysid});
4135    
# Line 3362  sub _get_next_token ($) { Line 4163  sub _get_next_token ($) {
4163      }      }
4164        
4165          redo A;          redo A;
4166        } elsif ($self->{nc} == 0x003E) { # >        } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
4167                    
4168          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4169    
4170          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
4171            $self->{s_kwd} = '';
4172                    
4173      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4174        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3384  sub _get_next_token ($) { Line 4186  sub _get_next_token ($) {
4186    
4187          redo A;          redo A;
4188        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
4189          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4190    
4191          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4192          ## reconsume            
4193              $self->{state} = DATA_STATE;
4194          $self->{ct}->{quirks} = 1;            $self->{s_kwd} = '';
4195          return  ($self->{ct}); # DOCTYPE            $self->{ct}->{quirks} = 1;
4196            } else {
4197              
4198              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4199            }
4200    
4201            ## reconsume
4202            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4203          redo A;          redo A;
4204        } else {        } else {
4205                    
4206          $self->{ct}->{sysid} # DOCTYPE          $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
             .= chr $self->{nc};  
4207          $self->{read_until}->($self->{ct}->{sysid}, q['>],          $self->{read_until}->($self->{ct}->{sysid}, q['>],
4208                                length $self->{ct}->{sysid});                                length $self->{ct}->{sysid});
4209    
# Line 3417  sub _get_next_token ($) { Line 4223  sub _get_next_token ($) {
4223        }        }
4224      } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {      } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
4225        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
4226                    if ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN) {
4227          ## Stay in the state            
4228              $self->{state} = BEFORE_NDATA_STATE;
4229            } else {
4230              
4231              ## Stay in the state
4232            }
4233                    
4234      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4235        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3432  sub _get_next_token ($) { Line 4243  sub _get_next_token ($) {
4243        
4244          redo A;          redo A;
4245        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
4246            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4247              
4248              $self->{state} = DATA_STATE;
4249              $self->{s_kwd} = '';
4250            } else {
4251              
4252              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4253            }
4254    
4255                    
4256          $self->{state} = DATA_STATE;      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4257          $self->{line_prev} = $self->{line};
4258          $self->{column_prev} = $self->{column};
4259          $self->{column}++;
4260          $self->{nc}
4261              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4262        } else {
4263          $self->{set_nc}->($self);
4264        }
4265      
4266            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4267            redo A;
4268          } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
4269                   ($self->{nc} == 0x004E or # N
4270                    $self->{nc} == 0x006E)) { # n
4271            
4272            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before NDATA'); ## TODO: type
4273            $self->{state} = NDATA_STATE;
4274            $self->{kwd} = chr $self->{nc};
4275                    
4276      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4277        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3445  sub _get_next_token ($) { Line 4283  sub _get_next_token ($) {
4283        $self->{set_nc}->($self);        $self->{set_nc}->($self);
4284      }      }
4285        
4286            redo A;
4287          } elsif ($self->{nc} == -1) {
4288            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4289              
4290              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
4291              $self->{state} = DATA_STATE;
4292              $self->{s_kwd} = '';
4293              $self->{ct}->{quirks} = 1;
4294            } else {
4295              
4296              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4297              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4298            }
4299    
4300            ## reconsume
4301            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4302            redo A;
4303          } elsif ($self->{is_xml} and
4304                   $self->{ct}->{type} == DOCTYPE_TOKEN and
4305                   $self->{nc} == 0x005B) { # [
4306            
4307            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4308            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4309            $self->{in_subset} = 1;
4310            
4311        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4312          $self->{line_prev} = $self->{line};
4313          $self->{column_prev} = $self->{column};
4314          $self->{column}++;
4315          $self->{nc}
4316              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4317        } else {
4318          $self->{set_nc}->($self);
4319        }
4320      
4321          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
4322            redo A;
4323          } else {
4324            $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
4325    
4326            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4327              
4328              #$self->{ct}->{quirks} = 1;
4329              $self->{state} = BOGUS_DOCTYPE_STATE;
4330            } else {
4331              
4332              $self->{state} = BOGUS_MD_STATE;
4333            }
4334    
4335            
4336        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4337          $self->{line_prev} = $self->{line};
4338          $self->{column_prev} = $self->{column};
4339          $self->{column}++;
4340          $self->{nc}
4341              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4342        } else {
4343          $self->{set_nc}->($self);
4344        }
4345      
4346            redo A;
4347          }
4348        } elsif ($self->{state} == BEFORE_NDATA_STATE) {
4349          if ($is_space->{$self->{nc}}) {
4350            
4351            ## Stay in the state.
4352            
4353        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4354          $self->{line_prev} = $self->{line};
4355          $self->{column_prev} = $self->{column};
4356          $self->{column}++;
4357          $self->{nc}
4358              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4359        } else {
4360          $self->{set_nc}->($self);
4361        }
4362      
4363            redo A;
4364          } elsif ($self->{nc} == 0x003E) { # >
4365            
4366            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4367            
4368        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4369          $self->{line_prev} = $self->{line};
4370          $self->{column_prev} = $self->{column};
4371          $self->{column}++;
4372          $self->{nc}
4373              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4374        } else {
4375          $self->{set_nc}->($self);
4376        }
4377      
4378            return  ($self->{ct}); # ENTITY
4379            redo A;
4380          } elsif ($self->{nc} == 0x004E or # N
4381                   $self->{nc} == 0x006E) { # n
4382            
4383            $self->{state} = NDATA_STATE;
4384            $self->{kwd} = chr $self->{nc};
4385            
4386        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4387          $self->{line_prev} = $self->{line};
4388          $self->{column_prev} = $self->{column};
4389          $self->{column}++;
4390          $self->{nc}
4391              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4392        } else {
4393          $self->{set_nc}->($self);
4394        }
4395      
4396          redo A;          redo A;
4397        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
4398                    
4399          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4400          $self->{state} = DATA_STATE;          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4401          ## reconsume          ## reconsume
4402            return  ($self->{ct}); # ENTITY
         $self->{ct}->{quirks} = 1;  
         return  ($self->{ct}); # DOCTYPE  
   
4403          redo A;          redo A;
4404        } else {        } else {
4405                    
4406          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
4407          #$self->{ct}->{quirks} = 1;          $self->{state} = BOGUS_MD_STATE;
   
         $self->{state} = BOGUS_DOCTYPE_STATE;  
4408                    
4409      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4410        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3482  sub _get_next_token ($) { Line 4422  sub _get_next_token ($) {
4422        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
4423                    
4424          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
4425            $self->{s_kwd} = '';
4426                    
4427      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4428        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3497  sub _get_next_token ($) { Line 4438  sub _get_next_token ($) {
4438          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
4439    
4440          redo A;          redo A;
4441          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
4442            
4443            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4444            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4445            $self->{in_subset} = 1;
4446            
4447        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4448          $self->{line_prev} = $self->{line};
4449          $self->{column_prev} = $self->{column};
4450          $self->{column}++;
4451          $self->{nc}
4452              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4453        } else {
4454          $self->{set_nc}->($self);
4455        }
4456      
4457            return  ($self->{ct}); # DOCTYPE
4458            redo A;
4459        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
4460                    
4461          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
4462            $self->{s_kwd} = '';
4463          ## reconsume          ## reconsume
4464    
4465          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
# Line 3508  sub _get_next_token ($) { Line 4468  sub _get_next_token ($) {
4468        } else {        } else {
4469                    
4470          my $s = '';          my $s = '';
4471          $self->{read_until}->($s, q[>], 0);          $self->{read_until}->($s, q{>[}, 0);
4472    
4473          ## Stay in the state          ## Stay in the state
4474                    
# Line 3528  sub _get_next_token ($) { Line 4488  sub _get_next_token ($) {
4488        ## NOTE: "CDATA section state" in the state is jointly implemented        ## NOTE: "CDATA section state" in the state is jointly implemented
4489        ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,        ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
4490        ## and |CDATA_SECTION_MSE2_STATE|.        ## and |CDATA_SECTION_MSE2_STATE|.
4491    
4492          ## XML5: "CDATA state".
4493                
4494        if ($self->{nc} == 0x005D) { # ]        if ($self->{nc} == 0x005D) { # ]
4495                    
# Line 3545  sub _get_next_token ($) { Line 4507  sub _get_next_token ($) {
4507        
4508          redo A;          redo A;
4509        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
4510            if ($self->{is_xml}) {
4511              
4512              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no mse'); ## TODO: type
4513            } else {
4514              
4515            }
4516    
4517          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
4518                    $self->{s_kwd} = '';
4519      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {          ## Reconsume.
       $self->{line_prev} = $self->{line};  
       $self->{column_prev} = $self->{column};  
       $self->{column}++;  
       $self->{nc}  
           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);  
     } else {  
       $self->{set_nc}->($self);  
     }  
     
4520          if (length $self->{ct}->{data}) { # character          if (length $self->{ct}->{data}) { # character
4521                        
4522            return  ($self->{ct}); # character            return  ($self->{ct}); # character
# Line 3589  sub _get_next_token ($) { Line 4549  sub _get_next_token ($) {
4549    
4550        ## ISSUE: "text tokens" in spec.        ## ISSUE: "text tokens" in spec.
4551      } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {      } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
4552          ## XML5: "CDATA bracket state".
4553    
4554        if ($self->{nc} == 0x005D) { # ]        if ($self->{nc} == 0x005D) { # ]
4555                    
4556          $self->{state} = CDATA_SECTION_MSE2_STATE;          $self->{state} = CDATA_SECTION_MSE2_STATE;
# Line 3606  sub _get_next_token ($) { Line 4568  sub _get_next_token ($) {
4568          redo A;          redo A;
4569        } else {        } else {
4570                    
4571            ## XML5: If EOF, "]" is not appended and changed to the data state.
4572          $self->{ct}->{data} .= ']';          $self->{ct}->{data} .= ']';
4573          $self->{state} = CDATA_SECTION_STATE;          $self->{state} = CDATA_SECTION_STATE; ## XML5: Stay in the state.
4574          ## Reconsume.          ## Reconsume.
4575          redo A;          redo A;
4576        }        }
4577      } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {      } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
4578          ## XML5: "CDATA end state".
4579    
4580        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
4581          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
4582            $self->{s_kwd} = '';
4583                    
4584      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4585        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3653  sub _get_next_token ($) { Line 4619  sub _get_next_token ($) {
4619                    
4620          $self->{ct}->{data} .= ']]'; # character          $self->{ct}->{data} .= ']]'; # character
4621          $self->{state} = CDATA_SECTION_STATE;          $self->{state} = CDATA_SECTION_STATE;
4622          ## Reconsume.          ## Reconsume. ## XML5: Emit.
4623          redo A;          redo A;
4624        }        }
4625      } elsif ($self->{state} == ENTITY_STATE) {      } elsif ($self->{state} == ENTITY_STATE) {
# Line 3670  sub _get_next_token ($) { Line 4636  sub _get_next_token ($) {
4636        } elsif ($self->{nc} == 0x0023) { # #        } elsif ($self->{nc} == 0x0023) { # #
4637                    
4638          $self->{state} = ENTITY_HASH_STATE;          $self->{state} = ENTITY_HASH_STATE;
4639          $self->{s_kwd} = '#';          $self->{kwd} = '#';
4640                    
4641      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4642        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3690  sub _get_next_token ($) { Line 4656  sub _get_next_token ($) {
4656                    
4657          require Whatpm::_NamedEntityList;          require Whatpm::_NamedEntityList;
4658          $self->{state} = ENTITY_NAME_STATE;          $self->{state} = ENTITY_NAME_STATE;
4659          $self->{s_kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
4660          $self->{entity__value} = $self->{s_kwd};          $self->{entity__value} = $self->{kwd};
4661          $self->{entity__match} = 0;          $self->{entity__match} = 0;
4662                    
4663      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 3721  sub _get_next_token ($) { Line 4687  sub _get_next_token ($) {
4687        if ($self->{prev_state} == DATA_STATE) {        if ($self->{prev_state} == DATA_STATE) {
4688                    
4689          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
4690            $self->{s_kwd} = '';
4691          ## Reconsume.          ## Reconsume.
4692          return  ({type => CHARACTER_TOKEN, data => '&',          return  ({type => CHARACTER_TOKEN, data => '&',
4693                    line => $self->{line_prev},                    line => $self->{line_prev},
# Line 3731  sub _get_next_token ($) { Line 4698  sub _get_next_token ($) {
4698                    
4699          $self->{ca}->{value} .= '&';          $self->{ca}->{value} .= '&';
4700          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
4701            $self->{s_kwd} = '';
4702          ## Reconsume.          ## Reconsume.
4703          redo A;          redo A;
4704        }        }
# Line 3739  sub _get_next_token ($) { Line 4707  sub _get_next_token ($) {
4707            $self->{nc} == 0x0058) { # X            $self->{nc} == 0x0058) { # X
4708                    
4709          $self->{state} = HEXREF_X_STATE;          $self->{state} = HEXREF_X_STATE;
4710          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
4711                    
4712      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4713        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3756  sub _get_next_token ($) { Line 4724  sub _get_next_token ($) {
4724                 $self->{nc} <= 0x0039) { # 0..9                 $self->{nc} <= 0x0039) { # 0..9
4725                    
4726          $self->{state} = NCR_NUM_STATE;          $self->{state} = NCR_NUM_STATE;
4727          $self->{s_kwd} = $self->{nc} - 0x0030;          $self->{kwd} = $self->{nc} - 0x0030;
4728                    
4729      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4730        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3781  sub _get_next_token ($) { Line 4749  sub _get_next_token ($) {
4749          if ($self->{prev_state} == DATA_STATE) {          if ($self->{prev_state} == DATA_STATE) {
4750                        
4751            $self->{state} = $self->{prev_state};            $self->{state} = $self->{prev_state};
4752              $self->{s_kwd} = '';
4753            ## Reconsume.            ## Reconsume.
4754            return  ({type => CHARACTER_TOKEN,            return  ({type => CHARACTER_TOKEN,
4755                      data => '&#',                      data => '&#',
# Line 3792  sub _get_next_token ($) { Line 4761  sub _get_next_token ($) {
4761                        
4762            $self->{ca}->{value} .= '&#';            $self->{ca}->{value} .= '&#';
4763            $self->{state} = $self->{prev_state};            $self->{state} = $self->{prev_state};
4764              $self->{s_kwd} = '';
4765            ## Reconsume.            ## Reconsume.
4766            redo A;            redo A;
4767          }          }
# Line 3800  sub _get_next_token ($) { Line 4770  sub _get_next_token ($) {
4770        if (0x0030 <= $self->{nc} and        if (0x0030 <= $self->{nc} and
4771            $self->{nc} <= 0x0039) { # 0..9            $self->{nc} <= 0x0039) { # 0..9
4772                    
4773          $self->{s_kwd} *= 10;          $self->{kwd} *= 10;
4774          $self->{s_kwd} += $self->{nc} - 0x0030;          $self->{kwd} += $self->{nc} - 0x0030;
4775                    
4776          ## Stay in the state.          ## Stay in the state.
4777                    
# Line 3837  sub _get_next_token ($) { Line 4807  sub _get_next_token ($) {
4807          #          #
4808        }        }
4809    
4810        my $code = $self->{s_kwd};        my $code = $self->{kwd};
4811        my $l = $self->{line_prev};        my $l = $self->{line_prev};
4812        my $c = $self->{column_prev};        my $c = $self->{column_prev};
4813        if ($charref_map->{$code}) {        if ($charref_map->{$code}) {
# Line 3857  sub _get_next_token ($) { Line 4827  sub _get_next_token ($) {
4827        if ($self->{prev_state} == DATA_STATE) {        if ($self->{prev_state} == DATA_STATE) {
4828                    
4829          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
4830            $self->{s_kwd} = '';
4831          ## Reconsume.          ## Reconsume.
4832          return  ({type => CHARACTER_TOKEN, data => chr $code,          return  ({type => CHARACTER_TOKEN, data => chr $code,
4833                      has_reference => 1,
4834                    line => $l, column => $c,                    line => $l, column => $c,
4835                   });                   });
4836          redo A;          redo A;
# Line 3867  sub _get_next_token ($) { Line 4839  sub _get_next_token ($) {
4839          $self->{ca}->{value} .= chr $code;          $self->{ca}->{value} .= chr $code;
4840          $self->{ca}->{has_reference} = 1;          $self->{ca}->{has_reference} = 1;
4841          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
4842            $self->{s_kwd} = '';
4843          ## Reconsume.          ## Reconsume.
4844          redo A;          redo A;
4845        }        }
# Line 3877  sub _get_next_token ($) { Line 4850  sub _get_next_token ($) {
4850          # 0..9, A..F, a..f          # 0..9, A..F, a..f
4851                    
4852          $self->{state} = HEXREF_HEX_STATE;          $self->{state} = HEXREF_HEX_STATE;
4853          $self->{s_kwd} = 0;          $self->{kwd} = 0;
4854          ## Reconsume.          ## Reconsume.
4855          redo A;          redo A;
4856        } else {        } else {
# Line 3892  sub _get_next_token ($) { Line 4865  sub _get_next_token ($) {
4865          if ($self->{prev_state} == DATA_STATE) {          if ($self->{prev_state} == DATA_STATE) {
4866                        
4867            $self->{state} = $self->{prev_state};            $self->{state} = $self->{prev_state};
4868              $self->{s_kwd} = '';
4869            ## Reconsume.            ## Reconsume.
4870            return  ({type => CHARACTER_TOKEN,            return  ({type => CHARACTER_TOKEN,
4871                      data => '&' . $self->{s_kwd},                      data => '&' . $self->{kwd},
4872                      line => $self->{line_prev},                      line => $self->{line_prev},
4873                      column => $self->{column_prev} - length $self->{s_kwd},                      column => $self->{column_prev} - length $self->{kwd},
4874                     });                     });
4875            redo A;            redo A;
4876          } else {          } else {
4877                        
4878            $self->{ca}->{value} .= '&' . $self->{s_kwd};            $self->{ca}->{value} .= '&' . $self->{kwd};
4879            $self->{state} = $self->{prev_state};            $self->{state} = $self->{prev_state};
4880              $self->{s_kwd} = '';
4881            ## Reconsume.            ## Reconsume.
4882            redo A;            redo A;
4883          }          }
# Line 3911  sub _get_next_token ($) { Line 4886  sub _get_next_token ($) {
4886        if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {        if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
4887          # 0..9          # 0..9
4888                    
4889          $self->{s_kwd} *= 0x10;          $self->{kwd} *= 0x10;
4890          $self->{s_kwd} += $self->{nc} - 0x0030;          $self->{kwd} += $self->{nc} - 0x0030;
4891          ## Stay in the state.          ## Stay in the state.
4892                    
4893      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 3929  sub _get_next_token ($) { Line 4904  sub _get_next_token ($) {
4904        } elsif (0x0061 <= $self->{nc} and        } elsif (0x0061 <= $self->{nc} and
4905                 $self->{nc} <= 0x0066) { # a..f                 $self->{nc} <= 0x0066) { # a..f
4906                    
4907          $self->{s_kwd} *= 0x10;          $self->{kwd} *= 0x10;
4908          $self->{s_kwd} += $self->{nc} - 0x0060 + 9;          $self->{kwd} += $self->{nc} - 0x0060 + 9;
4909          ## Stay in the state.          ## Stay in the state.
4910                    
4911      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 3947  sub _get_next_token ($) { Line 4922  sub _get_next_token ($) {
4922        } elsif (0x0041 <= $self->{nc} and        } elsif (0x0041 <= $self->{nc} and
4923                 $self->{nc} <= 0x0046) { # A..F                 $self->{nc} <= 0x0046) { # A..F
4924                    
4925          $self->{s_kwd} *= 0x10;          $self->{kwd} *= 0x10;
4926          $self->{s_kwd} += $self->{nc} - 0x0040 + 9;          $self->{kwd} += $self->{nc} - 0x0040 + 9;
4927          ## Stay in the state.          ## Stay in the state.
4928                    
4929      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 3985  sub _get_next_token ($) { Line 4960  sub _get_next_token ($) {
4960          #          #
4961        }        }
4962    
4963        my $code = $self->{s_kwd};        my $code = $self->{kwd};
4964        my $l = $self->{line_prev};        my $l = $self->{line_prev};
4965        my $c = $self->{column_prev};        my $c = $self->{column_prev};
4966        if ($charref_map->{$code}) {        if ($charref_map->{$code}) {
# Line 4005  sub _get_next_token ($) { Line 4980  sub _get_next_token ($) {
4980        if ($self->{prev_state} == DATA_STATE) {        if ($self->{prev_state} == DATA_STATE) {
4981                    
4982          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
4983            $self->{s_kwd} = '';
4984          ## Reconsume.          ## Reconsume.
4985          return  ({type => CHARACTER_TOKEN, data => chr $code,          return  ({type => CHARACTER_TOKEN, data => chr $code,
4986                      has_reference => 1,
4987                    line => $l, column => $c,                    line => $l, column => $c,
4988                   });                   });
4989          redo A;          redo A;
# Line 4015  sub _get_next_token ($) { Line 4992  sub _get_next_token ($) {
4992          $self->{ca}->{value} .= chr $code;          $self->{ca}->{value} .= chr $code;
4993          $self->{ca}->{has_reference} = 1;          $self->{ca}->{has_reference} = 1;
4994          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
4995            $self->{s_kwd} = '';
4996          ## Reconsume.          ## Reconsume.
4997          redo A;          redo A;
4998        }        }
4999      } elsif ($self->{state} == ENTITY_NAME_STATE) {      } elsif ($self->{state} == ENTITY_NAME_STATE) {
5000        if (length $self->{s_kwd} < 30 and        if (length $self->{kwd} < 30 and
5001            ## NOTE: Some number greater than the maximum length of entity name            ## NOTE: Some number greater than the maximum length of entity name
5002            ((0x0041 <= $self->{nc} and # a            ((0x0041 <= $self->{nc} and # a
5003              $self->{nc} <= 0x005A) or # x              $self->{nc} <= 0x005A) or # x
# Line 4029  sub _get_next_token ($) { Line 5007  sub _get_next_token ($) {
5007              $self->{nc} <= 0x0039) or # 9              $self->{nc} <= 0x0039) or # 9
5008             $self->{nc} == 0x003B)) { # ;             $self->{nc} == 0x003B)) { # ;
5009          our $EntityChar;          our $EntityChar;
5010          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
5011          if (defined $EntityChar->{$self->{s_kwd}}) {          if (defined $EntityChar->{$self->{kwd}}) {
5012            if ($self->{nc} == 0x003B) { # ;            if ($self->{nc} == 0x003B) { # ;
5013                            
5014              $self->{entity__value} = $EntityChar->{$self->{s_kwd}};              $self->{entity__value} = $EntityChar->{$self->{kwd}};
5015              $self->{entity__match} = 1;              $self->{entity__match} = 1;
5016                            
5017      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 4049  sub _get_next_token ($) { Line 5027  sub _get_next_token ($) {
5027              #              #
5028            } else {            } else {
5029                            
5030              $self->{entity__value} = $EntityChar->{$self->{s_kwd}};              $self->{entity__value} = $EntityChar->{$self->{kwd}};
5031              $self->{entity__match} = -1;              $self->{entity__match} = -1;
5032              ## Stay in the state.              ## Stay in the state.
5033                            
# Line 4097  sub _get_next_token ($) { Line 5075  sub _get_next_token ($) {
5075          if ($self->{prev_state} != DATA_STATE and # in attribute          if ($self->{prev_state} != DATA_STATE and # in attribute
5076              $self->{entity__match} < -1) {              $self->{entity__match} < -1) {
5077                        
5078            $data = '&' . $self->{s_kwd};            $data = '&' . $self->{kwd};
5079            #            #
5080          } else {          } else {
5081                        
# Line 4109  sub _get_next_token ($) { Line 5087  sub _get_next_token ($) {
5087                    
5088          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
5089                          line => $self->{line_prev},                          line => $self->{line_prev},
5090                          column => $self->{column_prev} - length $self->{s_kwd});                          column => $self->{column_prev} - length $self->{kwd});
5091          $data = '&' . $self->{s_kwd};          $data = '&' . $self->{kwd};
5092          #          #
5093        }        }
5094        
# Line 4127  sub _get_next_token ($) { Line 5105  sub _get_next_token ($) {
5105        if ($self->{prev_state} == DATA_STATE) {        if ($self->{prev_state} == DATA_STATE) {
5106                    
5107          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
5108            $self->{s_kwd} = '';
5109          ## Reconsume.          ## Reconsume.
5110          return  ({type => CHARACTER_TOKEN,          return  ({type => CHARACTER_TOKEN,
5111                    data => $data,                    data => $data,
5112                      has_reference => $has_ref,
5113                    line => $self->{line_prev},                    line => $self->{line_prev},
5114                    column => $self->{column_prev} + 1 - length $self->{s_kwd},                    column => $self->{column_prev} + 1 - length $self->{kwd},
5115                   });                   });
5116          redo A;          redo A;
5117        } else {        } else {
# Line 4139  sub _get_next_token ($) { Line 5119  sub _get_next_token ($) {
5119          $self->{ca}->{value} .= $data;          $self->{ca}->{value} .= $data;
5120          $self->{ca}->{has_reference} = 1 if $has_ref;          $self->{ca}->{has_reference} = 1 if $has_ref;
5121          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
5122            $self->{s_kwd} = '';
5123            ## Reconsume.
5124            redo A;
5125          }
5126    
5127        ## XML-only states
5128    
5129        } elsif ($self->{state} == PI_STATE) {
5130          ## XML5: "Pi state" and "DOCTYPE pi state".
5131    
5132          if ($is_space->{$self->{nc}} or
5133              $self->{nc} == 0x003F or # ?
5134              $self->{nc} == -1) {
5135            ## XML5: U+003F: "pi state": Same as "Anything else"; "DOCTYPE
5136            ## pi state": Switch to the "DOCTYPE pi after state".  EOF:
5137            ## "DOCTYPE pi state": Parse error, switch to the "data
5138            ## state".
5139            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare pio', ## TODO: type
5140                            line => $self->{line_prev},
5141                            column => $self->{column_prev}
5142                                - 1 * ($self->{nc} != -1));
5143            $self->{state} = BOGUS_COMMENT_STATE;
5144            ## Reconsume.
5145            $self->{ct} = {type => COMMENT_TOKEN,
5146                           data => '?',
5147                           line => $self->{line_prev},
5148                           column => $self->{column_prev}
5149                               - 1 * ($self->{nc} != -1),
5150                          };
5151            redo A;
5152          } else {
5153            ## XML5: "DOCTYPE pi state": Stay in the state.
5154            $self->{ct} = {type => PI_TOKEN,
5155                           target => chr $self->{nc},
5156                           data => '',
5157                           line => $self->{line_prev},
5158                           column => $self->{column_prev} - 1,
5159                          };
5160            $self->{state} = PI_TARGET_STATE;
5161            
5162        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5163          $self->{line_prev} = $self->{line};
5164          $self->{column_prev} = $self->{column};
5165          $self->{column}++;
5166          $self->{nc}
5167              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5168        } else {
5169          $self->{set_nc}->($self);
5170        }
5171      
5172            redo A;
5173          }
5174        } elsif ($self->{state} == PI_TARGET_STATE) {
5175          if ($is_space->{$self->{nc}}) {
5176            $self->{state} = PI_TARGET_AFTER_STATE;
5177            
5178        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5179          $self->{line_prev} = $self->{line};
5180          $self->{column_prev} = $self->{column};
5181          $self->{column}++;
5182          $self->{nc}
5183              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5184        } else {
5185          $self->{set_nc}->($self);
5186        }
5187      
5188            redo A;
5189          } elsif ($self->{nc} == -1) {
5190            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
5191            if ($self->{in_subset}) {
5192              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5193            } else {
5194              $self->{state} = DATA_STATE;
5195              $self->{s_kwd} = '';
5196            }
5197            ## Reconsume.
5198            return  ($self->{ct}); # pi
5199            redo A;
5200          } elsif ($self->{nc} == 0x003F) { # ?
5201            $self->{state} = PI_AFTER_STATE;
5202            
5203        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5204          $self->{line_prev} = $self->{line};
5205          $self->{column_prev} = $self->{column};
5206          $self->{column}++;
5207          $self->{nc}
5208              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5209        } else {
5210          $self->{set_nc}->($self);
5211        }
5212      
5213            redo A;
5214          } else {
5215            ## XML5: typo ("tag name" -> "target")
5216            $self->{ct}->{target} .= chr $self->{nc}; # pi
5217            
5218        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5219          $self->{line_prev} = $self->{line};
5220          $self->{column_prev} = $self->{column};
5221          $self->{column}++;
5222          $self->{nc}
5223              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5224        } else {
5225          $self->{set_nc}->($self);
5226        }
5227      
5228            redo A;
5229          }
5230        } elsif ($self->{state} == PI_TARGET_AFTER_STATE) {
5231          if ($is_space->{$self->{nc}}) {
5232            ## Stay in the state.
5233            
5234        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5235          $self->{line_prev} = $self->{line};
5236          $self->{column_prev} = $self->{column};
5237          $self->{column}++;
5238          $self->{nc}
5239              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5240        } else {
5241          $self->{set_nc}->($self);
5242        }
5243      
5244            redo A;
5245          } else {
5246            $self->{state} = PI_DATA_STATE;
5247            ## Reprocess.
5248            redo A;
5249          }
5250        } elsif ($self->{state} == PI_DATA_STATE) {
5251          if ($self->{nc} == 0x003F) { # ?
5252            $self->{state} = PI_DATA_AFTER_STATE;
5253            
5254        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5255          $self->{line_prev} = $self->{line};
5256          $self->{column_prev} = $self->{column};
5257          $self->{column}++;
5258          $self->{nc}
5259              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5260        } else {
5261          $self->{set_nc}->($self);
5262        }
5263      
5264            redo A;
5265          } elsif ($self->{nc} == -1) {
5266            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
5267            if ($self->{in_subset}) {
5268              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state"
5269            } else {
5270              $self->{state} = DATA_STATE;
5271              $self->{s_kwd} = '';
5272            }
5273            ## Reprocess.
5274            return  ($self->{ct}); # pi
5275            redo A;
5276          } else {
5277            $self->{ct}->{data} .= chr $self->{nc}; # pi
5278            $self->{read_until}->($self->{ct}->{data}, q[?],
5279                                  length $self->{ct}->{data});
5280            ## Stay in the state.
5281            
5282        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5283          $self->{line_prev} = $self->{line};
5284          $self->{column_prev} = $self->{column};
5285          $self->{column}++;
5286          $self->{nc}
5287              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5288        } else {
5289          $self->{set_nc}->($self);
5290        }
5291      
5292            ## Reprocess.
5293            redo A;
5294          }
5295        } elsif ($self->{state} == PI_AFTER_STATE) {
5296          ## XML5: Part of "Pi after state".
5297    
5298          if ($self->{nc} == 0x003E) { # >
5299            if ($self->{in_subset}) {
5300              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5301            } else {
5302              $self->{state} = DATA_STATE;
5303              $self->{s_kwd} = '';
5304            }
5305            
5306        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5307          $self->{line_prev} = $self->{line};
5308          $self->{column_prev} = $self->{column};
5309          $self->{column}++;
5310          $self->{nc}
5311              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5312        } else {
5313          $self->{set_nc}->($self);
5314        }
5315      
5316            return  ($self->{ct}); # pi
5317            redo A;
5318          } elsif ($self->{nc} == 0x003F) { # ?
5319            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
5320                            line => $self->{line_prev},
5321                            column => $self->{column_prev}); ## XML5: no error
5322            $self->{ct}->{data} .= '?';
5323            $self->{state} = PI_DATA_AFTER_STATE;
5324            
5325        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5326          $self->{line_prev} = $self->{line};
5327          $self->{column_prev} = $self->{column};
5328          $self->{column}++;
5329          $self->{nc}
5330              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5331        } else {
5332          $self->{set_nc}->($self);
5333        }
5334      
5335            redo A;
5336          } else {
5337            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
5338                            line => $self->{line_prev},
5339                            column => $self->{column_prev}
5340                                + 1 * ($self->{nc} == -1)); ## XML5: no error
5341            $self->{ct}->{data} .= '?'; ## XML5: not appended
5342            $self->{state} = PI_DATA_STATE;
5343            ## Reprocess.
5344            redo A;
5345          }
5346        } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
5347          ## XML5: Same as "pi after state" and "DOCTYPE pi after state".
5348    
5349          if ($self->{nc} == 0x003E) { # >
5350            if ($self->{in_subset}) {
5351              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5352            } else {
5353              $self->{state} = DATA_STATE;
5354              $self->{s_kwd} = '';
5355            }
5356            
5357        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5358          $self->{line_prev} = $self->{line};
5359          $self->{column_prev} = $self->{column};
5360          $self->{column}++;
5361          $self->{nc}
5362              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5363        } else {
5364          $self->{set_nc}->($self);
5365        }
5366      
5367            return  ($self->{ct}); # pi
5368            redo A;
5369          } elsif ($self->{nc} == 0x003F) { # ?
5370            $self->{ct}->{data} .= '?';
5371            ## Stay in the state.
5372            
5373        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5374          $self->{line_prev} = $self->{line};
5375          $self->{column_prev} = $self->{column};
5376          $self->{column}++;
5377          $self->{nc}
5378              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5379        } else {
5380          $self->{set_nc}->($self);
5381        }
5382      
5383            redo A;
5384          } else {
5385            $self->{ct}->{data} .= '?'; ## XML5: not appended
5386            $self->{state} = PI_DATA_STATE;
5387            ## Reprocess.
5388            redo A;
5389          }
5390    
5391        } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_STATE) {
5392          if ($self->{nc} == 0x003C) { # <
5393            $self->{state} = DOCTYPE_TAG_STATE;
5394            
5395        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5396          $self->{line_prev} = $self->{line};
5397          $self->{column_prev} = $self->{column};
5398          $self->{column}++;
5399          $self->{nc}
5400              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5401        } else {
5402          $self->{set_nc}->($self);
5403        }
5404      
5405            redo A;
5406          } elsif ($self->{nc} == 0x0025) { # %
5407            ## XML5: Not defined yet.
5408    
5409            ## TODO:
5410            
5411        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5412          $self->{line_prev} = $self->{line};
5413          $self->{column_prev} = $self->{column};
5414          $self->{column}++;
5415          $self->{nc}
5416              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5417        } else {
5418          $self->{set_nc}->($self);
5419        }
5420      
5421            redo A;
5422          } elsif ($self->{nc} == 0x005D) { # ]
5423            delete $self->{in_subset};
5424            $self->{state} = DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
5425            
5426        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5427          $self->{line_prev} = $self->{line};
5428          $self->{column_prev} = $self->{column};
5429          $self->{column}++;
5430          $self->{nc}
5431              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5432        } else {
5433          $self->{set_nc}->($self);
5434        }
5435      
5436            redo A;
5437          } elsif ($is_space->{$self->{nc}}) {
5438            ## Stay in the state.
5439            
5440        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5441          $self->{line_prev} = $self->{line};
5442          $self->{column_prev} = $self->{column};
5443          $self->{column}++;
5444          $self->{nc}
5445              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5446        } else {
5447          $self->{set_nc}->($self);
5448        }
5449      
5450            redo A;
5451          } elsif ($self->{nc} == -1) {
5452            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed internal subset'); ## TODO: type
5453            delete $self->{in_subset};
5454            $self->{state} = DATA_STATE;
5455            $self->{s_kwd} = '';
5456            ## Reconsume.
5457            return  ({type => END_OF_DOCTYPE_TOKEN});
5458            redo A;
5459          } else {
5460            unless ($self->{internal_subset_tainted}) {
5461              ## XML5: No parse error.
5462              $self->{parse_error}->(level => $self->{level}->{must}, type => 'string in internal subset');
5463              $self->{internal_subset_tainted} = 1;
5464            }
5465            ## Stay in the state.
5466            
5467        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5468          $self->{line_prev} = $self->{line};
5469          $self->{column_prev} = $self->{column};
5470          $self->{column}++;
5471          $self->{nc}
5472              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5473        } else {
5474          $self->{set_nc}->($self);
5475        }
5476      
5477            redo A;
5478          }
5479        } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
5480          if ($self->{nc} == 0x003E) { # >
5481            $self->{state} = DATA_STATE;
5482            $self->{s_kwd} = '';
5483            
5484        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5485          $self->{line_prev} = $self->{line};
5486          $self->{column_prev} = $self->{column};
5487          $self->{column}++;
5488          $self->{nc}
5489              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5490        } else {
5491          $self->{set_nc}->($self);
5492        }
5493      
5494            return  ({type => END_OF_DOCTYPE_TOKEN});
5495            redo A;
5496          } elsif ($self->{nc} == -1) {
5497            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
5498            $self->{state} = DATA_STATE;
5499            $self->{s_kwd} = '';
5500            ## Reconsume.
5501            return  ({type => END_OF_DOCTYPE_TOKEN});
5502            redo A;
5503          } else {
5504            ## XML5: No parse error and stay in the state.
5505            $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after internal subset'); ## TODO: type
5506    
5507            $self->{state} = BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
5508            
5509        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5510          $self->{line_prev} = $self->{line};
5511          $self->{column_prev} = $self->{column};
5512          $self->{column}++;
5513          $self->{nc}
5514              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5515        } else {
5516          $self->{set_nc}->($self);
5517        }
5518      
5519            redo A;
5520          }
5521        } elsif ($self->{state} == BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
5522          if ($self->{nc} == 0x003E) { # >
5523            $self->{state} = DATA_STATE;
5524            $self->{s_kwd} = '';
5525            
5526        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5527          $self->{line_prev} = $self->{line};
5528          $self->{column_prev} = $self->{column};
5529          $self->{column}++;
5530          $self->{nc}
5531              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5532        } else {
5533          $self->{set_nc}->($self);
5534        }
5535      
5536            return  ({type => END_OF_DOCTYPE_TOKEN});
5537            redo A;
5538          } elsif ($self->{nc} == -1) {
5539            $self->{state} = DATA_STATE;
5540            $self->{s_kwd} = '';
5541            ## Reconsume.
5542            return  ({type => END_OF_DOCTYPE_TOKEN});
5543            redo A;
5544          } else {
5545            ## Stay in the state.
5546            
5547        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5548          $self->{line_prev} = $self->{line};
5549          $self->{column_prev} = $self->{column};
5550          $self->{column}++;
5551          $self->{nc}
5552              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5553        } else {
5554          $self->{set_nc}->($self);
5555        }
5556      
5557            redo A;
5558          }
5559        } elsif ($self->{state} == DOCTYPE_TAG_STATE) {
5560          if ($self->{nc} == 0x0021) { # !
5561            $self->{state} = DOCTYPE_MARKUP_DECLARATION_OPEN_STATE;
5562            
5563        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5564          $self->{line_prev} = $self->{line};
5565          $self->{column_prev} = $self->{column};
5566          $self->{column}++;
5567          $self->{nc}
5568              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5569        } else {
5570          $self->{set_nc}->($self);
5571        }
5572      
5573            redo A;
5574          } elsif ($self->{nc} == 0x003F) { # ?
5575            $self->{state} = PI_STATE;
5576            
5577        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5578          $self->{line_prev} = $self->{line};
5579          $self->{column_prev} = $self->{column};
5580          $self->{column}++;
5581          $self->{nc}
5582              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5583        } else {
5584          $self->{set_nc}->($self);
5585        }
5586      
5587            redo A;
5588          } elsif ($self->{nc} == -1) {
5589            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago');
5590            $self->{state} = DATA_STATE;
5591            $self->{s_kwd} = '';
5592            ## Reconsume.
5593            redo A;
5594          } else {
5595            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago', ## XML5: Not a parse error.
5596                            line => $self->{line_prev},
5597                            column => $self->{column_prev});
5598            $self->{state} = BOGUS_COMMENT_STATE;
5599            $self->{ct} = {type => COMMENT_TOKEN,
5600                           data => '',
5601                          }; ## NOTE: Will be discarded.
5602            
5603        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5604          $self->{line_prev} = $self->{line};
5605          $self->{column_prev} = $self->{column};
5606          $self->{column}++;
5607          $self->{nc}
5608              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5609        } else {
5610          $self->{set_nc}->($self);
5611        }
5612      
5613            redo A;
5614          }
5615        } elsif ($self->{state} == DOCTYPE_MARKUP_DECLARATION_OPEN_STATE) {
5616          ## XML5: "DOCTYPE markup declaration state".
5617          
5618          if ($self->{nc} == 0x002D) { # -
5619            $self->{state} = MD_HYPHEN_STATE;
5620            
5621        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5622          $self->{line_prev} = $self->{line};
5623          $self->{column_prev} = $self->{column};
5624          $self->{column}++;
5625          $self->{nc}
5626              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5627        } else {
5628          $self->{set_nc}->($self);
5629        }
5630      
5631            redo A;
5632          } elsif ($self->{nc} == 0x0045 or # E
5633                   $self->{nc} == 0x0065) { # e
5634            $self->{state} = MD_E_STATE;
5635            $self->{kwd} = chr $self->{nc};
5636            
5637        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5638          $self->{line_prev} = $self->{line};
5639          $self->{column_prev} = $self->{column};
5640          $self->{column}++;
5641          $self->{nc}
5642              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5643        } else {
5644          $self->{set_nc}->($self);
5645        }
5646      
5647            redo A;
5648          } elsif ($self->{nc} == 0x0041 or # A
5649                   $self->{nc} == 0x0061) { # a
5650            $self->{state} = MD_ATTLIST_STATE;
5651            $self->{kwd} = chr $self->{nc};
5652            
5653        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5654          $self->{line_prev} = $self->{line};
5655          $self->{column_prev} = $self->{column};
5656          $self->{column}++;
5657          $self->{nc}
5658              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5659        } else {
5660          $self->{set_nc}->($self);
5661        }
5662      
5663            redo A;
5664          } elsif ($self->{nc} == 0x004E or # N
5665                   $self->{nc} == 0x006E) { # n
5666            $self->{state} = MD_NOTATION_STATE;
5667            $self->{kwd} = chr $self->{nc};
5668            
5669        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5670          $self->{line_prev} = $self->{line};
5671          $self->{column_prev} = $self->{column};
5672          $self->{column}++;
5673          $self->{nc}
5674              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5675        } else {
5676          $self->{set_nc}->($self);
5677        }
5678      
5679            redo A;
5680          } else {
5681            #
5682          }
5683          
5684          ## XML5: No parse error.
5685          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5686                          line => $self->{line_prev},
5687                          column => $self->{column_prev} - 1);
5688          ## Reconsume.
5689          $self->{state} = BOGUS_COMMENT_STATE;
5690          $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded.
5691          redo A;
5692        } elsif ($self->{state} == MD_E_STATE) {
5693          if ($self->{nc} == 0x004E or # N
5694              $self->{nc} == 0x006E) { # n
5695            $self->{state} = MD_ENTITY_STATE;
5696            $self->{kwd} .= chr $self->{nc};
5697            
5698        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5699          $self->{line_prev} = $self->{line};
5700          $self->{column_prev} = $self->{column};
5701          $self->{column}++;
5702          $self->{nc}
5703              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5704        } else {
5705          $self->{set_nc}->($self);
5706        }
5707      
5708            redo A;
5709          } elsif ($self->{nc} == 0x004C or # L
5710                   $self->{nc} == 0x006C) { # l
5711            ## XML5: <!ELEMENT> not supported.
5712            $self->{state} = MD_ELEMENT_STATE;
5713            $self->{kwd} .= chr $self->{nc};
5714            
5715        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5716          $self->{line_prev} = $self->{line};
5717          $self->{column_prev} = $self->{column};
5718          $self->{column}++;
5719          $self->{nc}
5720              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5721        } else {
5722          $self->{set_nc}->($self);
5723        }
5724      
5725            redo A;
5726          } else {
5727            ## XML5: No parse error.
5728            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5729                            line => $self->{line_prev},
5730                            column => $self->{column_prev} - 2
5731                                + 1 * ($self->{nc} == -1));
5732            ## Reconsume.
5733            $self->{state} = BOGUS_COMMENT_STATE;
5734            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5735            redo A;
5736          }
5737        } elsif ($self->{state} == MD_ENTITY_STATE) {
5738          if ($self->{nc} == [
5739                undef,
5740                undef,
5741                0x0054, # T
5742                0x0049, # I
5743                0x0054, # T
5744              ]->[length $self->{kwd}] or
5745              $self->{nc} == [
5746                undef,
5747                undef,
5748                0x0074, # t
5749                0x0069, # i
5750                0x0074, # t
5751              ]->[length $self->{kwd}]) {
5752            ## Stay in the state.
5753            $self->{kwd} .= chr $self->{nc};
5754            
5755        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5756          $self->{line_prev} = $self->{line};
5757          $self->{column_prev} = $self->{column};
5758          $self->{column}++;
5759          $self->{nc}
5760              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5761        } else {
5762          $self->{set_nc}->($self);
5763        }
5764      
5765            redo A;
5766          } elsif ((length $self->{kwd}) == 5 and
5767                   ($self->{nc} == 0x0059 or # Y
5768                    $self->{nc} == 0x0079)) { # y
5769            if ($self->{kwd} ne 'ENTIT' or $self->{nc} == 0x0079) {
5770              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
5771                              text => 'ENTITY',
5772                              line => $self->{line_prev},
5773                              column => $self->{column_prev} - 4);
5774            }
5775            $self->{ct} = {type => GENERAL_ENTITY_TOKEN, name => '',
5776                           line => $self->{line_prev},
5777                           column => $self->{column_prev} - 6};
5778            $self->{state} = DOCTYPE_MD_STATE;
5779            
5780        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5781          $self->{line_prev} = $self->{line};
5782          $self->{column_prev} = $self->{column};
5783          $self->{column}++;
5784          $self->{nc}
5785              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5786        } else {
5787          $self->{set_nc}->($self);
5788        }
5789      
5790            redo A;
5791          } else {
5792            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5793                            line => $self->{line_prev},
5794                            column => $self->{column_prev} - 1
5795                                - (length $self->{kwd})
5796                                + 1 * ($self->{nc} == -1));
5797            $self->{state} = BOGUS_COMMENT_STATE;
5798            ## Reconsume.
5799            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5800            redo A;
5801          }
5802        } elsif ($self->{state} == MD_ELEMENT_STATE) {
5803          if ($self->{nc} == [
5804               undef,
5805               undef,
5806               0x0045, # E
5807               0x004D, # M
5808               0x0045, # E
5809               0x004E, # N
5810              ]->[length $self->{kwd}] or
5811              $self->{nc} == [
5812               undef,
5813               undef,
5814               0x0065, # e
5815               0x006D, # m
5816               0x0065, # e
5817               0x006E, # n
5818              ]->[length $self->{kwd}]) {
5819            ## Stay in the state.
5820            $self->{kwd} .= chr $self->{nc};
5821            
5822        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5823          $self->{line_prev} = $self->{line};
5824          $self->{column_prev} = $self->{column};
5825          $self->{column}++;
5826          $self->{nc}
5827              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5828        } else {
5829          $self->{set_nc}->($self);
5830        }
5831      
5832            redo A;
5833          } elsif ((length $self->{kwd}) == 6 and
5834                   ($self->{nc} == 0x0054 or # T
5835                    $self->{nc} == 0x0074)) { # t
5836            if ($self->{kwd} ne 'ELEMEN' or $self->{nc} == 0x0074) {
5837              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
5838                              text => 'ELEMENT',
5839                              line => $self->{line_prev},
5840                              column => $self->{column_prev} - 5);
5841            }
5842            $self->{ct} = {type => ELEMENT_TOKEN, name => '',
5843                           line => $self->{line_prev},
5844                           column => $self->{column_prev} - 6};
5845            $self->{state} = DOCTYPE_MD_STATE;
5846            
5847        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5848          $self->{line_prev} = $self->{line};
5849          $self->{column_prev} = $self->{column};
5850          $self->{column}++;
5851          $self->{nc}
5852              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5853        } else {
5854          $self->{set_nc}->($self);
5855        }
5856      
5857            redo A;
5858          } else {
5859            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5860                            line => $self->{line_prev},
5861                            column => $self->{column_prev} - 1
5862                                - (length $self->{kwd})
5863                                + 1 * ($self->{nc} == -1));
5864            $self->{state} = BOGUS_COMMENT_STATE;
5865            ## Reconsume.
5866            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5867            redo A;
5868          }
5869        } elsif ($self->{state} == MD_ATTLIST_STATE) {
5870          if ($self->{nc} == [
5871               undef,
5872               0x0054, # T
5873               0x0054, # T
5874               0x004C, # L
5875               0x0049, # I
5876               0x0053, # S
5877              ]->[length $self->{kwd}] or
5878              $self->{nc} == [
5879               undef,
5880               0x0074, # t
5881               0x0074, # t
5882               0x006C, # l
5883               0x0069, # i
5884               0x0073, # s
5885              ]->[length $self->{kwd}]) {
5886            ## Stay in the state.
5887            $self->{kwd} .= chr $self->{nc};
5888            
5889        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5890          $self->{line_prev} = $self->{line};
5891          $self->{column_prev} = $self->{column};
5892          $self->{column}++;
5893          $self->{nc}
5894              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5895        } else {
5896          $self->{set_nc}->($self);
5897        }
5898      
5899            redo A;
5900          } elsif ((length $self->{kwd}) == 6 and
5901                   ($self->{nc} == 0x0054 or # T
5902                    $self->{nc} == 0x0074)) { # t
5903            if ($self->{kwd} ne 'ATTLIS' or $self->{nc} == 0x0074) {
5904              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
5905                              text => 'ATTLIST',
5906                              line => $self->{line_prev},
5907                              column => $self->{column_prev} - 5);
5908            }
5909            $self->{ct} = {type => ATTLIST_TOKEN, name => '',
5910                           attrdefs => [],
5911                           line => $self->{line_prev},
5912                           column => $self->{column_prev} - 6};
5913            $self->{state} = DOCTYPE_MD_STATE;
5914            
5915        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5916          $self->{line_prev} = $self->{line};
5917          $self->{column_prev} = $self->{column};
5918          $self->{column}++;
5919          $self->{nc}
5920              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5921        } else {
5922          $self->{set_nc}->($self);
5923        }
5924      
5925            redo A;
5926          } else {
5927            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5928                            line => $self->{line_prev},
5929                            column => $self->{column_prev} - 1
5930                                 - (length $self->{kwd})
5931                                 + 1 * ($self->{nc} == -1));
5932            $self->{state} = BOGUS_COMMENT_STATE;
5933            ## Reconsume.
5934            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5935            redo A;
5936          }
5937        } elsif ($self->{state} == MD_NOTATION_STATE) {
5938          if ($self->{nc} == [
5939               undef,
5940               0x004F, # O
5941               0x0054, # T
5942               0x0041, # A
5943               0x0054, # T
5944               0x0049, # I
5945               0x004F, # O
5946              ]->[length $self->{kwd}] or
5947              $self->{nc} == [
5948               undef,
5949               0x006F, # o
5950               0x0074, # t
5951               0x0061, # a
5952               0x0074, # t
5953               0x0069, # i
5954               0x006F, # o
5955              ]->[length $self->{kwd}]) {
5956            ## Stay in the state.
5957            $self->{kwd} .= chr $self->{nc};
5958            
5959        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5960          $self->{line_prev} = $self->{line};
5961          $self->{column_prev} = $self->{column};
5962          $self->{column}++;
5963          $self->{nc}
5964              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5965        } else {
5966          $self->{set_nc}->($self);
5967        }
5968      
5969            redo A;
5970          } elsif ((length $self->{kwd}) == 7 and
5971                   ($self->{nc} == 0x004E or # N
5972                    $self->{nc} == 0x006E)) { # n
5973            if ($self->{kwd} ne 'NOTATIO' or $self->{nc} == 0x006E) {
5974              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
5975                              text => 'NOTATION',
5976                              line => $self->{line_prev},
5977                              column => $self->{column_prev} - 6);
5978            }
5979            $self->{ct} = {type => NOTATION_TOKEN, name => '',
5980                           line => $self->{line_prev},
5981                           column => $self->{column_prev} - 6};
5982            $self->{state} = DOCTYPE_MD_STATE;
5983            
5984        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5985          $self->{line_prev} = $self->{line};
5986          $self->{column_prev} = $self->{column};
5987          $self->{column}++;
5988          $self->{nc}
5989              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5990        } else {
5991          $self->{set_nc}->($self);
5992        }
5993      
5994            redo A;
5995          } else {
5996            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5997                            line => $self->{line_prev},
5998                            column => $self->{column_prev} - 1
5999                                - (length $self->{kwd})
6000                                + 1 * ($self->{nc} == -1));
6001            $self->{state} = BOGUS_COMMENT_STATE;
6002            ## Reconsume.
6003            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6004            redo A;
6005          }
6006        } elsif ($self->{state} == DOCTYPE_MD_STATE) {
6007          ## XML5: "DOCTYPE ENTITY state", "DOCTYPE ATTLIST state", and
6008          ## "DOCTYPE NOTATION state".
6009    
6010          if ($is_space->{$self->{nc}}) {
6011            ## XML5: [NOTATION] Switch to the "DOCTYPE NOTATION identifier state".
6012            $self->{state} = BEFORE_MD_NAME_STATE;
6013            
6014        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6015          $self->{line_prev} = $self->{line};
6016          $self->{column_prev} = $self->{column};
6017          $self->{column}++;
6018          $self->{nc}
6019              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6020        } else {
6021          $self->{set_nc}->($self);
6022        }
6023      
6024            redo A;
6025          } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
6026                   $self->{nc} == 0x0025) { # %
6027            ## XML5: Switch to the "DOCTYPE bogus comment state".
6028            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before md name'); ## TODO: type
6029            $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
6030            
6031        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6032          $self->{line_prev} = $self->{line};
6033          $self->{column_prev} = $self->{column};
6034          $self->{column}++;
6035          $self->{nc}
6036              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6037        } else {
6038          $self->{set_nc}->($self);
6039        }
6040      
6041            redo A;
6042          } elsif ($self->{nc} == -1) {
6043            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6044            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6045            ## Reconsume.
6046            redo A;
6047          } elsif ($self->{nc} == 0x003E) { # >
6048            ## XML5: Switch to the "DOCTYPE bogus comment state".
6049            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6050            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6051            
6052        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6053          $self->{line_prev} = $self->{line};
6054          $self->{column_prev} = $self->{column};
6055          $self->{column}++;
6056          $self->{nc}
6057              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6058        } else {
6059          $self->{set_nc}->($self);
6060        }
6061      
6062            redo A;
6063          } else {
6064            ## XML5: Switch to the "DOCTYPE bogus comment state".
6065            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before md name'); ## TODO: type
6066            $self->{state} = BEFORE_MD_NAME_STATE;
6067            redo A;
6068          }
6069        } elsif ($self->{state} == BEFORE_MD_NAME_STATE) {
6070          ## XML5: "DOCTYPE ENTITY parameter state", "DOCTYPE ENTITY type
6071          ## before state", "DOCTYPE ATTLIST name before state".
6072    
6073          if ($is_space->{$self->{nc}}) {
6074            ## Stay in the state.
6075            
6076        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6077          $self->{line_prev} = $self->{line};
6078          $self->{column_prev} = $self->{column};
6079          $self->{column}++;
6080          $self->{nc}
6081              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6082        } else {
6083          $self->{set_nc}->($self);
6084        }
6085      
6086            redo A;
6087          } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
6088                   $self->{nc} == 0x0025) { # %
6089            $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
6090            
6091        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6092          $self->{line_prev} = $self->{line};
6093          $self->{column_prev} = $self->{column};
6094          $self->{column}++;
6095          $self->{nc}
6096              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6097        } else {
6098          $self->{set_nc}->($self);
6099        }
6100      
6101            redo A;
6102          } elsif ($self->{nc} == 0x003E) { # >
6103            ## XML5: Same as "Anything else".
6104            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6105            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6106            
6107        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6108          $self->{line_prev} = $self->{line};
6109          $self->{column_prev} = $self->{column};
6110          $self->{column}++;
6111          $self->{nc}
6112              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6113        } else {
6114          $self->{set_nc}->($self);
6115        }
6116      
6117            redo A;
6118          } elsif ($self->{nc} == -1) {
6119            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6120            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6121            ## Reconsume.
6122            redo A;
6123          } else {
6124            ## XML5: [ATTLIST] Not defined yet.
6125            $self->{ct}->{name} .= chr $self->{nc};
6126            $self->{state} = MD_NAME_STATE;
6127            
6128        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6129          $self->{line_prev} = $self->{line};
6130          $self->{column_prev} = $self->{column};
6131          $self->{column}++;
6132          $self->{nc}
6133              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6134        } else {
6135          $self->{set_nc}->($self);
6136        }
6137      
6138            redo A;
6139          }
6140        } elsif ($self->{state} == DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE) {
6141          if ($is_space->{$self->{nc}}) {
6142            ## XML5: Switch to the "DOCTYPE ENTITY parameter state".
6143            $self->{ct}->{type} = PARAMETER_ENTITY_TOKEN;
6144            $self->{state} = BEFORE_MD_NAME_STATE;
6145            
6146        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6147          $self->{line_prev} = $self->{line};
6148          $self->{column_prev} = $self->{column};
6149          $self->{column}++;
6150          $self->{nc}
6151              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6152        } else {
6153          $self->{set_nc}->($self);
6154        }
6155      
6156            redo A;
6157          } elsif ($self->{nc} == 0x003E) { # >
6158            ## XML5: Same as "Anything else".
6159            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6160            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6161            
6162        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6163          $self->{line_prev} = $self->{line};
6164          $self->{column_prev} = $self->{column};
6165          $self->{column}++;
6166          $self->{nc}
6167              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6168        } else {
6169          $self->{set_nc}->($self);
6170        }
6171      
6172            redo A;
6173          } elsif ($self->{nc} == -1) {
6174            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md');
6175            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6176            ## Reconsume.
6177            redo A;
6178          } else {
6179            ## XML5: No parse error.
6180            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space after ENTITY percent'); ## TODO: type
6181            $self->{state} = BOGUS_COMMENT_STATE;
6182            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6183            ## Reconsume.
6184            redo A;
6185          }
6186        } elsif ($self->{state} == MD_NAME_STATE) {
6187          ## XML5: "DOCTYPE ENTITY name state" and "DOCTYPE ATTLIST name state".
6188          
6189          if ($is_space->{$self->{nc}}) {
6190            if ($self->{ct}->{type} == ATTLIST_TOKEN) {
6191              $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
6192            } elsif ($self->{ct}->{type} == ELEMENT_TOKEN) {
6193              $self->{state} = AFTER_ELEMENT_NAME_STATE;
6194            } else { # ENTITY/NOTATION
6195              $self->{state} = AFTER_DOCTYPE_NAME_STATE;
6196            }
6197            
6198        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6199          $self->{line_prev} = $self->{line};
6200          $self->{column_prev} = $self->{column};
6201          $self->{column}++;
6202          $self->{nc}
6203              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6204        } else {
6205          $self->{set_nc}->($self);
6206        }
6207      
6208            redo A;
6209          } elsif ($self->{nc} == 0x003E) { # >
6210            if ($self->{ct}->{type} == ATTLIST_TOKEN) {
6211              #
6212            } else {
6213              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
6214            }
6215            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6216            
6217        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6218          $self->{line_prev} = $self->{line};
6219          $self->{column_prev} = $self->{column};
6220          $self->{column}++;
6221          $self->{nc}
6222              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6223        } else {
6224          $self->{set_nc}->($self);
6225        }
6226      
6227            return  ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
6228            redo A;
6229          } elsif ($self->{nc} == -1) {
6230            ## XML5: [ATTLIST] No parse error.
6231            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md');
6232            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6233            ## Reconsume.
6234            return  ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
6235            redo A;
6236          } else {
6237            ## XML5: [ATTLIST] Not defined yet.
6238            $self->{ct}->{name} .= chr $self->{nc};
6239            ## Stay in the state.
6240            
6241        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6242          $self->{line_prev} = $self->{line};
6243          $self->{column_prev} = $self->{column};
6244          $self->{column}++;
6245          $self->{nc}
6246              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6247        } else {
6248          $self->{set_nc}->($self);
6249        }
6250      
6251            redo A;
6252          }
6253        } elsif ($self->{state} == DOCTYPE_ATTLIST_NAME_AFTER_STATE) {
6254          if ($is_space->{$self->{nc}}) {
6255            ## Stay in the state.
6256            
6257        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6258          $self->{line_prev} = $self->{line};
6259          $self->{column_prev} = $self->{column};
6260          $self->{column}++;
6261          $self->{nc}
6262              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6263        } else {
6264          $self->{set_nc}->($self);
6265        }
6266      
6267            redo A;
6268          } elsif ($self->{nc} == 0x003E) { # >
6269            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6270            
6271        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6272          $self->{line_prev} = $self->{line};
6273          $self->{column_prev} = $self->{column};
6274          $self->{column}++;
6275          $self->{nc}
6276              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6277        } else {
6278          $self->{set_nc}->($self);
6279        }
6280      
6281            return  ($self->{ct}); # ATTLIST
6282            redo A;
6283          } elsif ($self->{nc} == -1) {
6284            ## XML5: No parse error.
6285            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6286            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6287            return  ($self->{ct});
6288            redo A;
6289          } else {
6290            ## XML5: Not defined yet.
6291            $self->{ca} = {name => chr ($self->{nc}), # attrdef
6292                           tokens => [],
6293                           line => $self->{line}, column => $self->{column}};
6294            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE;
6295            
6296        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6297          $self->{line_prev} = $self->{line};
6298          $self->{column_prev} = $self->{column};
6299          $self->{column}++;
6300          $self->{nc}
6301              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6302        } else {
6303          $self->{set_nc}->($self);
6304        }
6305      
6306            redo A;
6307          }
6308        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE) {
6309          if ($is_space->{$self->{nc}}) {
6310            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE;
6311            
6312        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6313          $self->{line_prev} = $self->{line};
6314          $self->{column_prev} = $self->{column};
6315          $self->{column}++;
6316          $self->{nc}
6317              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6318        } else {
6319          $self->{set_nc}->($self);
6320        }
6321      
6322            redo A;
6323          } elsif ($self->{nc} == 0x003E) { # >
6324            ## XML5: Same as "anything else".
6325            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr type'); ## TODO: type
6326            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6327            
6328        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6329          $self->{line_prev} = $self->{line};
6330          $self->{column_prev} = $self->{column};
6331          $self->{column}++;
6332          $self->{nc}
6333              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6334        } else {
6335          $self->{set_nc}->($self);
6336        }
6337      
6338            return  ($self->{ct}); # ATTLIST
6339            redo A;
6340          } elsif ($self->{nc} == 0x0028) { # (
6341            ## XML5: Same as "anything else".
6342            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before paren'); ## TODO: type
6343            $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6344            
6345        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6346          $self->{line_prev} = $self->{line};
6347          $self->{column_prev} = $self->{column};
6348          $self->{column}++;
6349          $self->{nc}
6350              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6351        } else {
6352          $self->{set_nc}->($self);
6353        }
6354      
6355            redo A;
6356          } elsif ($self->{nc} == -1) {
6357            ## XML5: No parse error.
6358            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6359            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6360            
6361        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6362          $self->{line_prev} = $self->{line};
6363          $self->{column_prev} = $self->{column};
6364          $self->{column}++;
6365          $self->{nc}
6366              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6367        } else {
6368          $self->{set_nc}->($self);
6369        }
6370      
6371            return  ($self->{ct}); # ATTLIST
6372            redo A;
6373          } else {
6374            ## XML5: Not defined yet.
6375            $self->{ca}->{name} .= chr $self->{nc};
6376            ## Stay in the state.
6377            
6378        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6379          $self->{line_prev} = $self->{line};
6380          $self->{column_prev} = $self->{column};
6381          $self->{column}++;
6382          $self->{nc}
6383              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6384        } else {
6385          $self->{set_nc}->($self);
6386        }
6387      
6388            redo A;
6389          }
6390        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE) {
6391          if ($is_space->{$self->{nc}}) {
6392            ## Stay in the state.
6393            
6394        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6395          $self->{line_prev} = $self->{line};
6396          $self->{column_prev} = $self->{column};
6397          $self->{column}++;
6398          $self->{nc}
6399              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6400        } else {
6401          $self->{set_nc}->($self);
6402        }
6403      
6404            redo A;
6405          } elsif ($self->{nc} == 0x003E) { # >
6406            ## XML5: Same as "anything else".
6407            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr type'); ## TODO: type
6408            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6409            
6410        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6411          $self->{line_prev} = $self->{line};
6412          $self->{column_prev} = $self->{column};
6413          $self->{column}++;
6414          $self->{nc}
6415              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6416        } else {
6417          $self->{set_nc}->($self);
6418        }
6419      
6420            return  ($self->{ct}); # ATTLIST
6421            redo A;
6422          } elsif ($self->{nc} == 0x0028) { # (
6423            ## XML5: Same as "anything else".
6424            $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6425            
6426        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6427          $self->{line_prev} = $self->{line};
6428          $self->{column_prev} = $self->{column};
6429          $self->{column}++;
6430          $self->{nc}
6431              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6432        } else {
6433          $self->{set_nc}->($self);
6434        }
6435      
6436            redo A;
6437          } elsif ($self->{nc} == -1) {
6438            ## XML5: No parse error.
6439            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6440            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6441            
6442        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6443          $self->{line_prev} = $self->{line};
6444          $self->{column_prev} = $self->{column};
6445          $self->{column}++;
6446          $self->{nc}
6447              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6448        } else {
6449          $self->{set_nc}->($self);
6450        }
6451      
6452            return  ($self->{ct});
6453            redo A;
6454          } else {
6455            ## XML5: Not defined yet.
6456            $self->{ca}->{type} = chr $self->{nc};
6457            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE;
6458            
6459        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6460          $self->{line_prev} = $self->{line};
6461          $self->{column_prev} = $self->{column};
6462          $self->{column}++;
6463          $self->{nc}
6464              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6465        } else {
6466          $self->{set_nc}->($self);
6467        }
6468      
6469            redo A;
6470          }
6471        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE) {
6472          if ($is_space->{$self->{nc}}) {
6473            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE;
6474            
6475        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6476          $self->{line_prev} = $self->{line};
6477          $self->{column_prev} = $self->{column};
6478          $self->{column}++;
6479          $self->{nc}
6480              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6481        } else {
6482          $self->{set_nc}->($self);
6483        }
6484      
6485            redo A;
6486          } elsif ($self->{nc} == 0x0023) { # #
6487            ## XML5: Same as "anything else".
6488            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6489            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
6490            
6491        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6492          $self->{line_prev} = $self->{line};
6493          $self->{column_prev} = $self->{column};
6494          $self->{column}++;
6495          $self->{nc}
6496              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6497        } else {
6498          $self->{set_nc}->($self);
6499        }
6500      
6501            redo A;
6502          } elsif ($self->{nc} == 0x0022) { # "
6503            ## XML5: Same as "anything else".
6504            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6505            $self->{ca}->{value} = '';
6506            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
6507            
6508        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6509          $self->{line_prev} = $self->{line};
6510          $self->{column_prev} = $self->{column};
6511          $self->{column}++;
6512          $self->{nc}
6513              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6514        } else {
6515          $self->{set_nc}->($self);
6516        }
6517      
6518            redo A;
6519          } elsif ($self->{nc} == 0x0027) { # '
6520            ## XML5: Same as "anything else".
6521            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6522            $self->{ca}->{value} = '';
6523            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
6524            
6525        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6526          $self->{line_prev} = $self->{line};
6527          $self->{column_prev} = $self->{column};
6528          $self->{column}++;
6529          $self->{nc}
6530              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6531        } else {
6532          $self->{set_nc}->($self);
6533        }
6534      
6535            redo A;
6536          } elsif ($self->{nc} == 0x003E) { # >
6537            ## XML5: Same as "anything else".
6538            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
6539            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6540            
6541        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6542          $self->{line_prev} = $self->{line};
6543          $self->{column_prev} = $self->{column};
6544          $self->{column}++;
6545          $self->{nc}
6546              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6547        } else {
6548          $self->{set_nc}->($self);
6549        }
6550      
6551            return  ($self->{ct}); # ATTLIST
6552            redo A;
6553          } elsif ($self->{nc} == 0x0028) { # (
6554            ## XML5: Same as "anything else".
6555            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before paren'); ## TODO: type
6556            $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6557            
6558        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6559          $self->{line_prev} = $self->{line};
6560          $self->{column_prev} = $self->{column};
6561          $self->{column}++;
6562          $self->{nc}
6563              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6564        } else {
6565          $self->{set_nc}->($self);
6566        }
6567      
6568            redo A;
6569          } elsif ($self->{nc} == -1) {
6570            ## XML5: No parse error.
6571            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6572            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6573            
6574        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6575          $self->{line_prev} = $self->{line};
6576          $self->{column_prev} = $self->{column};
6577          $self->{column}++;
6578          $self->{nc}
6579              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6580        } else {
6581          $self->{set_nc}->($self);
6582        }
6583      
6584            return  ($self->{ct});
6585            redo A;
6586          } else {
6587            ## XML5: Not defined yet.
6588            $self->{ca}->{type} .= chr $self->{nc};
6589            ## Stay in the state.
6590            
6591        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6592          $self->{line_prev} = $self->{line};
6593          $self->{column_prev} = $self->{column};
6594          $self->{column}++;
6595          $self->{nc}
6596              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6597        } else {
6598          $self->{set_nc}->($self);
6599        }
6600      
6601            redo A;
6602          }
6603        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE) {
6604          if ($is_space->{$self->{nc}}) {
6605            ## Stay in the state.
6606            
6607        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6608          $self->{line_prev} = $self->{line};
6609          $self->{column_prev} = $self->{column};
6610          $self->{column}++;
6611          $self->{nc}
6612              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6613        } else {
6614          $self->{set_nc}->($self);
6615        }
6616      
6617            redo A;
6618          } elsif ($self->{nc} == 0x0028) { # (
6619            ## XML5: Same as "anything else".
6620            $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6621            
6622        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6623          $self->{line_prev} = $self->{line};
6624          $self->{column_prev} = $self->{column};
6625          $self->{column}++;
6626          $self->{nc}
6627              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6628        } else {
6629          $self->{set_nc}->($self);
6630        }
6631      
6632            redo A;
6633          } elsif ($self->{nc} == 0x0023) { # #
6634            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
6635            
6636        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6637          $self->{line_prev} = $self->{line};
6638          $self->{column_prev} = $self->{column};
6639          $self->{column}++;
6640          $self->{nc}
6641              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6642        } else {
6643          $self->{set_nc}->($self);
6644        }
6645      
6646            redo A;
6647          } elsif ($self->{nc} == 0x0022) { # "
6648            ## XML5: Same as "anything else".
6649            $self->{ca}->{value} = '';
6650            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
6651            
6652        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6653          $self->{line_prev} = $self->{line};
6654          $self->{column_prev} = $self->{column};
6655          $self->{column}++;
6656          $self->{nc}
6657              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6658        } else {
6659          $self->{set_nc}->($self);
6660        }
6661      
6662            redo A;
6663          } elsif ($self->{nc} == 0x0027) { # '
6664            ## XML5: Same as "anything else".
6665            $self->{ca}->{value} = '';
6666            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
6667            
6668        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6669          $self->{line_prev} = $self->{line};
6670          $self->{column_prev} = $self->{column};
6671          $self->{column}++;
6672          $self->{nc}
6673              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6674        } else {
6675          $self->{set_nc}->($self);
6676        }
6677      
6678            redo A;
6679          } elsif ($self->{nc} == 0x003E) { # >
6680            ## XML5: Same as "anything else".
6681            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
6682            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6683            
6684        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6685          $self->{line_prev} = $self->{line};
6686          $self->{column_prev} = $self->{column};
6687          $self->{column}++;
6688          $self->{nc}
6689              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6690        } else {
6691          $self->{set_nc}->($self);
6692        }
6693      
6694            return  ($self->{ct}); # ATTLIST
6695            redo A;
6696          } elsif ($self->{nc} == -1) {
6697            ## XML5: No parse error.
6698            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6699            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6700            
6701        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6702          $self->{line_prev} = $self->{line};
6703          $self->{column_prev} = $self->{column};
6704          $self->{column}++;
6705          $self->{nc}
6706              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6707        } else {
6708          $self->{set_nc}->($self);
6709        }
6710      
6711            return  ($self->{ct});
6712            redo A;
6713          } else {
6714            ## XML5: Switch to the "DOCTYPE bogus comment state".
6715            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
6716            $self->{ca}->{value} = '';
6717            $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
6718          ## Reconsume.          ## Reconsume.
6719          redo A;          redo A;
6720        }        }
6721        } elsif ($self->{state} == BEFORE_ALLOWED_TOKEN_STATE) {
6722          if ($is_space->{$self->{nc}}) {
6723            ## Stay in the state.
6724            
6725        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6726          $self->{line_prev} = $self->{line};
6727          $self->{column_prev} = $self->{column};
6728          $self->{column}++;
6729          $self->{nc}
6730              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6731        } else {
6732          $self->{set_nc}->($self);
6733        }
6734      
6735            redo A;
6736          } elsif ($self->{nc} == 0x007C) { # |
6737            $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty allowed token'); ## TODO: type
6738            ## Stay in the state.
6739            
6740        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6741          $self->{line_prev} = $self->{line};
6742          $self->{column_prev} = $self->{column};
6743          $self->{column}++;
6744          $self->{nc}
6745              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6746        } else {
6747          $self->{set_nc}->($self);
6748        }
6749      
6750            redo A;
6751          } elsif ($self->{nc} == 0x0029) { # )
6752            $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty allowed token'); ## TODO: type
6753            $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
6754            
6755        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6756          $self->{line_prev} = $self->{line};
6757          $self->{column_prev} = $self->{column};
6758          $self->{column}++;
6759          $self->{nc}
6760              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6761        } else {
6762          $self->{set_nc}->($self);
6763        }
6764      
6765            redo A;
6766          } elsif ($self->{nc} == 0x003E) { # >
6767            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
6768            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6769            
6770        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6771          $self->{line_prev} = $self->{line};
6772          $self->{column_prev} = $self->{column};
6773          $self->{column}++;
6774          $self->{nc}
6775              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6776        } else {
6777          $self->{set_nc}->($self);
6778        }
6779      
6780            return  ($self->{ct}); # ATTLIST
6781            redo A;
6782          } elsif ($self->{nc} == -1) {
6783            ## XML5: No parse error.
6784            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6785            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6786            
6787        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6788          $self->{line_prev} = $self->{line};
6789          $self->{column_prev} = $self->{column};
6790          $self->{column}++;
6791          $self->{nc}
6792              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6793        } else {
6794          $self->{set_nc}->($self);
6795        }
6796      
6797            return  ($self->{ct});
6798            redo A;
6799          } else {
6800            push @{$self->{ca}->{tokens}}, chr $self->{nc};
6801            $self->{state} = ALLOWED_TOKEN_STATE;
6802            
6803        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6804          $self->{line_prev} = $self->{line};
6805          $self->{column_prev} = $self->{column};
6806          $self->{column}++;
6807          $self->{nc}
6808              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6809        } else {
6810          $self->{set_nc}->($self);
6811        }
6812      
6813            redo A;
6814          }
6815        } elsif ($self->{state} == ALLOWED_TOKEN_STATE) {
6816          if ($is_space->{$self->{nc}}) {
6817            $self->{state} = AFTER_ALLOWED_TOKEN_STATE;
6818            
6819        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6820          $self->{line_prev} = $self->{line};
6821          $self->{column_prev} = $self->{column};
6822          $self->{column}++;
6823          $self->{nc}
6824              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6825        } else {
6826          $self->{set_nc}->($self);
6827        }
6828      
6829            redo A;
6830          } elsif ($self->{nc} == 0x007C) { # |
6831            $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6832            
6833        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6834          $self->{line_prev} = $self->{line};
6835          $self->{column_prev} = $self->{column};
6836          $self->{column}++;
6837          $self->{nc}
6838              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6839        } else {
6840          $self->{set_nc}->($self);
6841        }
6842      
6843            redo A;
6844          } elsif ($self->{nc} == 0x0029) { # )
6845            $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
6846            
6847        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6848          $self->{line_prev} = $self->{line};
6849          $self->{column_prev} = $self->{column};
6850          $self->{column}++;
6851          $self->{nc}
6852              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6853        } else {
6854          $self->{set_nc}->($self);
6855        }
6856      
6857            redo A;
6858          } elsif ($self->{nc} == 0x003E) { # >
6859            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
6860            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6861            
6862        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6863          $self->{line_prev} = $self->{line};
6864          $self->{column_prev} = $self->{column};
6865          $self->{column}++;
6866          $self->{nc}
6867              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6868        } else {
6869          $self->{set_nc}->($self);
6870        }
6871      
6872            return  ($self->{ct}); # ATTLIST
6873            redo A;
6874          } elsif ($self->{nc} == -1) {
6875            ## XML5: No parse error.
6876            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6877            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6878            
6879        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6880          $self->{line_prev} = $self->{line};
6881          $self->{column_prev} = $self->{column};
6882          $self->{column}++;
6883          $self->{nc}
6884              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6885        } else {
6886          $self->{set_nc}->($self);
6887        }
6888      
6889            return  ($self->{ct});
6890            redo A;
6891          } else {
6892            $self->{ca}->{tokens}->[-1] .= chr $self->{nc};
6893            ## Stay in the state.
6894            
6895        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6896          $self->{line_prev} = $self->{line};
6897          $self->{column_prev} = $self->{column};
6898          $self->{column}++;
6899          $self->{nc}
6900              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6901        } else {
6902          $self->{set_nc}->($self);
6903        }
6904      
6905            redo A;
6906          }
6907        } elsif ($self->{state} == AFTER_ALLOWED_TOKEN_STATE) {
6908          if ($is_space->{$self->{nc}}) {
6909            ## Stay in the state.
6910            
6911        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6912          $self->{line_prev} = $self->{line};
6913          $self->{column_prev} = $self->{column};
6914          $self->{column}++;
6915          $self->{nc}
6916              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6917        } else {
6918          $self->{set_nc}->($self);
6919        }
6920      
6921            redo A;
6922          } elsif ($self->{nc} == 0x007C) { # |
6923            $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6924            
6925        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6926          $self->{line_prev} = $self->{line};
6927          $self->{column_prev} = $self->{column};
6928          $self->{column}++;
6929          $self->{nc}
6930              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6931        } else {
6932          $self->{set_nc}->($self);
6933        }
6934      
6935            redo A;
6936          } elsif ($self->{nc} == 0x0029) { # )
6937            $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
6938            
6939        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6940          $self->{line_prev} = $self->{line};
6941          $self->{column_prev} = $self->{column};
6942          $self->{column}++;
6943          $self->{nc}
6944              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6945        } else {
6946          $self->{set_nc}->($self);
6947        }
6948      
6949            redo A;
6950          } elsif ($self->{nc} == 0x003E) { # >
6951            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
6952            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6953            
6954        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6955          $self->{line_prev} = $self->{line};
6956          $self->{column_prev} = $self->{column};
6957          $self->{column}++;
6958          $self->{nc}
6959              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6960        } else {
6961          $self->{set_nc}->($self);
6962        }
6963      
6964            return  ($self->{ct}); # ATTLIST
6965            redo A;
6966          } elsif ($self->{nc} == -1) {
6967            ## XML5: No parse error.
6968            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6969            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6970            
6971        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6972          $self->{line_prev} = $self->{line};
6973          $self->{column_prev} = $self->{column};
6974          $self->{column}++;
6975          $self->{nc}
6976              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6977        } else {
6978          $self->{set_nc}->($self);
6979        }
6980      
6981            return  ($self->{ct});
6982            redo A;
6983          } else {
6984            $self->{parse_error}->(level => $self->{level}->{must}, type => 'space in allowed token', ## TODO: type
6985                            line => $self->{line_prev},
6986                            column => $self->{column_prev});
6987            $self->{ca}->{tokens}->[-1] .= ' ' . chr $self->{nc};
6988            $self->{state} = ALLOWED_TOKEN_STATE;
6989            
6990        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6991          $self->{line_prev} = $self->{line};
6992          $self->{column_prev} = $self->{column};
6993          $self->{column}++;
6994          $self->{nc}
6995              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6996        } else {
6997          $self->{set_nc}->($self);
6998        }
6999      
7000            redo A;
7001          }
7002        } elsif ($self->{state} == AFTER_ALLOWED_TOKENS_STATE) {
7003          if ($is_space->{$self->{nc}}) {
7004            $self->{state} = BEFORE_ATTR_DEFAULT_STATE;
7005            
7006        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7007          $self->{line_prev} = $self->{line};
7008          $self->{column_prev} = $self->{column};
7009          $self->{column}++;
7010          $self->{nc}
7011              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7012        } else {
7013          $self->{set_nc}->($self);
7014        }
7015      
7016            redo A;
7017          } elsif ($self->{nc} == 0x0023) { # #
7018            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7019            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
7020            
7021        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7022          $self->{line_prev} = $self->{line};
7023          $self->{column_prev} = $self->{column};
7024          $self->{column}++;
7025          $self->{nc}
7026              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7027        } else {
7028          $self->{set_nc}->($self);
7029        }
7030      
7031            redo A;
7032          } elsif ($self->{nc} == 0x0022) { # "
7033            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7034            $self->{ca}->{value} = '';
7035            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7036            
7037        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7038          $self->{line_prev} = $self->{line};
7039          $self->{column_prev} = $self->{column};
7040          $self->{column}++;
7041          $self->{nc}
7042              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7043        } else {
7044          $self->{set_nc}->($self);
7045        }
7046      
7047            redo A;
7048          } elsif ($self->{nc} == 0x0027) { # '
7049            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7050            $self->{ca}->{value} = '';
7051            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7052            
7053        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7054          $self->{line_prev} = $self->{line};
7055          $self->{column_prev} = $self->{column};
7056          $self->{column}++;
7057          $self->{nc}
7058              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7059        } else {
7060          $self->{set_nc}->($self);
7061        }
7062      
7063            redo A;
7064          } elsif ($self->{nc} == 0x003E) { # >
7065            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7066            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7067            
7068        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7069          $self->{line_prev} = $self->{line};
7070          $self->{column_prev} = $self->{column};
7071          $self->{column}++;
7072          $self->{nc}
7073              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7074        } else {
7075          $self->{set_nc}->($self);
7076        }
7077      
7078            return  ($self->{ct}); # ATTLIST
7079            redo A;
7080          } elsif ($self->{nc} == -1) {
7081            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7082            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7083            
7084        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7085          $self->{line_prev} = $self->{line};
7086          $self->{column_prev} = $self->{column};
7087          $self->{column}++;
7088          $self->{nc}
7089              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7090        } else {
7091          $self->{set_nc}->($self);
7092        }
7093      
7094            return  ($self->{ct});
7095            redo A;
7096          } else {
7097            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
7098            $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7099            ## Reconsume.
7100            redo A;
7101          }
7102        } elsif ($self->{state} == BEFORE_ATTR_DEFAULT_STATE) {
7103          if ($is_space->{$self->{nc}}) {
7104            ## Stay in the state.
7105            
7106        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7107          $self->{line_prev} = $self->{line};
7108          $self->{column_prev} = $self->{column};
7109          $self->{column}++;
7110          $self->{nc}
7111              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7112        } else {
7113          $self->{set_nc}->($self);
7114        }
7115      
7116            redo A;
7117          } elsif ($self->{nc} == 0x0023) { # #
7118            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
7119            
7120        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7121          $self->{line_prev} = $self->{line};
7122          $self->{column_prev} = $self->{column};
7123          $self->{column}++;
7124          $self->{nc}
7125              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7126        } else {
7127          $self->{set_nc}->($self);
7128        }
7129      
7130            redo A;
7131          } elsif ($self->{nc} == 0x0022) { # "
7132            $self->{ca}->{value} = '';
7133            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7134            
7135        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7136          $self->{line_prev} = $self->{line};
7137          $self->{column_prev} = $self->{column};
7138          $self->{column}++;
7139          $self->{nc}
7140              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7141        } else {
7142          $self->{set_nc}->($self);
7143        }
7144      
7145            redo A;
7146          } elsif ($self->{nc} == 0x0027) { # '
7147            $self->{ca}->{value} = '';
7148            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7149            
7150        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7151          $self->{line_prev} = $self->{line};
7152          $self->{column_prev} = $self->{column};
7153          $self->{column}++;
7154          $self->{nc}
7155              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7156        } else {
7157          $self->{set_nc}->($self);
7158        }
7159      
7160            redo A;
7161          } elsif ($self->{nc} == 0x003E) { # >
7162            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7163            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7164            
7165        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7166          $self->{line_prev} = $self->{line};
7167          $self->{column_prev} = $self->{column};
7168          $self->{column}++;
7169          $self->{nc}
7170              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7171        } else {
7172          $self->{set_nc}->($self);
7173        }
7174      
7175            return  ($self->{ct}); # ATTLIST
7176            redo A;
7177          } elsif ($self->{nc} == -1) {
7178            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7179            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7180            
7181        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7182          $self->{line_prev} = $self->{line};
7183          $self->{column_prev} = $self->{column};
7184          $self->{column}++;
7185          $self->{nc}
7186              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7187        } else {
7188          $self->{set_nc}->($self);
7189        }
7190      
7191            return  ($self->{ct});
7192            redo A;
7193          } else {
7194            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
7195            $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7196            ## Reconsume.
7197            redo A;
7198          }
7199        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE) {
7200          if ($is_space->{$self->{nc}}) {
7201            ## XML5: No parse error.
7202            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no default type'); ## TODO: type
7203            $self->{state} = BOGUS_MD_STATE;
7204            ## Reconsume.
7205            redo A;
7206          } elsif ($self->{nc} == 0x0022) { # "
7207            ## XML5: Same as "anything else".
7208            $self->{ca}->{value} = '';
7209            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7210            
7211        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7212          $self->{line_prev} = $self->{line};
7213          $self->{column_prev} = $self->{column};
7214          $self->{column}++;
7215          $self->{nc}
7216              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7217        } else {
7218          $self->{set_nc}->($self);
7219        }
7220      
7221            redo A;
7222          } elsif ($self->{nc} == 0x0027) { # '
7223            ## XML5: Same as "anything else".
7224            $self->{ca}->{value} = '';
7225            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7226            
7227        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7228          $self->{line_prev} = $self->{line};
7229          $self->{column_prev} = $self->{column};
7230          $self->{column}++;
7231          $self->{nc}
7232              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7233        } else {
7234          $self->{set_nc}->($self);
7235        }
7236      
7237            redo A;
7238          } elsif ($self->{nc} == 0x003E) { # >
7239            ## XML5: Same as "anything else".
7240            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7241            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7242            
7243        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7244          $self->{line_prev} = $self->{line};
7245          $self->{column_prev} = $self->{column};
7246          $self->{column}++;
7247          $self->{nc}
7248              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7249        } else {
7250          $self->{set_nc}->($self);
7251        }
7252      
7253            return  ($self->{ct}); # ATTLIST
7254            redo A;
7255          } elsif ($self->{nc} == -1) {
7256            ## XML5: No parse error.
7257            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7258            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7259            
7260        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7261          $self->{line_prev} = $self->{line};
7262          $self->{column_prev} = $self->{column};
7263          $self->{column}++;
7264          $self->{nc}
7265              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7266        } else {
7267          $self->{set_nc}->($self);
7268        }
7269      
7270            return  ($self->{ct});
7271            redo A;
7272          } else {
7273            $self->{ca}->{default} = chr $self->{nc};
7274            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE;
7275            
7276        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7277          $self->{line_prev} = $self->{line};
7278          $self->{column_prev} = $self->{column};
7279          $self->{column}++;
7280          $self->{nc}
7281              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7282        } else {
7283          $self->{set_nc}->($self);
7284        }
7285      
7286            redo A;
7287          }
7288        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE) {
7289          if ($is_space->{$self->{nc}}) {
7290            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE;
7291            
7292        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7293          $self->{line_prev} = $self->{line};
7294          $self->{column_prev} = $self->{column};
7295          $self->{column}++;
7296          $self->{nc}
7297              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7298        } else {
7299          $self->{set_nc}->($self);
7300        }
7301      
7302            redo A;
7303          } elsif ($self->{nc} == 0x0022) { # "
7304            ## XML5: Same as "anything else".
7305            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7306            $self->{ca}->{value} = '';
7307            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7308            
7309        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7310          $self->{line_prev} = $self->{line};
7311          $self->{column_prev} = $self->{column};
7312          $self->{column}++;
7313          $self->{nc}
7314              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7315        } else {
7316          $self->{set_nc}->($self);
7317        }
7318      
7319            redo A;
7320          } elsif ($self->{nc} == 0x0027) { # '
7321            ## XML5: Same as "anything else".
7322            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7323            $self->{ca}->{value} = '';
7324            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7325            
7326        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7327          $self->{line_prev} = $self->{line};
7328          $self->{column_prev} = $self->{column};
7329          $self->{column}++;
7330          $self->{nc}
7331              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7332        } else {
7333          $self->{set_nc}->($self);
7334        }
7335      
7336            redo A;
7337          } elsif ($self->{nc} == 0x003E) { # >
7338            ## XML5: Same as "anything else".
7339            push @{$self->{ct}->{attrdefs}}, $self->{ca};
7340            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7341            
7342        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7343          $self->{line_prev} = $self->{line};
7344          $self->{column_prev} = $self->{column};
7345          $self->{column}++;
7346          $self->{nc}
7347              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7348        } else {
7349          $self->{set_nc}->($self);
7350        }
7351      
7352            return  ($self->{ct}); # ATTLIST
7353            redo A;
7354          } elsif ($self->{nc} == -1) {
7355            ## XML5: No parse error.
7356            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7357            push @{$self->{ct}->{attrdefs}}, $self->{ca};
7358            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7359            
7360        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7361          $self->{line_prev} = $self->{line};
7362          $self->{column_prev} = $self->{column};
7363          $self->{column}++;
7364          $self->{nc}
7365              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7366        } else {
7367          $self->{set_nc}->($self);
7368        }
7369      
7370            return  ($self->{ct});
7371            redo A;
7372          } else {
7373            $self->{ca}->{default} .= chr $self->{nc};
7374            ## Stay in the state.
7375            
7376        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7377          $self->{line_prev} = $self->{line};
7378          $self->{column_prev} = $self->{column};
7379          $self->{column}++;
7380          $self->{nc}
7381              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7382        } else {
7383          $self->{set_nc}->($self);
7384        }
7385      
7386            redo A;
7387          }
7388        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE) {
7389          if ($is_space->{$self->{nc}}) {
7390            ## Stay in the state.
7391            
7392        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7393          $self->{line_prev} = $self->{line};
7394          $self->{column_prev} = $self->{column};
7395          $self->{column}++;
7396          $self->{nc}
7397              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7398        } else {
7399          $self->{set_nc}->($self);
7400        }
7401      
7402            redo A;
7403          } elsif ($self->{nc} == 0x0022) { # "
7404            $self->{ca}->{value} = '';
7405            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7406            
7407        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7408          $self->{line_prev} = $self->{line};
7409          $self->{column_prev} = $self->{column};
7410          $self->{column}++;
7411          $self->{nc}
7412              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7413        } else {
7414          $self->{set_nc}->($self);
7415        }
7416      
7417            redo A;
7418          } elsif ($self->{nc} == 0x0027) { # '
7419            $self->{ca}->{value} = '';
7420            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7421            
7422        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7423          $self->{line_prev} = $self->{line};
7424          $self->{column_prev} = $self->{column};
7425          $self->{column}++;
7426          $self->{nc}
7427              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7428        } else {
7429          $self->{set_nc}->($self);
7430        }
7431      
7432            redo A;
7433          } elsif ($self->{nc} == 0x003E) { # >
7434            push @{$self->{ct}->{attrdefs}}, $self->{ca};
7435            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7436            
7437        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7438          $self->{line_prev} = $self->{line};
7439          $self->{column_prev} = $self->{column};
7440          $self->{column}++;
7441          $self->{nc}
7442              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7443        } else {
7444          $self->{set_nc}->($self);
7445        }
7446      
7447            return  ($self->{ct}); # ATTLIST
7448            redo A;
7449          } elsif ($self->{nc} == -1) {
7450            ## XML5: No parse error.
7451            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7452            push @{$self->{ct}->{attrdefs}}, $self->{ca};
7453            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7454            
7455        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7456          $self->{line_prev} = $self->{line};
7457          $self->{column_prev} = $self->{column};
7458          $self->{column}++;
7459          $self->{nc}
7460              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7461        } else {
7462          $self->{set_nc}->($self);
7463        }
7464      
7465            return  ($self->{ct});
7466            redo A;
7467          } else {
7468            ## XML5: Not defined yet.
7469            if ($self->{ca}->{default} eq 'FIXED') {
7470              $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7471            } else {
7472              push @{$self->{ct}->{attrdefs}}, $self->{ca};
7473              $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7474            }
7475            ## Reconsume.
7476            redo A;
7477          }
7478        } elsif ($self->{state} == AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE) {
7479          if ($is_space->{$self->{nc}} or
7480              $self->{nc} == -1 or
7481              $self->{nc} == 0x003E) { # >
7482            $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7483            ## Reconsume.
7484            redo A;
7485          } else {
7486            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before attr name'); ## TODO: type
7487            $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7488            ## Reconsume.
7489            redo A;
7490          }
7491        } elsif ($self->{state} == NDATA_STATE) {
7492          ## ASCII case-insensitive
7493          if ($self->{nc} == [
7494                undef,
7495                0x0044, # D
7496                0x0041, # A
7497                0x0054, # T
7498              ]->[length $self->{kwd}] or
7499              $self->{nc} == [
7500                undef,
7501                0x0064, # d
7502                0x0061, # a
7503                0x0074, # t
7504              ]->[length $self->{kwd}]) {
7505            
7506            ## Stay in the state.
7507            $self->{kwd} .= chr $self->{nc};
7508            
7509        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7510          $self->{line_prev} = $self->{line};
7511          $self->{column_prev} = $self->{column};
7512          $self->{column}++;
7513          $self->{nc}
7514              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7515        } else {
7516          $self->{set_nc}->($self);
7517        }
7518      
7519            redo A;
7520          } elsif ((length $self->{kwd}) == 4 and
7521                   ($self->{nc} == 0x0041 or # A
7522                    $self->{nc} == 0x0061)) { # a
7523            if ($self->{kwd} ne 'NDAT' or $self->{nc} == 0x0061) { # a
7524              
7525              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
7526                              text => 'NDATA',
7527                              line => $self->{line_prev},
7528                              column => $self->{column_prev} - 4);
7529            } else {
7530              
7531            }
7532            $self->{state} = AFTER_NDATA_STATE;
7533            
7534        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7535          $self->{line_prev} = $self->{line};
7536          $self->{column_prev} = $self->{column};
7537          $self->{column}++;
7538          $self->{nc}
7539              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7540        } else {
7541          $self->{set_nc}->($self);
7542        }
7543      
7544            redo A;
7545          } else {
7546            $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after literal', ## TODO: type
7547                            line => $self->{line_prev},
7548                            column => $self->{column_prev} + 1
7549                                - length $self->{kwd});
7550            
7551            $self->{state} = BOGUS_MD_STATE;
7552            ## Reconsume.
7553            redo A;
7554          }
7555        } elsif ($self->{state} == AFTER_NDATA_STATE) {
7556          if ($is_space->{$self->{nc}}) {
7557            $self->{state} = BEFORE_NOTATION_NAME_STATE;
7558            
7559        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7560          $self->{line_prev} = $self->{line};
7561          $self->{column_prev} = $self->{column};
7562          $self->{column}++;
7563          $self->{nc}
7564              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7565        } else {
7566          $self->{set_nc}->($self);
7567        }
7568      
7569            redo A;
7570          } elsif ($self->{nc} == 0x003E) { # >
7571            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no notation name'); ## TODO: type
7572            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7573            
7574        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7575          $self->{line_prev} = $self->{line};
7576          $self->{column_prev} = $self->{column};
7577          $self->{column}++;
7578          $self->{nc}
7579              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7580        } else {
7581          $self->{set_nc}->($self);
7582        }
7583      
7584            return  ($self->{ct}); # ENTITY
7585            redo A;
7586          } elsif ($self->{nc} == -1) {
7587            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7588            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7589            
7590        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7591          $self->{line_prev} = $self->{line};
7592          $self->{column_prev} = $self->{column};
7593          $self->{column}++;
7594          $self->{nc}
7595              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7596        } else {
7597          $self->{set_nc}->($self);
7598        }
7599      
7600            return  ($self->{ct}); # ENTITY
7601            redo A;
7602          } else {
7603            $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after literal', ## TODO: type
7604                            line => $self->{line_prev},
7605                            column => $self->{column_prev} + 1
7606                                - length $self->{kwd});
7607            $self->{state} = BOGUS_MD_STATE;
7608            ## Reconsume.
7609            redo A;
7610          }
7611        } elsif ($self->{state} == BEFORE_NOTATION_NAME_STATE) {
7612          if ($is_space->{$self->{nc}}) {
7613            ## Stay in the state.
7614            
7615        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7616          $self->{line_prev} = $self->{line};
7617          $self->{column_prev} = $self->{column};
7618          $self->{column}++;
7619          $self->{nc}
7620              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7621        } else {
7622          $self->{set_nc}->($self);
7623        }
7624      
7625            redo A;
7626          } elsif ($self->{nc} == 0x003E) { # >
7627            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no notation name'); ## TODO: type
7628            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7629            
7630        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7631          $self->{line_prev} = $self->{line};
7632          $self->{column_prev} = $self->{column};
7633          $self->{column}++;
7634          $self->{nc}
7635              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7636        } else {
7637          $self->{set_nc}->($self);
7638        }
7639      
7640            return  ($self->{ct}); # ENTITY
7641            redo A;
7642          } elsif ($self->{nc} == -1) {
7643            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7644            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7645            
7646        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7647          $self->{line_prev} = $self->{line};
7648          $self->{column_prev} = $self->{column};
7649          $self->{column}++;
7650          $self->{nc}
7651              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7652        } else {
7653          $self->{set_nc}->($self);
7654        }
7655      
7656            return  ($self->{ct}); # ENTITY
7657            redo A;
7658          } else {
7659            $self->{ct}->{notation} = chr $self->{nc}; # ENTITY
7660            $self->{state} = NOTATION_NAME_STATE;
7661            
7662        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7663          $self->{line_prev} = $self->{line};
7664          $self->{column_prev} = $self->{column};
7665          $self->{column}++;
7666          $self->{nc}
7667              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7668        } else {
7669          $self->{set_nc}->($self);
7670        }
7671      
7672            redo A;
7673          }
7674        } elsif ($self->{state} == NOTATION_NAME_STATE) {
7675          if ($is_space->{$self->{nc}}) {
7676            $self->{state} = AFTER_MD_DEF_STATE;
7677            
7678        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7679          $self->{line_prev} = $self->{line};
7680          $self->{column_prev} = $self->{column};
7681          $self->{column}++;
7682          $self->{nc}
7683              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7684        } else {
7685          $self->{set_nc}->($self);
7686        }
7687      
7688            redo A;
7689          } elsif ($self->{nc} == 0x003E) { # >
7690            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7691            
7692        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7693          $self->{line_prev} = $self->{line};
7694          $self->{column_prev} = $self->{column};
7695          $self->{column}++;
7696          $self->{nc}
7697              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7698        } else {
7699          $self->{set_nc}->($self);
7700        }
7701      
7702            return  ($self->{ct}); # ENTITY
7703            redo A;
7704          } elsif ($self->{nc} == -1) {
7705            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7706            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7707            
7708        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7709          $self->{line_prev} = $self->{line};
7710          $self->{column_prev} = $self->{column};
7711          $self->{column}++;
7712          $self->{nc}
7713              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7714        } else {
7715          $self->{set_nc}->($self);
7716        }
7717      
7718            return  ($self->{ct}); # ENTITY
7719            redo A;
7720          } else {
7721            $self->{ct}->{notation} .= chr $self->{nc}; # ENTITY
7722            ## Stay in the state.
7723            
7724        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7725          $self->{line_prev} = $self->{line};
7726          $self->{column_prev} = $self->{column};
7727          $self->{column}++;
7728          $self->{nc}
7729              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7730        } else {
7731          $self->{set_nc}->($self);
7732        }
7733      
7734            redo A;
7735          }
7736        } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE) {
7737          if ($self->{nc} == 0x0022) { # "
7738            $self->{state} = AFTER_MD_DEF_STATE;
7739            
7740        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7741          $self->{line_prev} = $self->{line};
7742          $self->{column_prev} = $self->{column};
7743          $self->{column}++;
7744          $self->{nc}
7745              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7746        } else {
7747          $self->{set_nc}->($self);
7748        }
7749      
7750            redo A;
7751          } elsif ($self->{nc} == 0x0026) { # &
7752            $self->{prev_state} = $self->{state};
7753            $self->{state} = ENTITY_VALUE_ENTITY_STATE;
7754            $self->{entity_add} = 0x0022; # "
7755            
7756        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7757          $self->{line_prev} = $self->{line};
7758          $self->{column_prev} = $self->{column};
7759          $self->{column}++;
7760          $self->{nc}
7761              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7762        } else {
7763          $self->{set_nc}->($self);
7764        }
7765      
7766            redo A;
7767    ## TODO: %
7768          } elsif ($self->{nc} == -1) {
7769            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed entity value'); ## TODO: type
7770            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7771            ## Reconsume.
7772            return  ($self->{ct}); # ENTITY
7773            redo A;
7774          } else {
7775            $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
7776            
7777        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7778          $self->{line_prev} = $self->{line};
7779          $self->{column_prev} = $self->{column};
7780          $self->{column}++;
7781          $self->{nc}
7782              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7783        } else {
7784          $self->{set_nc}->($self);
7785        }
7786      
7787            redo A;
7788          }
7789        } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE) {
7790          if ($self->{nc} == 0x0027) { # '
7791            $self->{state} = AFTER_MD_DEF_STATE;
7792            
7793        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7794          $self->{line_prev} = $self->{line};
7795          $self->{column_prev} = $self->{column};
7796          $self->{column}++;
7797          $self->{nc}
7798              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7799        } else {
7800          $self->{set_nc}->($self);
7801        }
7802      
7803            redo A;
7804          } elsif ($self->{nc} == 0x0026) { # &
7805            $self->{prev_state} = $self->{state};
7806            $self->{state} = ENTITY_VALUE_ENTITY_STATE;
7807            $self->{entity_add} = 0x0027; # '
7808            
7809        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7810          $self->{line_prev} = $self->{line};
7811          $self->{column_prev} = $self->{column};
7812          $self->{column}++;
7813          $self->{nc}
7814              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7815        } else {
7816          $self->{set_nc}->($self);
7817        }
7818      
7819            redo A;
7820    ## TODO: %
7821          } elsif ($self->{nc} == -1) {
7822            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed entity value'); ## TODO: type
7823            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7824            ## Reconsume.
7825            return  ($self->{ct}); # ENTITY
7826            redo A;
7827          } else {
7828            $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
7829            
7830        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7831          $self->{line_prev} = $self->{line};
7832          $self->{column_prev} = $self->{column};
7833          $self->{column}++;
7834          $self->{nc}
7835              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7836        } else {
7837          $self->{set_nc}->($self);
7838        }
7839      
7840            redo A;
7841          }
7842        } elsif ($self->{state} == ENTITY_VALUE_ENTITY_STATE) {
7843          ## TODO: XMLize
7844    
7845          if ($is_space->{$self->{nc}} or
7846              {
7847                0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
7848                $self->{entity_add} => 1,
7849              }->{$self->{nc}}) {
7850            ## Don't consume
7851            ## No error
7852            ## Return nothing.
7853            #
7854          } elsif ($self->{nc} == 0x0023) { # #
7855            $self->{ca} = $self->{ct};
7856            $self->{state} = ENTITY_HASH_STATE;
7857            $self->{kwd} = '#';
7858            
7859        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7860          $self->{line_prev} = $self->{line};
7861          $self->{column_prev} = $self->{column};
7862          $self->{column}++;
7863          $self->{nc}
7864              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7865        } else {
7866          $self->{set_nc}->($self);
7867        }
7868      
7869            redo A;
7870          } elsif ((0x0041 <= $self->{nc} and
7871                    $self->{nc} <= 0x005A) or # A..Z
7872                   (0x0061 <= $self->{nc} and
7873                    $self->{nc} <= 0x007A)) { # a..z
7874            #
7875          } else {
7876            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero');
7877            ## Return nothing.
7878            #
7879          }
7880    
7881          $self->{ct}->{value} .= '&';
7882          $self->{state} = $self->{prev_state};
7883          ## Reconsume.
7884          redo A;
7885        } elsif ($self->{state} == AFTER_ELEMENT_NAME_STATE) {
7886          if ($is_space->{$self->{nc}}) {
7887            $self->{state} = BEFORE_ELEMENT_CONTENT_STATE;
7888            
7889        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7890          $self->{line_prev} = $self->{line};
7891          $self->{column_prev} = $self->{column};
7892          $self->{column}++;
7893          $self->{nc}
7894              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7895        } else {
7896          $self->{set_nc}->($self);
7897        }
7898      
7899            redo A;
7900          } elsif ($self->{nc} == 0x0028) { # (
7901            $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
7902            $self->{ct}->{content} = ['('];
7903            $self->{group_depth} = 1;
7904            
7905        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7906          $self->{line_prev} = $self->{line};
7907          $self->{column_prev} = $self->{column};
7908          $self->{column}++;
7909          $self->{nc}
7910              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7911        } else {
7912          $self->{set_nc}->($self);
7913        }
7914      
7915            redo A;
7916          } elsif ($self->{nc} == 0x003E) { # >
7917            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
7918            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7919            
7920        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7921          $self->{line_prev} = $self->{line};
7922          $self->{column_prev} = $self->{column};
7923          $self->{column}++;
7924          $self->{nc}
7925              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7926        } else {
7927          $self->{set_nc}->($self);
7928        }
7929      
7930            return  ($self->{ct}); # ELEMENT
7931            redo A;
7932          } elsif ($self->{nc} == -1) {
7933            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7934            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7935            
7936        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7937          $self->{line_prev} = $self->{line};
7938          $self->{column_prev} = $self->{column};
7939          $self->{column}++;
7940          $self->{nc}
7941              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7942        } else {
7943          $self->{set_nc}->($self);
7944        }
7945      
7946            return  ($self->{ct}); # ELEMENT
7947            redo A;
7948          } else {
7949            $self->{ct}->{content} = [chr $self->{nc}];
7950            $self->{state} = CONTENT_KEYWORD_STATE;
7951            
7952        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7953          $self->{line_prev} = $self->{line};
7954          $self->{column_prev} = $self->{column};
7955          $self->{column}++;
7956          $self->{nc}
7957              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7958        } else {
7959          $self->{set_nc}->($self);
7960        }
7961      
7962            redo A;
7963          }
7964        } elsif ($self->{state} == CONTENT_KEYWORD_STATE) {
7965          if ($is_space->{$self->{nc}}) {
7966            $self->{state} = AFTER_MD_DEF_STATE;
7967            
7968        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7969          $self->{line_prev} = $self->{line};
7970          $self->{column_prev} = $self->{column};
7971          $self->{column}++;
7972          $self->{nc}
7973              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7974        } else {
7975          $self->{set_nc}->($self);
7976        }
7977      
7978            redo A;
7979          } elsif ($self->{nc} == 0x003E) { # >
7980            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7981            
7982        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7983          $self->{line_prev} = $self->{line};
7984          $self->{column_prev} = $self->{column};
7985          $self->{column}++;
7986          $self->{nc}
7987              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7988        } else {
7989          $self->{set_nc}->($self);
7990        }
7991      
7992            return  ($self->{ct}); # ELEMENT
7993            redo A;
7994          } elsif ($self->{nc} == -1) {
7995            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7996            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7997            
7998        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7999          $self->{line_prev} = $self->{line};
8000          $self->{column_prev} = $self->{column};
8001          $self->{column}++;
8002          $self->{nc}
8003              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8004        } else {
8005          $self->{set_nc}->($self);
8006        }
8007      
8008            return  ($self->{ct}); # ELEMENT
8009            redo A;
8010          } else {
8011            $self->{ct}->{content}->[-1] .= chr $self->{nc}; # ELEMENT
8012            ## Stay in the state.
8013            
8014        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8015          $self->{line_prev} = $self->{line};
8016          $self->{column_prev} = $self->{column};
8017          $self->{column}++;
8018          $self->{nc}
8019              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8020        } else {
8021          $self->{set_nc}->($self);
8022        }
8023      
8024            redo A;
8025          }
8026        } elsif ($self->{state} == AFTER_CM_GROUP_OPEN_STATE) {
8027          if ($is_space->{$self->{nc}}) {
8028            ## Stay in the state.
8029            
8030        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8031          $self->{line_prev} = $self->{line};
8032          $self->{column_prev} = $self->{column};
8033          $self->{column}++;
8034          $self->{nc}
8035              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8036        } else {
8037          $self->{set_nc}->($self);
8038        }
8039      
8040            redo A;
8041          } elsif ($self->{nc} == 0x0028) { # (
8042            $self->{group_depth}++;
8043            push @{$self->{ct}->{content}}, chr $self->{nc};
8044            ## Stay in the state.
8045            
8046        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8047          $self->{line_prev} = $self->{line};
8048          $self->{column_prev} = $self->{column};
8049          $self->{column}++;
8050          $self->{nc}
8051              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8052        } else {
8053          $self->{set_nc}->($self);
8054        }
8055      
8056            redo A;
8057          } elsif ($self->{nc} == 0x007C or # |
8058                   $self->{nc} == 0x002C) { # ,
8059            $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty element name'); ## TODO: type
8060            ## Stay in the state.
8061            
8062        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8063          $self->{line_prev} = $self->{line};
8064          $self->{column_prev} = $self->{column};
8065          $self->{column}++;
8066          $self->{nc}
8067              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8068        } else {
8069          $self->{set_nc}->($self);
8070        }
8071      
8072            redo A;
8073          } elsif ($self->{nc} == 0x0029) { # )
8074            $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty element name'); ## TODO: type
8075            push @{$self->{ct}->{content}}, chr $self->{nc};
8076            $self->{group_depth}--;
8077            $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8078            
8079        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8080          $self->{line_prev} = $self->{line};
8081          $self->{column_prev} = $self->{column};
8082          $self->{column}++;
8083          $self->{nc}
8084              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8085        } else {
8086          $self->{set_nc}->($self);
8087        }
8088      
8089            redo A;
8090          } elsif ($self->{nc} == 0x003E) { # >
8091            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8092            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8093            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8094            
8095        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8096          $self->{line_prev} = $self->{line};
8097          $self->{column_prev} = $self->{column};
8098          $self->{column}++;
8099          $self->{nc}
8100              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8101        } else {
8102          $self->{set_nc}->($self);
8103        }
8104      
8105            return  ($self->{ct}); # ELEMENT
8106            redo A;
8107          } elsif ($self->{nc} == -1) {
8108            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8109            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8110            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8111            
8112        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8113          $self->{line_prev} = $self->{line};
8114          $self->{column_prev} = $self->{column};
8115          $self->{column}++;
8116          $self->{nc}
8117              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8118        } else {
8119          $self->{set_nc}->($self);
8120        }
8121      
8122            return  ($self->{ct}); # ELEMENT
8123            redo A;
8124          } else {
8125            push @{$self->{ct}->{content}}, chr $self->{nc};
8126            $self->{state} = CM_ELEMENT_NAME_STATE;
8127            
8128        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8129          $self->{line_prev} = $self->{line};
8130          $self->{column_prev} = $self->{column};
8131          $self->{column}++;
8132          $self->{nc}
8133              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8134        } else {
8135          $self->{set_nc}->($self);
8136        }
8137      
8138            redo A;
8139          }
8140        } elsif ($self->{state} == CM_ELEMENT_NAME_STATE) {
8141          if ($is_space->{$self->{nc}}) {
8142            $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8143            
8144        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8145          $self->{line_prev} = $self->{line};
8146          $self->{column_prev} = $self->{column};
8147          $self->{column}++;
8148          $self->{nc}
8149              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8150        } else {
8151          $self->{set_nc}->($self);
8152        }
8153      
8154            redo A;
8155          } elsif ($self->{nc} == 0x002A or # *
8156                   $self->{nc} == 0x002B or # +
8157                   $self->{nc} == 0x003F) { # ?
8158            push @{$self->{ct}->{content}}, chr $self->{nc};
8159            $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8160            
8161        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8162          $self->{line_prev} = $self->{line};
8163          $self->{column_prev} = $self->{column};
8164          $self->{column}++;
8165          $self->{nc}
8166              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8167        } else {
8168          $self->{set_nc}->($self);
8169        }
8170      
8171            redo A;
8172          } elsif ($self->{nc} == 0x007C or # |
8173                   $self->{nc} == 0x002C) { # ,
8174            push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
8175            $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
8176            
8177        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8178          $self->{line_prev} = $self->{line};
8179          $self->{column_prev} = $self->{column};
8180          $self->{column}++;
8181          $self->{nc}
8182              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8183        } else {
8184          $self->{set_nc}->($self);
8185        }
8186      
8187            redo A;
8188          } elsif ($self->{nc} == 0x0029) { # )
8189            $self->{group_depth}--;
8190            push @{$self->{ct}->{content}}, chr $self->{nc};
8191            $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8192            
8193        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8194          $self->{line_prev} = $self->{line};
8195          $self->{column_prev} = $self->{column};
8196          $self->{column}++;
8197          $self->{nc}
8198              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8199        } else {
8200          $self->{set_nc}->($self);
8201        }
8202      
8203            redo A;
8204          } elsif ($self->{nc} == 0x003E) { # >
8205            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8206            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8207            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8208            
8209        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8210          $self->{line_prev} = $self->{line};
8211          $self->{column_prev} = $self->{column};
8212          $self->{column}++;
8213          $self->{nc}
8214              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8215        } else {
8216          $self->{set_nc}->($self);
8217        }
8218      
8219            return  ($self->{ct}); # ELEMENT
8220            redo A;
8221          } elsif ($self->{nc} == -1) {
8222            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8223            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8224            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8225            
8226        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8227          $self->{line_prev} = $self->{line};
8228          $self->{column_prev} = $self->{column};
8229          $self->{column}++;
8230          $self->{nc}
8231              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8232        } else {
8233          $self->{set_nc}->($self);
8234        }
8235      
8236            return  ($self->{ct}); # ELEMENT
8237            redo A;
8238          } else {
8239            $self->{ct}->{content}->[-1] .= chr $self->{nc};
8240            ## Stay in the state.
8241            
8242        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8243          $self->{line_prev} = $self->{line};
8244          $self->{column_prev} = $self->{column};
8245          $self->{column}++;
8246          $self->{nc}
8247              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8248        } else {
8249          $self->{set_nc}->($self);
8250        }
8251      
8252            redo A;
8253          }
8254        } elsif ($self->{state} == AFTER_CM_ELEMENT_NAME_STATE) {
8255          if ($is_space->{$self->{nc}}) {
8256            ## Stay in the state.
8257            
8258        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8259          $self->{line_prev} = $self->{line};
8260          $self->{column_prev} = $self->{column};
8261          $self->{column}++;
8262          $self->{nc}
8263              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8264        } else {
8265          $self->{set_nc}->($self);
8266        }
8267      
8268            redo A;
8269          } elsif ($self->{nc} == 0x007C or # |
8270                   $self->{nc} == 0x002C) { # ,
8271            push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
8272            $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
8273            
8274        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8275          $self->{line_prev} = $self->{line};
8276          $self->{column_prev} = $self->{column};
8277          $self->{column}++;
8278          $self->{nc}
8279              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8280        } else {
8281          $self->{set_nc}->($self);
8282        }
8283      
8284            redo A;
8285          } elsif ($self->{nc} == 0x0029) { # )
8286            $self->{group_depth}--;
8287            push @{$self->{ct}->{content}}, chr $self->{nc};
8288            $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8289            
8290        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8291          $self->{line_prev} = $self->{line};
8292          $self->{column_prev} = $self->{column};
8293          $self->{column}++;
8294          $self->{nc}
8295              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8296        } else {
8297          $self->{set_nc}->($self);
8298        }
8299      
8300            redo A;
8301          } elsif ($self->{nc} == 0x003E) { # >
8302            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8303            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8304            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8305            
8306        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8307          $self->{line_prev} = $self->{line};
8308          $self->{column_prev} = $self->{column};
8309          $self->{column}++;
8310          $self->{nc}
8311              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8312        } else {
8313          $self->{set_nc}->($self);
8314        }
8315      
8316            return  ($self->{ct}); # ELEMENT
8317            redo A;
8318          } elsif ($self->{nc} == -1) {
8319            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8320            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8321            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8322            
8323        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8324          $self->{line_prev} = $self->{line};
8325          $self->{column_prev} = $self->{column};
8326          $self->{column}++;
8327          $self->{nc}
8328              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8329        } else {
8330          $self->{set_nc}->($self);
8331        }
8332      
8333            return  ($self->{ct}); # ELEMENT
8334            redo A;
8335          } else {
8336            $self->{parse_error}->(level => $self->{level}->{must}, type => 'after element name'); ## TODO: type
8337            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8338            $self->{state} = BOGUS_MD_STATE;
8339            
8340        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8341          $self->{line_prev} = $self->{line};
8342          $self->{column_prev} = $self->{column};
8343          $self->{column}++;
8344          $self->{nc}
8345              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8346        } else {
8347          $self->{set_nc}->($self);
8348        }
8349      
8350            redo A;
8351          }
8352        } elsif ($self->{state} == AFTER_CM_GROUP_CLOSE_STATE) {
8353          if ($is_space->{$self->{nc}}) {
8354            if ($self->{group_depth}) {
8355              $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8356            } else {
8357              $self->{state} = AFTER_MD_DEF_STATE;
8358            }
8359            
8360        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8361          $self->{line_prev} = $self->{line};
8362          $self->{column_prev} = $self->{column};
8363          $self->{column}++;
8364          $self->{nc}
8365              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8366        } else {
8367          $self->{set_nc}->($self);
8368        }
8369      
8370            redo A;
8371          } elsif ($self->{nc} == 0x002A or # *
8372                   $self->{nc} == 0x002B or # +
8373                   $self->{nc} == 0x003F) { # ?
8374            push @{$self->{ct}->{content}}, chr $self->{nc};
8375            if ($self->{group_depth}) {
8376              $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8377            } else {
8378              $self->{state} = AFTER_MD_DEF_STATE;
8379            }
8380            
8381        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8382          $self->{line_prev} = $self->{line};
8383          $self->{column_prev} = $self->{column};
8384          $self->{column}++;
8385          $self->{nc}
8386              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8387        } else {
8388          $self->{set_nc}->($self);
8389        }
8390      
8391            redo A;
8392          } elsif ($self->{nc} == 0x0029) { # )
8393            if ($self->{group_depth}) {
8394              $self->{group_depth}--;
8395              push @{$self->{ct}->{content}}, chr $self->{nc};
8396              ## Stay in the state.
8397              
8398        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8399          $self->{line_prev} = $self->{line};
8400          $self->{column_prev} = $self->{column};
8401          $self->{column}++;
8402          $self->{nc}
8403              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8404        } else {
8405          $self->{set_nc}->($self);
8406        }
8407      
8408              redo A;
8409            } else {
8410              $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8411              $self->{state} = BOGUS_MD_STATE;
8412              ## Reconsume.
8413              redo A;
8414            }
8415          } elsif ($self->{nc} == 0x003E) { # >
8416            if ($self->{group_depth}) {
8417              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8418              push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8419            }
8420            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8421            
8422        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8423          $self->{line_prev} = $self->{line};
8424          $self->{column_prev} = $self->{column};
8425          $self->{column}++;
8426          $self->{nc}
8427              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8428        } else {
8429          $self->{set_nc}->($self);
8430        }
8431      
8432            return  ($self->{ct}); # ELEMENT
8433            redo A;
8434          } elsif ($self->{nc} == -1) {
8435            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8436            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8437            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8438            
8439        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8440          $self->{line_prev} = $self->{line};
8441          $self->{column_prev} = $self->{column};
8442          $self->{column}++;
8443          $self->{nc}
8444              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8445        } else {
8446          $self->{set_nc}->($self);
8447        }
8448      
8449            return  ($self->{ct}); # ELEMENT
8450            redo A;
8451          } else {
8452            if ($self->{group_depth}) {
8453              $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8454            } else {
8455              $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8456              $self->{state} = BOGUS_MD_STATE;
8457            }
8458            ## Reconsume.
8459            redo A;
8460          }
8461        } elsif ($self->{state} == AFTER_MD_DEF_STATE) {
8462          if ($is_space->{$self->{nc}}) {
8463            ## Stay in the state.
8464            
8465        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8466          $self->{line_prev} = $self->{line};
8467          $self->{column_prev} = $self->{column};
8468          $self->{column}++;
8469          $self->{nc}
8470              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8471        } else {
8472          $self->{set_nc}->($self);
8473        }
8474      
8475            redo A;
8476          } elsif ($self->{nc} == 0x003E) { # >
8477            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8478            
8479        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8480          $self->{line_prev} = $self->{line};
8481          $self->{column_prev} = $self->{column};
8482          $self->{column}++;
8483          $self->{nc}
8484              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8485        } else {
8486          $self->{set_nc}->($self);
8487        }
8488      
8489            return  ($self->{ct}); # ENTITY/ELEMENT
8490            redo A;
8491          } elsif ($self->{nc} == -1) {
8492            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8493            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8494            
8495        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8496          $self->{line_prev} = $self->{line};
8497          $self->{column_prev} = $self->{column};
8498          $self->{column}++;
8499          $self->{nc}
8500              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8501        } else {
8502          $self->{set_nc}->($self);
8503        }
8504      
8505            return  ($self->{ct}); # ENTITY/ELEMENT
8506            redo A;
8507          } else {
8508            $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8509            $self->{state} = BOGUS_MD_STATE;
8510            ## Reconsume.
8511            redo A;
8512          }
8513        } elsif ($self->{state} == BOGUS_MD_STATE) {
8514          if ($self->{nc} == 0x003E) { # >
8515            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8516            
8517        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8518          $self->{line_prev} = $self->{line};
8519          $self->{column_prev} = $self->{column};
8520          $self->{column}++;
8521          $self->{nc}
8522              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8523        } else {
8524          $self->{set_nc}->($self);
8525        }
8526      
8527            return  ($self->{ct}); # ATTLIST/ENTITY/NOTATION
8528            redo A;
8529          } elsif ($self->{nc} == -1) {
8530            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8531            ## Reconsume.
8532            return  ($self->{ct}); # ATTLIST/ENTITY/NOTATION
8533            redo A;
8534          } else {
8535            ## Stay in the state.
8536            
8537        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8538          $self->{line_prev} = $self->{line};
8539          $self->{column_prev} = $self->{column};
8540          $self->{column}++;
8541          $self->{nc}
8542              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8543        } else {
8544          $self->{set_nc}->($self);
8545        }
8546      
8547            redo A;
8548          }
8549      } else {      } else {
8550        die "$0: $self->{state}: Unknown state";        die "$0: $self->{state}: Unknown state";
8551      }      }
# Line 4152  sub _get_next_token ($) { Line 8556  sub _get_next_token ($) {
8556    
8557  1;  1;
8558  ## $Date$  ## $Date$
8559                                    

Legend:
Removed from v.1.4  
changed lines
  Added in v.1.20

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24