/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm
Suika

Diff of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.5 by wakaba, Tue Oct 14 14:38:59 2008 UTC revision 1.30 by wakaba, Sun Aug 16 05:24:47 2009 UTC
# Line 15  BEGIN { Line 15  BEGIN {
15      CHARACTER_TOKEN      CHARACTER_TOKEN
16      PI_TOKEN      PI_TOKEN
17      ABORT_TOKEN      ABORT_TOKEN
18        END_OF_DOCTYPE_TOKEN
19        ATTLIST_TOKEN
20        ELEMENT_TOKEN
21        GENERAL_ENTITY_TOKEN
22        PARAMETER_ENTITY_TOKEN
23        NOTATION_TOKEN
24    );    );
25        
26    our %EXPORT_TAGS = (    our %EXPORT_TAGS = (
# Line 27  BEGIN { Line 33  BEGIN {
33        CHARACTER_TOKEN        CHARACTER_TOKEN
34        PI_TOKEN        PI_TOKEN
35        ABORT_TOKEN        ABORT_TOKEN
36          END_OF_DOCTYPE_TOKEN
37          ATTLIST_TOKEN
38          ELEMENT_TOKEN
39          GENERAL_ENTITY_TOKEN
40          PARAMETER_ENTITY_TOKEN
41          NOTATION_TOKEN
42      )],      )],
43    );    );
44  }  }
45    
46    ## NOTE: Differences from the XML5 draft are marked as "XML5:".
47    
48  ## Token types  ## Token types
49    
50  sub DOCTYPE_TOKEN () { 1 }  sub DOCTYPE_TOKEN () { 1 } ## XML5: No DOCTYPE token.
51  sub COMMENT_TOKEN () { 2 }  sub COMMENT_TOKEN () { 2 }
52  sub START_TAG_TOKEN () { 3 }  sub START_TAG_TOKEN () { 3 }
53  sub END_TAG_TOKEN () { 4 }  sub END_TAG_TOKEN () { 4 }
54  sub END_OF_FILE_TOKEN () { 5 }  sub END_OF_FILE_TOKEN () { 5 }
55  sub CHARACTER_TOKEN () { 6 }  sub CHARACTER_TOKEN () { 6 }
56  sub PI_TOKEN () { 7 } # XML5  sub PI_TOKEN () { 7 } ## NOTE: XML only.
57  sub ABORT_TOKEN () { 8 } # Not a token actually  sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing.
58    sub END_OF_DOCTYPE_TOKEN () { 9 } ## NOTE: XML only.
59    sub ATTLIST_TOKEN () { 10 } ## NOTE: XML only.
60    sub ELEMENT_TOKEN () { 11 } ## NOTE: XML only.
61    sub GENERAL_ENTITY_TOKEN () { 12 } ## NOTE: XML only.
62    sub PARAMETER_ENTITY_TOKEN () { 13 } ## NOTE: XML only.
63    sub NOTATION_TOKEN () { 14 } ## NOTE: XML only.
64    
65    ## XML5: XML5 has "empty tag token".  In this implementation, it is
66    ## represented as a start tag token with $self->{self_closing} flag
67    ## set to true.
68    
69    ## XML5: XML5 has "short end tag token".  In this implementation, it
70    ## is represented as an end tag token with $token->{tag_name} flag set
71    ## to an empty string.
72    
73  package Whatpm::HTML;  package Whatpm::HTML;
74    
# Line 114  sub HEXREF_HEX_STATE () { 48 } Line 142  sub HEXREF_HEX_STATE () { 48 }
142  sub ENTITY_NAME_STATE () { 49 }  sub ENTITY_NAME_STATE () { 49 }
143  sub PCDATA_STATE () { 50 } # "data state" in the spec  sub PCDATA_STATE () { 50 } # "data state" in the spec
144    
145    ## XML-only states
146    sub PI_STATE () { 51 }
147    sub PI_TARGET_STATE () { 52 }
148    sub PI_TARGET_AFTER_STATE () { 53 }
149    sub PI_DATA_STATE () { 54 }
150    sub PI_AFTER_STATE () { 55 }
151    sub PI_DATA_AFTER_STATE () { 56 }
152    sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 }
153    sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 }
154    sub BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 59 }
155    sub DOCTYPE_TAG_STATE () { 60 }
156    sub DOCTYPE_MARKUP_DECLARATION_OPEN_STATE () { 61 }
157    sub MD_ATTLIST_STATE () { 62 }
158    sub MD_E_STATE () { 63 }
159    sub MD_ELEMENT_STATE () { 64 }
160    sub MD_ENTITY_STATE () { 65 }
161    sub MD_NOTATION_STATE () { 66 }
162    sub DOCTYPE_MD_STATE () { 67 }
163    sub BEFORE_MD_NAME_STATE () { 68 }
164    sub MD_NAME_STATE () { 69 }
165    sub DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE () { 70 }
166    sub DOCTYPE_ATTLIST_NAME_AFTER_STATE () { 71 }
167    sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE () { 72 }
168    sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE () { 73 }
169    sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE () { 74 }
170    sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE () { 75 }
171    sub BEFORE_ALLOWED_TOKEN_STATE () { 76 }
172    sub ALLOWED_TOKEN_STATE () { 77 }
173    sub AFTER_ALLOWED_TOKEN_STATE () { 78 }
174    sub AFTER_ALLOWED_TOKENS_STATE () { 79 }
175    sub BEFORE_ATTR_DEFAULT_STATE () { 80 }
176    sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE () { 81 }
177    sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE () { 82 }
178    sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE () { 83 }
179    sub AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE () { 84 }
180    sub BEFORE_NDATA_STATE () { 85 }
181    sub NDATA_STATE () { 86 }
182    sub AFTER_NDATA_STATE () { 87 }
183    sub BEFORE_NOTATION_NAME_STATE () { 88 }
184    sub NOTATION_NAME_STATE () { 89 }
185    sub DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE () { 90 }
186    sub DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE () { 91 }
187    sub ENTITY_VALUE_ENTITY_STATE () { 92 }
188    sub AFTER_ELEMENT_NAME_STATE () { 93 }
189    sub BEFORE_ELEMENT_CONTENT_STATE () { 94 }
190    sub CONTENT_KEYWORD_STATE () { 95 }
191    sub AFTER_CM_GROUP_OPEN_STATE () { 96 }
192    sub CM_ELEMENT_NAME_STATE () { 97 }
193    sub AFTER_CM_ELEMENT_NAME_STATE () { 98 }
194    sub AFTER_CM_GROUP_CLOSE_STATE () { 99 }
195    sub AFTER_MD_DEF_STATE () { 100 }
196    sub BOGUS_MD_STATE () { 101 }
197    
198  ## Tree constructor state constants (see Whatpm::HTML for the full  ## Tree constructor state constants (see Whatpm::HTML for the full
199  ## list and descriptions)  ## list and descriptions)
200    
# Line 178  sub _initialize_tokenizer ($) { Line 259  sub _initialize_tokenizer ($) {
259    #$self->{is_xml} (if XML)    #$self->{is_xml} (if XML)
260    
261    $self->{state} = DATA_STATE; # MUST    $self->{state} = DATA_STATE; # MUST
262    $self->{s_kwd} = ''; # state keyword    $self->{s_kwd} = ''; # Data state keyword
263      #$self->{kwd} = ''; # State-dependent keyword; initialized when used
264    #$self->{entity__value}; # initialized when used    #$self->{entity__value}; # initialized when used
265    #$self->{entity__match}; # initialized when used    #$self->{entity__match}; # initialized when used
266    $self->{content_model} = PCDATA_CONTENT_MODEL; # be    $self->{content_model} = PCDATA_CONTENT_MODEL; # be
# Line 208  sub _initialize_tokenizer ($) { Line 290  sub _initialize_tokenizer ($) {
290    
291  ## A token has:  ## A token has:
292  ##   ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,  ##   ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
293  ##       CHARACTER_TOKEN, or END_OF_FILE_TOKEN  ##       CHARACTER_TOKEN, END_OF_FILE_TOKEN, PI_TOKEN, or ABORT_TOKEN
294  ##   ->{name} (DOCTYPE_TOKEN)  ##   ->{name} (DOCTYPE_TOKEN)
295  ##   ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)  ##   ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
296    ##   ->{target} (PI_TOKEN)
297  ##   ->{pubid} (DOCTYPE_TOKEN)  ##   ->{pubid} (DOCTYPE_TOKEN)
298  ##   ->{sysid} (DOCTYPE_TOKEN)  ##   ->{sysid} (DOCTYPE_TOKEN)
299  ##   ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag  ##   ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
# Line 218  sub _initialize_tokenizer ($) { Line 301  sub _initialize_tokenizer ($) {
301  ##        ->{name}  ##        ->{name}
302  ##        ->{value}  ##        ->{value}
303  ##        ->{has_reference} == 1 or 0  ##        ->{has_reference} == 1 or 0
304  ##   ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)  ##        ->{index}: Index of the attribute in a tag.
305    ##   ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)
306    ##   ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
307    ##   ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.
308    ##   ->{has_internal_subset} = 1 or 0 (DOCTYPE_TOKEN)
309    
310  ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.  ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
311  ##     |->{self_closing}| is used to save the value of |$self->{self_closing}|  ##     |->{self_closing}| is used to save the value of |$self->{self_closing}|
312  ##     while the token is pushed back to the stack.  ##     while the token is pushed back to the stack.
# Line 238  my $is_space = { Line 326  my $is_space = {
326    0x0009 => 1, # CHARACTER TABULATION (HT)    0x0009 => 1, # CHARACTER TABULATION (HT)
327    0x000A => 1, # LINE FEED (LF)    0x000A => 1, # LINE FEED (LF)
328    #0x000B => 0, # LINE TABULATION (VT)    #0x000B => 0, # LINE TABULATION (VT)
329    0x000C => 1, # FORM FEED (FF)    0x000C => 1, # FORM FEED (FF) ## XML5: Not a space character.
330    #0x000D => 1, # CARRIAGE RETURN (CR)    #0x000D => 1, # CARRIAGE RETURN (CR)
331    0x0020 => 1, # SPACE (SP)    0x0020 => 1, # SPACE (SP)
332  };  };
# Line 498  sub _get_next_token ($) { Line 586  sub _get_next_token ($) {
586        return  ($token);        return  ($token);
587        redo A;        redo A;
588      } elsif ($self->{state} == TAG_OPEN_STATE) {      } elsif ($self->{state} == TAG_OPEN_STATE) {
589          ## XML5: "tag state".
590    
591        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
592          if ($self->{nc} == 0x002F) { # /          if ($self->{nc} == 0x002F) { # /
593                        
# Line 516  sub _get_next_token ($) { Line 606  sub _get_next_token ($) {
606            redo A;            redo A;
607          } elsif ($self->{nc} == 0x0021) { # !          } elsif ($self->{nc} == 0x0021) { # !
608                        
609            $self->{s_kwd} = '<' unless $self->{escape};            $self->{s_kwd} = $self->{escaped} ? '' : '<';
610            #            #
611          } else {          } else {
612                        
613              $self->{s_kwd} = '';
614            #            #
615          }          }
616    
617          ## reconsume          ## reconsume
618          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
         $self->{s_kwd} = '';  
619          return  ({type => CHARACTER_TOKEN, data => '<',          return  ({type => CHARACTER_TOKEN, data => '<',
620                    line => $self->{line_prev},                    line => $self->{line_prev},
621                    column => $self->{column_prev},                    column => $self->{column_prev},
# Line 629  sub _get_next_token ($) { Line 719  sub _get_next_token ($) {
719    
720            redo A;            redo A;
721          } elsif ($self->{nc} == 0x003F) { # ?          } elsif ($self->{nc} == 0x003F) { # ?
722                        if ($self->{is_xml}) {
723            $self->{parse_error}->(level => $self->{level}->{must}, type => 'pio',              
724                            line => $self->{line_prev},              $self->{state} = PI_STATE;
725                            column => $self->{column_prev});              
726            $self->{state} = BOGUS_COMMENT_STATE;      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
727            $self->{ct} = {type => COMMENT_TOKEN, data => '',        $self->{line_prev} = $self->{line};
728                                      line => $self->{line_prev},        $self->{column_prev} = $self->{column};
729                                      column => $self->{column_prev},        $self->{column}++;
730                                     };        $self->{nc}
731            ## $self->{nc} is intentionally left as is            = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
732            redo A;      } else {
733          } else {        $self->{set_nc}->($self);
734        }
735      
736                redo A;
737              } else {
738                
739                $self->{parse_error}->(level => $self->{level}->{must}, type => 'pio',
740                                line => $self->{line_prev},
741                                column => $self->{column_prev});
742                $self->{state} = BOGUS_COMMENT_STATE;
743                $self->{ct} = {type => COMMENT_TOKEN, data => '',
744                               line => $self->{line_prev},
745                               column => $self->{column_prev},
746                              };
747                ## $self->{nc} is intentionally left as is
748                redo A;
749              }
750            } elsif (not $self->{is_xml} or $is_space->{$self->{nc}}) {
751                        
752            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago',            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago',
753                            line => $self->{line_prev},                            line => $self->{line_prev},
# Line 655  sub _get_next_token ($) { Line 762  sub _get_next_token ($) {
762                     });                     });
763    
764            redo A;            redo A;
765            } else {
766              ## XML5: "<:" is a parse error.
767              
768              $self->{ct} = {type => START_TAG_TOKEN,
769                                        tag_name => chr ($self->{nc}),
770                                        line => $self->{line_prev},
771                                        column => $self->{column_prev}};
772              $self->{state} = TAG_NAME_STATE;
773              
774        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
775          $self->{line_prev} = $self->{line};
776          $self->{column_prev} = $self->{column};
777          $self->{column}++;
778          $self->{nc}
779              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
780        } else {
781          $self->{set_nc}->($self);
782        }
783      
784              redo A;
785          }          }
786        } else {        } else {
787          die "$0: $self->{content_model} in tag open";          die "$0: $self->{content_model} in tag open";
# Line 663  sub _get_next_token ($) { Line 790  sub _get_next_token ($) {
790        ## NOTE: The "close tag open state" in the spec is implemented as        ## NOTE: The "close tag open state" in the spec is implemented as
791        ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.        ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
792    
793          ## XML5: "end tag state".
794    
795        my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"        my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
796        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
797          if (defined $self->{last_stag_name}) {          if (defined $self->{last_stag_name}) {
798            $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;            $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
799            $self->{s_kwd} = '';            $self->{kwd} = '';
800            ## Reconsume.            ## Reconsume.
801            redo A;            redo A;
802          } else {          } else {
# Line 724  sub _get_next_token ($) { Line 853  sub _get_next_token ($) {
853        
854          redo A;          redo A;
855        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
           
856          $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty end tag',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty end tag',
857                          line => $self->{line_prev}, ## "<" in "</>"                          line => $self->{line_prev}, ## "<" in "</>"
858                          column => $self->{column_prev} - 1);                          column => $self->{column_prev} - 1);
859          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
860          $self->{s_kwd} = '';          $self->{s_kwd} = '';
861                    if ($self->{is_xml}) {
862              
863              ## XML5: No parse error.
864              
865              ## NOTE: This parser raises a parse error, since it supports
866              ## XML1, not XML5.
867    
868              ## NOTE: A short end tag token.
869              my $ct = {type => END_TAG_TOKEN,
870                        tag_name => '',
871                        line => $self->{line_prev},
872                        column => $self->{column_prev} - 1,
873                       };
874              
875      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
876        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
877        $self->{column_prev} = $self->{column};        $self->{column_prev} = $self->{column};
# Line 741  sub _get_next_token ($) { Line 882  sub _get_next_token ($) {
882        $self->{set_nc}->($self);        $self->{set_nc}->($self);
883      }      }
884        
885              return  ($ct);
886            } else {
887              
888              
889        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
890          $self->{line_prev} = $self->{line};
891          $self->{column_prev} = $self->{column};
892          $self->{column}++;
893          $self->{nc}
894              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
895        } else {
896          $self->{set_nc}->($self);
897        }
898      
899            }
900          redo A;          redo A;
901        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
902                    
# Line 754  sub _get_next_token ($) { Line 910  sub _get_next_token ($) {
910                   });                   });
911    
912          redo A;          redo A;
913        } else {        } elsif (not $self->{is_xml} or
914                   $is_space->{$self->{nc}}) {
915                    
916          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus end tag');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus end tag',
917                            line => $self->{line_prev}, # "<" of "</"
918                            column => $self->{column_prev} - 1);
919          $self->{state} = BOGUS_COMMENT_STATE;          $self->{state} = BOGUS_COMMENT_STATE;
920          $self->{ct} = {type => COMMENT_TOKEN, data => '',          $self->{ct} = {type => COMMENT_TOKEN, data => '',
921                                    line => $self->{line_prev}, # "<" of "</"                                    line => $self->{line_prev}, # "<" of "</"
# Line 769  sub _get_next_token ($) { Line 928  sub _get_next_token ($) {
928          ## generated from the bogus end tag, as defined in the          ## generated from the bogus end tag, as defined in the
929          ## "bogus comment state" entry.          ## "bogus comment state" entry.
930          redo A;          redo A;
931          } else {
932            ## XML5: "</:" is a parse error.
933            
934            $self->{ct} = {type => END_TAG_TOKEN,
935                           tag_name => chr ($self->{nc}),
936                           line => $l, column => $c};
937            $self->{state} = TAG_NAME_STATE; ## XML5: "end tag name state".
938            
939        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
940          $self->{line_prev} = $self->{line};
941          $self->{column_prev} = $self->{column};
942          $self->{column}++;
943          $self->{nc}
944              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
945        } else {
946          $self->{set_nc}->($self);
947        }
948      
949            redo A;
950        }        }
951      } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {      } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
952        my $ch = substr $self->{last_stag_name}, length $self->{s_kwd}, 1;        my $ch = substr $self->{last_stag_name}, length $self->{kwd}, 1;
953        if (length $ch) {        if (length $ch) {
954          my $CH = $ch;          my $CH = $ch;
955          $ch =~ tr/a-z/A-Z/;          $ch =~ tr/a-z/A-Z/;
# Line 779  sub _get_next_token ($) { Line 957  sub _get_next_token ($) {
957          if ($nch eq $ch or $nch eq $CH) {          if ($nch eq $ch or $nch eq $CH) {
958                        
959            ## Stay in the state.            ## Stay in the state.
960            $self->{s_kwd} .= $nch;            $self->{kwd} .= $nch;
961                        
962      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
963        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 798  sub _get_next_token ($) { Line 976  sub _get_next_token ($) {
976            $self->{s_kwd} = '';            $self->{s_kwd} = '';
977            ## Reconsume.            ## Reconsume.
978            return  ({type => CHARACTER_TOKEN,            return  ({type => CHARACTER_TOKEN,
979                      data => '</' . $self->{s_kwd},                      data => '</' . $self->{kwd},
980                      line => $self->{line_prev},                      line => $self->{line_prev},
981                      column => $self->{column_prev} - 1 - length $self->{s_kwd},                      column => $self->{column_prev} - 1 - length $self->{kwd},
982                     });                     });
983            redo A;            redo A;
984          }          }
# Line 816  sub _get_next_token ($) { Line 994  sub _get_next_token ($) {
994            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
995            $self->{s_kwd} = '';            $self->{s_kwd} = '';
996            return  ({type => CHARACTER_TOKEN,            return  ({type => CHARACTER_TOKEN,
997                      data => '</' . $self->{s_kwd},                      data => '</' . $self->{kwd},
998                      line => $self->{line_prev},                      line => $self->{line_prev},
999                      column => $self->{column_prev} - 1 - length $self->{s_kwd},                      column => $self->{column_prev} - 1 - length $self->{kwd},
1000                     });                     });
1001            redo A;            redo A;
1002          } else {          } else {
# Line 827  sub _get_next_token ($) { Line 1005  sub _get_next_token ($) {
1005                = {type => END_TAG_TOKEN,                = {type => END_TAG_TOKEN,
1006                   tag_name => $self->{last_stag_name},                   tag_name => $self->{last_stag_name},
1007                   line => $self->{line_prev},                   line => $self->{line_prev},
1008                   column => $self->{column_prev} - 1 - length $self->{s_kwd}};                   column => $self->{column_prev} - 1 - length $self->{kwd}};
1009            $self->{state} = TAG_NAME_STATE;            $self->{state} = TAG_NAME_STATE;
1010            ## Reconsume.            ## Reconsume.
1011            redo A;            redo A;
# Line 959  sub _get_next_token ($) { Line 1137  sub _get_next_token ($) {
1137          redo A;          redo A;
1138        }        }
1139      } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {      } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
1140          ## XML5: "Tag attribute name before state".
1141    
1142        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
1143                    
1144          ## Stay in the state          ## Stay in the state
# Line 1068  sub _get_next_token ($) { Line 1248  sub _get_next_token ($) {
1248          if ({          if ({
1249               0x0022 => 1, # "               0x0022 => 1, # "
1250               0x0027 => 1, # '               0x0027 => 1, # '
1251                 0x003C => 1, # <
1252               0x003D => 1, # =               0x003D => 1, # =
1253              }->{$self->{nc}}) {              }->{$self->{nc}}) {
1254                        
1255              ## XML5: Not a parse error.
1256            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1257          } else {          } else {
1258                        
1259              ## XML5: ":" raises a parse error and is ignored.
1260          }          }
1261          $self->{ca}          $self->{ca}
1262              = {name => chr ($self->{nc}),              = {name => chr ($self->{nc}),
# Line 1094  sub _get_next_token ($) { Line 1277  sub _get_next_token ($) {
1277          redo A;          redo A;
1278        }        }
1279      } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {      } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
1280          ## XML5: "Tag attribute name state".
1281    
1282        my $before_leave = sub {        my $before_leave = sub {
1283          if (exists $self->{ct}->{attributes} # start tag or end tag          if (exists $self->{ct}->{attributes} # start tag or end tag
1284              ->{$self->{ca}->{name}}) { # MUST              ->{$self->{ca}->{name}}) { # MUST
# Line 1104  sub _get_next_token ($) { Line 1289  sub _get_next_token ($) {
1289                        
1290            $self->{ct}->{attributes}->{$self->{ca}->{name}}            $self->{ct}->{attributes}->{$self->{ca}->{name}}
1291              = $self->{ca};              = $self->{ca};
1292              $self->{ca}->{index} = ++$self->{ct}->{last_index};
1293          }          }
1294        }; # $before_leave        }; # $before_leave
1295    
# Line 1140  sub _get_next_token ($) { Line 1326  sub _get_next_token ($) {
1326        
1327          redo A;          redo A;
1328        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
1329            if ($self->{is_xml}) {
1330              
1331              ## XML5: Not a parse error.
1332              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1333            } else {
1334              
1335            }
1336    
1337          $before_leave->();          $before_leave->();
1338          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
1339                        
# Line 1189  sub _get_next_token ($) { Line 1383  sub _get_next_token ($) {
1383        
1384          redo A;          redo A;
1385        } elsif ($self->{nc} == 0x002F) { # /        } elsif ($self->{nc} == 0x002F) { # /
1386            if ($self->{is_xml}) {
1387              
1388              ## XML5: Not a parse error.
1389              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1390            } else {
1391              
1392            }
1393                    
1394          $before_leave->();          $before_leave->();
1395          $self->{state} = SELF_CLOSING_START_TAG_STATE;          $self->{state} = SELF_CLOSING_START_TAG_STATE;
# Line 1230  sub _get_next_token ($) { Line 1431  sub _get_next_token ($) {
1431    
1432          redo A;          redo A;
1433        } else {        } else {
1434          if ($self->{nc} == 0x0022 or # "          if ({
1435              $self->{nc} == 0x0027) { # '               0x0022 => 1, # "
1436                 0x0027 => 1, # '
1437                 0x003C => 1, # <
1438                }->{$self->{nc}}) {
1439                        
1440              ## XML5: Not a parse error.
1441            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1442          } else {          } else {
1443                        
# Line 1253  sub _get_next_token ($) { Line 1458  sub _get_next_token ($) {
1458          redo A;          redo A;
1459        }        }
1460      } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {      } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1461          ## XML5: "Tag attribute name after state".
1462          
1463        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
1464                    
1465          ## Stay in the state          ## Stay in the state
# Line 1284  sub _get_next_token ($) { Line 1491  sub _get_next_token ($) {
1491        
1492          redo A;          redo A;
1493        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
1494            if ($self->{is_xml}) {
1495              
1496              ## XML5: Not a parse error.
1497              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1498            } else {
1499              
1500            }
1501    
1502          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
1503                        
1504            $self->{last_stag_name} = $self->{ct}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
# Line 1337  sub _get_next_token ($) { Line 1552  sub _get_next_token ($) {
1552        
1553          redo A;          redo A;
1554        } elsif ($self->{nc} == 0x002F) { # /        } elsif ($self->{nc} == 0x002F) { # /
1555            if ($self->{is_xml}) {
1556              
1557              ## XML5: Not a parse error.
1558              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1559            } else {
1560              
1561            }
1562                    
1563          $self->{state} = SELF_CLOSING_START_TAG_STATE;          $self->{state} = SELF_CLOSING_START_TAG_STATE;
1564                    
# Line 1376  sub _get_next_token ($) { Line 1598  sub _get_next_token ($) {
1598    
1599          redo A;          redo A;
1600        } else {        } else {
1601          if ($self->{nc} == 0x0022 or # "          if ($self->{is_xml}) {
1602              $self->{nc} == 0x0027) { # '            
1603              ## XML5: Not a parse error.
1604              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1605            } else {
1606                        
1607            }
1608    
1609            if ({
1610                 0x0022 => 1, # "
1611                 0x0027 => 1, # '
1612                 0x003C => 1, # <
1613                }->{$self->{nc}}) {
1614              
1615              ## XML5: Not a parse error.
1616            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1617          } else {          } else {
1618                        
# Line 1402  sub _get_next_token ($) { Line 1636  sub _get_next_token ($) {
1636          redo A;                  redo A;        
1637        }        }
1638      } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {      } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1639          ## XML5: "Tag attribute value before state".
1640    
1641        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
1642                    
1643          ## Stay in the state          ## Stay in the state
# Line 1511  sub _get_next_token ($) { Line 1747  sub _get_next_token ($) {
1747    
1748          redo A;          redo A;
1749        } else {        } else {
1750          if ($self->{nc} == 0x003D) { # =          if ($self->{nc} == 0x003D or $self->{nc} == 0x003C) { # =, <
1751                        
1752              ## XML5: Not a parse error.
1753            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
1754            } elsif ($self->{is_xml}) {
1755              
1756              ## XML5: No parse error.
1757              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO
1758          } else {          } else {
1759                        
1760          }          }
# Line 1533  sub _get_next_token ($) { Line 1774  sub _get_next_token ($) {
1774          redo A;          redo A;
1775        }        }
1776      } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {      } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1777          ## XML5: "Tag attribute value double quoted state" and "DOCTYPE
1778          ## ATTLIST attribute value double quoted state".
1779          
1780        if ($self->{nc} == 0x0022) { # "        if ($self->{nc} == 0x0022) { # "
1781                    if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1782          $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;            
1783              ## XML5: "DOCTYPE ATTLIST name after state".
1784              push @{$self->{ct}->{attrdefs}}, $self->{ca};
1785              $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1786            } else {
1787              
1788              ## XML5: "Tag attribute name before state".
1789              $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1790            }
1791                    
1792      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1793        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 1550  sub _get_next_token ($) { Line 1802  sub _get_next_token ($) {
1802          redo A;          redo A;
1803        } elsif ($self->{nc} == 0x0026) { # &        } elsif ($self->{nc} == 0x0026) { # &
1804                    
1805            ## XML5: Not defined yet.
1806    
1807          ## NOTE: In the spec, the tokenizer is switched to the          ## NOTE: In the spec, the tokenizer is switched to the
1808          ## "entity in attribute value state".  In this implementation, the          ## "entity in attribute value state".  In this implementation, the
1809          ## tokenizer is switched to the |ENTITY_STATE|, which is an          ## tokenizer is switched to the |ENTITY_STATE|, which is an
# Line 1569  sub _get_next_token ($) { Line 1823  sub _get_next_token ($) {
1823      }      }
1824        
1825          redo A;          redo A;
1826          } elsif ($self->{is_xml} and
1827                   $is_space->{$self->{nc}}) {
1828            
1829            $self->{ca}->{value} .= ' ';
1830            ## Stay in the state.
1831            
1832        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1833          $self->{line_prev} = $self->{line};
1834          $self->{column_prev} = $self->{column};
1835          $self->{column}++;
1836          $self->{nc}
1837              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1838        } else {
1839          $self->{set_nc}->($self);
1840        }
1841      
1842            redo A;
1843        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
1844          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1845          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
1846                        
1847            $self->{last_stag_name} = $self->{ct}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
1848    
1849              $self->{state} = DATA_STATE;
1850              $self->{s_kwd} = '';
1851              ## reconsume
1852              return  ($self->{ct}); # start tag
1853              redo A;
1854          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1855            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1856            if ($self->{ct}->{attributes}) {            if ($self->{ct}->{attributes}) {
# Line 1583  sub _get_next_token ($) { Line 1860  sub _get_next_token ($) {
1860              ## NOTE: This state should never be reached.              ## NOTE: This state should never be reached.
1861                            
1862            }            }
1863    
1864              $self->{state} = DATA_STATE;
1865              $self->{s_kwd} = '';
1866              ## reconsume
1867              return  ($self->{ct}); # end tag
1868              redo A;
1869            } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1870              ## XML5: No parse error above; not defined yet.
1871              push @{$self->{ct}->{attrdefs}}, $self->{ca};
1872              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1873              ## Reconsume.
1874              return  ($self->{ct}); # ATTLIST
1875              redo A;
1876          } else {          } else {
1877            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1878          }          }
         $self->{state} = DATA_STATE;  
         $self->{s_kwd} = '';  
         ## reconsume  
   
         return  ($self->{ct}); # start tag or end tag  
   
         redo A;  
1879        } else {        } else {
1880                    ## XML5 [ATTLIST]: Not defined yet.
1881            if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1882              
1883              ## XML5: Not a parse error.
1884              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
1885            } else {
1886              
1887            }
1888          $self->{ca}->{value} .= chr ($self->{nc});          $self->{ca}->{value} .= chr ($self->{nc});
1889          $self->{read_until}->($self->{ca}->{value},          $self->{read_until}->($self->{ca}->{value},
1890                                q["&],                                qq["&<\x09\x0C\x20],
1891                                length $self->{ca}->{value});                                length $self->{ca}->{value});
1892    
1893          ## Stay in the state          ## Stay in the state
# Line 1615  sub _get_next_token ($) { Line 1905  sub _get_next_token ($) {
1905          redo A;          redo A;
1906        }        }
1907      } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {      } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1908          ## XML5: "Tag attribute value single quoted state" and "DOCTYPE
1909          ## ATTLIST attribute value single quoted state".
1910    
1911        if ($self->{nc} == 0x0027) { # '        if ($self->{nc} == 0x0027) { # '
1912                    if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1913          $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;            
1914              ## XML5: "DOCTYPE ATTLIST name after state".
1915              push @{$self->{ct}->{attrdefs}}, $self->{ca};
1916              $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1917            } else {
1918              
1919              ## XML5: "Before attribute name state" (sic).
1920              $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1921            }
1922                    
1923      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1924        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 1632  sub _get_next_token ($) { Line 1933  sub _get_next_token ($) {
1933          redo A;          redo A;
1934        } elsif ($self->{nc} == 0x0026) { # &        } elsif ($self->{nc} == 0x0026) { # &
1935                    
1936            ## XML5: Not defined yet.
1937    
1938          ## NOTE: In the spec, the tokenizer is switched to the          ## NOTE: In the spec, the tokenizer is switched to the
1939          ## "entity in attribute value state".  In this implementation, the          ## "entity in attribute value state".  In this implementation, the
1940          ## tokenizer is switched to the |ENTITY_STATE|, which is an          ## tokenizer is switched to the |ENTITY_STATE|, which is an
# Line 1651  sub _get_next_token ($) { Line 1954  sub _get_next_token ($) {
1954      }      }
1955        
1956          redo A;          redo A;
1957          } elsif ($self->{is_xml} and
1958                   $is_space->{$self->{nc}}) {
1959            
1960            $self->{ca}->{value} .= ' ';
1961            ## Stay in the state.
1962            
1963        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1964          $self->{line_prev} = $self->{line};
1965          $self->{column_prev} = $self->{column};
1966          $self->{column}++;
1967          $self->{nc}
1968              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1969        } else {
1970          $self->{set_nc}->($self);
1971        }
1972      
1973            redo A;
1974        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
1975          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1976          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
1977                        
1978            $self->{last_stag_name} = $self->{ct}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
1979    
1980              $self->{state} = DATA_STATE;
1981              $self->{s_kwd} = '';
1982              ## reconsume
1983              return  ($self->{ct}); # start tag
1984              redo A;
1985          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1986            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1987            if ($self->{ct}->{attributes}) {            if ($self->{ct}->{attributes}) {
# Line 1665  sub _get_next_token ($) { Line 1991  sub _get_next_token ($) {
1991              ## NOTE: This state should never be reached.              ## NOTE: This state should never be reached.
1992                            
1993            }            }
1994    
1995              $self->{state} = DATA_STATE;
1996              $self->{s_kwd} = '';
1997              ## reconsume
1998              return  ($self->{ct}); # end tag
1999              redo A;
2000            } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2001              ## XML5: No parse error above; not defined yet.
2002              push @{$self->{ct}->{attrdefs}}, $self->{ca};
2003              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2004              ## Reconsume.
2005              return  ($self->{ct}); # ATTLIST
2006              redo A;
2007          } else {          } else {
2008            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
2009          }          }
         $self->{state} = DATA_STATE;  
         $self->{s_kwd} = '';  
         ## reconsume  
   
         return  ($self->{ct}); # start tag or end tag  
   
         redo A;  
2010        } else {        } else {
2011                    ## XML5 [ATTLIST]: Not defined yet.
2012            if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
2013              
2014              ## XML5: Not a parse error.
2015              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
2016            } else {
2017              
2018            }
2019          $self->{ca}->{value} .= chr ($self->{nc});          $self->{ca}->{value} .= chr ($self->{nc});
2020          $self->{read_until}->($self->{ca}->{value},          $self->{read_until}->($self->{ca}->{value},
2021                                q['&],                                qq['&<\x09\x0C\x20],
2022                                length $self->{ca}->{value});                                length $self->{ca}->{value});
2023    
2024          ## Stay in the state          ## Stay in the state
# Line 1697  sub _get_next_token ($) { Line 2036  sub _get_next_token ($) {
2036          redo A;          redo A;
2037        }        }
2038      } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {      } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
2039          ## XML5: "Tag attribute value unquoted state".
2040    
2041        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
2042                    if ($self->{ct}->{type} == ATTLIST_TOKEN) {
2043          $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;            
2044              push @{$self->{ct}->{attrdefs}}, $self->{ca};
2045              $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
2046            } else {
2047              
2048              ## XML5: "Tag attribute name before state".
2049              $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2050            }
2051                    
2052      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2053        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 1714  sub _get_next_token ($) { Line 2062  sub _get_next_token ($) {
2062          redo A;          redo A;
2063        } elsif ($self->{nc} == 0x0026) { # &        } elsif ($self->{nc} == 0x0026) { # &
2064                    
2065    
2066            ## XML5: Not defined yet.
2067    
2068          ## NOTE: In the spec, the tokenizer is switched to the          ## NOTE: In the spec, the tokenizer is switched to the
2069          ## "entity in attribute value state".  In this implementation, the          ## "entity in attribute value state".  In this implementation, the
2070          ## tokenizer is switched to the |ENTITY_STATE|, which is an          ## tokenizer is switched to the |ENTITY_STATE|, which is an
# Line 1737  sub _get_next_token ($) { Line 2088  sub _get_next_token ($) {
2088          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
2089                        
2090            $self->{last_stag_name} = $self->{ct}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
2091    
2092              $self->{state} = DATA_STATE;
2093              $self->{s_kwd} = '';
2094              
2095        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2096          $self->{line_prev} = $self->{line};
2097          $self->{column_prev} = $self->{column};
2098          $self->{column}++;
2099          $self->{nc}
2100              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2101        } else {
2102          $self->{set_nc}->($self);
2103        }
2104      
2105              return  ($self->{ct}); # start tag
2106              redo A;
2107          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2108            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2109            if ($self->{ct}->{attributes}) {            if ($self->{ct}->{attributes}) {
# Line 1746  sub _get_next_token ($) { Line 2113  sub _get_next_token ($) {
2113              ## NOTE: This state should never be reached.              ## NOTE: This state should never be reached.
2114                            
2115            }            }
2116          } else {  
2117            die "$0: $self->{ct}->{type}: Unknown token type";            $self->{state} = DATA_STATE;
2118          }            $self->{s_kwd} = '';
2119          $self->{state} = DATA_STATE;            
         $self->{s_kwd} = '';  
           
2120      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2121        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
2122        $self->{column_prev} = $self->{column};        $self->{column_prev} = $self->{column};
# Line 1762  sub _get_next_token ($) { Line 2127  sub _get_next_token ($) {
2127        $self->{set_nc}->($self);        $self->{set_nc}->($self);
2128      }      }
2129        
2130              return  ($self->{ct}); # end tag
2131          return  ($self->{ct}); # start tag or end tag            redo A;
2132            } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2133          redo A;            push @{$self->{ct}->{attrdefs}}, $self->{ca};
2134              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2135              
2136        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2137          $self->{line_prev} = $self->{line};
2138          $self->{column_prev} = $self->{column};
2139          $self->{column}++;
2140          $self->{nc}
2141              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2142        } else {
2143          $self->{set_nc}->($self);
2144        }
2145      
2146              return  ($self->{ct}); # ATTLIST
2147              redo A;
2148            } else {
2149              die "$0: $self->{ct}->{type}: Unknown token type";
2150            }
2151        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
         $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');  
2152          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
2153                        
2154              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2155            $self->{last_stag_name} = $self->{ct}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
2156    
2157              $self->{state} = DATA_STATE;
2158              $self->{s_kwd} = '';
2159              ## reconsume
2160              return  ($self->{ct}); # start tag
2161              redo A;
2162          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2163              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2164            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2165            if ($self->{ct}->{attributes}) {            if ($self->{ct}->{attributes}) {
2166                            
# Line 1780  sub _get_next_token ($) { Line 2169  sub _get_next_token ($) {
2169              ## NOTE: This state should never be reached.              ## NOTE: This state should never be reached.
2170                            
2171            }            }
2172    
2173              $self->{state} = DATA_STATE;
2174              $self->{s_kwd} = '';
2175              ## reconsume
2176              return  ($self->{ct}); # end tag
2177              redo A;
2178            } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2179              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
2180              push @{$self->{ct}->{attrdefs}}, $self->{ca};
2181              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2182              ## Reconsume.
2183              return  ($self->{ct}); # ATTLIST
2184              redo A;
2185          } else {          } else {
2186            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
2187          }          }
         $self->{state} = DATA_STATE;  
         $self->{s_kwd} = '';  
         ## reconsume  
   
         return  ($self->{ct}); # start tag or end tag  
   
         redo A;  
2188        } else {        } else {
2189          if ({          if ({
2190               0x0022 => 1, # "               0x0022 => 1, # "
2191               0x0027 => 1, # '               0x0027 => 1, # '
2192               0x003D => 1, # =               0x003D => 1, # =
2193                 0x003C => 1, # <
2194              }->{$self->{nc}}) {              }->{$self->{nc}}) {
2195                        
2196              ## XML5: Not a parse error.
2197            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
2198          } else {          } else {
2199                        
2200          }          }
2201          $self->{ca}->{value} .= chr ($self->{nc});          $self->{ca}->{value} .= chr ($self->{nc});
2202          $self->{read_until}->($self->{ca}->{value},          $self->{read_until}->($self->{ca}->{value},
2203                                q["'=& >],                                qq["'=& \x09\x0C>],
2204                                length $self->{ca}->{value});                                length $self->{ca}->{value});
2205    
2206          ## Stay in the state          ## Stay in the state
# Line 1913  sub _get_next_token ($) { Line 2310  sub _get_next_token ($) {
2310          redo A;          redo A;
2311        }        }
2312      } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {      } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
2313          ## XML5: "Empty tag state".
2314    
2315        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
2316          if ($self->{ct}->{type} == END_TAG_TOKEN) {          if ($self->{ct}->{type} == END_TAG_TOKEN) {
2317                        
# Line 1964  sub _get_next_token ($) { Line 2363  sub _get_next_token ($) {
2363          } else {          } else {
2364            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
2365          }          }
2366            ## XML5: "Tag attribute name before state".
2367          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2368          $self->{s_kwd} = '';          $self->{s_kwd} = '';
2369          ## Reconsume.          ## Reconsume.
# Line 1978  sub _get_next_token ($) { Line 2378  sub _get_next_token ($) {
2378          redo A;          redo A;
2379        }        }
2380      } elsif ($self->{state} == BOGUS_COMMENT_STATE) {      } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
2381        ## (only happen if PCDATA state)        ## XML5: "Bogus comment state" and "DOCTYPE bogus comment state".
2382    
2383        ## NOTE: Unlike spec's "bogus comment state", this implementation        ## NOTE: Unlike spec's "bogus comment state", this implementation
2384        ## consumes characters one-by-one basis.        ## consumes characters one-by-one basis.
2385                
2386        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
2387                    if ($self->{in_subset}) {
2388          $self->{state} = DATA_STATE;            
2389          $self->{s_kwd} = '';            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2390            } else {
2391              
2392              $self->{state} = DATA_STATE;
2393              $self->{s_kwd} = '';
2394            }
2395                    
2396      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2397        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2002  sub _get_next_token ($) { Line 2407  sub _get_next_token ($) {
2407          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
2408          redo A;          redo A;
2409        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
2410                    if ($self->{in_subset}) {
2411          $self->{state} = DATA_STATE;            
2412          $self->{s_kwd} = '';            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2413            } else {
2414              
2415              $self->{state} = DATA_STATE;
2416              $self->{s_kwd} = '';
2417            }
2418          ## reconsume          ## reconsume
2419    
2420          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
# Line 2031  sub _get_next_token ($) { Line 2441  sub _get_next_token ($) {
2441          redo A;          redo A;
2442        }        }
2443      } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {      } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
2444        ## (only happen if PCDATA state)        ## XML5: "Markup declaration state".
2445                
2446        if ($self->{nc} == 0x002D) { # -        if ($self->{nc} == 0x002D) { # -
2447                    
# Line 2053  sub _get_next_token ($) { Line 2463  sub _get_next_token ($) {
2463          ## ASCII case-insensitive.          ## ASCII case-insensitive.
2464                    
2465          $self->{state} = MD_DOCTYPE_STATE;          $self->{state} = MD_DOCTYPE_STATE;
2466          $self->{s_kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
2467                    
2468      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2469        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2072  sub _get_next_token ($) { Line 2482  sub _get_next_token ($) {
2482                 $self->{nc} == 0x005B) { # [                 $self->{nc} == 0x005B) { # [
2483                                                    
2484          $self->{state} = MD_CDATA_STATE;          $self->{state} = MD_CDATA_STATE;
2485          $self->{s_kwd} = '[';          $self->{kwd} = '[';
2486                    
2487      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2488        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2106  sub _get_next_token ($) { Line 2516  sub _get_next_token ($) {
2516                                    line => $self->{line_prev},                                    line => $self->{line_prev},
2517                                    column => $self->{column_prev} - 2,                                    column => $self->{column_prev} - 2,
2518                                   };                                   };
2519          $self->{state} = COMMENT_START_STATE;          $self->{state} = COMMENT_START_STATE; ## XML5: "comment state".
2520                    
2521      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2522        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2142  sub _get_next_token ($) { Line 2552  sub _get_next_token ($) {
2552              0x0054, # T              0x0054, # T
2553              0x0059, # Y              0x0059, # Y
2554              0x0050, # P              0x0050, # P
2555            ]->[length $self->{s_kwd}] or            ]->[length $self->{kwd}] or
2556            $self->{nc} == [            $self->{nc} == [
2557              undef,              undef,
2558              0x006F, # o              0x006F, # o
# Line 2150  sub _get_next_token ($) { Line 2560  sub _get_next_token ($) {
2560              0x0074, # t              0x0074, # t
2561              0x0079, # y              0x0079, # y
2562              0x0070, # p              0x0070, # p
2563            ]->[length $self->{s_kwd}]) {            ]->[length $self->{kwd}]) {
2564                    
2565          ## Stay in the state.          ## Stay in the state.
2566          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
2567                    
2568      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2569        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2166  sub _get_next_token ($) { Line 2576  sub _get_next_token ($) {
2576      }      }
2577        
2578          redo A;          redo A;
2579        } elsif ((length $self->{s_kwd}) == 6 and        } elsif ((length $self->{kwd}) == 6 and
2580                 ($self->{nc} == 0x0045 or # E                 ($self->{nc} == 0x0045 or # E
2581                  $self->{nc} == 0x0065)) { # e                  $self->{nc} == 0x0065)) { # e
2582                    if ($self->{is_xml} and
2583                ($self->{kwd} ne 'DOCTYP' or $self->{nc} == 0x0065)) {
2584              
2585              ## XML5: case-sensitive.
2586              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO
2587                              text => 'DOCTYPE',
2588                              line => $self->{line_prev},
2589                              column => $self->{column_prev} - 5);
2590            } else {
2591              
2592            }
2593          $self->{state} = DOCTYPE_STATE;          $self->{state} = DOCTYPE_STATE;
2594          $self->{ct} = {type => DOCTYPE_TOKEN,          $self->{ct} = {type => DOCTYPE_TOKEN,
2595                                    quirks => 1,                                    quirks => 1,
# Line 2192  sub _get_next_token ($) { Line 2612  sub _get_next_token ($) {
2612                                    
2613          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2614                          line => $self->{line_prev},                          line => $self->{line_prev},
2615                          column => $self->{column_prev} - 1 - length $self->{s_kwd});                          column => $self->{column_prev} - 1 - length $self->{kwd});
2616          $self->{state} = BOGUS_COMMENT_STATE;          $self->{state} = BOGUS_COMMENT_STATE;
2617          ## Reconsume.          ## Reconsume.
2618          $self->{ct} = {type => COMMENT_TOKEN,          $self->{ct} = {type => COMMENT_TOKEN,
2619                                    data => $self->{s_kwd},                                    data => $self->{kwd},
2620                                    line => $self->{line_prev},                                    line => $self->{line_prev},
2621                                    column => $self->{column_prev} - 1 - length $self->{s_kwd},                                    column => $self->{column_prev} - 1 - length $self->{kwd},
2622                                   };                                   };
2623          redo A;          redo A;
2624        }        }
# Line 2209  sub _get_next_token ($) { Line 2629  sub _get_next_token ($) {
2629              '[CD' => 0x0041, # A              '[CD' => 0x0041, # A
2630              '[CDA' => 0x0054, # T              '[CDA' => 0x0054, # T
2631              '[CDAT' => 0x0041, # A              '[CDAT' => 0x0041, # A
2632            }->{$self->{s_kwd}}) {            }->{$self->{kwd}}) {
2633                    
2634          ## Stay in the state.          ## Stay in the state.
2635          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
2636                    
2637      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2638        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2225  sub _get_next_token ($) { Line 2645  sub _get_next_token ($) {
2645      }      }
2646        
2647          redo A;          redo A;
2648        } elsif ($self->{s_kwd} eq '[CDATA' and        } elsif ($self->{kwd} eq '[CDATA' and
2649                 $self->{nc} == 0x005B) { # [                 $self->{nc} == 0x005B) { # [
2650                    if ($self->{is_xml} and
2651                not $self->{tainted} and
2652                @{$self->{open_elements} or []} == 0) {
2653              
2654              $self->{parse_error}->(level => $self->{level}->{must}, type => 'cdata outside of root element',
2655                              line => $self->{line_prev},
2656                              column => $self->{column_prev} - 7);
2657              $self->{tainted} = 1;
2658            } else {
2659              
2660            }
2661    
2662          $self->{ct} = {type => CHARACTER_TOKEN,          $self->{ct} = {type => CHARACTER_TOKEN,
2663                                    data => '',                                    data => '',
2664                                    line => $self->{line_prev},                                    line => $self->{line_prev},
# Line 2249  sub _get_next_token ($) { Line 2680  sub _get_next_token ($) {
2680                    
2681          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2682                          line => $self->{line_prev},                          line => $self->{line_prev},
2683                          column => $self->{column_prev} - 1 - length $self->{s_kwd});                          column => $self->{column_prev} - 1 - length $self->{kwd});
2684          $self->{state} = BOGUS_COMMENT_STATE;          $self->{state} = BOGUS_COMMENT_STATE;
2685          ## Reconsume.          ## Reconsume.
2686          $self->{ct} = {type => COMMENT_TOKEN,          $self->{ct} = {type => COMMENT_TOKEN,
2687                                    data => $self->{s_kwd},                                    data => $self->{kwd},
2688                                    line => $self->{line_prev},                                    line => $self->{line_prev},
2689                                    column => $self->{column_prev} - 1 - length $self->{s_kwd},                                    column => $self->{column_prev} - 1 - length $self->{kwd},
2690                                   };                                   };
2691          redo A;          redo A;
2692        }        }
# Line 2276  sub _get_next_token ($) { Line 2707  sub _get_next_token ($) {
2707        
2708          redo A;          redo A;
2709        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
           
2710          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2711          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
2712          $self->{s_kwd} = '';            
2713              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2714            } else {
2715              
2716              $self->{state} = DATA_STATE;
2717              $self->{s_kwd} = '';
2718            }
2719                    
2720      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2721        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2296  sub _get_next_token ($) { Line 2732  sub _get_next_token ($) {
2732    
2733          redo A;          redo A;
2734        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
2735          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2736          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
2737          $self->{s_kwd} = '';            
2738              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2739            } else {
2740              
2741              $self->{state} = DATA_STATE;
2742              $self->{s_kwd} = '';
2743            }
2744          ## reconsume          ## reconsume
2745    
2746          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
# Line 2340  sub _get_next_token ($) { Line 2781  sub _get_next_token ($) {
2781        
2782          redo A;          redo A;
2783        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
           
2784          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2785          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
2786          $self->{s_kwd} = '';            
2787              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2788            } else {
2789              
2790              $self->{state} = DATA_STATE;
2791              $self->{s_kwd} = '';
2792            }
2793                    
2794      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2795        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2360  sub _get_next_token ($) { Line 2806  sub _get_next_token ($) {
2806    
2807          redo A;          redo A;
2808        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
2809          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2810          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
2811          $self->{s_kwd} = '';            
2812              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2813            } else {
2814              
2815              $self->{state} = DATA_STATE;
2816              $self->{s_kwd} = '';
2817            }
2818          ## reconsume          ## reconsume
2819    
2820          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
# Line 2388  sub _get_next_token ($) { Line 2839  sub _get_next_token ($) {
2839          redo A;          redo A;
2840        }        }
2841      } elsif ($self->{state} == COMMENT_STATE) {      } elsif ($self->{state} == COMMENT_STATE) {
2842          ## XML5: "Comment state" and "DOCTYPE comment state".
2843    
2844        if ($self->{nc} == 0x002D) { # -        if ($self->{nc} == 0x002D) { # -
2845                    
2846          $self->{state} = COMMENT_END_DASH_STATE;          $self->{state} = COMMENT_END_DASH_STATE;
# Line 2404  sub _get_next_token ($) { Line 2857  sub _get_next_token ($) {
2857        
2858          redo A;          redo A;
2859        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
2860          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2861          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
2862          $self->{s_kwd} = '';            
2863              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2864            } else {
2865              
2866              $self->{state} = DATA_STATE;
2867              $self->{s_kwd} = '';
2868            }
2869          ## reconsume          ## reconsume
2870    
2871          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
# Line 2435  sub _get_next_token ($) { Line 2893  sub _get_next_token ($) {
2893          redo A;          redo A;
2894        }        }
2895      } elsif ($self->{state} == COMMENT_END_DASH_STATE) {      } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2896          ## XML5: "Comment dash state" and "DOCTYPE comment dash state".
2897    
2898        if ($self->{nc} == 0x002D) { # -        if ($self->{nc} == 0x002D) { # -
2899                    
2900          $self->{state} = COMMENT_END_STATE;          $self->{state} = COMMENT_END_STATE;
# Line 2451  sub _get_next_token ($) { Line 2911  sub _get_next_token ($) {
2911        
2912          redo A;          redo A;
2913        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
2914          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2915          $self->{s_kwd} = '';          if ($self->{in_subset}) {
2916          $self->{state} = DATA_STATE;            
2917          $self->{s_kwd} = '';            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2918            } else {
2919              
2920              $self->{state} = DATA_STATE;
2921              $self->{s_kwd} = '';
2922            }
2923          ## reconsume          ## reconsume
2924    
2925          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
# Line 2479  sub _get_next_token ($) { Line 2943  sub _get_next_token ($) {
2943          redo A;          redo A;
2944        }        }
2945      } elsif ($self->{state} == COMMENT_END_STATE) {      } elsif ($self->{state} == COMMENT_END_STATE) {
2946          ## XML5: "Comment end state" and "DOCTYPE comment end state".
2947    
2948        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
2949                    if ($self->{in_subset}) {
2950          $self->{state} = DATA_STATE;            
2951          $self->{s_kwd} = '';            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2952            } else {
2953              
2954              $self->{state} = DATA_STATE;
2955              $self->{s_kwd} = '';
2956            }
2957                    
2958      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2959        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2500  sub _get_next_token ($) { Line 2971  sub _get_next_token ($) {
2971          redo A;          redo A;
2972        } elsif ($self->{nc} == 0x002D) { # -        } elsif ($self->{nc} == 0x002D) { # -
2973                    
2974            ## XML5: Not a parse error.
2975          $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2976                          line => $self->{line_prev},                          line => $self->{line_prev},
2977                          column => $self->{column_prev});                          column => $self->{column_prev});
# Line 2518  sub _get_next_token ($) { Line 2990  sub _get_next_token ($) {
2990        
2991          redo A;          redo A;
2992        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
2993          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2994          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
2995          $self->{s_kwd} = '';            
2996              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2997            } else {
2998              
2999              $self->{state} = DATA_STATE;
3000              $self->{s_kwd} = '';
3001            }
3002          ## reconsume          ## reconsume
3003    
3004          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
# Line 2529  sub _get_next_token ($) { Line 3006  sub _get_next_token ($) {
3006          redo A;          redo A;
3007        } else {        } else {
3008                    
         $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',  
                         line => $self->{line_prev},  
                         column => $self->{column_prev});  
3009          $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment          $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
3010          $self->{state} = COMMENT_STATE;          $self->{state} = COMMENT_STATE;
3011                    
# Line 2563  sub _get_next_token ($) { Line 3037  sub _get_next_token ($) {
3037      }      }
3038        
3039          redo A;          redo A;
3040          } elsif ($self->{nc} == -1) {
3041            
3042            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3043            $self->{ct}->{quirks} = 1;
3044    
3045            $self->{state} = DATA_STATE;
3046            ## Reconsume.
3047            return  ($self->{ct}); # DOCTYPE (quirks)
3048    
3049            redo A;
3050        } else {        } else {
3051                    
3052            ## XML5: Swith to the bogus comment state.
3053          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before DOCTYPE name');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before DOCTYPE name');
3054          $self->{state} = BEFORE_DOCTYPE_NAME_STATE;          $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
3055          ## reconsume          ## reconsume
3056          redo A;          redo A;
3057        }        }
3058      } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {      } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
3059          ## XML5: "DOCTYPE root name before state".
3060    
3061        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
3062                    
3063          ## Stay in the state          ## Stay in the state
# Line 2588  sub _get_next_token ($) { Line 3075  sub _get_next_token ($) {
3075          redo A;          redo A;
3076        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
3077                    
3078            ## XML5: No parse error.
3079          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3080          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3081          $self->{s_kwd} = '';          $self->{s_kwd} = '';
# Line 2606  sub _get_next_token ($) { Line 3094  sub _get_next_token ($) {
3094          return  ($self->{ct}); # DOCTYPE (quirks)          return  ($self->{ct}); # DOCTYPE (quirks)
3095    
3096          redo A;          redo A;
3097          } elsif (0x0041 <= $self->{nc} and $self->{nc} <= 0x005A) { # A..Z
3098            
3099            $self->{ct}->{name} # DOCTYPE
3100                = chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
3101            delete $self->{ct}->{quirks};
3102            $self->{state} = DOCTYPE_NAME_STATE;
3103            
3104        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3105          $self->{line_prev} = $self->{line};
3106          $self->{column_prev} = $self->{column};
3107          $self->{column}++;
3108          $self->{nc}
3109              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3110        } else {
3111          $self->{set_nc}->($self);
3112        }
3113      
3114            redo A;
3115        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
3116                    
3117          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
# Line 2616  sub _get_next_token ($) { Line 3122  sub _get_next_token ($) {
3122          return  ($self->{ct}); # DOCTYPE (quirks)          return  ($self->{ct}); # DOCTYPE (quirks)
3123    
3124          redo A;          redo A;
3125          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3126            
3127            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3128            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3129            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3130            $self->{in_subset} = 1;
3131            
3132        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3133          $self->{line_prev} = $self->{line};
3134          $self->{column_prev} = $self->{column};
3135          $self->{column}++;
3136          $self->{nc}
3137              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3138        } else {
3139          $self->{set_nc}->($self);
3140        }
3141      
3142            return  ($self->{ct}); # DOCTYPE
3143            redo A;
3144        } else {        } else {
3145                    
3146          $self->{ct}->{name} = chr $self->{nc};          $self->{ct}->{name} = chr $self->{nc};
# Line 2635  sub _get_next_token ($) { Line 3160  sub _get_next_token ($) {
3160          redo A;          redo A;
3161        }        }
3162      } elsif ($self->{state} == DOCTYPE_NAME_STATE) {      } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
3163  ## ISSUE: Redundant "First," in the spec.        ## XML5: "DOCTYPE root name state".
3164    
3165          ## ISSUE: Redundant "First," in the spec.
3166    
3167        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
3168                    
3169          $self->{state} = AFTER_DOCTYPE_NAME_STATE;          $self->{state} = AFTER_DOCTYPE_NAME_STATE;
# Line 2670  sub _get_next_token ($) { Line 3198  sub _get_next_token ($) {
3198          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
3199    
3200          redo A;          redo A;
3201          } elsif (0x0041 <= $self->{nc} and $self->{nc} <= 0x005A) { # A..Z
3202            
3203            $self->{ct}->{name} # DOCTYPE
3204                .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
3205            delete $self->{ct}->{quirks};
3206            ## Stay in the state.
3207            
3208        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3209          $self->{line_prev} = $self->{line};
3210          $self->{column_prev} = $self->{column};
3211          $self->{column}++;
3212          $self->{nc}
3213              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3214        } else {
3215          $self->{set_nc}->($self);
3216        }
3217      
3218            redo A;
3219        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
3220                    
3221          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
# Line 2681  sub _get_next_token ($) { Line 3227  sub _get_next_token ($) {
3227          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
3228    
3229          redo A;          redo A;
3230          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3231            
3232            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3233            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3234            $self->{in_subset} = 1;
3235            
3236        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3237          $self->{line_prev} = $self->{line};
3238          $self->{column_prev} = $self->{column};
3239          $self->{column}++;
3240          $self->{nc}
3241              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3242        } else {
3243          $self->{set_nc}->($self);
3244        }
3245      
3246            return  ($self->{ct}); # DOCTYPE
3247            redo A;
3248        } else {        } else {
3249                    
3250          $self->{ct}->{name}          $self->{ct}->{name} .= chr ($self->{nc}); # DOCTYPE
3251            .= chr ($self->{nc}); # DOCTYPE          ## Stay in the state.
         ## Stay in the state  
3252                    
3253      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3254        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2700  sub _get_next_token ($) { Line 3263  sub _get_next_token ($) {
3263          redo A;          redo A;
3264        }        }
3265      } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {      } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
3266          ## XML5: Corresponding to XML5's "DOCTYPE root name after
3267          ## state", but implemented differently.
3268    
3269        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
3270                    
3271          ## Stay in the state          ## Stay in the state
# Line 2716  sub _get_next_token ($) { Line 3282  sub _get_next_token ($) {
3282        
3283          redo A;          redo A;
3284        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
3285            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3286              
3287              $self->{state} = DATA_STATE;
3288              $self->{s_kwd} = '';
3289            } else {
3290              
3291              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
3292              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3293            }
3294                    
         $self->{state} = DATA_STATE;  
         $self->{s_kwd} = '';  
3295                    
3296      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3297        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2730  sub _get_next_token ($) { Line 3303  sub _get_next_token ($) {
3303        $self->{set_nc}->($self);        $self->{set_nc}->($self);
3304      }      }
3305        
3306            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         return  ($self->{ct}); # DOCTYPE  
   
3307          redo A;          redo A;
3308        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
3309            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3310              
3311              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3312              $self->{state} = DATA_STATE;
3313              $self->{s_kwd} = '';
3314              $self->{ct}->{quirks} = 1;
3315            } else {
3316              
3317              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3318              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3319            }
3320                    
3321          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');          ## Reconsume.
3322          $self->{state} = DATA_STATE;          return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{s_kwd} = '';  
         ## reconsume  
   
         $self->{ct}->{quirks} = 1;  
         return  ($self->{ct}); # DOCTYPE  
   
3323          redo A;          redo A;
3324        } elsif ($self->{nc} == 0x0050 or # P        } elsif ($self->{nc} == 0x0050 or # P
3325                 $self->{nc} == 0x0070) { # p                 $self->{nc} == 0x0070) { # p
3326            
3327          $self->{state} = PUBLIC_STATE;          $self->{state} = PUBLIC_STATE;
3328          $self->{s_kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
3329                    
3330      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3331        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2763  sub _get_next_token ($) { Line 3340  sub _get_next_token ($) {
3340          redo A;          redo A;
3341        } elsif ($self->{nc} == 0x0053 or # S        } elsif ($self->{nc} == 0x0053 or # S
3342                 $self->{nc} == 0x0073) { # s                 $self->{nc} == 0x0073) { # s
3343            
3344          $self->{state} = SYSTEM_STATE;          $self->{state} = SYSTEM_STATE;
3345          $self->{s_kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
3346                    
3347      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3348        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2777  sub _get_next_token ($) { Line 3355  sub _get_next_token ($) {
3355      }      }
3356        
3357          redo A;          redo A;
3358        } else {        } elsif ($self->{nc} == 0x0022 and # "
3359                   ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
3360                    $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
3361                    
3362          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name');          $self->{state} = DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE;
3363          $self->{ct}->{quirks} = 1;          $self->{ct}->{value} = ''; # ENTITY
3364            
3365        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3366          $self->{line_prev} = $self->{line};
3367          $self->{column_prev} = $self->{column};
3368          $self->{column}++;
3369          $self->{nc}
3370              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3371        } else {
3372          $self->{set_nc}->($self);
3373        }
3374      
3375            redo A;
3376          } elsif ($self->{nc} == 0x0027 and # '
3377                   ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
3378                    $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
3379            
3380            $self->{state} = DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE;
3381            $self->{ct}->{value} = ''; # ENTITY
3382            
3383        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3384          $self->{line_prev} = $self->{line};
3385          $self->{column_prev} = $self->{column};
3386          $self->{column}++;
3387          $self->{nc}
3388              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3389        } else {
3390          $self->{set_nc}->($self);
3391        }
3392      
3393            redo A;
3394          } elsif ($self->{is_xml} and
3395                   $self->{ct}->{type} == DOCTYPE_TOKEN and
3396                   $self->{nc} == 0x005B) { # [
3397            
3398            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3399            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3400            $self->{in_subset} = 1;
3401            
3402        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3403          $self->{line_prev} = $self->{line};
3404          $self->{column_prev} = $self->{column};
3405          $self->{column}++;
3406          $self->{nc}
3407              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3408        } else {
3409          $self->{set_nc}->($self);
3410        }
3411      
3412            return  ($self->{ct}); # DOCTYPE
3413            redo A;
3414          } else {
3415            $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name'); ## TODO: type
3416    
3417            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3418              
3419              $self->{ct}->{quirks} = 1;
3420              $self->{state} = BOGUS_DOCTYPE_STATE;
3421            } else {
3422              
3423              $self->{state} = BOGUS_MD_STATE;
3424            }
3425    
         $self->{state} = BOGUS_DOCTYPE_STATE;  
3426                    
3427      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3428        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2804  sub _get_next_token ($) { Line 3444  sub _get_next_token ($) {
3444              0x0042, # B              0x0042, # B
3445              0x004C, # L              0x004C, # L
3446              0x0049, # I              0x0049, # I
3447            ]->[length $self->{s_kwd}] or            ]->[length $self->{kwd}] or
3448            $self->{nc} == [            $self->{nc} == [
3449              undef,              undef,
3450              0x0075, # u              0x0075, # u
3451              0x0062, # b              0x0062, # b
3452              0x006C, # l              0x006C, # l
3453              0x0069, # i              0x0069, # i
3454            ]->[length $self->{s_kwd}]) {            ]->[length $self->{kwd}]) {
3455                    
3456          ## Stay in the state.          ## Stay in the state.
3457          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
3458                    
3459      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3460        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2827  sub _get_next_token ($) { Line 3467  sub _get_next_token ($) {
3467      }      }
3468        
3469          redo A;          redo A;
3470        } elsif ((length $self->{s_kwd}) == 5 and        } elsif ((length $self->{kwd}) == 5 and
3471                 ($self->{nc} == 0x0043 or # C                 ($self->{nc} == 0x0043 or # C
3472                  $self->{nc} == 0x0063)) { # c                  $self->{nc} == 0x0063)) { # c
3473                    if ($self->{is_xml} and
3474                ($self->{kwd} ne 'PUBLI' or $self->{nc} == 0x0063)) { # c
3475              
3476              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
3477                              text => 'PUBLIC',
3478                              line => $self->{line_prev},
3479                              column => $self->{column_prev} - 4);
3480            } else {
3481              
3482            }
3483          $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;          $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3484                    
3485      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 2845  sub _get_next_token ($) { Line 3494  sub _get_next_token ($) {
3494        
3495          redo A;          redo A;
3496        } else {        } else {
3497                    $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name', ## TODO: type
         $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name',  
3498                          line => $self->{line_prev},                          line => $self->{line_prev},
3499                          column => $self->{column_prev} + 1 - length $self->{s_kwd});                          column => $self->{column_prev} + 1 - length $self->{kwd});
3500          $self->{ct}->{quirks} = 1;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3501              
3502          $self->{state} = BOGUS_DOCTYPE_STATE;            $self->{ct}->{quirks} = 1;
3503              $self->{state} = BOGUS_DOCTYPE_STATE;
3504            } else {
3505              
3506              $self->{state} = BOGUS_MD_STATE;
3507            }
3508          ## Reconsume.          ## Reconsume.
3509          redo A;          redo A;
3510        }        }
# Line 2863  sub _get_next_token ($) { Line 3516  sub _get_next_token ($) {
3516              0x0053, # S              0x0053, # S
3517              0x0054, # T              0x0054, # T
3518              0x0045, # E              0x0045, # E
3519            ]->[length $self->{s_kwd}] or            ]->[length $self->{kwd}] or
3520            $self->{nc} == [            $self->{nc} == [
3521              undef,              undef,
3522              0x0079, # y              0x0079, # y
3523              0x0073, # s              0x0073, # s
3524              0x0074, # t              0x0074, # t
3525              0x0065, # e              0x0065, # e
3526            ]->[length $self->{s_kwd}]) {            ]->[length $self->{kwd}]) {
3527                    
3528          ## Stay in the state.          ## Stay in the state.
3529          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
3530                    
3531      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3532        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2886  sub _get_next_token ($) { Line 3539  sub _get_next_token ($) {
3539      }      }
3540        
3541          redo A;          redo A;
3542        } elsif ((length $self->{s_kwd}) == 5 and        } elsif ((length $self->{kwd}) == 5 and
3543                 ($self->{nc} == 0x004D or # M                 ($self->{nc} == 0x004D or # M
3544                  $self->{nc} == 0x006D)) { # m                  $self->{nc} == 0x006D)) { # m
3545                    if ($self->{is_xml} and
3546                ($self->{kwd} ne 'SYSTE' or $self->{nc} == 0x006D)) { # m
3547              
3548              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
3549                              text => 'SYSTEM',
3550                              line => $self->{line_prev},
3551                              column => $self->{column_prev} - 4);
3552            } else {
3553              
3554            }
3555          $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;          $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
3556                    
3557      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 2904  sub _get_next_token ($) { Line 3566  sub _get_next_token ($) {
3566        
3567          redo A;          redo A;
3568        } else {        } else {
3569                    $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name', ## TODO: type
         $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name',  
3570                          line => $self->{line_prev},                          line => $self->{line_prev},
3571                          column => $self->{column_prev} + 1 - length $self->{s_kwd});                          column => $self->{column_prev} + 1 - length $self->{kwd});
3572          $self->{ct}->{quirks} = 1;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3573              
3574          $self->{state} = BOGUS_DOCTYPE_STATE;            $self->{ct}->{quirks} = 1;
3575              $self->{state} = BOGUS_DOCTYPE_STATE;
3576            } else {
3577              
3578              $self->{state} = BOGUS_MD_STATE;
3579            }
3580          ## Reconsume.          ## Reconsume.
3581          redo A;          redo A;
3582        }        }
# Line 2963  sub _get_next_token ($) { Line 3629  sub _get_next_token ($) {
3629        
3630          redo A;          redo A;
3631        } elsif ($self->{nc} eq 0x003E) { # >        } elsif ($self->{nc} eq 0x003E) { # >
           
3632          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3633            
3634          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3635          $self->{s_kwd} = '';            
3636              $self->{state} = DATA_STATE;
3637              $self->{s_kwd} = '';
3638              $self->{ct}->{quirks} = 1;
3639            } else {
3640              
3641              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3642            }
3643            
3644                    
3645      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3646        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2979  sub _get_next_token ($) { Line 3652  sub _get_next_token ($) {
3652        $self->{set_nc}->($self);        $self->{set_nc}->($self);
3653      }      }
3654        
3655            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         return  ($self->{ct}); # DOCTYPE  
   
3656          redo A;          redo A;
3657        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
3658            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3659              
3660              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3661              $self->{state} = DATA_STATE;
3662              $self->{s_kwd} = '';
3663              $self->{ct}->{quirks} = 1;
3664            } else {
3665              
3666              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3667              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3668            }
3669                    
         $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');  
   
         $self->{state} = DATA_STATE;  
         $self->{s_kwd} = '';  
3670          ## reconsume          ## reconsume
   
         $self->{ct}->{quirks} = 1;  
3671          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
   
3672          redo A;          redo A;
3673        } else {        } elsif ($self->{is_xml} and
3674                   $self->{ct}->{type} == DOCTYPE_TOKEN and
3675                   $self->{nc} == 0x005B) { # [
3676            
3677            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3678            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3679            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3680            $self->{in_subset} = 1;
3681                    
3682        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3683          $self->{line_prev} = $self->{line};
3684          $self->{column_prev} = $self->{column};
3685          $self->{column}++;
3686          $self->{nc}
3687              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3688        } else {
3689          $self->{set_nc}->($self);
3690        }
3691      
3692            return  ($self->{ct}); # DOCTYPE
3693            redo A;
3694          } else {
3695          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC');
         $self->{ct}->{quirks} = 1;  
3696    
3697          $self->{state} = BOGUS_DOCTYPE_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3698              
3699              $self->{ct}->{quirks} = 1;
3700              $self->{state} = BOGUS_DOCTYPE_STATE;
3701            } else {
3702              
3703              $self->{state} = BOGUS_MD_STATE;
3704            }
3705    
3706                    
3707      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3708        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3032  sub _get_next_token ($) { Line 3733  sub _get_next_token ($) {
3733        
3734          redo A;          redo A;
3735        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
           
3736          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3737    
3738          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3739          $self->{s_kwd} = '';            
3740              $self->{state} = DATA_STATE;
3741              $self->{s_kwd} = '';
3742              $self->{ct}->{quirks} = 1;
3743            } else {
3744              
3745              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3746            }
3747    
3748                    
3749      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3750        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3048  sub _get_next_token ($) { Line 3756  sub _get_next_token ($) {
3756        $self->{set_nc}->($self);        $self->{set_nc}->($self);
3757      }      }
3758        
3759            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         return  ($self->{ct}); # DOCTYPE  
   
3760          redo A;          redo A;
3761        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
3762          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3763    
3764          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3765          $self->{s_kwd} = '';            
3766          ## reconsume            $self->{state} = DATA_STATE;
3767              $self->{s_kwd} = '';
3768          $self->{ct}->{quirks} = 1;            $self->{ct}->{quirks} = 1;
3769            } else {
3770              
3771              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3772            }
3773            
3774            ## Reconsume.
3775          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
   
3776          redo A;          redo A;
3777        } else {        } else {
3778                    
3779          $self->{ct}->{pubid} # DOCTYPE          $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
             .= chr $self->{nc};  
3780          $self->{read_until}->($self->{ct}->{pubid}, q[">],          $self->{read_until}->($self->{ct}->{pubid}, q[">],
3781                                length $self->{ct}->{pubid});                                length $self->{ct}->{pubid});
3782    
# Line 3103  sub _get_next_token ($) { Line 3811  sub _get_next_token ($) {
3811        
3812          redo A;          redo A;
3813        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
           
3814          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3815    
3816          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3817          $self->{s_kwd} = '';            
3818              $self->{state} = DATA_STATE;
3819              $self->{s_kwd} = '';
3820              $self->{ct}->{quirks} = 1;
3821            } else {
3822              
3823              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3824            }
3825    
3826                    
3827      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3828        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3119  sub _get_next_token ($) { Line 3834  sub _get_next_token ($) {
3834        $self->{set_nc}->($self);        $self->{set_nc}->($self);
3835      }      }
3836        
3837            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         return  ($self->{ct}); # DOCTYPE  
   
3838          redo A;          redo A;
3839        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
3840          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3841    
3842          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3843          $self->{s_kwd} = '';            
3844              $self->{state} = DATA_STATE;
3845              $self->{s_kwd} = '';
3846              $self->{ct}->{quirks} = 1;
3847            } else {
3848              
3849              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3850            }
3851          
3852          ## reconsume          ## reconsume
3853            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         return  ($self->{ct}); # DOCTYPE  
   
3854          redo A;          redo A;
3855        } else {        } else {
3856                    
3857          $self->{ct}->{pubid} # DOCTYPE          $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
             .= chr $self->{nc};  
3858          $self->{read_until}->($self->{ct}->{pubid}, q['>],          $self->{read_until}->($self->{ct}->{pubid}, q['>],
3859                                length $self->{ct}->{pubid});                                length $self->{ct}->{pubid});
3860    
# Line 3175  sub _get_next_token ($) { Line 3890  sub _get_next_token ($) {
3890          redo A;          redo A;
3891        } elsif ($self->{nc} == 0x0022) { # "        } elsif ($self->{nc} == 0x0022) { # "
3892                    
3893          $self->{ct}->{sysid} = ''; # DOCTYPE          $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
3894          $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;          $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3895                    
3896      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 3191  sub _get_next_token ($) { Line 3906  sub _get_next_token ($) {
3906          redo A;          redo A;
3907        } elsif ($self->{nc} == 0x0027) { # '        } elsif ($self->{nc} == 0x0027) { # '
3908                    
3909          $self->{ct}->{sysid} = ''; # DOCTYPE          $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
3910          $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;          $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
3911                    
3912      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 3206  sub _get_next_token ($) { Line 3921  sub _get_next_token ($) {
3921        
3922          redo A;          redo A;
3923        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
3924            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3925              if ($self->{is_xml}) {
3926                
3927                $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3928              } else {
3929                
3930              }
3931              $self->{state} = DATA_STATE;
3932              $self->{s_kwd} = '';
3933            } else {
3934              if ($self->{ct}->{type} == NOTATION_TOKEN) {
3935                
3936              } else {
3937                
3938                $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');            
3939              }
3940              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3941            }
3942                    
         $self->{state} = DATA_STATE;  
         $self->{s_kwd} = '';  
3943                    
3944      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3945        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3220  sub _get_next_token ($) { Line 3951  sub _get_next_token ($) {
3951        $self->{set_nc}->($self);        $self->{set_nc}->($self);
3952      }      }
3953        
3954            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         return  ($self->{ct}); # DOCTYPE  
   
3955          redo A;          redo A;
3956        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
3957            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3958              
3959              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3960              
3961              $self->{state} = DATA_STATE;
3962              $self->{s_kwd} = '';
3963              $self->{ct}->{quirks} = 1;
3964            } else {
3965              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3966              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3967            }
3968                    
         $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');  
   
         $self->{state} = DATA_STATE;  
         $self->{s_kwd} = '';  
3969          ## reconsume          ## reconsume
3970            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3971          $self->{ct}->{quirks} = 1;          redo A;
3972          } elsif ($self->{is_xml} and
3973                   $self->{ct}->{type} == DOCTYPE_TOKEN and
3974                   $self->{nc} == 0x005B) { # [
3975            
3976            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3977            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3978            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3979            $self->{in_subset} = 1;
3980            
3981        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3982          $self->{line_prev} = $self->{line};
3983          $self->{column_prev} = $self->{column};
3984          $self->{column}++;
3985          $self->{nc}
3986              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3987        } else {
3988          $self->{set_nc}->($self);
3989        }
3990      
3991          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
   
3992          redo A;          redo A;
3993        } else {        } else {
           
3994          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC literal');
         $self->{ct}->{quirks} = 1;  
3995    
3996          $self->{state} = BOGUS_DOCTYPE_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3997              
3998              $self->{ct}->{quirks} = 1;
3999              $self->{state} = BOGUS_DOCTYPE_STATE;
4000            } else {
4001              
4002              $self->{state} = BOGUS_MD_STATE;
4003            }
4004    
4005                    
4006      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4007        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3304  sub _get_next_token ($) { Line 4064  sub _get_next_token ($) {
4064        
4065          redo A;          redo A;
4066        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
           
4067          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
         $self->{state} = DATA_STATE;  
         $self->{s_kwd} = '';  
4068                    
4069      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4070        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3320  sub _get_next_token ($) { Line 4077  sub _get_next_token ($) {
4077      }      }
4078        
4079    
4080          $self->{ct}->{quirks} = 1;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4081          return  ($self->{ct}); # DOCTYPE            
4082              $self->{state} = DATA_STATE;
4083              $self->{s_kwd} = '';
4084              $self->{ct}->{quirks} = 1;
4085            } else {
4086              
4087              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4088            }
4089    
4090            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4091          redo A;          redo A;
4092        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
4093            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4094              
4095              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
4096              $self->{state} = DATA_STATE;
4097              $self->{s_kwd} = '';
4098              $self->{ct}->{quirks} = 1;
4099            } else {
4100              
4101              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4102              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4103            }
4104                    
         $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');  
   
         $self->{state} = DATA_STATE;  
         $self->{s_kwd} = '';  
4105          ## reconsume          ## reconsume
4106            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4107            redo A;
4108          } elsif ($self->{is_xml} and
4109                   $self->{ct}->{type} == DOCTYPE_TOKEN and
4110                   $self->{nc} == 0x005B) { # [
4111            
4112            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
4113    
4114          $self->{ct}->{quirks} = 1;          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4115            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4116            $self->{in_subset} = 1;
4117            
4118        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4119          $self->{line_prev} = $self->{line};
4120          $self->{column_prev} = $self->{column};
4121          $self->{column}++;
4122          $self->{nc}
4123              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4124        } else {
4125          $self->{set_nc}->($self);
4126        }
4127      
4128          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
   
4129          redo A;          redo A;
4130        } else {        } else {
           
4131          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM');
         $self->{ct}->{quirks} = 1;  
4132    
4133          $self->{state} = BOGUS_DOCTYPE_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4134                        
4135              $self->{ct}->{quirks} = 1;
4136              $self->{state} = BOGUS_DOCTYPE_STATE;
4137            } else {
4138              
4139              $self->{state} = BOGUS_MD_STATE;
4140            }
4141    
4142                    
4143      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4144        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3371  sub _get_next_token ($) { Line 4168  sub _get_next_token ($) {
4168      }      }
4169        
4170          redo A;          redo A;
4171        } elsif ($self->{nc} == 0x003E) { # >        } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
           
4172          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4173    
4174          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4175          $self->{s_kwd} = '';            
4176              $self->{state} = DATA_STATE;
4177              $self->{s_kwd} = '';
4178              $self->{ct}->{quirks} = 1;
4179            } else {
4180              
4181              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4182            }
4183            
4184                    
4185      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4186        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3388  sub _get_next_token ($) { Line 4192  sub _get_next_token ($) {
4192        $self->{set_nc}->($self);        $self->{set_nc}->($self);
4193      }      }
4194        
4195            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         return  ($self->{ct}); # DOCTYPE  
   
4196          redo A;          redo A;
4197        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
4198          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4199    
4200          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4201          $self->{s_kwd} = '';            
4202              $self->{state} = DATA_STATE;
4203              $self->{s_kwd} = '';
4204              $self->{ct}->{quirks} = 1;
4205            } else {
4206              
4207              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4208            }
4209            
4210          ## reconsume          ## reconsume
4211            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         return  ($self->{ct}); # DOCTYPE  
   
4212          redo A;          redo A;
4213        } else {        } else {
4214                    
4215          $self->{ct}->{sysid} # DOCTYPE          $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
             .= chr $self->{nc};  
4216          $self->{read_until}->($self->{ct}->{sysid}, q[">],          $self->{read_until}->($self->{ct}->{sysid}, q[">],
4217                                length $self->{ct}->{sysid});                                length $self->{ct}->{sysid});
4218    
# Line 3442  sub _get_next_token ($) { Line 4246  sub _get_next_token ($) {
4246      }      }
4247        
4248          redo A;          redo A;
4249        } elsif ($self->{nc} == 0x003E) { # >        } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
4250                    
4251          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4252    
# Line 3465  sub _get_next_token ($) { Line 4269  sub _get_next_token ($) {
4269    
4270          redo A;          redo A;
4271        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
4272          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4273    
4274          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4275          $self->{s_kwd} = '';            
4276          ## reconsume            $self->{state} = DATA_STATE;
4277              $self->{s_kwd} = '';
4278          $self->{ct}->{quirks} = 1;            $self->{ct}->{quirks} = 1;
4279          return  ($self->{ct}); # DOCTYPE          } else {
4280              
4281              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4282            }
4283    
4284            ## reconsume
4285            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4286          redo A;          redo A;
4287        } else {        } else {
4288                    
4289          $self->{ct}->{sysid} # DOCTYPE          $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
             .= chr $self->{nc};  
4290          $self->{read_until}->($self->{ct}->{sysid}, q['>],          $self->{read_until}->($self->{ct}->{sysid}, q['>],
4291                                length $self->{ct}->{sysid});                                length $self->{ct}->{sysid});
4292    
# Line 3499  sub _get_next_token ($) { Line 4306  sub _get_next_token ($) {
4306        }        }
4307      } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {      } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
4308        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
4309                    if ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN) {
4310          ## Stay in the state            
4311              $self->{state} = BEFORE_NDATA_STATE;
4312            } else {
4313              
4314              ## Stay in the state
4315            }
4316                    
4317      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4318        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3514  sub _get_next_token ($) { Line 4326  sub _get_next_token ($) {
4326        
4327          redo A;          redo A;
4328        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
4329            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4330              
4331              $self->{state} = DATA_STATE;
4332              $self->{s_kwd} = '';
4333            } else {
4334              
4335              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4336            }
4337    
4338                    
4339          $self->{state} = DATA_STATE;      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4340          $self->{s_kwd} = '';        $self->{line_prev} = $self->{line};
4341          $self->{column_prev} = $self->{column};
4342          $self->{column}++;
4343          $self->{nc}
4344              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4345        } else {
4346          $self->{set_nc}->($self);
4347        }
4348      
4349            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4350            redo A;
4351          } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
4352                   ($self->{nc} == 0x004E or # N
4353                    $self->{nc} == 0x006E)) { # n
4354            
4355            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before NDATA'); ## TODO: type
4356            $self->{state} = NDATA_STATE;
4357            $self->{kwd} = chr $self->{nc};
4358                    
4359      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4360        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3528  sub _get_next_token ($) { Line 4366  sub _get_next_token ($) {
4366        $self->{set_nc}->($self);        $self->{set_nc}->($self);
4367      }      }
4368        
4369            redo A;
4370          } elsif ($self->{nc} == -1) {
4371            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4372              
4373              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
4374              $self->{state} = DATA_STATE;
4375              $self->{s_kwd} = '';
4376              $self->{ct}->{quirks} = 1;
4377            } else {
4378              
4379              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4380              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4381            }
4382    
4383            ## reconsume
4384            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4385            redo A;
4386          } elsif ($self->{is_xml} and
4387                   $self->{ct}->{type} == DOCTYPE_TOKEN and
4388                   $self->{nc} == 0x005B) { # [
4389            
4390            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4391            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4392            $self->{in_subset} = 1;
4393            
4394        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4395          $self->{line_prev} = $self->{line};
4396          $self->{column_prev} = $self->{column};
4397          $self->{column}++;
4398          $self->{nc}
4399              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4400        } else {
4401          $self->{set_nc}->($self);
4402        }
4403      
4404          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
4405            redo A;
4406          } else {
4407            $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
4408    
4409            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4410              
4411              #$self->{ct}->{quirks} = 1;
4412              $self->{state} = BOGUS_DOCTYPE_STATE;
4413            } else {
4414              
4415              $self->{state} = BOGUS_MD_STATE;
4416            }
4417    
4418            
4419        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4420          $self->{line_prev} = $self->{line};
4421          $self->{column_prev} = $self->{column};
4422          $self->{column}++;
4423          $self->{nc}
4424              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4425        } else {
4426          $self->{set_nc}->($self);
4427        }
4428      
4429            redo A;
4430          }
4431        } elsif ($self->{state} == BEFORE_NDATA_STATE) {
4432          if ($is_space->{$self->{nc}}) {
4433            
4434            ## Stay in the state.
4435            
4436        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4437          $self->{line_prev} = $self->{line};
4438          $self->{column_prev} = $self->{column};
4439          $self->{column}++;
4440          $self->{nc}
4441              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4442        } else {
4443          $self->{set_nc}->($self);
4444        }
4445      
4446            redo A;
4447          } elsif ($self->{nc} == 0x003E) { # >
4448            
4449            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4450            
4451        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4452          $self->{line_prev} = $self->{line};
4453          $self->{column_prev} = $self->{column};
4454          $self->{column}++;
4455          $self->{nc}
4456              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4457        } else {
4458          $self->{set_nc}->($self);
4459        }
4460      
4461            return  ($self->{ct}); # ENTITY
4462            redo A;
4463          } elsif ($self->{nc} == 0x004E or # N
4464                   $self->{nc} == 0x006E) { # n
4465            
4466            $self->{state} = NDATA_STATE;
4467            $self->{kwd} = chr $self->{nc};
4468            
4469        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4470          $self->{line_prev} = $self->{line};
4471          $self->{column_prev} = $self->{column};
4472          $self->{column}++;
4473          $self->{nc}
4474              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4475        } else {
4476          $self->{set_nc}->($self);
4477        }
4478      
4479          redo A;          redo A;
4480        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
4481                    
4482          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4483          $self->{state} = DATA_STATE;          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
         $self->{s_kwd} = '';  
4484          ## reconsume          ## reconsume
4485            return  ($self->{ct}); # ENTITY
         $self->{ct}->{quirks} = 1;  
         return  ($self->{ct}); # DOCTYPE  
   
4486          redo A;          redo A;
4487        } else {        } else {
4488                    
4489          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
4490          #$self->{ct}->{quirks} = 1;          $self->{state} = BOGUS_MD_STATE;
   
         $self->{state} = BOGUS_DOCTYPE_STATE;  
4491                    
4492      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4493        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3582  sub _get_next_token ($) { Line 4521  sub _get_next_token ($) {
4521          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
4522    
4523          redo A;          redo A;
4524          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
4525            
4526            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4527            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4528            $self->{in_subset} = 1;
4529            
4530        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4531          $self->{line_prev} = $self->{line};
4532          $self->{column_prev} = $self->{column};
4533          $self->{column}++;
4534          $self->{nc}
4535              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4536        } else {
4537          $self->{set_nc}->($self);
4538        }
4539      
4540            return  ($self->{ct}); # DOCTYPE
4541            redo A;
4542        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
4543                    
4544          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
# Line 3594  sub _get_next_token ($) { Line 4551  sub _get_next_token ($) {
4551        } else {        } else {
4552                    
4553          my $s = '';          my $s = '';
4554          $self->{read_until}->($s, q[>], 0);          $self->{read_until}->($s, q{>[}, 0);
4555    
4556          ## Stay in the state          ## Stay in the state
4557                    
# Line 3614  sub _get_next_token ($) { Line 4571  sub _get_next_token ($) {
4571        ## NOTE: "CDATA section state" in the state is jointly implemented        ## NOTE: "CDATA section state" in the state is jointly implemented
4572        ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,        ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
4573        ## and |CDATA_SECTION_MSE2_STATE|.        ## and |CDATA_SECTION_MSE2_STATE|.
4574    
4575          ## XML5: "CDATA state".
4576                
4577        if ($self->{nc} == 0x005D) { # ]        if ($self->{nc} == 0x005D) { # ]
4578                    
# Line 3631  sub _get_next_token ($) { Line 4590  sub _get_next_token ($) {
4590        
4591          redo A;          redo A;
4592        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
4593            if ($self->{is_xml}) {
4594              
4595              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no mse'); ## TODO: type
4596            } else {
4597              
4598            }
4599    
4600          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
4601          $self->{s_kwd} = '';          $self->{s_kwd} = '';
4602                    ## Reconsume.
     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {  
       $self->{line_prev} = $self->{line};  
       $self->{column_prev} = $self->{column};  
       $self->{column}++;  
       $self->{nc}  
           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);  
     } else {  
       $self->{set_nc}->($self);  
     }  
     
4603          if (length $self->{ct}->{data}) { # character          if (length $self->{ct}->{data}) { # character
4604                        
4605            return  ($self->{ct}); # character            return  ($self->{ct}); # character
# Line 3676  sub _get_next_token ($) { Line 4632  sub _get_next_token ($) {
4632    
4633        ## ISSUE: "text tokens" in spec.        ## ISSUE: "text tokens" in spec.
4634      } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {      } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
4635          ## XML5: "CDATA bracket state".
4636    
4637        if ($self->{nc} == 0x005D) { # ]        if ($self->{nc} == 0x005D) { # ]
4638                    
4639          $self->{state} = CDATA_SECTION_MSE2_STATE;          $self->{state} = CDATA_SECTION_MSE2_STATE;
# Line 3693  sub _get_next_token ($) { Line 4651  sub _get_next_token ($) {
4651          redo A;          redo A;
4652        } else {        } else {
4653                    
4654            ## XML5: If EOF, "]" is not appended and changed to the data state.
4655          $self->{ct}->{data} .= ']';          $self->{ct}->{data} .= ']';
4656          $self->{state} = CDATA_SECTION_STATE;          $self->{state} = CDATA_SECTION_STATE; ## XML5: Stay in the state.
4657          ## Reconsume.          ## Reconsume.
4658          redo A;          redo A;
4659        }        }
4660      } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {      } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
4661          ## XML5: "CDATA end state".
4662    
4663        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
4664          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
4665          $self->{s_kwd} = '';          $self->{s_kwd} = '';
# Line 3741  sub _get_next_token ($) { Line 4702  sub _get_next_token ($) {
4702                    
4703          $self->{ct}->{data} .= ']]'; # character          $self->{ct}->{data} .= ']]'; # character
4704          $self->{state} = CDATA_SECTION_STATE;          $self->{state} = CDATA_SECTION_STATE;
4705          ## Reconsume.          ## Reconsume. ## XML5: Emit.
4706          redo A;          redo A;
4707        }        }
4708      } elsif ($self->{state} == ENTITY_STATE) {      } elsif ($self->{state} == ENTITY_STATE) {
# Line 3750  sub _get_next_token ($) { Line 4711  sub _get_next_token ($) {
4711              0x003C => 1, 0x0026 => 1, -1 => 1, # <, &              0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
4712              $self->{entity_add} => 1,              $self->{entity_add} => 1,
4713            }->{$self->{nc}}) {            }->{$self->{nc}}) {
4714                    if ($self->{is_xml}) {
4715              
4716              $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
4717                              line => $self->{line_prev},
4718                              column => $self->{column_prev}
4719                                  + ($self->{nc} == -1 ? 1 : 0));
4720            } else {
4721              
4722              ## No error
4723            }
4724          ## Don't consume          ## Don't consume
         ## No error  
4725          ## Return nothing.          ## Return nothing.
4726          #          #
4727        } elsif ($self->{nc} == 0x0023) { # #        } elsif ($self->{nc} == 0x0023) { # #
4728                    
4729          $self->{state} = ENTITY_HASH_STATE;          $self->{state} = ENTITY_HASH_STATE;
4730          $self->{s_kwd} = '#';          $self->{kwd} = '#';
4731                    
4732      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4733        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3771  sub _get_next_token ($) { Line 4740  sub _get_next_token ($) {
4740      }      }
4741        
4742          redo A;          redo A;
4743        } elsif ((0x0041 <= $self->{nc} and        } elsif ($self->{is_xml} or
4744                   (0x0041 <= $self->{nc} and
4745                  $self->{nc} <= 0x005A) or # A..Z                  $self->{nc} <= 0x005A) or # A..Z
4746                 (0x0061 <= $self->{nc} and                 (0x0061 <= $self->{nc} and
4747                  $self->{nc} <= 0x007A)) { # a..z                  $self->{nc} <= 0x007A)) { # a..z
4748                    
4749          require Whatpm::_NamedEntityList;          require Whatpm::_NamedEntityList;
4750          $self->{state} = ENTITY_NAME_STATE;          $self->{state} = ENTITY_NAME_STATE;
4751          $self->{s_kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
4752          $self->{entity__value} = $self->{s_kwd};          $self->{entity__value} = $self->{kwd};
4753          $self->{entity__match} = 0;          $self->{entity__match} = 0;
4754                    
4755      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 3825  sub _get_next_token ($) { Line 4795  sub _get_next_token ($) {
4795          redo A;          redo A;
4796        }        }
4797      } elsif ($self->{state} == ENTITY_HASH_STATE) {      } elsif ($self->{state} == ENTITY_HASH_STATE) {
4798        if ($self->{nc} == 0x0078 or # x        if ($self->{nc} == 0x0078) { # x
           $self->{nc} == 0x0058) { # X  
4799                    
4800          $self->{state} = HEXREF_X_STATE;          $self->{state} = HEXREF_X_STATE;
4801          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
4802            
4803        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4804          $self->{line_prev} = $self->{line};
4805          $self->{column_prev} = $self->{column};
4806          $self->{column}++;
4807          $self->{nc}
4808              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4809        } else {
4810          $self->{set_nc}->($self);
4811        }
4812      
4813            redo A;
4814          } elsif ($self->{nc} == 0x0058) { # X
4815            
4816            if ($self->{is_xml}) {
4817              $self->{parse_error}->(level => $self->{level}->{must}, type => 'uppercase hcro'); ## TODO: type
4818            }
4819            $self->{state} = HEXREF_X_STATE;
4820            $self->{kwd} .= chr $self->{nc};
4821                    
4822      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4823        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3846  sub _get_next_token ($) { Line 4834  sub _get_next_token ($) {
4834                 $self->{nc} <= 0x0039) { # 0..9                 $self->{nc} <= 0x0039) { # 0..9
4835                    
4836          $self->{state} = NCR_NUM_STATE;          $self->{state} = NCR_NUM_STATE;
4837          $self->{s_kwd} = $self->{nc} - 0x0030;          $self->{kwd} = $self->{nc} - 0x0030;
4838                    
4839      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4840        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3892  sub _get_next_token ($) { Line 4880  sub _get_next_token ($) {
4880        if (0x0030 <= $self->{nc} and        if (0x0030 <= $self->{nc} and
4881            $self->{nc} <= 0x0039) { # 0..9            $self->{nc} <= 0x0039) { # 0..9
4882                    
4883          $self->{s_kwd} *= 10;          $self->{kwd} *= 10;
4884          $self->{s_kwd} += $self->{nc} - 0x0030;          $self->{kwd} += $self->{nc} - 0x0030;
4885                    
4886          ## Stay in the state.          ## Stay in the state.
4887                    
# Line 3929  sub _get_next_token ($) { Line 4917  sub _get_next_token ($) {
4917          #          #
4918        }        }
4919    
4920        my $code = $self->{s_kwd};        my $code = $self->{kwd};
4921        my $l = $self->{line_prev};        my $l = $self->{line_prev};
4922        my $c = $self->{column_prev};        my $c = $self->{column_prev};
4923        if ($charref_map->{$code}) {        if ((not $self->{is_xml} and $charref_map->{$code}) or
4924              ($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or
4925              ($self->{is_xml} and $code == 0x0000)) {
4926                    
4927          $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4928                          text => (sprintf 'U+%04X', $code),                          text => (sprintf 'U+%04X', $code),
# Line 3952  sub _get_next_token ($) { Line 4942  sub _get_next_token ($) {
4942          $self->{s_kwd} = '';          $self->{s_kwd} = '';
4943          ## Reconsume.          ## Reconsume.
4944          return  ({type => CHARACTER_TOKEN, data => chr $code,          return  ({type => CHARACTER_TOKEN, data => chr $code,
4945                      has_reference => 1,
4946                    line => $l, column => $c,                    line => $l, column => $c,
4947                   });                   });
4948          redo A;          redo A;
# Line 3971  sub _get_next_token ($) { Line 4962  sub _get_next_token ($) {
4962          # 0..9, A..F, a..f          # 0..9, A..F, a..f
4963                    
4964          $self->{state} = HEXREF_HEX_STATE;          $self->{state} = HEXREF_HEX_STATE;
4965          $self->{s_kwd} = 0;          $self->{kwd} = 0;
4966          ## Reconsume.          ## Reconsume.
4967          redo A;          redo A;
4968        } else {        } else {
# Line 3989  sub _get_next_token ($) { Line 4980  sub _get_next_token ($) {
4980            $self->{s_kwd} = '';            $self->{s_kwd} = '';
4981            ## Reconsume.            ## Reconsume.
4982            return  ({type => CHARACTER_TOKEN,            return  ({type => CHARACTER_TOKEN,
4983                      data => '&' . $self->{s_kwd},                      data => '&' . $self->{kwd},
4984                      line => $self->{line_prev},                      line => $self->{line_prev},
4985                      column => $self->{column_prev} - length $self->{s_kwd},                      column => $self->{column_prev} - length $self->{kwd},
4986                     });                     });
4987            redo A;            redo A;
4988          } else {          } else {
4989                        
4990            $self->{ca}->{value} .= '&' . $self->{s_kwd};            $self->{ca}->{value} .= '&' . $self->{kwd};
4991            $self->{state} = $self->{prev_state};            $self->{state} = $self->{prev_state};
4992            $self->{s_kwd} = '';            $self->{s_kwd} = '';
4993            ## Reconsume.            ## Reconsume.
# Line 4007  sub _get_next_token ($) { Line 4998  sub _get_next_token ($) {
4998        if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {        if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
4999          # 0..9          # 0..9
5000                    
5001          $self->{s_kwd} *= 0x10;          $self->{kwd} *= 0x10;
5002          $self->{s_kwd} += $self->{nc} - 0x0030;          $self->{kwd} += $self->{nc} - 0x0030;
5003          ## Stay in the state.          ## Stay in the state.
5004                    
5005      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 4025  sub _get_next_token ($) { Line 5016  sub _get_next_token ($) {
5016        } elsif (0x0061 <= $self->{nc} and        } elsif (0x0061 <= $self->{nc} and
5017                 $self->{nc} <= 0x0066) { # a..f                 $self->{nc} <= 0x0066) { # a..f
5018                    
5019          $self->{s_kwd} *= 0x10;          $self->{kwd} *= 0x10;
5020          $self->{s_kwd} += $self->{nc} - 0x0060 + 9;          $self->{kwd} += $self->{nc} - 0x0060 + 9;
5021          ## Stay in the state.          ## Stay in the state.
5022                    
5023      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 4043  sub _get_next_token ($) { Line 5034  sub _get_next_token ($) {
5034        } elsif (0x0041 <= $self->{nc} and        } elsif (0x0041 <= $self->{nc} and
5035                 $self->{nc} <= 0x0046) { # A..F                 $self->{nc} <= 0x0046) { # A..F
5036                    
5037          $self->{s_kwd} *= 0x10;          $self->{kwd} *= 0x10;
5038          $self->{s_kwd} += $self->{nc} - 0x0040 + 9;          $self->{kwd} += $self->{nc} - 0x0040 + 9;
5039          ## Stay in the state.          ## Stay in the state.
5040                    
5041      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 4081  sub _get_next_token ($) { Line 5072  sub _get_next_token ($) {
5072          #          #
5073        }        }
5074    
5075        my $code = $self->{s_kwd};        my $code = $self->{kwd};
5076        my $l = $self->{line_prev};        my $l = $self->{line_prev};
5077        my $c = $self->{column_prev};        my $c = $self->{column_prev};
5078        if ($charref_map->{$code}) {        if ((not $self->{is_xml} and $charref_map->{$code}) or
5079              ($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or
5080              ($self->{is_xml} and $code == 0x0000)) {
5081                    
5082          $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
5083                          text => (sprintf 'U+%04X', $code),                          text => (sprintf 'U+%04X', $code),
# Line 4104  sub _get_next_token ($) { Line 5097  sub _get_next_token ($) {
5097          $self->{s_kwd} = '';          $self->{s_kwd} = '';
5098          ## Reconsume.          ## Reconsume.
5099          return  ({type => CHARACTER_TOKEN, data => chr $code,          return  ({type => CHARACTER_TOKEN, data => chr $code,
5100                      has_reference => 1,
5101                    line => $l, column => $c,                    line => $l, column => $c,
5102                   });                   });
5103          redo A;          redo A;
# Line 4117  sub _get_next_token ($) { Line 5111  sub _get_next_token ($) {
5111          redo A;          redo A;
5112        }        }
5113      } elsif ($self->{state} == ENTITY_NAME_STATE) {      } elsif ($self->{state} == ENTITY_NAME_STATE) {
5114        if (length $self->{s_kwd} < 30 and        if ((0x0041 <= $self->{nc} and # a
5115            ## NOTE: Some number greater than the maximum length of entity name             $self->{nc} <= 0x005A) or # x
5116            ((0x0041 <= $self->{nc} and # a            (0x0061 <= $self->{nc} and # a
5117              $self->{nc} <= 0x005A) or # x             $self->{nc} <= 0x007A) or # z
5118             (0x0061 <= $self->{nc} and # a            (0x0030 <= $self->{nc} and # 0
5119              $self->{nc} <= 0x007A) or # z             $self->{nc} <= 0x0039) or # 9
5120             (0x0030 <= $self->{nc} and # 0            $self->{nc} == 0x003B or # ;
5121              $self->{nc} <= 0x0039) or # 9            ($self->{is_xml} and
5122             $self->{nc} == 0x003B)) { # ;             not ($is_space->{$self->{nc}} or
5123                    {
5124                      0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
5125                      $self->{entity_add} => 1,
5126                    }->{$self->{nc}}))) {
5127          our $EntityChar;          our $EntityChar;
5128          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
5129          if (defined $EntityChar->{$self->{s_kwd}}) {          if (defined $EntityChar->{$self->{kwd}} or
5130                $self->{ge}->{$self->{kwd}}) {
5131            if ($self->{nc} == 0x003B) { # ;            if ($self->{nc} == 0x003B) { # ;
5132                            if (defined $self->{ge}->{$self->{kwd}}) {
5133              $self->{entity__value} = $EntityChar->{$self->{s_kwd}};                if ($self->{ge}->{$self->{kwd}}->{only_text}) {
5134                    
5135                    $self->{entity__value} = $self->{ge}->{$self->{kwd}}->{value};
5136                  } else {
5137                    if (defined $self->{ge}->{$self->{kwd}}->{notation}) {
5138                      
5139                      $self->{parse_error}->(level => $self->{level}->{must}, type => 'unparsed entity', ## TODO: type
5140                                      value => $self->{kwd});
5141                    } else {
5142                      
5143                    }
5144                    $self->{entity__value} = '&' . $self->{kwd}; ## TODO: expand
5145                  }
5146                } else {
5147                  if ($self->{is_xml}) {
5148                    
5149                    $self->{parse_error}->(level => $self->{level}->{must}, type => 'entity not declared', ## TODO: type
5150                                    value => $self->{kwd},
5151                                    level => {
5152                                              'amp;' => $self->{level}->{warn},
5153                                              'quot;' => $self->{level}->{warn},
5154                                              'lt;' => $self->{level}->{warn},
5155                                              'gt;' => $self->{level}->{warn},
5156                                              'apos;' => $self->{level}->{warn},
5157                                             }->{$self->{kwd}} ||
5158                                             $self->{level}->{must});
5159                  } else {
5160                    
5161                  }
5162                  $self->{entity__value} = $EntityChar->{$self->{kwd}};
5163                }
5164              $self->{entity__match} = 1;              $self->{entity__match} = 1;
5165                            
5166      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 4147  sub _get_next_token ($) { Line 5176  sub _get_next_token ($) {
5176              #              #
5177            } else {            } else {
5178                            
5179              $self->{entity__value} = $EntityChar->{$self->{s_kwd}};              $self->{entity__value} = $EntityChar->{$self->{kwd}};
5180              $self->{entity__match} = -1;              $self->{entity__match} = -1;
5181              ## Stay in the state.              ## Stay in the state.
5182                            
# Line 4195  sub _get_next_token ($) { Line 5224  sub _get_next_token ($) {
5224          if ($self->{prev_state} != DATA_STATE and # in attribute          if ($self->{prev_state} != DATA_STATE and # in attribute
5225              $self->{entity__match} < -1) {              $self->{entity__match} < -1) {
5226                        
5227            $data = '&' . $self->{s_kwd};            $data = '&' . $self->{kwd};
5228            #            #
5229          } else {          } else {
5230                        
# Line 4207  sub _get_next_token ($) { Line 5236  sub _get_next_token ($) {
5236                    
5237          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
5238                          line => $self->{line_prev},                          line => $self->{line_prev},
5239                          column => $self->{column_prev} - length $self->{s_kwd});                          column => $self->{column_prev} - length $self->{kwd});
5240          $data = '&' . $self->{s_kwd};          $data = '&' . $self->{kwd};
5241          #          #
5242        }        }
5243        
# Line 4229  sub _get_next_token ($) { Line 5258  sub _get_next_token ($) {
5258          ## Reconsume.          ## Reconsume.
5259          return  ({type => CHARACTER_TOKEN,          return  ({type => CHARACTER_TOKEN,
5260                    data => $data,                    data => $data,
5261                      has_reference => $has_ref,
5262                    line => $self->{line_prev},                    line => $self->{line_prev},
5263                    column => $self->{column_prev} + 1 - length $self->{s_kwd},                    column => $self->{column_prev} + 1 - length $self->{kwd},
5264                   });                   });
5265          redo A;          redo A;
5266        } else {        } else {
# Line 4242  sub _get_next_token ($) { Line 5272  sub _get_next_token ($) {
5272          ## Reconsume.          ## Reconsume.
5273          redo A;          redo A;
5274        }        }
5275    
5276        ## XML-only states
5277    
5278        } elsif ($self->{state} == PI_STATE) {
5279          ## XML5: "Pi state" and "DOCTYPE pi state".
5280    
5281          if ($is_space->{$self->{nc}} or
5282              $self->{nc} == 0x003F or # ?
5283              $self->{nc} == -1) {
5284            ## XML5: U+003F: "pi state": Same as "Anything else"; "DOCTYPE
5285            ## pi state": Switch to the "DOCTYPE pi after state".  EOF:
5286            ## "DOCTYPE pi state": Parse error, switch to the "data
5287            ## state".
5288            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare pio', ## TODO: type
5289                            line => $self->{line_prev},
5290                            column => $self->{column_prev}
5291                                - 1 * ($self->{nc} != -1));
5292            $self->{state} = BOGUS_COMMENT_STATE;
5293            ## Reconsume.
5294            $self->{ct} = {type => COMMENT_TOKEN,
5295                           data => '?',
5296                           line => $self->{line_prev},
5297                           column => $self->{column_prev}
5298                               - 1 * ($self->{nc} != -1),
5299                          };
5300            redo A;
5301          } else {
5302            ## XML5: "DOCTYPE pi state": Stay in the state.
5303            $self->{ct} = {type => PI_TOKEN,
5304                           target => chr $self->{nc},
5305                           data => '',
5306                           line => $self->{line_prev},
5307                           column => $self->{column_prev} - 1,
5308                          };
5309            $self->{state} = PI_TARGET_STATE;
5310            
5311        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5312          $self->{line_prev} = $self->{line};
5313          $self->{column_prev} = $self->{column};
5314          $self->{column}++;
5315          $self->{nc}
5316              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5317        } else {
5318          $self->{set_nc}->($self);
5319        }
5320      
5321            redo A;
5322          }
5323        } elsif ($self->{state} == PI_TARGET_STATE) {
5324          if ($is_space->{$self->{nc}}) {
5325            $self->{state} = PI_TARGET_AFTER_STATE;
5326            
5327        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5328          $self->{line_prev} = $self->{line};
5329          $self->{column_prev} = $self->{column};
5330          $self->{column}++;
5331          $self->{nc}
5332              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5333        } else {
5334          $self->{set_nc}->($self);
5335        }
5336      
5337            redo A;
5338          } elsif ($self->{nc} == -1) {
5339            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
5340            if ($self->{in_subset}) {
5341              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5342            } else {
5343              $self->{state} = DATA_STATE;
5344              $self->{s_kwd} = '';
5345            }
5346            ## Reconsume.
5347            return  ($self->{ct}); # pi
5348            redo A;
5349          } elsif ($self->{nc} == 0x003F) { # ?
5350            $self->{state} = PI_AFTER_STATE;
5351            
5352        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5353          $self->{line_prev} = $self->{line};
5354          $self->{column_prev} = $self->{column};
5355          $self->{column}++;
5356          $self->{nc}
5357              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5358        } else {
5359          $self->{set_nc}->($self);
5360        }
5361      
5362            redo A;
5363          } else {
5364            ## XML5: typo ("tag name" -> "target")
5365            $self->{ct}->{target} .= chr $self->{nc}; # pi
5366            
5367        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5368          $self->{line_prev} = $self->{line};
5369          $self->{column_prev} = $self->{column};
5370          $self->{column}++;
5371          $self->{nc}
5372              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5373        } else {
5374          $self->{set_nc}->($self);
5375        }
5376      
5377            redo A;
5378          }
5379        } elsif ($self->{state} == PI_TARGET_AFTER_STATE) {
5380          if ($is_space->{$self->{nc}}) {
5381            ## Stay in the state.
5382            
5383        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5384          $self->{line_prev} = $self->{line};
5385          $self->{column_prev} = $self->{column};
5386          $self->{column}++;
5387          $self->{nc}
5388              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5389        } else {
5390          $self->{set_nc}->($self);
5391        }
5392      
5393            redo A;
5394          } else {
5395            $self->{state} = PI_DATA_STATE;
5396            ## Reprocess.
5397            redo A;
5398          }
5399        } elsif ($self->{state} == PI_DATA_STATE) {
5400          if ($self->{nc} == 0x003F) { # ?
5401            $self->{state} = PI_DATA_AFTER_STATE;
5402            
5403        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5404          $self->{line_prev} = $self->{line};
5405          $self->{column_prev} = $self->{column};
5406          $self->{column}++;
5407          $self->{nc}
5408              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5409        } else {
5410          $self->{set_nc}->($self);
5411        }
5412      
5413            redo A;
5414          } elsif ($self->{nc} == -1) {
5415            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
5416            if ($self->{in_subset}) {
5417              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state"
5418            } else {
5419              $self->{state} = DATA_STATE;
5420              $self->{s_kwd} = '';
5421            }
5422            ## Reprocess.
5423            return  ($self->{ct}); # pi
5424            redo A;
5425          } else {
5426            $self->{ct}->{data} .= chr $self->{nc}; # pi
5427            $self->{read_until}->($self->{ct}->{data}, q[?],
5428                                  length $self->{ct}->{data});
5429            ## Stay in the state.
5430            
5431        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5432          $self->{line_prev} = $self->{line};
5433          $self->{column_prev} = $self->{column};
5434          $self->{column}++;
5435          $self->{nc}
5436              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5437        } else {
5438          $self->{set_nc}->($self);
5439        }
5440      
5441            ## Reprocess.
5442            redo A;
5443          }
5444        } elsif ($self->{state} == PI_AFTER_STATE) {
5445          ## XML5: Part of "Pi after state".
5446    
5447          if ($self->{nc} == 0x003E) { # >
5448            if ($self->{in_subset}) {
5449              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5450            } else {
5451              $self->{state} = DATA_STATE;
5452              $self->{s_kwd} = '';
5453            }
5454            
5455        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5456          $self->{line_prev} = $self->{line};
5457          $self->{column_prev} = $self->{column};
5458          $self->{column}++;
5459          $self->{nc}
5460              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5461        } else {
5462          $self->{set_nc}->($self);
5463        }
5464      
5465            return  ($self->{ct}); # pi
5466            redo A;
5467          } elsif ($self->{nc} == 0x003F) { # ?
5468            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
5469                            line => $self->{line_prev},
5470                            column => $self->{column_prev}); ## XML5: no error
5471            $self->{ct}->{data} .= '?';
5472            $self->{state} = PI_DATA_AFTER_STATE;
5473            
5474        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5475          $self->{line_prev} = $self->{line};
5476          $self->{column_prev} = $self->{column};
5477          $self->{column}++;
5478          $self->{nc}
5479              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5480        } else {
5481          $self->{set_nc}->($self);
5482        }
5483      
5484            redo A;
5485          } else {
5486            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
5487                            line => $self->{line_prev},
5488                            column => $self->{column_prev}
5489                                + 1 * ($self->{nc} == -1)); ## XML5: no error
5490            $self->{ct}->{data} .= '?'; ## XML5: not appended
5491            $self->{state} = PI_DATA_STATE;
5492            ## Reprocess.
5493            redo A;
5494          }
5495        } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
5496          ## XML5: Same as "pi after state" and "DOCTYPE pi after state".
5497    
5498          if ($self->{nc} == 0x003E) { # >
5499            if ($self->{in_subset}) {
5500              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5501            } else {
5502              $self->{state} = DATA_STATE;
5503              $self->{s_kwd} = '';
5504            }
5505            
5506        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5507          $self->{line_prev} = $self->{line};
5508          $self->{column_prev} = $self->{column};
5509          $self->{column}++;
5510          $self->{nc}
5511              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5512        } else {
5513          $self->{set_nc}->($self);
5514        }
5515      
5516            return  ($self->{ct}); # pi
5517            redo A;
5518          } elsif ($self->{nc} == 0x003F) { # ?
5519            $self->{ct}->{data} .= '?';
5520            ## Stay in the state.
5521            
5522        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5523          $self->{line_prev} = $self->{line};
5524          $self->{column_prev} = $self->{column};
5525          $self->{column}++;
5526          $self->{nc}
5527              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5528        } else {
5529          $self->{set_nc}->($self);
5530        }
5531      
5532            redo A;
5533          } else {
5534            $self->{ct}->{data} .= '?'; ## XML5: not appended
5535            $self->{state} = PI_DATA_STATE;
5536            ## Reprocess.
5537            redo A;
5538          }
5539    
5540        } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_STATE) {
5541          if ($self->{nc} == 0x003C) { # <
5542            $self->{state} = DOCTYPE_TAG_STATE;
5543            
5544        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5545          $self->{line_prev} = $self->{line};
5546          $self->{column_prev} = $self->{column};
5547          $self->{column}++;
5548          $self->{nc}
5549              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5550        } else {
5551          $self->{set_nc}->($self);
5552        }
5553      
5554            redo A;
5555          } elsif ($self->{nc} == 0x0025) { # %
5556            ## XML5: Not defined yet.
5557    
5558            ## TODO:
5559    
5560            if (not $self->{stop_processing} and
5561                not $self->{document}->xml_standalone) {
5562              $self->{parse_error}->(level => $self->{level}->{must}, type => 'stop processing', ## TODO: type
5563                              level => $self->{level}->{info});
5564              $self->{stop_processing} = 1;
5565            }
5566    
5567            
5568        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5569          $self->{line_prev} = $self->{line};
5570          $self->{column_prev} = $self->{column};
5571          $self->{column}++;
5572          $self->{nc}
5573              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5574        } else {
5575          $self->{set_nc}->($self);
5576        }
5577      
5578            redo A;
5579          } elsif ($self->{nc} == 0x005D) { # ]
5580            delete $self->{in_subset};
5581            $self->{state} = DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
5582            
5583        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5584          $self->{line_prev} = $self->{line};
5585          $self->{column_prev} = $self->{column};
5586          $self->{column}++;
5587          $self->{nc}
5588              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5589        } else {
5590          $self->{set_nc}->($self);
5591        }
5592      
5593            redo A;
5594          } elsif ($is_space->{$self->{nc}}) {
5595            ## Stay in the state.
5596            
5597        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5598          $self->{line_prev} = $self->{line};
5599          $self->{column_prev} = $self->{column};
5600          $self->{column}++;
5601          $self->{nc}
5602              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5603        } else {
5604          $self->{set_nc}->($self);
5605        }
5606      
5607            redo A;
5608          } elsif ($self->{nc} == -1) {
5609            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed internal subset'); ## TODO: type
5610            delete $self->{in_subset};
5611            $self->{state} = DATA_STATE;
5612            $self->{s_kwd} = '';
5613            ## Reconsume.
5614            return  ({type => END_OF_DOCTYPE_TOKEN});
5615            redo A;
5616          } else {
5617            unless ($self->{internal_subset_tainted}) {
5618              ## XML5: No parse error.
5619              $self->{parse_error}->(level => $self->{level}->{must}, type => 'string in internal subset');
5620              $self->{internal_subset_tainted} = 1;
5621            }
5622            ## Stay in the state.
5623            
5624        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5625          $self->{line_prev} = $self->{line};
5626          $self->{column_prev} = $self->{column};
5627          $self->{column}++;
5628          $self->{nc}
5629              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5630        } else {
5631          $self->{set_nc}->($self);
5632        }
5633      
5634            redo A;
5635          }
5636        } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
5637          if ($self->{nc} == 0x003E) { # >
5638            $self->{state} = DATA_STATE;
5639            $self->{s_kwd} = '';
5640            
5641        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5642          $self->{line_prev} = $self->{line};
5643          $self->{column_prev} = $self->{column};
5644          $self->{column}++;
5645          $self->{nc}
5646              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5647        } else {
5648          $self->{set_nc}->($self);
5649        }
5650      
5651            return  ({type => END_OF_DOCTYPE_TOKEN});
5652            redo A;
5653          } elsif ($self->{nc} == -1) {
5654            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
5655            $self->{state} = DATA_STATE;
5656            $self->{s_kwd} = '';
5657            ## Reconsume.
5658            return  ({type => END_OF_DOCTYPE_TOKEN});
5659            redo A;
5660          } else {
5661            ## XML5: No parse error and stay in the state.
5662            $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after internal subset'); ## TODO: type
5663    
5664            $self->{state} = BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
5665            
5666        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5667          $self->{line_prev} = $self->{line};
5668          $self->{column_prev} = $self->{column};
5669          $self->{column}++;
5670          $self->{nc}
5671              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5672        } else {
5673          $self->{set_nc}->($self);
5674        }
5675      
5676            redo A;
5677          }
5678        } elsif ($self->{state} == BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
5679          if ($self->{nc} == 0x003E) { # >
5680            $self->{state} = DATA_STATE;
5681            $self->{s_kwd} = '';
5682            
5683        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5684          $self->{line_prev} = $self->{line};
5685          $self->{column_prev} = $self->{column};
5686          $self->{column}++;
5687          $self->{nc}
5688              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5689        } else {
5690          $self->{set_nc}->($self);
5691        }
5692      
5693            return  ({type => END_OF_DOCTYPE_TOKEN});
5694            redo A;
5695          } elsif ($self->{nc} == -1) {
5696            $self->{state} = DATA_STATE;
5697            $self->{s_kwd} = '';
5698            ## Reconsume.
5699            return  ({type => END_OF_DOCTYPE_TOKEN});
5700            redo A;
5701          } else {
5702            ## Stay in the state.
5703            
5704        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5705          $self->{line_prev} = $self->{line};
5706          $self->{column_prev} = $self->{column};
5707          $self->{column}++;
5708          $self->{nc}
5709              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5710        } else {
5711          $self->{set_nc}->($self);
5712        }
5713      
5714            redo A;
5715          }
5716        } elsif ($self->{state} == DOCTYPE_TAG_STATE) {
5717          if ($self->{nc} == 0x0021) { # !
5718            $self->{state} = DOCTYPE_MARKUP_DECLARATION_OPEN_STATE;
5719            
5720        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5721          $self->{line_prev} = $self->{line};
5722          $self->{column_prev} = $self->{column};
5723          $self->{column}++;
5724          $self->{nc}
5725              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5726        } else {
5727          $self->{set_nc}->($self);
5728        }
5729      
5730            redo A;
5731          } elsif ($self->{nc} == 0x003F) { # ?
5732            $self->{state} = PI_STATE;
5733            
5734        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5735          $self->{line_prev} = $self->{line};
5736          $self->{column_prev} = $self->{column};
5737          $self->{column}++;
5738          $self->{nc}
5739              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5740        } else {
5741          $self->{set_nc}->($self);
5742        }
5743      
5744            redo A;
5745          } elsif ($self->{nc} == -1) {
5746            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago');
5747            $self->{state} = DATA_STATE;
5748            $self->{s_kwd} = '';
5749            ## Reconsume.
5750            redo A;
5751          } else {
5752            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago', ## XML5: Not a parse error.
5753                            line => $self->{line_prev},
5754                            column => $self->{column_prev});
5755            $self->{state} = BOGUS_COMMENT_STATE;
5756            $self->{ct} = {type => COMMENT_TOKEN,
5757                           data => '',
5758                          }; ## NOTE: Will be discarded.
5759            
5760        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5761          $self->{line_prev} = $self->{line};
5762          $self->{column_prev} = $self->{column};
5763          $self->{column}++;
5764          $self->{nc}
5765              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5766        } else {
5767          $self->{set_nc}->($self);
5768        }
5769      
5770            redo A;
5771          }
5772        } elsif ($self->{state} == DOCTYPE_MARKUP_DECLARATION_OPEN_STATE) {
5773          ## XML5: "DOCTYPE markup declaration state".
5774          
5775          if ($self->{nc} == 0x002D) { # -
5776            $self->{state} = MD_HYPHEN_STATE;
5777            
5778        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5779          $self->{line_prev} = $self->{line};
5780          $self->{column_prev} = $self->{column};
5781          $self->{column}++;
5782          $self->{nc}
5783              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5784        } else {
5785          $self->{set_nc}->($self);
5786        }
5787      
5788            redo A;
5789          } elsif ($self->{nc} == 0x0045 or # E
5790                   $self->{nc} == 0x0065) { # e
5791            $self->{state} = MD_E_STATE;
5792            $self->{kwd} = chr $self->{nc};
5793            
5794        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5795          $self->{line_prev} = $self->{line};
5796          $self->{column_prev} = $self->{column};
5797          $self->{column}++;
5798          $self->{nc}
5799              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5800        } else {
5801          $self->{set_nc}->($self);
5802        }
5803      
5804            redo A;
5805          } elsif ($self->{nc} == 0x0041 or # A
5806                   $self->{nc} == 0x0061) { # a
5807            $self->{state} = MD_ATTLIST_STATE;
5808            $self->{kwd} = chr $self->{nc};
5809            
5810        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5811          $self->{line_prev} = $self->{line};
5812          $self->{column_prev} = $self->{column};
5813          $self->{column}++;
5814          $self->{nc}
5815              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5816        } else {
5817          $self->{set_nc}->($self);
5818        }
5819      
5820            redo A;
5821          } elsif ($self->{nc} == 0x004E or # N
5822                   $self->{nc} == 0x006E) { # n
5823            $self->{state} = MD_NOTATION_STATE;
5824            $self->{kwd} = chr $self->{nc};
5825            
5826        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5827          $self->{line_prev} = $self->{line};
5828          $self->{column_prev} = $self->{column};
5829          $self->{column}++;
5830          $self->{nc}
5831              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5832        } else {
5833          $self->{set_nc}->($self);
5834        }
5835      
5836            redo A;
5837          } else {
5838            #
5839          }
5840          
5841          ## XML5: No parse error.
5842          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5843                          line => $self->{line_prev},
5844                          column => $self->{column_prev} - 1);
5845          ## Reconsume.
5846          $self->{state} = BOGUS_COMMENT_STATE;
5847          $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded.
5848          redo A;
5849        } elsif ($self->{state} == MD_E_STATE) {
5850          if ($self->{nc} == 0x004E or # N
5851              $self->{nc} == 0x006E) { # n
5852            $self->{state} = MD_ENTITY_STATE;
5853            $self->{kwd} .= chr $self->{nc};
5854            
5855        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5856          $self->{line_prev} = $self->{line};
5857          $self->{column_prev} = $self->{column};
5858          $self->{column}++;
5859          $self->{nc}
5860              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5861        } else {
5862          $self->{set_nc}->($self);
5863        }
5864      
5865            redo A;
5866          } elsif ($self->{nc} == 0x004C or # L
5867                   $self->{nc} == 0x006C) { # l
5868            ## XML5: <!ELEMENT> not supported.
5869            $self->{state} = MD_ELEMENT_STATE;
5870            $self->{kwd} .= chr $self->{nc};
5871            
5872        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5873          $self->{line_prev} = $self->{line};
5874          $self->{column_prev} = $self->{column};
5875          $self->{column}++;
5876          $self->{nc}
5877              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5878        } else {
5879          $self->{set_nc}->($self);
5880        }
5881      
5882            redo A;
5883          } else {
5884            ## XML5: No parse error.
5885            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5886                            line => $self->{line_prev},
5887                            column => $self->{column_prev} - 2
5888                                + 1 * ($self->{nc} == -1));
5889            ## Reconsume.
5890            $self->{state} = BOGUS_COMMENT_STATE;
5891            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5892            redo A;
5893          }
5894        } elsif ($self->{state} == MD_ENTITY_STATE) {
5895          if ($self->{nc} == [
5896                undef,
5897                undef,
5898                0x0054, # T
5899                0x0049, # I
5900                0x0054, # T
5901              ]->[length $self->{kwd}] or
5902              $self->{nc} == [
5903                undef,
5904                undef,
5905                0x0074, # t
5906                0x0069, # i
5907                0x0074, # t
5908              ]->[length $self->{kwd}]) {
5909            ## Stay in the state.
5910            $self->{kwd} .= chr $self->{nc};
5911            
5912        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5913          $self->{line_prev} = $self->{line};
5914          $self->{column_prev} = $self->{column};
5915          $self->{column}++;
5916          $self->{nc}
5917              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5918        } else {
5919          $self->{set_nc}->($self);
5920        }
5921      
5922            redo A;
5923          } elsif ((length $self->{kwd}) == 5 and
5924                   ($self->{nc} == 0x0059 or # Y
5925                    $self->{nc} == 0x0079)) { # y
5926            if ($self->{kwd} ne 'ENTIT' or $self->{nc} == 0x0079) {
5927              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
5928                              text => 'ENTITY',
5929                              line => $self->{line_prev},
5930                              column => $self->{column_prev} - 4);
5931            }
5932            $self->{ct} = {type => GENERAL_ENTITY_TOKEN, name => '',
5933                           line => $self->{line_prev},
5934                           column => $self->{column_prev} - 6};
5935            $self->{state} = DOCTYPE_MD_STATE;
5936            
5937        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5938          $self->{line_prev} = $self->{line};
5939          $self->{column_prev} = $self->{column};
5940          $self->{column}++;
5941          $self->{nc}
5942              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5943        } else {
5944          $self->{set_nc}->($self);
5945        }
5946      
5947            redo A;
5948          } else {
5949            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5950                            line => $self->{line_prev},
5951                            column => $self->{column_prev} - 1
5952                                - (length $self->{kwd})
5953                                + 1 * ($self->{nc} == -1));
5954            $self->{state} = BOGUS_COMMENT_STATE;
5955            ## Reconsume.
5956            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5957            redo A;
5958          }
5959        } elsif ($self->{state} == MD_ELEMENT_STATE) {
5960          if ($self->{nc} == [
5961               undef,
5962               undef,
5963               0x0045, # E
5964               0x004D, # M
5965               0x0045, # E
5966               0x004E, # N
5967              ]->[length $self->{kwd}] or
5968              $self->{nc} == [
5969               undef,
5970               undef,
5971               0x0065, # e
5972               0x006D, # m
5973               0x0065, # e
5974               0x006E, # n
5975              ]->[length $self->{kwd}]) {
5976            ## Stay in the state.
5977            $self->{kwd} .= chr $self->{nc};
5978            
5979        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5980          $self->{line_prev} = $self->{line};
5981          $self->{column_prev} = $self->{column};
5982          $self->{column}++;
5983          $self->{nc}
5984              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5985        } else {
5986          $self->{set_nc}->($self);
5987        }
5988      
5989            redo A;
5990          } elsif ((length $self->{kwd}) == 6 and
5991                   ($self->{nc} == 0x0054 or # T
5992                    $self->{nc} == 0x0074)) { # t
5993            if ($self->{kwd} ne 'ELEMEN' or $self->{nc} == 0x0074) {
5994              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
5995                              text => 'ELEMENT',
5996                              line => $self->{line_prev},
5997                              column => $self->{column_prev} - 5);
5998            }
5999            $self->{ct} = {type => ELEMENT_TOKEN, name => '',
6000                           line => $self->{line_prev},
6001                           column => $self->{column_prev} - 7};
6002            $self->{state} = DOCTYPE_MD_STATE;
6003            
6004        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6005          $self->{line_prev} = $self->{line};
6006          $self->{column_prev} = $self->{column};
6007          $self->{column}++;
6008          $self->{nc}
6009              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6010        } else {
6011          $self->{set_nc}->($self);
6012        }
6013      
6014            redo A;
6015          } else {
6016            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
6017                            line => $self->{line_prev},
6018                            column => $self->{column_prev} - 1
6019                                - (length $self->{kwd})
6020                                + 1 * ($self->{nc} == -1));
6021            $self->{state} = BOGUS_COMMENT_STATE;
6022            ## Reconsume.
6023            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6024            redo A;
6025          }
6026        } elsif ($self->{state} == MD_ATTLIST_STATE) {
6027          if ($self->{nc} == [
6028               undef,
6029               0x0054, # T
6030               0x0054, # T
6031               0x004C, # L
6032               0x0049, # I
6033               0x0053, # S
6034              ]->[length $self->{kwd}] or
6035              $self->{nc} == [
6036               undef,
6037               0x0074, # t
6038               0x0074, # t
6039               0x006C, # l
6040               0x0069, # i
6041               0x0073, # s
6042              ]->[length $self->{kwd}]) {
6043            ## Stay in the state.
6044            $self->{kwd} .= chr $self->{nc};
6045            
6046        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6047          $self->{line_prev} = $self->{line};
6048          $self->{column_prev} = $self->{column};
6049          $self->{column}++;
6050          $self->{nc}
6051              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6052        } else {
6053          $self->{set_nc}->($self);
6054        }
6055      
6056            redo A;
6057          } elsif ((length $self->{kwd}) == 6 and
6058                   ($self->{nc} == 0x0054 or # T
6059                    $self->{nc} == 0x0074)) { # t
6060            if ($self->{kwd} ne 'ATTLIS' or $self->{nc} == 0x0074) {
6061              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
6062                              text => 'ATTLIST',
6063                              line => $self->{line_prev},
6064                              column => $self->{column_prev} - 5);
6065            }
6066            $self->{ct} = {type => ATTLIST_TOKEN, name => '',
6067                           attrdefs => [],
6068                           line => $self->{line_prev},
6069                           column => $self->{column_prev} - 7};
6070            $self->{state} = DOCTYPE_MD_STATE;
6071            
6072        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6073          $self->{line_prev} = $self->{line};
6074          $self->{column_prev} = $self->{column};
6075          $self->{column}++;
6076          $self->{nc}
6077              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6078        } else {
6079          $self->{set_nc}->($self);
6080        }
6081      
6082            redo A;
6083          } else {
6084            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
6085                            line => $self->{line_prev},
6086                            column => $self->{column_prev} - 1
6087                                 - (length $self->{kwd})
6088                                 + 1 * ($self->{nc} == -1));
6089            $self->{state} = BOGUS_COMMENT_STATE;
6090            ## Reconsume.
6091            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6092            redo A;
6093          }
6094        } elsif ($self->{state} == MD_NOTATION_STATE) {
6095          if ($self->{nc} == [
6096               undef,
6097               0x004F, # O
6098               0x0054, # T
6099               0x0041, # A
6100               0x0054, # T
6101               0x0049, # I
6102               0x004F, # O
6103              ]->[length $self->{kwd}] or
6104              $self->{nc} == [
6105               undef,
6106               0x006F, # o
6107               0x0074, # t
6108               0x0061, # a
6109               0x0074, # t
6110               0x0069, # i
6111               0x006F, # o
6112              ]->[length $self->{kwd}]) {
6113            ## Stay in the state.
6114            $self->{kwd} .= chr $self->{nc};
6115            
6116        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6117          $self->{line_prev} = $self->{line};
6118          $self->{column_prev} = $self->{column};
6119          $self->{column}++;
6120          $self->{nc}
6121              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6122        } else {
6123          $self->{set_nc}->($self);
6124        }
6125      
6126            redo A;
6127          } elsif ((length $self->{kwd}) == 7 and
6128                   ($self->{nc} == 0x004E or # N
6129                    $self->{nc} == 0x006E)) { # n
6130            if ($self->{kwd} ne 'NOTATIO' or $self->{nc} == 0x006E) {
6131              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
6132                              text => 'NOTATION',
6133                              line => $self->{line_prev},
6134                              column => $self->{column_prev} - 6);
6135            }
6136            $self->{ct} = {type => NOTATION_TOKEN, name => '',
6137                           line => $self->{line_prev},
6138                           column => $self->{column_prev} - 8};
6139            $self->{state} = DOCTYPE_MD_STATE;
6140            
6141        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6142          $self->{line_prev} = $self->{line};
6143          $self->{column_prev} = $self->{column};
6144          $self->{column}++;
6145          $self->{nc}
6146              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6147        } else {
6148          $self->{set_nc}->($self);
6149        }
6150      
6151            redo A;
6152          } else {
6153            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
6154                            line => $self->{line_prev},
6155                            column => $self->{column_prev} - 1
6156                                - (length $self->{kwd})
6157                                + 1 * ($self->{nc} == -1));
6158            $self->{state} = BOGUS_COMMENT_STATE;
6159            ## Reconsume.
6160            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6161            redo A;
6162          }
6163        } elsif ($self->{state} == DOCTYPE_MD_STATE) {
6164          ## XML5: "DOCTYPE ENTITY state", "DOCTYPE ATTLIST state", and
6165          ## "DOCTYPE NOTATION state".
6166    
6167          if ($is_space->{$self->{nc}}) {
6168            ## XML5: [NOTATION] Switch to the "DOCTYPE NOTATION identifier state".
6169            $self->{state} = BEFORE_MD_NAME_STATE;
6170            
6171        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6172          $self->{line_prev} = $self->{line};
6173          $self->{column_prev} = $self->{column};
6174          $self->{column}++;
6175          $self->{nc}
6176              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6177        } else {
6178          $self->{set_nc}->($self);
6179        }
6180      
6181            redo A;
6182          } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
6183                   $self->{nc} == 0x0025) { # %
6184            ## XML5: Switch to the "DOCTYPE bogus comment state".
6185            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before md name'); ## TODO: type
6186            $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
6187            
6188        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6189          $self->{line_prev} = $self->{line};
6190          $self->{column_prev} = $self->{column};
6191          $self->{column}++;
6192          $self->{nc}
6193              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6194        } else {
6195          $self->{set_nc}->($self);
6196        }
6197      
6198            redo A;
6199          } elsif ($self->{nc} == -1) {
6200            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6201            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6202            ## Reconsume.
6203            redo A;
6204          } elsif ($self->{nc} == 0x003E) { # >
6205            ## XML5: Switch to the "DOCTYPE bogus comment state".
6206            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6207            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6208            
6209        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6210          $self->{line_prev} = $self->{line};
6211          $self->{column_prev} = $self->{column};
6212          $self->{column}++;
6213          $self->{nc}
6214              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6215        } else {
6216          $self->{set_nc}->($self);
6217        }
6218      
6219            redo A;
6220          } else {
6221            ## XML5: Switch to the "DOCTYPE bogus comment state".
6222            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before md name'); ## TODO: type
6223            $self->{state} = BEFORE_MD_NAME_STATE;
6224            redo A;
6225          }
6226        } elsif ($self->{state} == BEFORE_MD_NAME_STATE) {
6227          ## XML5: "DOCTYPE ENTITY parameter state", "DOCTYPE ENTITY type
6228          ## before state", "DOCTYPE ATTLIST name before state".
6229    
6230          if ($is_space->{$self->{nc}}) {
6231            ## Stay in the state.
6232            
6233        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6234          $self->{line_prev} = $self->{line};
6235          $self->{column_prev} = $self->{column};
6236          $self->{column}++;
6237          $self->{nc}
6238              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6239        } else {
6240          $self->{set_nc}->($self);
6241        }
6242      
6243            redo A;
6244          } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
6245                   $self->{nc} == 0x0025) { # %
6246            $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
6247            
6248        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6249          $self->{line_prev} = $self->{line};
6250          $self->{column_prev} = $self->{column};
6251          $self->{column}++;
6252          $self->{nc}
6253              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6254        } else {
6255          $self->{set_nc}->($self);
6256        }
6257      
6258            redo A;
6259          } elsif ($self->{nc} == 0x003E) { # >
6260            ## XML5: Same as "Anything else".
6261            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6262            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6263            
6264        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6265          $self->{line_prev} = $self->{line};
6266          $self->{column_prev} = $self->{column};
6267          $self->{column}++;
6268          $self->{nc}
6269              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6270        } else {
6271          $self->{set_nc}->($self);
6272        }
6273      
6274            redo A;
6275          } elsif ($self->{nc} == -1) {
6276            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6277            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6278            ## Reconsume.
6279            redo A;
6280          } else {
6281            ## XML5: [ATTLIST] Not defined yet.
6282            $self->{ct}->{name} .= chr $self->{nc};
6283            $self->{state} = MD_NAME_STATE;
6284            
6285        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6286          $self->{line_prev} = $self->{line};
6287          $self->{column_prev} = $self->{column};
6288          $self->{column}++;
6289          $self->{nc}
6290              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6291        } else {
6292          $self->{set_nc}->($self);
6293        }
6294      
6295            redo A;
6296          }
6297        } elsif ($self->{state} == DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE) {
6298          if ($is_space->{$self->{nc}}) {
6299            ## XML5: Switch to the "DOCTYPE ENTITY parameter state".
6300            $self->{ct}->{type} = PARAMETER_ENTITY_TOKEN;
6301            $self->{state} = BEFORE_MD_NAME_STATE;
6302            
6303        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6304          $self->{line_prev} = $self->{line};
6305          $self->{column_prev} = $self->{column};
6306          $self->{column}++;
6307          $self->{nc}
6308              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6309        } else {
6310          $self->{set_nc}->($self);
6311        }
6312      
6313            redo A;
6314          } elsif ($self->{nc} == 0x003E) { # >
6315            ## XML5: Same as "Anything else".
6316            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6317            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6318            
6319        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6320          $self->{line_prev} = $self->{line};
6321          $self->{column_prev} = $self->{column};
6322          $self->{column}++;
6323          $self->{nc}
6324              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6325        } else {
6326          $self->{set_nc}->($self);
6327        }
6328      
6329            redo A;
6330          } elsif ($self->{nc} == -1) {
6331            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md');
6332            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6333            ## Reconsume.
6334            redo A;
6335          } else {
6336            ## XML5: No parse error.
6337            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space after ENTITY percent'); ## TODO: type
6338            $self->{state} = BOGUS_COMMENT_STATE;
6339            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6340            ## Reconsume.
6341            redo A;
6342          }
6343        } elsif ($self->{state} == MD_NAME_STATE) {
6344          ## XML5: "DOCTYPE ENTITY name state" and "DOCTYPE ATTLIST name state".
6345          
6346          if ($is_space->{$self->{nc}}) {
6347            if ($self->{ct}->{type} == ATTLIST_TOKEN) {
6348              $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
6349            } elsif ($self->{ct}->{type} == ELEMENT_TOKEN) {
6350              $self->{state} = AFTER_ELEMENT_NAME_STATE;
6351            } else { # ENTITY/NOTATION
6352              $self->{state} = AFTER_DOCTYPE_NAME_STATE;
6353            }
6354            
6355        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6356          $self->{line_prev} = $self->{line};
6357          $self->{column_prev} = $self->{column};
6358          $self->{column}++;
6359          $self->{nc}
6360              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6361        } else {
6362          $self->{set_nc}->($self);
6363        }
6364      
6365            redo A;
6366          } elsif ($self->{nc} == 0x003E) { # >
6367            if ($self->{ct}->{type} == ATTLIST_TOKEN) {
6368              #
6369            } else {
6370              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
6371            }
6372            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6373            
6374        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6375          $self->{line_prev} = $self->{line};
6376          $self->{column_prev} = $self->{column};
6377          $self->{column}++;
6378          $self->{nc}
6379              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6380        } else {
6381          $self->{set_nc}->($self);
6382        }
6383      
6384            return  ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
6385            redo A;
6386          } elsif ($self->{nc} == -1) {
6387            ## XML5: [ATTLIST] No parse error.
6388            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md');
6389            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6390            ## Reconsume.
6391            return  ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
6392            redo A;
6393          } else {
6394            ## XML5: [ATTLIST] Not defined yet.
6395            $self->{ct}->{name} .= chr $self->{nc};
6396            ## Stay in the state.
6397            
6398        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6399          $self->{line_prev} = $self->{line};
6400          $self->{column_prev} = $self->{column};
6401          $self->{column}++;
6402          $self->{nc}
6403              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6404        } else {
6405          $self->{set_nc}->($self);
6406        }
6407      
6408            redo A;
6409          }
6410        } elsif ($self->{state} == DOCTYPE_ATTLIST_NAME_AFTER_STATE) {
6411          if ($is_space->{$self->{nc}}) {
6412            ## Stay in the state.
6413            
6414        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6415          $self->{line_prev} = $self->{line};
6416          $self->{column_prev} = $self->{column};
6417          $self->{column}++;
6418          $self->{nc}
6419              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6420        } else {
6421          $self->{set_nc}->($self);
6422        }
6423      
6424            redo A;
6425          } elsif ($self->{nc} == 0x003E) { # >
6426            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6427            
6428        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6429          $self->{line_prev} = $self->{line};
6430          $self->{column_prev} = $self->{column};
6431          $self->{column}++;
6432          $self->{nc}
6433              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6434        } else {
6435          $self->{set_nc}->($self);
6436        }
6437      
6438            return  ($self->{ct}); # ATTLIST
6439            redo A;
6440          } elsif ($self->{nc} == -1) {
6441            ## XML5: No parse error.
6442            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6443            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6444            return  ($self->{ct});
6445            redo A;
6446          } else {
6447            ## XML5: Not defined yet.
6448            $self->{ca} = {name => chr ($self->{nc}), # attrdef
6449                           tokens => [],
6450                           line => $self->{line}, column => $self->{column}};
6451            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE;
6452            
6453        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6454          $self->{line_prev} = $self->{line};
6455          $self->{column_prev} = $self->{column};
6456          $self->{column}++;
6457          $self->{nc}
6458              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6459        } else {
6460          $self->{set_nc}->($self);
6461        }
6462      
6463            redo A;
6464          }
6465        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE) {
6466          if ($is_space->{$self->{nc}}) {
6467            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE;
6468            
6469        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6470          $self->{line_prev} = $self->{line};
6471          $self->{column_prev} = $self->{column};
6472          $self->{column}++;
6473          $self->{nc}
6474              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6475        } else {
6476          $self->{set_nc}->($self);
6477        }
6478      
6479            redo A;
6480          } elsif ($self->{nc} == 0x003E) { # >
6481            ## XML5: Same as "anything else".
6482            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr type'); ## TODO: type
6483            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6484            
6485        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6486          $self->{line_prev} = $self->{line};
6487          $self->{column_prev} = $self->{column};
6488          $self->{column}++;
6489          $self->{nc}
6490              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6491        } else {
6492          $self->{set_nc}->($self);
6493        }
6494      
6495            return  ($self->{ct}); # ATTLIST
6496            redo A;
6497          } elsif ($self->{nc} == 0x0028) { # (
6498            ## XML5: Same as "anything else".
6499            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before paren'); ## TODO: type
6500            $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6501            
6502        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6503          $self->{line_prev} = $self->{line};
6504          $self->{column_prev} = $self->{column};
6505          $self->{column}++;
6506          $self->{nc}
6507              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6508        } else {
6509          $self->{set_nc}->($self);
6510        }
6511      
6512            redo A;
6513          } elsif ($self->{nc} == -1) {
6514            ## XML5: No parse error.
6515            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6516            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6517            
6518        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6519          $self->{line_prev} = $self->{line};
6520          $self->{column_prev} = $self->{column};
6521          $self->{column}++;
6522          $self->{nc}
6523              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6524        } else {
6525          $self->{set_nc}->($self);
6526        }
6527      
6528            return  ($self->{ct}); # ATTLIST
6529            redo A;
6530          } else {
6531            ## XML5: Not defined yet.
6532            $self->{ca}->{name} .= chr $self->{nc};
6533            ## Stay in the state.
6534            
6535        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6536          $self->{line_prev} = $self->{line};
6537          $self->{column_prev} = $self->{column};
6538          $self->{column}++;
6539          $self->{nc}
6540              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6541        } else {
6542          $self->{set_nc}->($self);
6543        }
6544      
6545            redo A;
6546          }
6547        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE) {
6548          if ($is_space->{$self->{nc}}) {
6549            ## Stay in the state.
6550            
6551        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6552          $self->{line_prev} = $self->{line};
6553          $self->{column_prev} = $self->{column};
6554          $self->{column}++;
6555          $self->{nc}
6556              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6557        } else {
6558          $self->{set_nc}->($self);
6559        }
6560      
6561            redo A;
6562          } elsif ($self->{nc} == 0x003E) { # >
6563            ## XML5: Same as "anything else".
6564            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr type'); ## TODO: type
6565            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6566            
6567        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6568          $self->{line_prev} = $self->{line};
6569          $self->{column_prev} = $self->{column};
6570          $self->{column}++;
6571          $self->{nc}
6572              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6573        } else {
6574          $self->{set_nc}->($self);
6575        }
6576      
6577            return  ($self->{ct}); # ATTLIST
6578            redo A;
6579          } elsif ($self->{nc} == 0x0028) { # (
6580            ## XML5: Same as "anything else".
6581            $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6582            
6583        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6584          $self->{line_prev} = $self->{line};
6585          $self->{column_prev} = $self->{column};
6586          $self->{column}++;
6587          $self->{nc}
6588              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6589        } else {
6590          $self->{set_nc}->($self);
6591        }
6592      
6593            redo A;
6594          } elsif ($self->{nc} == -1) {
6595            ## XML5: No parse error.
6596            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6597            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6598            
6599        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6600          $self->{line_prev} = $self->{line};
6601          $self->{column_prev} = $self->{column};
6602          $self->{column}++;
6603          $self->{nc}
6604              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6605        } else {
6606          $self->{set_nc}->($self);
6607        }
6608      
6609            return  ($self->{ct});
6610            redo A;
6611          } else {
6612            ## XML5: Not defined yet.
6613            $self->{ca}->{type} = chr $self->{nc};
6614            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE;
6615            
6616        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6617          $self->{line_prev} = $self->{line};
6618          $self->{column_prev} = $self->{column};
6619          $self->{column}++;
6620          $self->{nc}
6621              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6622        } else {
6623          $self->{set_nc}->($self);
6624        }
6625      
6626            redo A;
6627          }
6628        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE) {
6629          if ($is_space->{$self->{nc}}) {
6630            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE;
6631            
6632        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6633          $self->{line_prev} = $self->{line};
6634          $self->{column_prev} = $self->{column};
6635          $self->{column}++;
6636          $self->{nc}
6637              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6638        } else {
6639          $self->{set_nc}->($self);
6640        }
6641      
6642            redo A;
6643          } elsif ($self->{nc} == 0x0023) { # #
6644            ## XML5: Same as "anything else".
6645            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6646            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
6647            
6648        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6649          $self->{line_prev} = $self->{line};
6650          $self->{column_prev} = $self->{column};
6651          $self->{column}++;
6652          $self->{nc}
6653              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6654        } else {
6655          $self->{set_nc}->($self);
6656        }
6657      
6658            redo A;
6659          } elsif ($self->{nc} == 0x0022) { # "
6660            ## XML5: Same as "anything else".
6661            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6662            $self->{ca}->{value} = '';
6663            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
6664            
6665        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6666          $self->{line_prev} = $self->{line};
6667          $self->{column_prev} = $self->{column};
6668          $self->{column}++;
6669          $self->{nc}
6670              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6671        } else {
6672          $self->{set_nc}->($self);
6673        }
6674      
6675            redo A;
6676          } elsif ($self->{nc} == 0x0027) { # '
6677            ## XML5: Same as "anything else".
6678            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6679            $self->{ca}->{value} = '';
6680            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
6681            
6682        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6683          $self->{line_prev} = $self->{line};
6684          $self->{column_prev} = $self->{column};
6685          $self->{column}++;
6686          $self->{nc}
6687              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6688        } else {
6689          $self->{set_nc}->($self);
6690        }
6691      
6692            redo A;
6693          } elsif ($self->{nc} == 0x003E) { # >
6694            ## XML5: Same as "anything else".
6695            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
6696            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6697            
6698        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6699          $self->{line_prev} = $self->{line};
6700          $self->{column_prev} = $self->{column};
6701          $self->{column}++;
6702          $self->{nc}
6703              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6704        } else {
6705          $self->{set_nc}->($self);
6706        }
6707      
6708            return  ($self->{ct}); # ATTLIST
6709            redo A;
6710          } elsif ($self->{nc} == 0x0028) { # (
6711            ## XML5: Same as "anything else".
6712            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before paren'); ## TODO: type
6713            $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6714            
6715        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6716          $self->{line_prev} = $self->{line};
6717          $self->{column_prev} = $self->{column};
6718          $self->{column}++;
6719          $self->{nc}
6720              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6721        } else {
6722          $self->{set_nc}->($self);
6723        }
6724      
6725            redo A;
6726          } elsif ($self->{nc} == -1) {
6727            ## XML5: No parse error.
6728            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6729            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6730            
6731        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6732          $self->{line_prev} = $self->{line};
6733          $self->{column_prev} = $self->{column};
6734          $self->{column}++;
6735          $self->{nc}
6736              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6737        } else {
6738          $self->{set_nc}->($self);
6739        }
6740      
6741            return  ($self->{ct});
6742            redo A;
6743          } else {
6744            ## XML5: Not defined yet.
6745            $self->{ca}->{type} .= chr $self->{nc};
6746            ## Stay in the state.
6747            
6748        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6749          $self->{line_prev} = $self->{line};
6750          $self->{column_prev} = $self->{column};
6751          $self->{column}++;
6752          $self->{nc}
6753              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6754        } else {
6755          $self->{set_nc}->($self);
6756        }
6757      
6758            redo A;
6759          }
6760        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE) {
6761          if ($is_space->{$self->{nc}}) {
6762            ## Stay in the state.
6763            
6764        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6765          $self->{line_prev} = $self->{line};
6766          $self->{column_prev} = $self->{column};
6767          $self->{column}++;
6768          $self->{nc}
6769              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6770        } else {
6771          $self->{set_nc}->($self);
6772        }
6773      
6774            redo A;
6775          } elsif ($self->{nc} == 0x0028) { # (
6776            ## XML5: Same as "anything else".
6777            $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6778            
6779        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6780          $self->{line_prev} = $self->{line};
6781          $self->{column_prev} = $self->{column};
6782          $self->{column}++;
6783          $self->{nc}
6784              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6785        } else {
6786          $self->{set_nc}->($self);
6787        }
6788      
6789            redo A;
6790          } elsif ($self->{nc} == 0x0023) { # #
6791            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
6792            
6793        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6794          $self->{line_prev} = $self->{line};
6795          $self->{column_prev} = $self->{column};
6796          $self->{column}++;
6797          $self->{nc}
6798              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6799        } else {
6800          $self->{set_nc}->($self);
6801        }
6802      
6803            redo A;
6804          } elsif ($self->{nc} == 0x0022) { # "
6805            ## XML5: Same as "anything else".
6806            $self->{ca}->{value} = '';
6807            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
6808            
6809        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6810          $self->{line_prev} = $self->{line};
6811          $self->{column_prev} = $self->{column};
6812          $self->{column}++;
6813          $self->{nc}
6814              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6815        } else {
6816          $self->{set_nc}->($self);
6817        }
6818      
6819            redo A;
6820          } elsif ($self->{nc} == 0x0027) { # '
6821            ## XML5: Same as "anything else".
6822            $self->{ca}->{value} = '';
6823            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
6824            
6825        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6826          $self->{line_prev} = $self->{line};
6827          $self->{column_prev} = $self->{column};
6828          $self->{column}++;
6829          $self->{nc}
6830              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6831        } else {
6832          $self->{set_nc}->($self);
6833        }
6834      
6835            redo A;
6836          } elsif ($self->{nc} == 0x003E) { # >
6837            ## XML5: Same as "anything else".
6838            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
6839            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6840            
6841        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6842          $self->{line_prev} = $self->{line};
6843          $self->{column_prev} = $self->{column};
6844          $self->{column}++;
6845          $self->{nc}
6846              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6847        } else {
6848          $self->{set_nc}->($self);
6849        }
6850      
6851            return  ($self->{ct}); # ATTLIST
6852            redo A;
6853          } elsif ($self->{nc} == -1) {
6854            ## XML5: No parse error.
6855            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6856            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6857            
6858        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6859          $self->{line_prev} = $self->{line};
6860          $self->{column_prev} = $self->{column};
6861          $self->{column}++;
6862          $self->{nc}
6863              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6864        } else {
6865          $self->{set_nc}->($self);
6866        }
6867      
6868            return  ($self->{ct});
6869            redo A;
6870          } else {
6871            ## XML5: Switch to the "DOCTYPE bogus comment state".
6872            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
6873            $self->{ca}->{value} = '';
6874            $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
6875            ## Reconsume.
6876            redo A;
6877          }
6878        } elsif ($self->{state} == BEFORE_ALLOWED_TOKEN_STATE) {
6879          if ($is_space->{$self->{nc}}) {
6880            ## Stay in the state.
6881            
6882        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6883          $self->{line_prev} = $self->{line};
6884          $self->{column_prev} = $self->{column};
6885          $self->{column}++;
6886          $self->{nc}
6887              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6888        } else {
6889          $self->{set_nc}->($self);
6890        }
6891      
6892            redo A;
6893          } elsif ($self->{nc} == 0x007C) { # |
6894            $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty allowed token'); ## TODO: type
6895            ## Stay in the state.
6896            
6897        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6898          $self->{line_prev} = $self->{line};
6899          $self->{column_prev} = $self->{column};
6900          $self->{column}++;
6901          $self->{nc}
6902              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6903        } else {
6904          $self->{set_nc}->($self);
6905        }
6906      
6907            redo A;
6908          } elsif ($self->{nc} == 0x0029) { # )
6909            $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty allowed token'); ## TODO: type
6910            $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
6911            
6912        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6913          $self->{line_prev} = $self->{line};
6914          $self->{column_prev} = $self->{column};
6915          $self->{column}++;
6916          $self->{nc}
6917              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6918        } else {
6919          $self->{set_nc}->($self);
6920        }
6921      
6922            redo A;
6923          } elsif ($self->{nc} == 0x003E) { # >
6924            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
6925            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6926            
6927        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6928          $self->{line_prev} = $self->{line};
6929          $self->{column_prev} = $self->{column};
6930          $self->{column}++;
6931          $self->{nc}
6932              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6933        } else {
6934          $self->{set_nc}->($self);
6935        }
6936      
6937            return  ($self->{ct}); # ATTLIST
6938            redo A;
6939          } elsif ($self->{nc} == -1) {
6940            ## XML5: No parse error.
6941            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6942            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6943            
6944        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6945          $self->{line_prev} = $self->{line};
6946          $self->{column_prev} = $self->{column};
6947          $self->{column}++;
6948          $self->{nc}
6949              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6950        } else {
6951          $self->{set_nc}->($self);
6952        }
6953      
6954            return  ($self->{ct});
6955            redo A;
6956          } else {
6957            push @{$self->{ca}->{tokens}}, chr $self->{nc};
6958            $self->{state} = ALLOWED_TOKEN_STATE;
6959            
6960        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6961          $self->{line_prev} = $self->{line};
6962          $self->{column_prev} = $self->{column};
6963          $self->{column}++;
6964          $self->{nc}
6965              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6966        } else {
6967          $self->{set_nc}->($self);
6968        }
6969      
6970            redo A;
6971          }
6972        } elsif ($self->{state} == ALLOWED_TOKEN_STATE) {
6973          if ($is_space->{$self->{nc}}) {
6974            $self->{state} = AFTER_ALLOWED_TOKEN_STATE;
6975            
6976        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6977          $self->{line_prev} = $self->{line};
6978          $self->{column_prev} = $self->{column};
6979          $self->{column}++;
6980          $self->{nc}
6981              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6982        } else {
6983          $self->{set_nc}->($self);
6984        }
6985      
6986            redo A;
6987          } elsif ($self->{nc} == 0x007C) { # |
6988            $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6989            
6990        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6991          $self->{line_prev} = $self->{line};
6992          $self->{column_prev} = $self->{column};
6993          $self->{column}++;
6994          $self->{nc}
6995              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6996        } else {
6997          $self->{set_nc}->($self);
6998        }
6999      
7000            redo A;
7001          } elsif ($self->{nc} == 0x0029) { # )
7002            $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
7003            
7004        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7005          $self->{line_prev} = $self->{line};
7006          $self->{column_prev} = $self->{column};
7007          $self->{column}++;
7008          $self->{nc}
7009              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7010        } else {
7011          $self->{set_nc}->($self);
7012        }
7013      
7014            redo A;
7015          } elsif ($self->{nc} == 0x003E) { # >
7016            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
7017            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7018            
7019        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7020          $self->{line_prev} = $self->{line};
7021          $self->{column_prev} = $self->{column};
7022          $self->{column}++;
7023          $self->{nc}
7024              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7025        } else {
7026          $self->{set_nc}->($self);
7027        }
7028      
7029            return  ($self->{ct}); # ATTLIST
7030            redo A;
7031          } elsif ($self->{nc} == -1) {
7032            ## XML5: No parse error.
7033            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7034            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7035            
7036        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7037          $self->{line_prev} = $self->{line};
7038          $self->{column_prev} = $self->{column};
7039          $self->{column}++;
7040          $self->{nc}
7041              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7042        } else {
7043          $self->{set_nc}->($self);
7044        }
7045      
7046            return  ($self->{ct});
7047            redo A;
7048          } else {
7049            $self->{ca}->{tokens}->[-1] .= chr $self->{nc};
7050            ## Stay in the state.
7051            
7052        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7053          $self->{line_prev} = $self->{line};
7054          $self->{column_prev} = $self->{column};
7055          $self->{column}++;
7056          $self->{nc}
7057              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7058        } else {
7059          $self->{set_nc}->($self);
7060        }
7061      
7062            redo A;
7063          }
7064        } elsif ($self->{state} == AFTER_ALLOWED_TOKEN_STATE) {
7065          if ($is_space->{$self->{nc}}) {
7066            ## Stay in the state.
7067            
7068        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7069          $self->{line_prev} = $self->{line};
7070          $self->{column_prev} = $self->{column};
7071          $self->{column}++;
7072          $self->{nc}
7073              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7074        } else {
7075          $self->{set_nc}->($self);
7076        }
7077      
7078            redo A;
7079          } elsif ($self->{nc} == 0x007C) { # |
7080            $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
7081            
7082        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7083          $self->{line_prev} = $self->{line};
7084          $self->{column_prev} = $self->{column};
7085          $self->{column}++;
7086          $self->{nc}
7087              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7088        } else {
7089          $self->{set_nc}->($self);
7090        }
7091      
7092            redo A;
7093          } elsif ($self->{nc} == 0x0029) { # )
7094            $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
7095            
7096        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7097          $self->{line_prev} = $self->{line};
7098          $self->{column_prev} = $self->{column};
7099          $self->{column}++;
7100          $self->{nc}
7101              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7102        } else {
7103          $self->{set_nc}->($self);
7104        }
7105      
7106            redo A;
7107          } elsif ($self->{nc} == 0x003E) { # >
7108            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
7109            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7110            
7111        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7112          $self->{line_prev} = $self->{line};
7113          $self->{column_prev} = $self->{column};
7114          $self->{column}++;
7115          $self->{nc}
7116              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7117        } else {
7118          $self->{set_nc}->($self);
7119        }
7120      
7121            return  ($self->{ct}); # ATTLIST
7122            redo A;
7123          } elsif ($self->{nc} == -1) {
7124            ## XML5: No parse error.
7125            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7126            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7127            
7128        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7129          $self->{line_prev} = $self->{line};
7130          $self->{column_prev} = $self->{column};
7131          $self->{column}++;
7132          $self->{nc}
7133              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7134        } else {
7135          $self->{set_nc}->($self);
7136        }
7137      
7138            return  ($self->{ct});
7139            redo A;
7140          } else {
7141            $self->{parse_error}->(level => $self->{level}->{must}, type => 'space in allowed token', ## TODO: type
7142                            line => $self->{line_prev},
7143                            column => $self->{column_prev});
7144            $self->{ca}->{tokens}->[-1] .= ' ' . chr $self->{nc};
7145            $self->{state} = ALLOWED_TOKEN_STATE;
7146            
7147        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7148          $self->{line_prev} = $self->{line};
7149          $self->{column_prev} = $self->{column};
7150          $self->{column}++;
7151          $self->{nc}
7152              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7153        } else {
7154          $self->{set_nc}->($self);
7155        }
7156      
7157            redo A;
7158          }
7159        } elsif ($self->{state} == AFTER_ALLOWED_TOKENS_STATE) {
7160          if ($is_space->{$self->{nc}}) {
7161            $self->{state} = BEFORE_ATTR_DEFAULT_STATE;
7162            
7163        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7164          $self->{line_prev} = $self->{line};
7165          $self->{column_prev} = $self->{column};
7166          $self->{column}++;
7167          $self->{nc}
7168              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7169        } else {
7170          $self->{set_nc}->($self);
7171        }
7172      
7173            redo A;
7174          } elsif ($self->{nc} == 0x0023) { # #
7175            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7176            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
7177            
7178        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7179          $self->{line_prev} = $self->{line};
7180          $self->{column_prev} = $self->{column};
7181          $self->{column}++;
7182          $self->{nc}
7183              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7184        } else {
7185          $self->{set_nc}->($self);
7186        }
7187      
7188            redo A;
7189          } elsif ($self->{nc} == 0x0022) { # "
7190            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7191            $self->{ca}->{value} = '';
7192            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7193            
7194        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7195          $self->{line_prev} = $self->{line};
7196          $self->{column_prev} = $self->{column};
7197          $self->{column}++;
7198          $self->{nc}
7199              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7200        } else {
7201          $self->{set_nc}->($self);
7202        }
7203      
7204            redo A;
7205          } elsif ($self->{nc} == 0x0027) { # '
7206            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7207            $self->{ca}->{value} = '';
7208            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7209            
7210        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7211          $self->{line_prev} = $self->{line};
7212          $self->{column_prev} = $self->{column};
7213          $self->{column}++;
7214          $self->{nc}
7215              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7216        } else {
7217          $self->{set_nc}->($self);
7218        }
7219      
7220            redo A;
7221          } elsif ($self->{nc} == 0x003E) { # >
7222            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7223            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7224            
7225        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7226          $self->{line_prev} = $self->{line};
7227          $self->{column_prev} = $self->{column};
7228          $self->{column}++;
7229          $self->{nc}
7230              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7231        } else {
7232          $self->{set_nc}->($self);
7233        }
7234      
7235            return  ($self->{ct}); # ATTLIST
7236            redo A;
7237          } elsif ($self->{nc} == -1) {
7238            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7239            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7240            
7241        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7242          $self->{line_prev} = $self->{line};
7243          $self->{column_prev} = $self->{column};
7244          $self->{column}++;
7245          $self->{nc}
7246              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7247        } else {
7248          $self->{set_nc}->($self);
7249        }
7250      
7251            return  ($self->{ct});
7252            redo A;
7253          } else {
7254            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
7255            $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7256            ## Reconsume.
7257            redo A;
7258          }
7259        } elsif ($self->{state} == BEFORE_ATTR_DEFAULT_STATE) {
7260          if ($is_space->{$self->{nc}}) {
7261            ## Stay in the state.
7262            
7263        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7264          $self->{line_prev} = $self->{line};
7265          $self->{column_prev} = $self->{column};
7266          $self->{column}++;
7267          $self->{nc}
7268              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7269        } else {
7270          $self->{set_nc}->($self);
7271        }
7272      
7273            redo A;
7274          } elsif ($self->{nc} == 0x0023) { # #
7275            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
7276            
7277        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7278          $self->{line_prev} = $self->{line};
7279          $self->{column_prev} = $self->{column};
7280          $self->{column}++;
7281          $self->{nc}
7282              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7283        } else {
7284          $self->{set_nc}->($self);
7285        }
7286      
7287            redo A;
7288          } elsif ($self->{nc} == 0x0022) { # "
7289            $self->{ca}->{value} = '';
7290            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7291            
7292        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7293          $self->{line_prev} = $self->{line};
7294          $self->{column_prev} = $self->{column};
7295          $self->{column}++;
7296          $self->{nc}
7297              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7298        } else {
7299          $self->{set_nc}->($self);
7300        }
7301      
7302            redo A;
7303          } elsif ($self->{nc} == 0x0027) { # '
7304            $self->{ca}->{value} = '';
7305            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7306            
7307        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7308          $self->{line_prev} = $self->{line};
7309          $self->{column_prev} = $self->{column};
7310          $self->{column}++;
7311          $self->{nc}
7312              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7313        } else {
7314          $self->{set_nc}->($self);
7315        }
7316      
7317            redo A;
7318          } elsif ($self->{nc} == 0x003E) { # >
7319            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7320            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7321            
7322        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7323          $self->{line_prev} = $self->{line};
7324          $self->{column_prev} = $self->{column};
7325          $self->{column}++;
7326          $self->{nc}
7327              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7328        } else {
7329          $self->{set_nc}->($self);
7330        }
7331      
7332            return  ($self->{ct}); # ATTLIST
7333            redo A;
7334          } elsif ($self->{nc} == -1) {
7335            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7336            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7337            
7338        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7339          $self->{line_prev} = $self->{line};
7340          $self->{column_prev} = $self->{column};
7341          $self->{column}++;
7342          $self->{nc}
7343              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7344        } else {
7345          $self->{set_nc}->($self);
7346        }
7347      
7348            return  ($self->{ct});
7349            redo A;
7350          } else {
7351            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
7352            $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7353            ## Reconsume.
7354            redo A;
7355          }
7356        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE) {
7357          if ($is_space->{$self->{nc}}) {
7358            ## XML5: No parse error.
7359            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no default type'); ## TODO: type
7360            $self->{state} = BOGUS_MD_STATE;
7361            ## Reconsume.
7362            redo A;
7363          } elsif ($self->{nc} == 0x0022) { # "
7364            ## XML5: Same as "anything else".
7365            $self->{ca}->{value} = '';
7366            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7367            
7368        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7369          $self->{line_prev} = $self->{line};
7370          $self->{column_prev} = $self->{column};
7371          $self->{column}++;
7372          $self->{nc}
7373              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7374        } else {
7375          $self->{set_nc}->($self);
7376        }
7377      
7378            redo A;
7379          } elsif ($self->{nc} == 0x0027) { # '
7380            ## XML5: Same as "anything else".
7381            $self->{ca}->{value} = '';
7382            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7383            
7384        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7385          $self->{line_prev} = $self->{line};
7386          $self->{column_prev} = $self->{column};
7387          $self->{column}++;
7388          $self->{nc}
7389              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7390        } else {
7391          $self->{set_nc}->($self);
7392        }
7393      
7394            redo A;
7395          } elsif ($self->{nc} == 0x003E) { # >
7396            ## XML5: Same as "anything else".
7397            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7398            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7399            
7400        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7401          $self->{line_prev} = $self->{line};
7402          $self->{column_prev} = $self->{column};
7403          $self->{column}++;
7404          $self->{nc}
7405              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7406        } else {
7407          $self->{set_nc}->($self);
7408        }
7409      
7410            return  ($self->{ct}); # ATTLIST
7411            redo A;
7412          } elsif ($self->{nc} == -1) {
7413            ## XML5: No parse error.
7414            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7415            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7416            
7417        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7418          $self->{line_prev} = $self->{line};
7419          $self->{column_prev} = $self->{column};
7420          $self->{column}++;
7421          $self->{nc}
7422              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7423        } else {
7424          $self->{set_nc}->($self);
7425        }
7426      
7427            return  ($self->{ct});
7428            redo A;
7429          } else {
7430            $self->{ca}->{default} = chr $self->{nc};
7431            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE;
7432            
7433        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7434          $self->{line_prev} = $self->{line};
7435          $self->{column_prev} = $self->{column};
7436          $self->{column}++;
7437          $self->{nc}
7438              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7439        } else {
7440          $self->{set_nc}->($self);
7441        }
7442      
7443            redo A;
7444          }
7445        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE) {
7446          if ($is_space->{$self->{nc}}) {
7447            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE;
7448            
7449        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7450          $self->{line_prev} = $self->{line};
7451          $self->{column_prev} = $self->{column};
7452          $self->{column}++;
7453          $self->{nc}
7454              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7455        } else {
7456          $self->{set_nc}->($self);
7457        }
7458      
7459            redo A;
7460          } elsif ($self->{nc} == 0x0022) { # "
7461            ## XML5: Same as "anything else".
7462            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7463            $self->{ca}->{value} = '';
7464            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7465            
7466        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7467          $self->{line_prev} = $self->{line};
7468          $self->{column_prev} = $self->{column};
7469          $self->{column}++;
7470          $self->{nc}
7471              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7472        } else {
7473          $self->{set_nc}->($self);
7474        }
7475      
7476            redo A;
7477          } elsif ($self->{nc} == 0x0027) { # '
7478            ## XML5: Same as "anything else".
7479            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7480            $self->{ca}->{value} = '';
7481            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7482            
7483        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7484          $self->{line_prev} = $self->{line};
7485          $self->{column_prev} = $self->{column};
7486          $self->{column}++;
7487          $self->{nc}
7488              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7489        } else {
7490          $self->{set_nc}->($self);
7491        }
7492      
7493            redo A;
7494          } elsif ($self->{nc} == 0x003E) { # >
7495            ## XML5: Same as "anything else".
7496            push @{$self->{ct}->{attrdefs}}, $self->{ca};
7497            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7498            
7499        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7500          $self->{line_prev} = $self->{line};
7501          $self->{column_prev} = $self->{column};
7502          $self->{column}++;
7503          $self->{nc}
7504              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7505        } else {
7506          $self->{set_nc}->($self);
7507        }
7508      
7509            return  ($self->{ct}); # ATTLIST
7510            redo A;
7511          } elsif ($self->{nc} == -1) {
7512            ## XML5: No parse error.
7513            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7514            push @{$self->{ct}->{attrdefs}}, $self->{ca};
7515            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7516            
7517        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7518          $self->{line_prev} = $self->{line};
7519          $self->{column_prev} = $self->{column};
7520          $self->{column}++;
7521          $self->{nc}
7522              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7523        } else {
7524          $self->{set_nc}->($self);
7525        }
7526      
7527            return  ($self->{ct});
7528            redo A;
7529          } else {
7530            $self->{ca}->{default} .= chr $self->{nc};
7531            ## Stay in the state.
7532            
7533        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7534          $self->{line_prev} = $self->{line};
7535          $self->{column_prev} = $self->{column};
7536          $self->{column}++;
7537          $self->{nc}
7538              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7539        } else {
7540          $self->{set_nc}->($self);
7541        }
7542      
7543            redo A;
7544          }
7545        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE) {
7546          if ($is_space->{$self->{nc}}) {
7547            ## Stay in the state.
7548            
7549        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7550          $self->{line_prev} = $self->{line};
7551          $self->{column_prev} = $self->{column};
7552          $self->{column}++;
7553          $self->{nc}
7554              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7555        } else {
7556          $self->{set_nc}->($self);
7557        }
7558      
7559            redo A;
7560          } elsif ($self->{nc} == 0x0022) { # "
7561            $self->{ca}->{value} = '';
7562            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7563            
7564        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7565          $self->{line_prev} = $self->{line};
7566          $self->{column_prev} = $self->{column};
7567          $self->{column}++;
7568          $self->{nc}
7569              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7570        } else {
7571          $self->{set_nc}->($self);
7572        }
7573      
7574            redo A;
7575          } elsif ($self->{nc} == 0x0027) { # '
7576            $self->{ca}->{value} = '';
7577            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7578            
7579        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7580          $self->{line_prev} = $self->{line};
7581          $self->{column_prev} = $self->{column};
7582          $self->{column}++;
7583          $self->{nc}
7584              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7585        } else {
7586          $self->{set_nc}->($self);
7587        }
7588      
7589            redo A;
7590          } elsif ($self->{nc} == 0x003E) { # >
7591            push @{$self->{ct}->{attrdefs}}, $self->{ca};
7592            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7593            
7594        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7595          $self->{line_prev} = $self->{line};
7596          $self->{column_prev} = $self->{column};
7597          $self->{column}++;
7598          $self->{nc}
7599              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7600        } else {
7601          $self->{set_nc}->($self);
7602        }
7603      
7604            return  ($self->{ct}); # ATTLIST
7605            redo A;
7606          } elsif ($self->{nc} == -1) {
7607            ## XML5: No parse error.
7608            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7609            push @{$self->{ct}->{attrdefs}}, $self->{ca};
7610            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7611            
7612        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7613          $self->{line_prev} = $self->{line};
7614          $self->{column_prev} = $self->{column};
7615          $self->{column}++;
7616          $self->{nc}
7617              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7618        } else {
7619          $self->{set_nc}->($self);
7620        }
7621      
7622            return  ($self->{ct});
7623            redo A;
7624          } else {
7625            ## XML5: Not defined yet.
7626            if ($self->{ca}->{default} eq 'FIXED') {
7627              $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7628            } else {
7629              push @{$self->{ct}->{attrdefs}}, $self->{ca};
7630              $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7631            }
7632            ## Reconsume.
7633            redo A;
7634          }
7635        } elsif ($self->{state} == AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE) {
7636          if ($is_space->{$self->{nc}} or
7637              $self->{nc} == -1 or
7638              $self->{nc} == 0x003E) { # >
7639            $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7640            ## Reconsume.
7641            redo A;
7642          } else {
7643            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before attr name'); ## TODO: type
7644            $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7645            ## Reconsume.
7646            redo A;
7647          }
7648        } elsif ($self->{state} == NDATA_STATE) {
7649          ## ASCII case-insensitive
7650          if ($self->{nc} == [
7651                undef,
7652                0x0044, # D
7653                0x0041, # A
7654                0x0054, # T
7655              ]->[length $self->{kwd}] or
7656              $self->{nc} == [
7657                undef,
7658                0x0064, # d
7659                0x0061, # a
7660                0x0074, # t
7661              ]->[length $self->{kwd}]) {
7662            
7663            ## Stay in the state.
7664            $self->{kwd} .= chr $self->{nc};
7665            
7666        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7667          $self->{line_prev} = $self->{line};
7668          $self->{column_prev} = $self->{column};
7669          $self->{column}++;
7670          $self->{nc}
7671              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7672        } else {
7673          $self->{set_nc}->($self);
7674        }
7675      
7676            redo A;
7677          } elsif ((length $self->{kwd}) == 4 and
7678                   ($self->{nc} == 0x0041 or # A
7679                    $self->{nc} == 0x0061)) { # a
7680            if ($self->{kwd} ne 'NDAT' or $self->{nc} == 0x0061) { # a
7681              
7682              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
7683                              text => 'NDATA',
7684                              line => $self->{line_prev},
7685                              column => $self->{column_prev} - 4);
7686            } else {
7687              
7688            }
7689            $self->{state} = AFTER_NDATA_STATE;
7690            
7691        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7692          $self->{line_prev} = $self->{line};
7693          $self->{column_prev} = $self->{column};
7694          $self->{column}++;
7695          $self->{nc}
7696              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7697        } else {
7698          $self->{set_nc}->($self);
7699        }
7700      
7701            redo A;
7702          } else {
7703            $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after literal', ## TODO: type
7704                            line => $self->{line_prev},
7705                            column => $self->{column_prev} + 1
7706                                - length $self->{kwd});
7707            
7708            $self->{state} = BOGUS_MD_STATE;
7709            ## Reconsume.
7710            redo A;
7711          }
7712        } elsif ($self->{state} == AFTER_NDATA_STATE) {
7713          if ($is_space->{$self->{nc}}) {
7714            $self->{state} = BEFORE_NOTATION_NAME_STATE;
7715            
7716        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7717          $self->{line_prev} = $self->{line};
7718          $self->{column_prev} = $self->{column};
7719          $self->{column}++;
7720          $self->{nc}
7721              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7722        } else {
7723          $self->{set_nc}->($self);
7724        }
7725      
7726            redo A;
7727          } elsif ($self->{nc} == 0x003E) { # >
7728            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no notation name'); ## TODO: type
7729            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7730            
7731        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7732          $self->{line_prev} = $self->{line};
7733          $self->{column_prev} = $self->{column};
7734          $self->{column}++;
7735          $self->{nc}
7736              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7737        } else {
7738          $self->{set_nc}->($self);
7739        }
7740      
7741            return  ($self->{ct}); # ENTITY
7742            redo A;
7743          } elsif ($self->{nc} == -1) {
7744            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7745            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7746            
7747        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7748          $self->{line_prev} = $self->{line};
7749          $self->{column_prev} = $self->{column};
7750          $self->{column}++;
7751          $self->{nc}
7752              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7753        } else {
7754          $self->{set_nc}->($self);
7755        }
7756      
7757            return  ($self->{ct}); # ENTITY
7758            redo A;
7759          } else {
7760            $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after literal', ## TODO: type
7761                            line => $self->{line_prev},
7762                            column => $self->{column_prev} + 1
7763                                - length $self->{kwd});
7764            $self->{state} = BOGUS_MD_STATE;
7765            ## Reconsume.
7766            redo A;
7767          }
7768        } elsif ($self->{state} == BEFORE_NOTATION_NAME_STATE) {
7769          if ($is_space->{$self->{nc}}) {
7770            ## Stay in the state.
7771            
7772        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7773          $self->{line_prev} = $self->{line};
7774          $self->{column_prev} = $self->{column};
7775          $self->{column}++;
7776          $self->{nc}
7777              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7778        } else {
7779          $self->{set_nc}->($self);
7780        }
7781      
7782            redo A;
7783          } elsif ($self->{nc} == 0x003E) { # >
7784            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no notation name'); ## TODO: type
7785            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7786            
7787        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7788          $self->{line_prev} = $self->{line};
7789          $self->{column_prev} = $self->{column};
7790          $self->{column}++;
7791          $self->{nc}
7792              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7793        } else {
7794          $self->{set_nc}->($self);
7795        }
7796      
7797            return  ($self->{ct}); # ENTITY
7798            redo A;
7799          } elsif ($self->{nc} == -1) {
7800            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7801            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7802            
7803        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7804          $self->{line_prev} = $self->{line};
7805          $self->{column_prev} = $self->{column};
7806          $self->{column}++;
7807          $self->{nc}
7808              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7809        } else {
7810          $self->{set_nc}->($self);
7811        }
7812      
7813            return  ($self->{ct}); # ENTITY
7814            redo A;
7815          } else {
7816            $self->{ct}->{notation} = chr $self->{nc}; # ENTITY
7817            $self->{state} = NOTATION_NAME_STATE;
7818            
7819        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7820          $self->{line_prev} = $self->{line};
7821          $self->{column_prev} = $self->{column};
7822          $self->{column}++;
7823          $self->{nc}
7824              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7825        } else {
7826          $self->{set_nc}->($self);
7827        }
7828      
7829            redo A;
7830          }
7831        } elsif ($self->{state} == NOTATION_NAME_STATE) {
7832          if ($is_space->{$self->{nc}}) {
7833            $self->{state} = AFTER_MD_DEF_STATE;
7834            
7835        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7836          $self->{line_prev} = $self->{line};
7837          $self->{column_prev} = $self->{column};
7838          $self->{column}++;
7839          $self->{nc}
7840              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7841        } else {
7842          $self->{set_nc}->($self);
7843        }
7844      
7845            redo A;
7846          } elsif ($self->{nc} == 0x003E) { # >
7847            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7848            
7849        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7850          $self->{line_prev} = $self->{line};
7851          $self->{column_prev} = $self->{column};
7852          $self->{column}++;
7853          $self->{nc}
7854              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7855        } else {
7856          $self->{set_nc}->($self);
7857        }
7858      
7859            return  ($self->{ct}); # ENTITY
7860            redo A;
7861          } elsif ($self->{nc} == -1) {
7862            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7863            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7864            
7865        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7866          $self->{line_prev} = $self->{line};
7867          $self->{column_prev} = $self->{column};
7868          $self->{column}++;
7869          $self->{nc}
7870              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7871        } else {
7872          $self->{set_nc}->($self);
7873        }
7874      
7875            return  ($self->{ct}); # ENTITY
7876            redo A;
7877          } else {
7878            $self->{ct}->{notation} .= chr $self->{nc}; # ENTITY
7879            ## Stay in the state.
7880            
7881        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7882          $self->{line_prev} = $self->{line};
7883          $self->{column_prev} = $self->{column};
7884          $self->{column}++;
7885          $self->{nc}
7886              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7887        } else {
7888          $self->{set_nc}->($self);
7889        }
7890      
7891            redo A;
7892          }
7893        } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE) {
7894          if ($self->{nc} == 0x0022) { # "
7895            $self->{state} = AFTER_MD_DEF_STATE;
7896            
7897        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7898          $self->{line_prev} = $self->{line};
7899          $self->{column_prev} = $self->{column};
7900          $self->{column}++;
7901          $self->{nc}
7902              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7903        } else {
7904          $self->{set_nc}->($self);
7905        }
7906      
7907            redo A;
7908          } elsif ($self->{nc} == 0x0026) { # &
7909            $self->{prev_state} = $self->{state};
7910            $self->{state} = ENTITY_VALUE_ENTITY_STATE;
7911            $self->{entity_add} = 0x0022; # "
7912            
7913        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7914          $self->{line_prev} = $self->{line};
7915          $self->{column_prev} = $self->{column};
7916          $self->{column}++;
7917          $self->{nc}
7918              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7919        } else {
7920          $self->{set_nc}->($self);
7921        }
7922      
7923            redo A;
7924    ## TODO: %
7925          } elsif ($self->{nc} == -1) {
7926            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed entity value'); ## TODO: type
7927            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7928            ## Reconsume.
7929            return  ($self->{ct}); # ENTITY
7930            redo A;
7931          } else {
7932            $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
7933            
7934        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7935          $self->{line_prev} = $self->{line};
7936          $self->{column_prev} = $self->{column};
7937          $self->{column}++;
7938          $self->{nc}
7939              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7940        } else {
7941          $self->{set_nc}->($self);
7942        }
7943      
7944            redo A;
7945          }
7946        } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE) {
7947          if ($self->{nc} == 0x0027) { # '
7948            $self->{state} = AFTER_MD_DEF_STATE;
7949            
7950        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7951          $self->{line_prev} = $self->{line};
7952          $self->{column_prev} = $self->{column};
7953          $self->{column}++;
7954          $self->{nc}
7955              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7956        } else {
7957          $self->{set_nc}->($self);
7958        }
7959      
7960            redo A;
7961          } elsif ($self->{nc} == 0x0026) { # &
7962            $self->{prev_state} = $self->{state};
7963            $self->{state} = ENTITY_VALUE_ENTITY_STATE;
7964            $self->{entity_add} = 0x0027; # '
7965            
7966        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7967          $self->{line_prev} = $self->{line};
7968          $self->{column_prev} = $self->{column};
7969          $self->{column}++;
7970          $self->{nc}
7971              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7972        } else {
7973          $self->{set_nc}->($self);
7974        }
7975      
7976            redo A;
7977    ## TODO: %
7978          } elsif ($self->{nc} == -1) {
7979            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed entity value'); ## TODO: type
7980            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7981            ## Reconsume.
7982            return  ($self->{ct}); # ENTITY
7983            redo A;
7984          } else {
7985            $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
7986            
7987        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7988          $self->{line_prev} = $self->{line};
7989          $self->{column_prev} = $self->{column};
7990          $self->{column}++;
7991          $self->{nc}
7992              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7993        } else {
7994          $self->{set_nc}->($self);
7995        }
7996      
7997            redo A;
7998          }
7999        } elsif ($self->{state} == ENTITY_VALUE_ENTITY_STATE) {
8000          if ($is_space->{$self->{nc}} or
8001              {
8002                0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
8003                $self->{entity_add} => 1,
8004              }->{$self->{nc}}) {
8005            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
8006                            line => $self->{line_prev},
8007                            column => $self->{column_prev}
8008                                + ($self->{nc} == -1 ? 1 : 0));
8009            ## Don't consume
8010            ## Return nothing.
8011            #
8012          } elsif ($self->{nc} == 0x0023) { # #
8013            $self->{ca} = $self->{ct};
8014            $self->{state} = ENTITY_HASH_STATE;
8015            $self->{kwd} = '#';
8016            
8017        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8018          $self->{line_prev} = $self->{line};
8019          $self->{column_prev} = $self->{column};
8020          $self->{column}++;
8021          $self->{nc}
8022              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8023        } else {
8024          $self->{set_nc}->($self);
8025        }
8026      
8027            redo A;
8028          } else {
8029            #
8030          }
8031    
8032          $self->{ct}->{value} .= '&';
8033          $self->{state} = $self->{prev_state};
8034          ## Reconsume.
8035          redo A;
8036        } elsif ($self->{state} == AFTER_ELEMENT_NAME_STATE) {
8037          if ($is_space->{$self->{nc}}) {
8038            $self->{state} = BEFORE_ELEMENT_CONTENT_STATE;
8039            
8040        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8041          $self->{line_prev} = $self->{line};
8042          $self->{column_prev} = $self->{column};
8043          $self->{column}++;
8044          $self->{nc}
8045              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8046        } else {
8047          $self->{set_nc}->($self);
8048        }
8049      
8050            redo A;
8051          } elsif ($self->{nc} == 0x0028) { # (
8052            $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
8053            $self->{ct}->{content} = ['('];
8054            $self->{group_depth} = 1;
8055            
8056        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8057          $self->{line_prev} = $self->{line};
8058          $self->{column_prev} = $self->{column};
8059          $self->{column}++;
8060          $self->{nc}
8061              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8062        } else {
8063          $self->{set_nc}->($self);
8064        }
8065      
8066            redo A;
8067          } elsif ($self->{nc} == 0x003E) { # >
8068            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
8069            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8070            
8071        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8072          $self->{line_prev} = $self->{line};
8073          $self->{column_prev} = $self->{column};
8074          $self->{column}++;
8075          $self->{nc}
8076              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8077        } else {
8078          $self->{set_nc}->($self);
8079        }
8080      
8081            return  ($self->{ct}); # ELEMENT
8082            redo A;
8083          } elsif ($self->{nc} == -1) {
8084            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8085            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8086            
8087        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8088          $self->{line_prev} = $self->{line};
8089          $self->{column_prev} = $self->{column};
8090          $self->{column}++;
8091          $self->{nc}
8092              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8093        } else {
8094          $self->{set_nc}->($self);
8095        }
8096      
8097            return  ($self->{ct}); # ELEMENT
8098            redo A;
8099          } else {
8100            $self->{ct}->{content} = [chr $self->{nc}];
8101            $self->{state} = CONTENT_KEYWORD_STATE;
8102            
8103        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8104          $self->{line_prev} = $self->{line};
8105          $self->{column_prev} = $self->{column};
8106          $self->{column}++;
8107          $self->{nc}
8108              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8109        } else {
8110          $self->{set_nc}->($self);
8111        }
8112      
8113            redo A;
8114          }
8115        } elsif ($self->{state} == CONTENT_KEYWORD_STATE) {
8116          if ($is_space->{$self->{nc}}) {
8117            $self->{state} = AFTER_MD_DEF_STATE;
8118            
8119        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8120          $self->{line_prev} = $self->{line};
8121          $self->{column_prev} = $self->{column};
8122          $self->{column}++;
8123          $self->{nc}
8124              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8125        } else {
8126          $self->{set_nc}->($self);
8127        }
8128      
8129            redo A;
8130          } elsif ($self->{nc} == 0x003E) { # >
8131            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8132            
8133        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8134          $self->{line_prev} = $self->{line};
8135          $self->{column_prev} = $self->{column};
8136          $self->{column}++;
8137          $self->{nc}
8138              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8139        } else {
8140          $self->{set_nc}->($self);
8141        }
8142      
8143            return  ($self->{ct}); # ELEMENT
8144            redo A;
8145          } elsif ($self->{nc} == -1) {
8146            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8147            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8148            
8149        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8150          $self->{line_prev} = $self->{line};
8151          $self->{column_prev} = $self->{column};
8152          $self->{column}++;
8153          $self->{nc}
8154              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8155        } else {
8156          $self->{set_nc}->($self);
8157        }
8158      
8159            return  ($self->{ct}); # ELEMENT
8160            redo A;
8161          } else {
8162            $self->{ct}->{content}->[-1] .= chr $self->{nc}; # ELEMENT
8163            ## Stay in the state.
8164            
8165        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8166          $self->{line_prev} = $self->{line};
8167          $self->{column_prev} = $self->{column};
8168          $self->{column}++;
8169          $self->{nc}
8170              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8171        } else {
8172          $self->{set_nc}->($self);
8173        }
8174      
8175            redo A;
8176          }
8177        } elsif ($self->{state} == AFTER_CM_GROUP_OPEN_STATE) {
8178          if ($is_space->{$self->{nc}}) {
8179            ## Stay in the state.
8180            
8181        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8182          $self->{line_prev} = $self->{line};
8183          $self->{column_prev} = $self->{column};
8184          $self->{column}++;
8185          $self->{nc}
8186              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8187        } else {
8188          $self->{set_nc}->($self);
8189        }
8190      
8191            redo A;
8192          } elsif ($self->{nc} == 0x0028) { # (
8193            $self->{group_depth}++;
8194            push @{$self->{ct}->{content}}, chr $self->{nc};
8195            ## Stay in the state.
8196            
8197        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8198          $self->{line_prev} = $self->{line};
8199          $self->{column_prev} = $self->{column};
8200          $self->{column}++;
8201          $self->{nc}
8202              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8203        } else {
8204          $self->{set_nc}->($self);
8205        }
8206      
8207            redo A;
8208          } elsif ($self->{nc} == 0x007C or # |
8209                   $self->{nc} == 0x002C) { # ,
8210            $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty element name'); ## TODO: type
8211            ## Stay in the state.
8212            
8213        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8214          $self->{line_prev} = $self->{line};
8215          $self->{column_prev} = $self->{column};
8216          $self->{column}++;
8217          $self->{nc}
8218              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8219        } else {
8220          $self->{set_nc}->($self);
8221        }
8222      
8223            redo A;
8224          } elsif ($self->{nc} == 0x0029) { # )
8225            $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty element name'); ## TODO: type
8226            push @{$self->{ct}->{content}}, chr $self->{nc};
8227            $self->{group_depth}--;
8228            $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8229            
8230        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8231          $self->{line_prev} = $self->{line};
8232          $self->{column_prev} = $self->{column};
8233          $self->{column}++;
8234          $self->{nc}
8235              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8236        } else {
8237          $self->{set_nc}->($self);
8238        }
8239      
8240            redo A;
8241          } elsif ($self->{nc} == 0x003E) { # >
8242            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8243            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8244            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8245            
8246        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8247          $self->{line_prev} = $self->{line};
8248          $self->{column_prev} = $self->{column};
8249          $self->{column}++;
8250          $self->{nc}
8251              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8252        } else {
8253          $self->{set_nc}->($self);
8254        }
8255      
8256            return  ($self->{ct}); # ELEMENT
8257            redo A;
8258          } elsif ($self->{nc} == -1) {
8259            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8260            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8261            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8262            
8263        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8264          $self->{line_prev} = $self->{line};
8265          $self->{column_prev} = $self->{column};
8266          $self->{column}++;
8267          $self->{nc}
8268              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8269        } else {
8270          $self->{set_nc}->($self);
8271        }
8272      
8273            return  ($self->{ct}); # ELEMENT
8274            redo A;
8275          } else {
8276            push @{$self->{ct}->{content}}, chr $self->{nc};
8277            $self->{state} = CM_ELEMENT_NAME_STATE;
8278            
8279        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8280          $self->{line_prev} = $self->{line};
8281          $self->{column_prev} = $self->{column};
8282          $self->{column}++;
8283          $self->{nc}
8284              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8285        } else {
8286          $self->{set_nc}->($self);
8287        }
8288      
8289            redo A;
8290          }
8291        } elsif ($self->{state} == CM_ELEMENT_NAME_STATE) {
8292          if ($is_space->{$self->{nc}}) {
8293            $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8294            
8295        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8296          $self->{line_prev} = $self->{line};
8297          $self->{column_prev} = $self->{column};
8298          $self->{column}++;
8299          $self->{nc}
8300              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8301        } else {
8302          $self->{set_nc}->($self);
8303        }
8304      
8305            redo A;
8306          } elsif ($self->{nc} == 0x002A or # *
8307                   $self->{nc} == 0x002B or # +
8308                   $self->{nc} == 0x003F) { # ?
8309            push @{$self->{ct}->{content}}, chr $self->{nc};
8310            $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8311            
8312        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8313          $self->{line_prev} = $self->{line};
8314          $self->{column_prev} = $self->{column};
8315          $self->{column}++;
8316          $self->{nc}
8317              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8318        } else {
8319          $self->{set_nc}->($self);
8320        }
8321      
8322            redo A;
8323          } elsif ($self->{nc} == 0x007C or # |
8324                   $self->{nc} == 0x002C) { # ,
8325            push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
8326            $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
8327            
8328        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8329          $self->{line_prev} = $self->{line};
8330          $self->{column_prev} = $self->{column};
8331          $self->{column}++;
8332          $self->{nc}
8333              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8334        } else {
8335          $self->{set_nc}->($self);
8336        }
8337      
8338            redo A;
8339          } elsif ($self->{nc} == 0x0029) { # )
8340            $self->{group_depth}--;
8341            push @{$self->{ct}->{content}}, chr $self->{nc};
8342            $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8343            
8344        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8345          $self->{line_prev} = $self->{line};
8346          $self->{column_prev} = $self->{column};
8347          $self->{column}++;
8348          $self->{nc}
8349              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8350        } else {
8351          $self->{set_nc}->($self);
8352        }
8353      
8354            redo A;
8355          } elsif ($self->{nc} == 0x003E) { # >
8356            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8357            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8358            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8359            
8360        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8361          $self->{line_prev} = $self->{line};
8362          $self->{column_prev} = $self->{column};
8363          $self->{column}++;
8364          $self->{nc}
8365              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8366        } else {
8367          $self->{set_nc}->($self);
8368        }
8369      
8370            return  ($self->{ct}); # ELEMENT
8371            redo A;
8372          } elsif ($self->{nc} == -1) {
8373            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8374            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8375            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8376            
8377        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8378          $self->{line_prev} = $self->{line};
8379          $self->{column_prev} = $self->{column};
8380          $self->{column}++;
8381          $self->{nc}
8382              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8383        } else {
8384          $self->{set_nc}->($self);
8385        }
8386      
8387            return  ($self->{ct}); # ELEMENT
8388            redo A;
8389          } else {
8390            $self->{ct}->{content}->[-1] .= chr $self->{nc};
8391            ## Stay in the state.
8392            
8393        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8394          $self->{line_prev} = $self->{line};
8395          $self->{column_prev} = $self->{column};
8396          $self->{column}++;
8397          $self->{nc}
8398              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8399        } else {
8400          $self->{set_nc}->($self);
8401        }
8402      
8403            redo A;
8404          }
8405        } elsif ($self->{state} == AFTER_CM_ELEMENT_NAME_STATE) {
8406          if ($is_space->{$self->{nc}}) {
8407            ## Stay in the state.
8408            
8409        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8410          $self->{line_prev} = $self->{line};
8411          $self->{column_prev} = $self->{column};
8412          $self->{column}++;
8413          $self->{nc}
8414              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8415        } else {
8416          $self->{set_nc}->($self);
8417        }
8418      
8419            redo A;
8420          } elsif ($self->{nc} == 0x007C or # |
8421                   $self->{nc} == 0x002C) { # ,
8422            push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
8423            $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
8424            
8425        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8426          $self->{line_prev} = $self->{line};
8427          $self->{column_prev} = $self->{column};
8428          $self->{column}++;
8429          $self->{nc}
8430              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8431        } else {
8432          $self->{set_nc}->($self);
8433        }
8434      
8435            redo A;
8436          } elsif ($self->{nc} == 0x0029) { # )
8437            $self->{group_depth}--;
8438            push @{$self->{ct}->{content}}, chr $self->{nc};
8439            $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8440            
8441        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8442          $self->{line_prev} = $self->{line};
8443          $self->{column_prev} = $self->{column};
8444          $self->{column}++;
8445          $self->{nc}
8446              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8447        } else {
8448          $self->{set_nc}->($self);
8449        }
8450      
8451            redo A;
8452          } elsif ($self->{nc} == 0x003E) { # >
8453            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8454            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8455            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8456            
8457        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8458          $self->{line_prev} = $self->{line};
8459          $self->{column_prev} = $self->{column};
8460          $self->{column}++;
8461          $self->{nc}
8462              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8463        } else {
8464          $self->{set_nc}->($self);
8465        }
8466      
8467            return  ($self->{ct}); # ELEMENT
8468            redo A;
8469          } elsif ($self->{nc} == -1) {
8470            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8471            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8472            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8473            
8474        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8475          $self->{line_prev} = $self->{line};
8476          $self->{column_prev} = $self->{column};
8477          $self->{column}++;
8478          $self->{nc}
8479              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8480        } else {
8481          $self->{set_nc}->($self);
8482        }
8483      
8484            return  ($self->{ct}); # ELEMENT
8485            redo A;
8486          } else {
8487            $self->{parse_error}->(level => $self->{level}->{must}, type => 'after element name'); ## TODO: type
8488            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8489            $self->{state} = BOGUS_MD_STATE;
8490            
8491        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8492          $self->{line_prev} = $self->{line};
8493          $self->{column_prev} = $self->{column};
8494          $self->{column}++;
8495          $self->{nc}
8496              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8497        } else {
8498          $self->{set_nc}->($self);
8499        }
8500      
8501            redo A;
8502          }
8503        } elsif ($self->{state} == AFTER_CM_GROUP_CLOSE_STATE) {
8504          if ($is_space->{$self->{nc}}) {
8505            if ($self->{group_depth}) {
8506              $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8507            } else {
8508              $self->{state} = AFTER_MD_DEF_STATE;
8509            }
8510            
8511        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8512          $self->{line_prev} = $self->{line};
8513          $self->{column_prev} = $self->{column};
8514          $self->{column}++;
8515          $self->{nc}
8516              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8517        } else {
8518          $self->{set_nc}->($self);
8519        }
8520      
8521            redo A;
8522          } elsif ($self->{nc} == 0x002A or # *
8523                   $self->{nc} == 0x002B or # +
8524                   $self->{nc} == 0x003F) { # ?
8525            push @{$self->{ct}->{content}}, chr $self->{nc};
8526            if ($self->{group_depth}) {
8527              $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8528            } else {
8529              $self->{state} = AFTER_MD_DEF_STATE;
8530            }
8531            
8532        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8533          $self->{line_prev} = $self->{line};
8534          $self->{column_prev} = $self->{column};
8535          $self->{column}++;
8536          $self->{nc}
8537              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8538        } else {
8539          $self->{set_nc}->($self);
8540        }
8541      
8542            redo A;
8543          } elsif ($self->{nc} == 0x0029) { # )
8544            if ($self->{group_depth}) {
8545              $self->{group_depth}--;
8546              push @{$self->{ct}->{content}}, chr $self->{nc};
8547              ## Stay in the state.
8548              
8549        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8550          $self->{line_prev} = $self->{line};
8551          $self->{column_prev} = $self->{column};
8552          $self->{column}++;
8553          $self->{nc}
8554              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8555        } else {
8556          $self->{set_nc}->($self);
8557        }
8558      
8559              redo A;
8560            } else {
8561              $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8562              $self->{state} = BOGUS_MD_STATE;
8563              ## Reconsume.
8564              redo A;
8565            }
8566          } elsif ($self->{nc} == 0x003E) { # >
8567            if ($self->{group_depth}) {
8568              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8569              push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8570            }
8571            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8572            
8573        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8574          $self->{line_prev} = $self->{line};
8575          $self->{column_prev} = $self->{column};
8576          $self->{column}++;
8577          $self->{nc}
8578              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8579        } else {
8580          $self->{set_nc}->($self);
8581        }
8582      
8583            return  ($self->{ct}); # ELEMENT
8584            redo A;
8585          } elsif ($self->{nc} == -1) {
8586            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8587            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8588            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8589            
8590        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8591          $self->{line_prev} = $self->{line};
8592          $self->{column_prev} = $self->{column};
8593          $self->{column}++;
8594          $self->{nc}
8595              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8596        } else {
8597          $self->{set_nc}->($self);
8598        }
8599      
8600            return  ($self->{ct}); # ELEMENT
8601            redo A;
8602          } else {
8603            if ($self->{group_depth}) {
8604              $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8605            } else {
8606              $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8607              $self->{state} = BOGUS_MD_STATE;
8608            }
8609            ## Reconsume.
8610            redo A;
8611          }
8612        } elsif ($self->{state} == AFTER_MD_DEF_STATE) {
8613          if ($is_space->{$self->{nc}}) {
8614            ## Stay in the state.
8615            
8616        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8617          $self->{line_prev} = $self->{line};
8618          $self->{column_prev} = $self->{column};
8619          $self->{column}++;
8620          $self->{nc}
8621              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8622        } else {
8623          $self->{set_nc}->($self);
8624        }
8625      
8626            redo A;
8627          } elsif ($self->{nc} == 0x003E) { # >
8628            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8629            
8630        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8631          $self->{line_prev} = $self->{line};
8632          $self->{column_prev} = $self->{column};
8633          $self->{column}++;
8634          $self->{nc}
8635              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8636        } else {
8637          $self->{set_nc}->($self);
8638        }
8639      
8640            return  ($self->{ct}); # ENTITY/ELEMENT
8641            redo A;
8642          } elsif ($self->{nc} == -1) {
8643            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8644            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8645            
8646        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8647          $self->{line_prev} = $self->{line};
8648          $self->{column_prev} = $self->{column};
8649          $self->{column}++;
8650          $self->{nc}
8651              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8652        } else {
8653          $self->{set_nc}->($self);
8654        }
8655      
8656            return  ($self->{ct}); # ENTITY/ELEMENT
8657            redo A;
8658          } else {
8659            $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8660            $self->{state} = BOGUS_MD_STATE;
8661            ## Reconsume.
8662            redo A;
8663          }
8664        } elsif ($self->{state} == BOGUS_MD_STATE) {
8665          if ($self->{nc} == 0x003E) { # >
8666            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8667            
8668        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8669          $self->{line_prev} = $self->{line};
8670          $self->{column_prev} = $self->{column};
8671          $self->{column}++;
8672          $self->{nc}
8673              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8674        } else {
8675          $self->{set_nc}->($self);
8676        }
8677      
8678            return  ($self->{ct}); # ATTLIST/ENTITY/NOTATION
8679            redo A;
8680          } elsif ($self->{nc} == -1) {
8681            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8682            ## Reconsume.
8683            return  ($self->{ct}); # ATTLIST/ENTITY/NOTATION
8684            redo A;
8685          } else {
8686            ## Stay in the state.
8687            
8688        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8689          $self->{line_prev} = $self->{line};
8690          $self->{column_prev} = $self->{column};
8691          $self->{column}++;
8692          $self->{nc}
8693              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8694        } else {
8695          $self->{set_nc}->($self);
8696        }
8697      
8698            redo A;
8699          }
8700      } else {      } else {
8701        die "$0: $self->{state}: Unknown state";        die "$0: $self->{state}: Unknown state";
8702      }      }
# Line 4252  sub _get_next_token ($) { Line 8707  sub _get_next_token ($) {
8707    
8708  1;  1;
8709  ## $Date$  ## $Date$
8710                                    

Legend:
Removed from v.1.5  
changed lines
  Added in v.1.30

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24