/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm
Suika

Diff of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.5 by wakaba, Tue Oct 14 14:38:59 2008 UTC revision 1.25 by wakaba, Sun Oct 19 15:17:01 2008 UTC
# Line 15  BEGIN { Line 15  BEGIN {
15      CHARACTER_TOKEN      CHARACTER_TOKEN
16      PI_TOKEN      PI_TOKEN
17      ABORT_TOKEN      ABORT_TOKEN
18        END_OF_DOCTYPE_TOKEN
19        ATTLIST_TOKEN
20        ELEMENT_TOKEN
21        GENERAL_ENTITY_TOKEN
22        PARAMETER_ENTITY_TOKEN
23        NOTATION_TOKEN
24    );    );
25        
26    our %EXPORT_TAGS = (    our %EXPORT_TAGS = (
# Line 27  BEGIN { Line 33  BEGIN {
33        CHARACTER_TOKEN        CHARACTER_TOKEN
34        PI_TOKEN        PI_TOKEN
35        ABORT_TOKEN        ABORT_TOKEN
36          END_OF_DOCTYPE_TOKEN
37          ATTLIST_TOKEN
38          ELEMENT_TOKEN
39          GENERAL_ENTITY_TOKEN
40          PARAMETER_ENTITY_TOKEN
41          NOTATION_TOKEN
42      )],      )],
43    );    );
44  }  }
45    
46    ## NOTE: Differences from the XML5 draft are marked as "XML5:".
47    
48  ## Token types  ## Token types
49    
50  sub DOCTYPE_TOKEN () { 1 }  sub DOCTYPE_TOKEN () { 1 } ## XML5: No DOCTYPE token.
51  sub COMMENT_TOKEN () { 2 }  sub COMMENT_TOKEN () { 2 }
52  sub START_TAG_TOKEN () { 3 }  sub START_TAG_TOKEN () { 3 }
53  sub END_TAG_TOKEN () { 4 }  sub END_TAG_TOKEN () { 4 }
54  sub END_OF_FILE_TOKEN () { 5 }  sub END_OF_FILE_TOKEN () { 5 }
55  sub CHARACTER_TOKEN () { 6 }  sub CHARACTER_TOKEN () { 6 }
56  sub PI_TOKEN () { 7 } # XML5  sub PI_TOKEN () { 7 } ## NOTE: XML only.
57  sub ABORT_TOKEN () { 8 } # Not a token actually  sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing.
58    sub END_OF_DOCTYPE_TOKEN () { 9 } ## NOTE: XML only.
59    sub ATTLIST_TOKEN () { 10 } ## NOTE: XML only.
60    sub ELEMENT_TOKEN () { 11 } ## NOTE: XML only.
61    sub GENERAL_ENTITY_TOKEN () { 12 } ## NOTE: XML only.
62    sub PARAMETER_ENTITY_TOKEN () { 13 } ## NOTE: XML only.
63    sub NOTATION_TOKEN () { 14 } ## NOTE: XML only.
64    
65    ## XML5: XML5 has "empty tag token".  In this implementation, it is
66    ## represented as a start tag token with $self->{self_closing} flag
67    ## set to true.
68    
69    ## XML5: XML5 has "short end tag token".  In this implementation, it
70    ## is represented as an end tag token with $token->{tag_name} flag set
71    ## to an empty string.
72    
73  package Whatpm::HTML;  package Whatpm::HTML;
74    
# Line 114  sub HEXREF_HEX_STATE () { 48 } Line 142  sub HEXREF_HEX_STATE () { 48 }
142  sub ENTITY_NAME_STATE () { 49 }  sub ENTITY_NAME_STATE () { 49 }
143  sub PCDATA_STATE () { 50 } # "data state" in the spec  sub PCDATA_STATE () { 50 } # "data state" in the spec
144    
145    ## XML-only states
146    sub PI_STATE () { 51 }
147    sub PI_TARGET_STATE () { 52 }
148    sub PI_TARGET_AFTER_STATE () { 53 }
149    sub PI_DATA_STATE () { 54 }
150    sub PI_AFTER_STATE () { 55 }
151    sub PI_DATA_AFTER_STATE () { 56 }
152    sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 }
153    sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 }
154    sub BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 59 }
155    sub DOCTYPE_TAG_STATE () { 60 }
156    sub DOCTYPE_MARKUP_DECLARATION_OPEN_STATE () { 61 }
157    sub MD_ATTLIST_STATE () { 62 }
158    sub MD_E_STATE () { 63 }
159    sub MD_ELEMENT_STATE () { 64 }
160    sub MD_ENTITY_STATE () { 65 }
161    sub MD_NOTATION_STATE () { 66 }
162    sub DOCTYPE_MD_STATE () { 67 }
163    sub BEFORE_MD_NAME_STATE () { 68 }
164    sub MD_NAME_STATE () { 69 }
165    sub DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE () { 70 }
166    sub DOCTYPE_ATTLIST_NAME_AFTER_STATE () { 71 }
167    sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE () { 72 }
168    sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE () { 73 }
169    sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE () { 74 }
170    sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE () { 75 }
171    sub BEFORE_ALLOWED_TOKEN_STATE () { 76 }
172    sub ALLOWED_TOKEN_STATE () { 77 }
173    sub AFTER_ALLOWED_TOKEN_STATE () { 78 }
174    sub AFTER_ALLOWED_TOKENS_STATE () { 79 }
175    sub BEFORE_ATTR_DEFAULT_STATE () { 80 }
176    sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE () { 81 }
177    sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE () { 82 }
178    sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE () { 83 }
179    sub AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE () { 84 }
180    sub BEFORE_NDATA_STATE () { 85 }
181    sub NDATA_STATE () { 86 }
182    sub AFTER_NDATA_STATE () { 87 }
183    sub BEFORE_NOTATION_NAME_STATE () { 88 }
184    sub NOTATION_NAME_STATE () { 89 }
185    sub DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE () { 90 }
186    sub DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE () { 91 }
187    sub ENTITY_VALUE_ENTITY_STATE () { 92 }
188    sub AFTER_ELEMENT_NAME_STATE () { 93 }
189    sub BEFORE_ELEMENT_CONTENT_STATE () { 94 }
190    sub CONTENT_KEYWORD_STATE () { 95 }
191    sub AFTER_CM_GROUP_OPEN_STATE () { 96 }
192    sub CM_ELEMENT_NAME_STATE () { 97 }
193    sub AFTER_CM_ELEMENT_NAME_STATE () { 98 }
194    sub AFTER_CM_GROUP_CLOSE_STATE () { 99 }
195    sub AFTER_MD_DEF_STATE () { 100 }
196    sub BOGUS_MD_STATE () { 101 }
197    
198  ## Tree constructor state constants (see Whatpm::HTML for the full  ## Tree constructor state constants (see Whatpm::HTML for the full
199  ## list and descriptions)  ## list and descriptions)
200    
# Line 178  sub _initialize_tokenizer ($) { Line 259  sub _initialize_tokenizer ($) {
259    #$self->{is_xml} (if XML)    #$self->{is_xml} (if XML)
260    
261    $self->{state} = DATA_STATE; # MUST    $self->{state} = DATA_STATE; # MUST
262    $self->{s_kwd} = ''; # state keyword    $self->{s_kwd} = ''; # Data state keyword
263      #$self->{kwd} = ''; # State-dependent keyword; initialized when used
264    #$self->{entity__value}; # initialized when used    #$self->{entity__value}; # initialized when used
265    #$self->{entity__match}; # initialized when used    #$self->{entity__match}; # initialized when used
266    $self->{content_model} = PCDATA_CONTENT_MODEL; # be    $self->{content_model} = PCDATA_CONTENT_MODEL; # be
# Line 208  sub _initialize_tokenizer ($) { Line 290  sub _initialize_tokenizer ($) {
290    
291  ## A token has:  ## A token has:
292  ##   ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,  ##   ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
293  ##       CHARACTER_TOKEN, or END_OF_FILE_TOKEN  ##       CHARACTER_TOKEN, END_OF_FILE_TOKEN, PI_TOKEN, or ABORT_TOKEN
294  ##   ->{name} (DOCTYPE_TOKEN)  ##   ->{name} (DOCTYPE_TOKEN)
295  ##   ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)  ##   ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
296    ##   ->{target} (PI_TOKEN)
297  ##   ->{pubid} (DOCTYPE_TOKEN)  ##   ->{pubid} (DOCTYPE_TOKEN)
298  ##   ->{sysid} (DOCTYPE_TOKEN)  ##   ->{sysid} (DOCTYPE_TOKEN)
299  ##   ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag  ##   ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
# Line 218  sub _initialize_tokenizer ($) { Line 301  sub _initialize_tokenizer ($) {
301  ##        ->{name}  ##        ->{name}
302  ##        ->{value}  ##        ->{value}
303  ##        ->{has_reference} == 1 or 0  ##        ->{has_reference} == 1 or 0
304  ##   ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)  ##        ->{index}: Index of the attribute in a tag.
305    ##   ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)
306    ##   ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
307    ##   ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.
308    ##   ->{has_internal_subset} = 1 or 0 (DOCTYPE_TOKEN)
309    
310  ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.  ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
311  ##     |->{self_closing}| is used to save the value of |$self->{self_closing}|  ##     |->{self_closing}| is used to save the value of |$self->{self_closing}|
312  ##     while the token is pushed back to the stack.  ##     while the token is pushed back to the stack.
# Line 238  my $is_space = { Line 326  my $is_space = {
326    0x0009 => 1, # CHARACTER TABULATION (HT)    0x0009 => 1, # CHARACTER TABULATION (HT)
327    0x000A => 1, # LINE FEED (LF)    0x000A => 1, # LINE FEED (LF)
328    #0x000B => 0, # LINE TABULATION (VT)    #0x000B => 0, # LINE TABULATION (VT)
329    0x000C => 1, # FORM FEED (FF)    0x000C => 1, # FORM FEED (FF) ## XML5: Not a space character.
330    #0x000D => 1, # CARRIAGE RETURN (CR)    #0x000D => 1, # CARRIAGE RETURN (CR)
331    0x0020 => 1, # SPACE (SP)    0x0020 => 1, # SPACE (SP)
332  };  };
# Line 498  sub _get_next_token ($) { Line 586  sub _get_next_token ($) {
586        return  ($token);        return  ($token);
587        redo A;        redo A;
588      } elsif ($self->{state} == TAG_OPEN_STATE) {      } elsif ($self->{state} == TAG_OPEN_STATE) {
589          ## XML5: "tag state".
590    
591        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
592          if ($self->{nc} == 0x002F) { # /          if ($self->{nc} == 0x002F) { # /
593                        
# Line 516  sub _get_next_token ($) { Line 606  sub _get_next_token ($) {
606            redo A;            redo A;
607          } elsif ($self->{nc} == 0x0021) { # !          } elsif ($self->{nc} == 0x0021) { # !
608                        
609            $self->{s_kwd} = '<' unless $self->{escape};            $self->{s_kwd} = $self->{escaped} ? '' : '<';
610            #            #
611          } else {          } else {
612                        
613              $self->{s_kwd} = '';
614            #            #
615          }          }
616    
617          ## reconsume          ## reconsume
618          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
         $self->{s_kwd} = '';  
619          return  ({type => CHARACTER_TOKEN, data => '<',          return  ({type => CHARACTER_TOKEN, data => '<',
620                    line => $self->{line_prev},                    line => $self->{line_prev},
621                    column => $self->{column_prev},                    column => $self->{column_prev},
# Line 629  sub _get_next_token ($) { Line 719  sub _get_next_token ($) {
719    
720            redo A;            redo A;
721          } elsif ($self->{nc} == 0x003F) { # ?          } elsif ($self->{nc} == 0x003F) { # ?
722                        if ($self->{is_xml}) {
723            $self->{parse_error}->(level => $self->{level}->{must}, type => 'pio',              
724                            line => $self->{line_prev},              $self->{state} = PI_STATE;
725                            column => $self->{column_prev});              
726            $self->{state} = BOGUS_COMMENT_STATE;      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
727            $self->{ct} = {type => COMMENT_TOKEN, data => '',        $self->{line_prev} = $self->{line};
728                                      line => $self->{line_prev},        $self->{column_prev} = $self->{column};
729                                      column => $self->{column_prev},        $self->{column}++;
730                                     };        $self->{nc}
731            ## $self->{nc} is intentionally left as is            = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
732            redo A;      } else {
733          } else {        $self->{set_nc}->($self);
734        }
735      
736                redo A;
737              } else {
738                
739                $self->{parse_error}->(level => $self->{level}->{must}, type => 'pio',
740                                line => $self->{line_prev},
741                                column => $self->{column_prev});
742                $self->{state} = BOGUS_COMMENT_STATE;
743                $self->{ct} = {type => COMMENT_TOKEN, data => '',
744                               line => $self->{line_prev},
745                               column => $self->{column_prev},
746                              };
747                ## $self->{nc} is intentionally left as is
748                redo A;
749              }
750            } elsif (not $self->{is_xml} or $is_space->{$self->{nc}}) {
751                        
752            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago',            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago',
753                            line => $self->{line_prev},                            line => $self->{line_prev},
# Line 655  sub _get_next_token ($) { Line 762  sub _get_next_token ($) {
762                     });                     });
763    
764            redo A;            redo A;
765            } else {
766              ## XML5: "<:" is a parse error.
767              
768              $self->{ct} = {type => START_TAG_TOKEN,
769                                        tag_name => chr ($self->{nc}),
770                                        line => $self->{line_prev},
771                                        column => $self->{column_prev}};
772              $self->{state} = TAG_NAME_STATE;
773              
774        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
775          $self->{line_prev} = $self->{line};
776          $self->{column_prev} = $self->{column};
777          $self->{column}++;
778          $self->{nc}
779              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
780        } else {
781          $self->{set_nc}->($self);
782        }
783      
784              redo A;
785          }          }
786        } else {        } else {
787          die "$0: $self->{content_model} in tag open";          die "$0: $self->{content_model} in tag open";
# Line 663  sub _get_next_token ($) { Line 790  sub _get_next_token ($) {
790        ## NOTE: The "close tag open state" in the spec is implemented as        ## NOTE: The "close tag open state" in the spec is implemented as
791        ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.        ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
792    
793          ## XML5: "end tag state".
794    
795        my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"        my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
796        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
797          if (defined $self->{last_stag_name}) {          if (defined $self->{last_stag_name}) {
798            $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;            $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
799            $self->{s_kwd} = '';            $self->{kwd} = '';
800            ## Reconsume.            ## Reconsume.
801            redo A;            redo A;
802          } else {          } else {
# Line 724  sub _get_next_token ($) { Line 853  sub _get_next_token ($) {
853        
854          redo A;          redo A;
855        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
           
856          $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty end tag',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty end tag',
857                          line => $self->{line_prev}, ## "<" in "</>"                          line => $self->{line_prev}, ## "<" in "</>"
858                          column => $self->{column_prev} - 1);                          column => $self->{column_prev} - 1);
859          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
860          $self->{s_kwd} = '';          $self->{s_kwd} = '';
861                    if ($self->{is_xml}) {
862              
863              ## XML5: No parse error.
864              
865              ## NOTE: This parser raises a parse error, since it supports
866              ## XML1, not XML5.
867    
868              ## NOTE: A short end tag token.
869              my $ct = {type => END_TAG_TOKEN,
870                        tag_name => '',
871                        line => $self->{line_prev},
872                        column => $self->{column_prev} - 1,
873                       };
874              
875      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
876        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
877        $self->{column_prev} = $self->{column};        $self->{column_prev} = $self->{column};
# Line 741  sub _get_next_token ($) { Line 882  sub _get_next_token ($) {
882        $self->{set_nc}->($self);        $self->{set_nc}->($self);
883      }      }
884        
885              return  ($ct);
886            } else {
887              
888              
889        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
890          $self->{line_prev} = $self->{line};
891          $self->{column_prev} = $self->{column};
892          $self->{column}++;
893          $self->{nc}
894              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
895        } else {
896          $self->{set_nc}->($self);
897        }
898      
899            }
900          redo A;          redo A;
901        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
902                    
# Line 754  sub _get_next_token ($) { Line 910  sub _get_next_token ($) {
910                   });                   });
911    
912          redo A;          redo A;
913        } else {        } elsif (not $self->{is_xml} or
914                   $is_space->{$self->{nc}}) {
915                    
916          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus end tag');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus end tag',
917                            line => $self->{line_prev}, # "<" of "</"
918                            column => $self->{column_prev} - 1);
919          $self->{state} = BOGUS_COMMENT_STATE;          $self->{state} = BOGUS_COMMENT_STATE;
920          $self->{ct} = {type => COMMENT_TOKEN, data => '',          $self->{ct} = {type => COMMENT_TOKEN, data => '',
921                                    line => $self->{line_prev}, # "<" of "</"                                    line => $self->{line_prev}, # "<" of "</"
# Line 769  sub _get_next_token ($) { Line 928  sub _get_next_token ($) {
928          ## generated from the bogus end tag, as defined in the          ## generated from the bogus end tag, as defined in the
929          ## "bogus comment state" entry.          ## "bogus comment state" entry.
930          redo A;          redo A;
931          } else {
932            ## XML5: "</:" is a parse error.
933            
934            $self->{ct} = {type => END_TAG_TOKEN,
935                           tag_name => chr ($self->{nc}),
936                           line => $l, column => $c};
937            $self->{state} = TAG_NAME_STATE; ## XML5: "end tag name state".
938            
939        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
940          $self->{line_prev} = $self->{line};
941          $self->{column_prev} = $self->{column};
942          $self->{column}++;
943          $self->{nc}
944              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
945        } else {
946          $self->{set_nc}->($self);
947        }
948      
949            redo A;
950        }        }
951      } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {      } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
952        my $ch = substr $self->{last_stag_name}, length $self->{s_kwd}, 1;        my $ch = substr $self->{last_stag_name}, length $self->{kwd}, 1;
953        if (length $ch) {        if (length $ch) {
954          my $CH = $ch;          my $CH = $ch;
955          $ch =~ tr/a-z/A-Z/;          $ch =~ tr/a-z/A-Z/;
# Line 779  sub _get_next_token ($) { Line 957  sub _get_next_token ($) {
957          if ($nch eq $ch or $nch eq $CH) {          if ($nch eq $ch or $nch eq $CH) {
958                        
959            ## Stay in the state.            ## Stay in the state.
960            $self->{s_kwd} .= $nch;            $self->{kwd} .= $nch;
961                        
962      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
963        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 798  sub _get_next_token ($) { Line 976  sub _get_next_token ($) {
976            $self->{s_kwd} = '';            $self->{s_kwd} = '';
977            ## Reconsume.            ## Reconsume.
978            return  ({type => CHARACTER_TOKEN,            return  ({type => CHARACTER_TOKEN,
979                      data => '</' . $self->{s_kwd},                      data => '</' . $self->{kwd},
980                      line => $self->{line_prev},                      line => $self->{line_prev},
981                      column => $self->{column_prev} - 1 - length $self->{s_kwd},                      column => $self->{column_prev} - 1 - length $self->{kwd},
982                     });                     });
983            redo A;            redo A;
984          }          }
# Line 816  sub _get_next_token ($) { Line 994  sub _get_next_token ($) {
994            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
995            $self->{s_kwd} = '';            $self->{s_kwd} = '';
996            return  ({type => CHARACTER_TOKEN,            return  ({type => CHARACTER_TOKEN,
997                      data => '</' . $self->{s_kwd},                      data => '</' . $self->{kwd},
998                      line => $self->{line_prev},                      line => $self->{line_prev},
999                      column => $self->{column_prev} - 1 - length $self->{s_kwd},                      column => $self->{column_prev} - 1 - length $self->{kwd},
1000                     });                     });
1001            redo A;            redo A;
1002          } else {          } else {
# Line 827  sub _get_next_token ($) { Line 1005  sub _get_next_token ($) {
1005                = {type => END_TAG_TOKEN,                = {type => END_TAG_TOKEN,
1006                   tag_name => $self->{last_stag_name},                   tag_name => $self->{last_stag_name},
1007                   line => $self->{line_prev},                   line => $self->{line_prev},
1008                   column => $self->{column_prev} - 1 - length $self->{s_kwd}};                   column => $self->{column_prev} - 1 - length $self->{kwd}};
1009            $self->{state} = TAG_NAME_STATE;            $self->{state} = TAG_NAME_STATE;
1010            ## Reconsume.            ## Reconsume.
1011            redo A;            redo A;
# Line 959  sub _get_next_token ($) { Line 1137  sub _get_next_token ($) {
1137          redo A;          redo A;
1138        }        }
1139      } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {      } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
1140          ## XML5: "Tag attribute name before state".
1141    
1142        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
1143                    
1144          ## Stay in the state          ## Stay in the state
# Line 1071  sub _get_next_token ($) { Line 1251  sub _get_next_token ($) {
1251               0x003D => 1, # =               0x003D => 1, # =
1252              }->{$self->{nc}}) {              }->{$self->{nc}}) {
1253                        
1254              ## XML5: Not a parse error.
1255            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1256          } else {          } else {
1257                        
1258              ## XML5: ":" raises a parse error and is ignored.
1259          }          }
1260          $self->{ca}          $self->{ca}
1261              = {name => chr ($self->{nc}),              = {name => chr ($self->{nc}),
# Line 1094  sub _get_next_token ($) { Line 1276  sub _get_next_token ($) {
1276          redo A;          redo A;
1277        }        }
1278      } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {      } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
1279          ## XML5: "Tag attribute name state".
1280    
1281        my $before_leave = sub {        my $before_leave = sub {
1282          if (exists $self->{ct}->{attributes} # start tag or end tag          if (exists $self->{ct}->{attributes} # start tag or end tag
1283              ->{$self->{ca}->{name}}) { # MUST              ->{$self->{ca}->{name}}) { # MUST
# Line 1104  sub _get_next_token ($) { Line 1288  sub _get_next_token ($) {
1288                        
1289            $self->{ct}->{attributes}->{$self->{ca}->{name}}            $self->{ct}->{attributes}->{$self->{ca}->{name}}
1290              = $self->{ca};              = $self->{ca};
1291              $self->{ca}->{index} = ++$self->{ct}->{last_index};
1292          }          }
1293        }; # $before_leave        }; # $before_leave
1294    
# Line 1140  sub _get_next_token ($) { Line 1325  sub _get_next_token ($) {
1325        
1326          redo A;          redo A;
1327        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
1328            if ($self->{is_xml}) {
1329              
1330              ## XML5: Not a parse error.
1331              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1332            } else {
1333              
1334            }
1335    
1336          $before_leave->();          $before_leave->();
1337          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
1338                        
# Line 1189  sub _get_next_token ($) { Line 1382  sub _get_next_token ($) {
1382        
1383          redo A;          redo A;
1384        } elsif ($self->{nc} == 0x002F) { # /        } elsif ($self->{nc} == 0x002F) { # /
1385            if ($self->{is_xml}) {
1386              
1387              ## XML5: Not a parse error.
1388              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1389            } else {
1390              
1391            }
1392                    
1393          $before_leave->();          $before_leave->();
1394          $self->{state} = SELF_CLOSING_START_TAG_STATE;          $self->{state} = SELF_CLOSING_START_TAG_STATE;
# Line 1233  sub _get_next_token ($) { Line 1433  sub _get_next_token ($) {
1433          if ($self->{nc} == 0x0022 or # "          if ($self->{nc} == 0x0022 or # "
1434              $self->{nc} == 0x0027) { # '              $self->{nc} == 0x0027) { # '
1435                        
1436              ## XML5: Not a parse error.
1437            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1438          } else {          } else {
1439                        
# Line 1253  sub _get_next_token ($) { Line 1454  sub _get_next_token ($) {
1454          redo A;          redo A;
1455        }        }
1456      } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {      } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1457          ## XML5: "Tag attribute name after state".
1458          
1459        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
1460                    
1461          ## Stay in the state          ## Stay in the state
# Line 1284  sub _get_next_token ($) { Line 1487  sub _get_next_token ($) {
1487        
1488          redo A;          redo A;
1489        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
1490            if ($self->{is_xml}) {
1491              
1492              ## XML5: Not a parse error.
1493              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1494            } else {
1495              
1496            }
1497    
1498          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
1499                        
1500            $self->{last_stag_name} = $self->{ct}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
# Line 1337  sub _get_next_token ($) { Line 1548  sub _get_next_token ($) {
1548        
1549          redo A;          redo A;
1550        } elsif ($self->{nc} == 0x002F) { # /        } elsif ($self->{nc} == 0x002F) { # /
1551            if ($self->{is_xml}) {
1552              
1553              ## XML5: Not a parse error.
1554              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1555            } else {
1556              
1557            }
1558                    
1559          $self->{state} = SELF_CLOSING_START_TAG_STATE;          $self->{state} = SELF_CLOSING_START_TAG_STATE;
1560                    
# Line 1376  sub _get_next_token ($) { Line 1594  sub _get_next_token ($) {
1594    
1595          redo A;          redo A;
1596        } else {        } else {
1597            if ($self->{is_xml}) {
1598              
1599              ## XML5: Not a parse error.
1600              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1601            } else {
1602              
1603            }
1604    
1605          if ($self->{nc} == 0x0022 or # "          if ($self->{nc} == 0x0022 or # "
1606              $self->{nc} == 0x0027) { # '              $self->{nc} == 0x0027) { # '
1607                        
1608              ## XML5: Not a parse error.
1609            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1610          } else {          } else {
1611                        
# Line 1402  sub _get_next_token ($) { Line 1629  sub _get_next_token ($) {
1629          redo A;                  redo A;        
1630        }        }
1631      } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {      } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1632          ## XML5: "Tag attribute value before state".
1633    
1634        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
1635                    
1636          ## Stay in the state          ## Stay in the state
# Line 1513  sub _get_next_token ($) { Line 1742  sub _get_next_token ($) {
1742        } else {        } else {
1743          if ($self->{nc} == 0x003D) { # =          if ($self->{nc} == 0x003D) { # =
1744                        
1745              ## XML5: Not a parse error.
1746            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
1747            } elsif ($self->{is_xml}) {
1748              
1749              ## XML5: No parse error.
1750              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO
1751          } else {          } else {
1752                        
1753          }          }
# Line 1533  sub _get_next_token ($) { Line 1767  sub _get_next_token ($) {
1767          redo A;          redo A;
1768        }        }
1769      } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {      } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1770          ## XML5: "Tag attribute value double quoted state" and "DOCTYPE
1771          ## ATTLIST attribute value double quoted state".
1772          
1773        if ($self->{nc} == 0x0022) { # "        if ($self->{nc} == 0x0022) { # "
1774                    if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1775          $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;            
1776              ## XML5: "DOCTYPE ATTLIST name after state".
1777              push @{$self->{ct}->{attrdefs}}, $self->{ca};
1778              $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1779            } else {
1780              
1781              ## XML5: "Tag attribute name before state".
1782              $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1783            }
1784                    
1785      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1786        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 1550  sub _get_next_token ($) { Line 1795  sub _get_next_token ($) {
1795          redo A;          redo A;
1796        } elsif ($self->{nc} == 0x0026) { # &        } elsif ($self->{nc} == 0x0026) { # &
1797                    
1798            ## XML5: Not defined yet.
1799    
1800          ## NOTE: In the spec, the tokenizer is switched to the          ## NOTE: In the spec, the tokenizer is switched to the
1801          ## "entity in attribute value state".  In this implementation, the          ## "entity in attribute value state".  In this implementation, the
1802          ## tokenizer is switched to the |ENTITY_STATE|, which is an          ## tokenizer is switched to the |ENTITY_STATE|, which is an
# Line 1569  sub _get_next_token ($) { Line 1816  sub _get_next_token ($) {
1816      }      }
1817        
1818          redo A;          redo A;
1819          } elsif ($self->{is_xml} and
1820                   $is_space->{$self->{nc}}) {
1821            
1822            $self->{ca}->{value} .= ' ';
1823            ## Stay in the state.
1824            
1825        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1826          $self->{line_prev} = $self->{line};
1827          $self->{column_prev} = $self->{column};
1828          $self->{column}++;
1829          $self->{nc}
1830              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1831        } else {
1832          $self->{set_nc}->($self);
1833        }
1834      
1835            redo A;
1836        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
1837          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1838          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
1839                        
1840            $self->{last_stag_name} = $self->{ct}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
1841    
1842              $self->{state} = DATA_STATE;
1843              $self->{s_kwd} = '';
1844              ## reconsume
1845              return  ($self->{ct}); # start tag
1846              redo A;
1847          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1848            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1849            if ($self->{ct}->{attributes}) {            if ($self->{ct}->{attributes}) {
# Line 1583  sub _get_next_token ($) { Line 1853  sub _get_next_token ($) {
1853              ## NOTE: This state should never be reached.              ## NOTE: This state should never be reached.
1854                            
1855            }            }
1856    
1857              $self->{state} = DATA_STATE;
1858              $self->{s_kwd} = '';
1859              ## reconsume
1860              return  ($self->{ct}); # end tag
1861              redo A;
1862            } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1863              ## XML5: No parse error above; not defined yet.
1864              push @{$self->{ct}->{attrdefs}}, $self->{ca};
1865              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1866              ## Reconsume.
1867              return  ($self->{ct}); # ATTLIST
1868              redo A;
1869          } else {          } else {
1870            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1871          }          }
         $self->{state} = DATA_STATE;  
         $self->{s_kwd} = '';  
         ## reconsume  
   
         return  ($self->{ct}); # start tag or end tag  
   
         redo A;  
1872        } else {        } else {
1873                    ## XML5 [ATTLIST]: Not defined yet.
1874            if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1875              
1876              ## XML5: Not a parse error.
1877              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
1878            } else {
1879              
1880            }
1881          $self->{ca}->{value} .= chr ($self->{nc});          $self->{ca}->{value} .= chr ($self->{nc});
1882          $self->{read_until}->($self->{ca}->{value},          $self->{read_until}->($self->{ca}->{value},
1883                                q["&],                                qq["&<\x09\x0C\x20],
1884                                length $self->{ca}->{value});                                length $self->{ca}->{value});
1885    
1886          ## Stay in the state          ## Stay in the state
# Line 1615  sub _get_next_token ($) { Line 1898  sub _get_next_token ($) {
1898          redo A;          redo A;
1899        }        }
1900      } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {      } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1901          ## XML5: "Tag attribute value single quoted state" and "DOCTYPE
1902          ## ATTLIST attribute value single quoted state".
1903    
1904        if ($self->{nc} == 0x0027) { # '        if ($self->{nc} == 0x0027) { # '
1905                    if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1906          $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;            
1907              ## XML5: "DOCTYPE ATTLIST name after state".
1908              push @{$self->{ct}->{attrdefs}}, $self->{ca};
1909              $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1910            } else {
1911              
1912              ## XML5: "Before attribute name state" (sic).
1913              $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1914            }
1915                    
1916      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1917        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 1632  sub _get_next_token ($) { Line 1926  sub _get_next_token ($) {
1926          redo A;          redo A;
1927        } elsif ($self->{nc} == 0x0026) { # &        } elsif ($self->{nc} == 0x0026) { # &
1928                    
1929            ## XML5: Not defined yet.
1930    
1931          ## NOTE: In the spec, the tokenizer is switched to the          ## NOTE: In the spec, the tokenizer is switched to the
1932          ## "entity in attribute value state".  In this implementation, the          ## "entity in attribute value state".  In this implementation, the
1933          ## tokenizer is switched to the |ENTITY_STATE|, which is an          ## tokenizer is switched to the |ENTITY_STATE|, which is an
# Line 1651  sub _get_next_token ($) { Line 1947  sub _get_next_token ($) {
1947      }      }
1948        
1949          redo A;          redo A;
1950          } elsif ($self->{is_xml} and
1951                   $is_space->{$self->{nc}}) {
1952            
1953            $self->{ca}->{value} .= ' ';
1954            ## Stay in the state.
1955            
1956        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1957          $self->{line_prev} = $self->{line};
1958          $self->{column_prev} = $self->{column};
1959          $self->{column}++;
1960          $self->{nc}
1961              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1962        } else {
1963          $self->{set_nc}->($self);
1964        }
1965      
1966            redo A;
1967        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
1968          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1969          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
1970                        
1971            $self->{last_stag_name} = $self->{ct}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
1972    
1973              $self->{state} = DATA_STATE;
1974              $self->{s_kwd} = '';
1975              ## reconsume
1976              return  ($self->{ct}); # start tag
1977              redo A;
1978          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1979            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1980            if ($self->{ct}->{attributes}) {            if ($self->{ct}->{attributes}) {
# Line 1665  sub _get_next_token ($) { Line 1984  sub _get_next_token ($) {
1984              ## NOTE: This state should never be reached.              ## NOTE: This state should never be reached.
1985                            
1986            }            }
1987    
1988              $self->{state} = DATA_STATE;
1989              $self->{s_kwd} = '';
1990              ## reconsume
1991              return  ($self->{ct}); # end tag
1992              redo A;
1993            } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1994              ## XML5: No parse error above; not defined yet.
1995              push @{$self->{ct}->{attrdefs}}, $self->{ca};
1996              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1997              ## Reconsume.
1998              return  ($self->{ct}); # ATTLIST
1999              redo A;
2000          } else {          } else {
2001            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
2002          }          }
         $self->{state} = DATA_STATE;  
         $self->{s_kwd} = '';  
         ## reconsume  
   
         return  ($self->{ct}); # start tag or end tag  
   
         redo A;  
2003        } else {        } else {
2004                    ## XML5 [ATTLIST]: Not defined yet.
2005            if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
2006              
2007              ## XML5: Not a parse error.
2008              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
2009            } else {
2010              
2011            }
2012          $self->{ca}->{value} .= chr ($self->{nc});          $self->{ca}->{value} .= chr ($self->{nc});
2013          $self->{read_until}->($self->{ca}->{value},          $self->{read_until}->($self->{ca}->{value},
2014                                q['&],                                qq['&<\x09\x0C\x20],
2015                                length $self->{ca}->{value});                                length $self->{ca}->{value});
2016    
2017          ## Stay in the state          ## Stay in the state
# Line 1697  sub _get_next_token ($) { Line 2029  sub _get_next_token ($) {
2029          redo A;          redo A;
2030        }        }
2031      } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {      } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
2032          ## XML5: "Tag attribute value unquoted state".
2033    
2034        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
2035                    if ($self->{ct}->{type} == ATTLIST_TOKEN) {
2036          $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;            
2037              push @{$self->{ct}->{attrdefs}}, $self->{ca};
2038              $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
2039            } else {
2040              
2041              ## XML5: "Tag attribute name before state".
2042              $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2043            }
2044                    
2045      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2046        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 1714  sub _get_next_token ($) { Line 2055  sub _get_next_token ($) {
2055          redo A;          redo A;
2056        } elsif ($self->{nc} == 0x0026) { # &        } elsif ($self->{nc} == 0x0026) { # &
2057                    
2058    
2059            ## XML5: Not defined yet.
2060    
2061          ## NOTE: In the spec, the tokenizer is switched to the          ## NOTE: In the spec, the tokenizer is switched to the
2062          ## "entity in attribute value state".  In this implementation, the          ## "entity in attribute value state".  In this implementation, the
2063          ## tokenizer is switched to the |ENTITY_STATE|, which is an          ## tokenizer is switched to the |ENTITY_STATE|, which is an
# Line 1737  sub _get_next_token ($) { Line 2081  sub _get_next_token ($) {
2081          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
2082                        
2083            $self->{last_stag_name} = $self->{ct}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
2084    
2085              $self->{state} = DATA_STATE;
2086              $self->{s_kwd} = '';
2087              
2088        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2089          $self->{line_prev} = $self->{line};
2090          $self->{column_prev} = $self->{column};
2091          $self->{column}++;
2092          $self->{nc}
2093              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2094        } else {
2095          $self->{set_nc}->($self);
2096        }
2097      
2098              return  ($self->{ct}); # start tag
2099              redo A;
2100          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2101            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2102            if ($self->{ct}->{attributes}) {            if ($self->{ct}->{attributes}) {
# Line 1746  sub _get_next_token ($) { Line 2106  sub _get_next_token ($) {
2106              ## NOTE: This state should never be reached.              ## NOTE: This state should never be reached.
2107                            
2108            }            }
2109          } else {  
2110            die "$0: $self->{ct}->{type}: Unknown token type";            $self->{state} = DATA_STATE;
2111          }            $self->{s_kwd} = '';
2112          $self->{state} = DATA_STATE;            
         $self->{s_kwd} = '';  
           
2113      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2114        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
2115        $self->{column_prev} = $self->{column};        $self->{column_prev} = $self->{column};
# Line 1762  sub _get_next_token ($) { Line 2120  sub _get_next_token ($) {
2120        $self->{set_nc}->($self);        $self->{set_nc}->($self);
2121      }      }
2122        
2123              return  ($self->{ct}); # end tag
2124          return  ($self->{ct}); # start tag or end tag            redo A;
2125            } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2126          redo A;            push @{$self->{ct}->{attrdefs}}, $self->{ca};
2127              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2128              
2129        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2130          $self->{line_prev} = $self->{line};
2131          $self->{column_prev} = $self->{column};
2132          $self->{column}++;
2133          $self->{nc}
2134              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2135        } else {
2136          $self->{set_nc}->($self);
2137        }
2138      
2139              return  ($self->{ct}); # ATTLIST
2140              redo A;
2141            } else {
2142              die "$0: $self->{ct}->{type}: Unknown token type";
2143            }
2144        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
         $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');  
2145          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
2146                        
2147              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2148            $self->{last_stag_name} = $self->{ct}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
2149    
2150              $self->{state} = DATA_STATE;
2151              $self->{s_kwd} = '';
2152              ## reconsume
2153              return  ($self->{ct}); # start tag
2154              redo A;
2155          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2156              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2157            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2158            if ($self->{ct}->{attributes}) {            if ($self->{ct}->{attributes}) {
2159                            
# Line 1780  sub _get_next_token ($) { Line 2162  sub _get_next_token ($) {
2162              ## NOTE: This state should never be reached.              ## NOTE: This state should never be reached.
2163                            
2164            }            }
2165    
2166              $self->{state} = DATA_STATE;
2167              $self->{s_kwd} = '';
2168              ## reconsume
2169              return  ($self->{ct}); # end tag
2170              redo A;
2171            } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2172              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
2173              push @{$self->{ct}->{attrdefs}}, $self->{ca};
2174              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2175              ## Reconsume.
2176              return  ($self->{ct}); # ATTLIST
2177              redo A;
2178          } else {          } else {
2179            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
2180          }          }
         $self->{state} = DATA_STATE;  
         $self->{s_kwd} = '';  
         ## reconsume  
   
         return  ($self->{ct}); # start tag or end tag  
   
         redo A;  
2181        } else {        } else {
2182          if ({          if ({
2183               0x0022 => 1, # "               0x0022 => 1, # "
# Line 1797  sub _get_next_token ($) { Line 2185  sub _get_next_token ($) {
2185               0x003D => 1, # =               0x003D => 1, # =
2186              }->{$self->{nc}}) {              }->{$self->{nc}}) {
2187                        
2188              ## XML5: Not a parse error.
2189            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
2190          } else {          } else {
2191                        
2192          }          }
2193          $self->{ca}->{value} .= chr ($self->{nc});          $self->{ca}->{value} .= chr ($self->{nc});
2194          $self->{read_until}->($self->{ca}->{value},          $self->{read_until}->($self->{ca}->{value},
2195                                q["'=& >],                                qq["'=& \x09\x0C>],
2196                                length $self->{ca}->{value});                                length $self->{ca}->{value});
2197    
2198          ## Stay in the state          ## Stay in the state
# Line 1913  sub _get_next_token ($) { Line 2302  sub _get_next_token ($) {
2302          redo A;          redo A;
2303        }        }
2304      } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {      } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
2305          ## XML5: "Empty tag state".
2306    
2307        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
2308          if ($self->{ct}->{type} == END_TAG_TOKEN) {          if ($self->{ct}->{type} == END_TAG_TOKEN) {
2309                        
# Line 1964  sub _get_next_token ($) { Line 2355  sub _get_next_token ($) {
2355          } else {          } else {
2356            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
2357          }          }
2358            ## XML5: "Tag attribute name before state".
2359          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2360          $self->{s_kwd} = '';          $self->{s_kwd} = '';
2361          ## Reconsume.          ## Reconsume.
# Line 1978  sub _get_next_token ($) { Line 2370  sub _get_next_token ($) {
2370          redo A;          redo A;
2371        }        }
2372      } elsif ($self->{state} == BOGUS_COMMENT_STATE) {      } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
2373        ## (only happen if PCDATA state)        ## XML5: "Bogus comment state" and "DOCTYPE bogus comment state".
2374    
2375        ## NOTE: Unlike spec's "bogus comment state", this implementation        ## NOTE: Unlike spec's "bogus comment state", this implementation
2376        ## consumes characters one-by-one basis.        ## consumes characters one-by-one basis.
2377                
2378        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
2379                    if ($self->{in_subset}) {
2380          $self->{state} = DATA_STATE;            
2381          $self->{s_kwd} = '';            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2382            } else {
2383              
2384              $self->{state} = DATA_STATE;
2385              $self->{s_kwd} = '';
2386            }
2387                    
2388      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2389        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2002  sub _get_next_token ($) { Line 2399  sub _get_next_token ($) {
2399          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
2400          redo A;          redo A;
2401        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
2402                    if ($self->{in_subset}) {
2403          $self->{state} = DATA_STATE;            
2404          $self->{s_kwd} = '';            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2405            } else {
2406              
2407              $self->{state} = DATA_STATE;
2408              $self->{s_kwd} = '';
2409            }
2410          ## reconsume          ## reconsume
2411    
2412          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
# Line 2031  sub _get_next_token ($) { Line 2433  sub _get_next_token ($) {
2433          redo A;          redo A;
2434        }        }
2435      } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {      } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
2436        ## (only happen if PCDATA state)        ## XML5: "Markup declaration state".
2437                
2438        if ($self->{nc} == 0x002D) { # -        if ($self->{nc} == 0x002D) { # -
2439                    
# Line 2053  sub _get_next_token ($) { Line 2455  sub _get_next_token ($) {
2455          ## ASCII case-insensitive.          ## ASCII case-insensitive.
2456                    
2457          $self->{state} = MD_DOCTYPE_STATE;          $self->{state} = MD_DOCTYPE_STATE;
2458          $self->{s_kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
2459                    
2460      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2461        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2072  sub _get_next_token ($) { Line 2474  sub _get_next_token ($) {
2474                 $self->{nc} == 0x005B) { # [                 $self->{nc} == 0x005B) { # [
2475                                                    
2476          $self->{state} = MD_CDATA_STATE;          $self->{state} = MD_CDATA_STATE;
2477          $self->{s_kwd} = '[';          $self->{kwd} = '[';
2478                    
2479      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2480        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2106  sub _get_next_token ($) { Line 2508  sub _get_next_token ($) {
2508                                    line => $self->{line_prev},                                    line => $self->{line_prev},
2509                                    column => $self->{column_prev} - 2,                                    column => $self->{column_prev} - 2,
2510                                   };                                   };
2511          $self->{state} = COMMENT_START_STATE;          $self->{state} = COMMENT_START_STATE; ## XML5: "comment state".
2512                    
2513      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2514        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2142  sub _get_next_token ($) { Line 2544  sub _get_next_token ($) {
2544              0x0054, # T              0x0054, # T
2545              0x0059, # Y              0x0059, # Y
2546              0x0050, # P              0x0050, # P
2547            ]->[length $self->{s_kwd}] or            ]->[length $self->{kwd}] or
2548            $self->{nc} == [            $self->{nc} == [
2549              undef,              undef,
2550              0x006F, # o              0x006F, # o
# Line 2150  sub _get_next_token ($) { Line 2552  sub _get_next_token ($) {
2552              0x0074, # t              0x0074, # t
2553              0x0079, # y              0x0079, # y
2554              0x0070, # p              0x0070, # p
2555            ]->[length $self->{s_kwd}]) {            ]->[length $self->{kwd}]) {
2556                    
2557          ## Stay in the state.          ## Stay in the state.
2558          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
2559                    
2560      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2561        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2166  sub _get_next_token ($) { Line 2568  sub _get_next_token ($) {
2568      }      }
2569        
2570          redo A;          redo A;
2571        } elsif ((length $self->{s_kwd}) == 6 and        } elsif ((length $self->{kwd}) == 6 and
2572                 ($self->{nc} == 0x0045 or # E                 ($self->{nc} == 0x0045 or # E
2573                  $self->{nc} == 0x0065)) { # e                  $self->{nc} == 0x0065)) { # e
2574                    if ($self->{is_xml} and
2575                ($self->{kwd} ne 'DOCTYP' or $self->{nc} == 0x0065)) {
2576              
2577              ## XML5: case-sensitive.
2578              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO
2579                              text => 'DOCTYPE',
2580                              line => $self->{line_prev},
2581                              column => $self->{column_prev} - 5);
2582            } else {
2583              
2584            }
2585          $self->{state} = DOCTYPE_STATE;          $self->{state} = DOCTYPE_STATE;
2586          $self->{ct} = {type => DOCTYPE_TOKEN,          $self->{ct} = {type => DOCTYPE_TOKEN,
2587                                    quirks => 1,                                    quirks => 1,
# Line 2192  sub _get_next_token ($) { Line 2604  sub _get_next_token ($) {
2604                                    
2605          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2606                          line => $self->{line_prev},                          line => $self->{line_prev},
2607                          column => $self->{column_prev} - 1 - length $self->{s_kwd});                          column => $self->{column_prev} - 1 - length $self->{kwd});
2608          $self->{state} = BOGUS_COMMENT_STATE;          $self->{state} = BOGUS_COMMENT_STATE;
2609          ## Reconsume.          ## Reconsume.
2610          $self->{ct} = {type => COMMENT_TOKEN,          $self->{ct} = {type => COMMENT_TOKEN,
2611                                    data => $self->{s_kwd},                                    data => $self->{kwd},
2612                                    line => $self->{line_prev},                                    line => $self->{line_prev},
2613                                    column => $self->{column_prev} - 1 - length $self->{s_kwd},                                    column => $self->{column_prev} - 1 - length $self->{kwd},
2614                                   };                                   };
2615          redo A;          redo A;
2616        }        }
# Line 2209  sub _get_next_token ($) { Line 2621  sub _get_next_token ($) {
2621              '[CD' => 0x0041, # A              '[CD' => 0x0041, # A
2622              '[CDA' => 0x0054, # T              '[CDA' => 0x0054, # T
2623              '[CDAT' => 0x0041, # A              '[CDAT' => 0x0041, # A
2624            }->{$self->{s_kwd}}) {            }->{$self->{kwd}}) {
2625                    
2626          ## Stay in the state.          ## Stay in the state.
2627          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
2628                    
2629      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2630        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2225  sub _get_next_token ($) { Line 2637  sub _get_next_token ($) {
2637      }      }
2638        
2639          redo A;          redo A;
2640        } elsif ($self->{s_kwd} eq '[CDATA' and        } elsif ($self->{kwd} eq '[CDATA' and
2641                 $self->{nc} == 0x005B) { # [                 $self->{nc} == 0x005B) { # [
2642                    if ($self->{is_xml} and
2643                not $self->{tainted} and
2644                @{$self->{open_elements} or []} == 0) {
2645              
2646              $self->{parse_error}->(level => $self->{level}->{must}, type => 'cdata outside of root element',
2647                              line => $self->{line_prev},
2648                              column => $self->{column_prev} - 7);
2649              $self->{tainted} = 1;
2650            } else {
2651              
2652            }
2653    
2654          $self->{ct} = {type => CHARACTER_TOKEN,          $self->{ct} = {type => CHARACTER_TOKEN,
2655                                    data => '',                                    data => '',
2656                                    line => $self->{line_prev},                                    line => $self->{line_prev},
# Line 2249  sub _get_next_token ($) { Line 2672  sub _get_next_token ($) {
2672                    
2673          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2674                          line => $self->{line_prev},                          line => $self->{line_prev},
2675                          column => $self->{column_prev} - 1 - length $self->{s_kwd});                          column => $self->{column_prev} - 1 - length $self->{kwd});
2676          $self->{state} = BOGUS_COMMENT_STATE;          $self->{state} = BOGUS_COMMENT_STATE;
2677          ## Reconsume.          ## Reconsume.
2678          $self->{ct} = {type => COMMENT_TOKEN,          $self->{ct} = {type => COMMENT_TOKEN,
2679                                    data => $self->{s_kwd},                                    data => $self->{kwd},
2680                                    line => $self->{line_prev},                                    line => $self->{line_prev},
2681                                    column => $self->{column_prev} - 1 - length $self->{s_kwd},                                    column => $self->{column_prev} - 1 - length $self->{kwd},
2682                                   };                                   };
2683          redo A;          redo A;
2684        }        }
# Line 2276  sub _get_next_token ($) { Line 2699  sub _get_next_token ($) {
2699        
2700          redo A;          redo A;
2701        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
           
2702          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2703          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
2704          $self->{s_kwd} = '';            
2705              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2706            } else {
2707              
2708              $self->{state} = DATA_STATE;
2709              $self->{s_kwd} = '';
2710            }
2711                    
2712      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2713        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2296  sub _get_next_token ($) { Line 2724  sub _get_next_token ($) {
2724    
2725          redo A;          redo A;
2726        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
2727          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2728          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
2729          $self->{s_kwd} = '';            
2730              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2731            } else {
2732              
2733              $self->{state} = DATA_STATE;
2734              $self->{s_kwd} = '';
2735            }
2736          ## reconsume          ## reconsume
2737    
2738          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
# Line 2340  sub _get_next_token ($) { Line 2773  sub _get_next_token ($) {
2773        
2774          redo A;          redo A;
2775        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
           
2776          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2777          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
2778          $self->{s_kwd} = '';            
2779              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2780            } else {
2781              
2782              $self->{state} = DATA_STATE;
2783              $self->{s_kwd} = '';
2784            }
2785                    
2786      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2787        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2360  sub _get_next_token ($) { Line 2798  sub _get_next_token ($) {
2798    
2799          redo A;          redo A;
2800        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
2801          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2802          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
2803          $self->{s_kwd} = '';            
2804              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2805            } else {
2806              
2807              $self->{state} = DATA_STATE;
2808              $self->{s_kwd} = '';
2809            }
2810          ## reconsume          ## reconsume
2811    
2812          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
# Line 2388  sub _get_next_token ($) { Line 2831  sub _get_next_token ($) {
2831          redo A;          redo A;
2832        }        }
2833      } elsif ($self->{state} == COMMENT_STATE) {      } elsif ($self->{state} == COMMENT_STATE) {
2834          ## XML5: "Comment state" and "DOCTYPE comment state".
2835    
2836        if ($self->{nc} == 0x002D) { # -        if ($self->{nc} == 0x002D) { # -
2837                    
2838          $self->{state} = COMMENT_END_DASH_STATE;          $self->{state} = COMMENT_END_DASH_STATE;
# Line 2404  sub _get_next_token ($) { Line 2849  sub _get_next_token ($) {
2849        
2850          redo A;          redo A;
2851        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
2852          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2853          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
2854          $self->{s_kwd} = '';            
2855              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2856            } else {
2857              
2858              $self->{state} = DATA_STATE;
2859              $self->{s_kwd} = '';
2860            }
2861          ## reconsume          ## reconsume
2862    
2863          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
# Line 2435  sub _get_next_token ($) { Line 2885  sub _get_next_token ($) {
2885          redo A;          redo A;
2886        }        }
2887      } elsif ($self->{state} == COMMENT_END_DASH_STATE) {      } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2888          ## XML5: "Comment dash state" and "DOCTYPE comment dash state".
2889    
2890        if ($self->{nc} == 0x002D) { # -        if ($self->{nc} == 0x002D) { # -
2891                    
2892          $self->{state} = COMMENT_END_STATE;          $self->{state} = COMMENT_END_STATE;
# Line 2451  sub _get_next_token ($) { Line 2903  sub _get_next_token ($) {
2903        
2904          redo A;          redo A;
2905        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
2906          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2907          $self->{s_kwd} = '';          if ($self->{in_subset}) {
2908          $self->{state} = DATA_STATE;            
2909          $self->{s_kwd} = '';            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2910            } else {
2911              
2912              $self->{state} = DATA_STATE;
2913              $self->{s_kwd} = '';
2914            }
2915          ## reconsume          ## reconsume
2916    
2917          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
# Line 2479  sub _get_next_token ($) { Line 2935  sub _get_next_token ($) {
2935          redo A;          redo A;
2936        }        }
2937      } elsif ($self->{state} == COMMENT_END_STATE) {      } elsif ($self->{state} == COMMENT_END_STATE) {
2938          ## XML5: "Comment end state" and "DOCTYPE comment end state".
2939    
2940        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
2941                    if ($self->{in_subset}) {
2942          $self->{state} = DATA_STATE;            
2943          $self->{s_kwd} = '';            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2944            } else {
2945              
2946              $self->{state} = DATA_STATE;
2947              $self->{s_kwd} = '';
2948            }
2949                    
2950      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2951        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2500  sub _get_next_token ($) { Line 2963  sub _get_next_token ($) {
2963          redo A;          redo A;
2964        } elsif ($self->{nc} == 0x002D) { # -        } elsif ($self->{nc} == 0x002D) { # -
2965                    
2966            ## XML5: Not a parse error.
2967          $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2968                          line => $self->{line_prev},                          line => $self->{line_prev},
2969                          column => $self->{column_prev});                          column => $self->{column_prev});
# Line 2518  sub _get_next_token ($) { Line 2982  sub _get_next_token ($) {
2982        
2983          redo A;          redo A;
2984        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
2985          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2986          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
2987          $self->{s_kwd} = '';            
2988              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2989            } else {
2990              
2991              $self->{state} = DATA_STATE;
2992              $self->{s_kwd} = '';
2993            }
2994          ## reconsume          ## reconsume
2995    
2996          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
# Line 2529  sub _get_next_token ($) { Line 2998  sub _get_next_token ($) {
2998          redo A;          redo A;
2999        } else {        } else {
3000                    
3001            ## XML5: Not a parse error.
3002          $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
3003                          line => $self->{line_prev},                          line => $self->{line_prev},
3004                          column => $self->{column_prev});                          column => $self->{column_prev});
# Line 2565  sub _get_next_token ($) { Line 3035  sub _get_next_token ($) {
3035          redo A;          redo A;
3036        } else {        } else {
3037                    
3038            ## XML5: Unless EOF, swith to the bogus comment state.
3039          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before DOCTYPE name');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before DOCTYPE name');
3040          $self->{state} = BEFORE_DOCTYPE_NAME_STATE;          $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
3041          ## reconsume          ## reconsume
3042          redo A;          redo A;
3043        }        }
3044      } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {      } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
3045          ## XML5: "DOCTYPE root name before state".
3046    
3047        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
3048                    
3049          ## Stay in the state          ## Stay in the state
# Line 2588  sub _get_next_token ($) { Line 3061  sub _get_next_token ($) {
3061          redo A;          redo A;
3062        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
3063                    
3064            ## XML5: No parse error.
3065          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3066          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3067          $self->{s_kwd} = '';          $self->{s_kwd} = '';
# Line 2616  sub _get_next_token ($) { Line 3090  sub _get_next_token ($) {
3090          return  ($self->{ct}); # DOCTYPE (quirks)          return  ($self->{ct}); # DOCTYPE (quirks)
3091    
3092          redo A;          redo A;
3093          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3094            
3095            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3096            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3097            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3098            $self->{in_subset} = 1;
3099            
3100        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3101          $self->{line_prev} = $self->{line};
3102          $self->{column_prev} = $self->{column};
3103          $self->{column}++;
3104          $self->{nc}
3105              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3106        } else {
3107          $self->{set_nc}->($self);
3108        }
3109      
3110            return  ($self->{ct}); # DOCTYPE
3111            redo A;
3112        } else {        } else {
3113                    
3114          $self->{ct}->{name} = chr $self->{nc};          $self->{ct}->{name} = chr $self->{nc};
# Line 2635  sub _get_next_token ($) { Line 3128  sub _get_next_token ($) {
3128          redo A;          redo A;
3129        }        }
3130      } elsif ($self->{state} == DOCTYPE_NAME_STATE) {      } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
3131  ## ISSUE: Redundant "First," in the spec.        ## XML5: "DOCTYPE root name state".
3132    
3133          ## ISSUE: Redundant "First," in the spec.
3134    
3135        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
3136                    
3137          $self->{state} = AFTER_DOCTYPE_NAME_STATE;          $self->{state} = AFTER_DOCTYPE_NAME_STATE;
# Line 2681  sub _get_next_token ($) { Line 3177  sub _get_next_token ($) {
3177          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
3178    
3179          redo A;          redo A;
3180          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3181            
3182            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3183            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3184            $self->{in_subset} = 1;
3185            
3186        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3187          $self->{line_prev} = $self->{line};
3188          $self->{column_prev} = $self->{column};
3189          $self->{column}++;
3190          $self->{nc}
3191              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3192        } else {
3193          $self->{set_nc}->($self);
3194        }
3195      
3196            return  ($self->{ct}); # DOCTYPE
3197            redo A;
3198        } else {        } else {
3199                    
3200          $self->{ct}->{name}          $self->{ct}->{name}
# Line 2700  sub _get_next_token ($) { Line 3214  sub _get_next_token ($) {
3214          redo A;          redo A;
3215        }        }
3216      } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {      } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
3217          ## XML5: Corresponding to XML5's "DOCTYPE root name after
3218          ## state", but implemented differently.
3219    
3220        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
3221                    
3222          ## Stay in the state          ## Stay in the state
# Line 2716  sub _get_next_token ($) { Line 3233  sub _get_next_token ($) {
3233        
3234          redo A;          redo A;
3235        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
3236            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3237              
3238              $self->{state} = DATA_STATE;
3239              $self->{s_kwd} = '';
3240            } else {
3241              
3242              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
3243              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3244            }
3245                    
         $self->{state} = DATA_STATE;  
         $self->{s_kwd} = '';  
3246                    
3247      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3248        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2730  sub _get_next_token ($) { Line 3254  sub _get_next_token ($) {
3254        $self->{set_nc}->($self);        $self->{set_nc}->($self);
3255      }      }
3256        
3257            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         return  ($self->{ct}); # DOCTYPE  
   
3258          redo A;          redo A;
3259        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
3260            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3261              
3262              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3263              $self->{state} = DATA_STATE;
3264              $self->{s_kwd} = '';
3265              $self->{ct}->{quirks} = 1;
3266            } else {
3267              
3268              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3269              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3270            }
3271                    
3272          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');          ## Reconsume.
3273          $self->{state} = DATA_STATE;          return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{s_kwd} = '';  
         ## reconsume  
   
         $self->{ct}->{quirks} = 1;  
         return  ($self->{ct}); # DOCTYPE  
   
3274          redo A;          redo A;
3275        } elsif ($self->{nc} == 0x0050 or # P        } elsif ($self->{nc} == 0x0050 or # P
3276                 $self->{nc} == 0x0070) { # p                 $self->{nc} == 0x0070) { # p
3277            
3278          $self->{state} = PUBLIC_STATE;          $self->{state} = PUBLIC_STATE;
3279          $self->{s_kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
3280                    
3281      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3282        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2763  sub _get_next_token ($) { Line 3291  sub _get_next_token ($) {
3291          redo A;          redo A;
3292        } elsif ($self->{nc} == 0x0053 or # S        } elsif ($self->{nc} == 0x0053 or # S
3293                 $self->{nc} == 0x0073) { # s                 $self->{nc} == 0x0073) { # s
3294            
3295          $self->{state} = SYSTEM_STATE;          $self->{state} = SYSTEM_STATE;
3296          $self->{s_kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
3297                    
3298      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3299        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2777  sub _get_next_token ($) { Line 3306  sub _get_next_token ($) {
3306      }      }
3307        
3308          redo A;          redo A;
3309        } else {        } elsif ($self->{nc} == 0x0022 and # "
3310                   ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
3311                    $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
3312                    
3313          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name');          $self->{state} = DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE;
3314          $self->{ct}->{quirks} = 1;          $self->{ct}->{value} = ''; # ENTITY
3315            
3316        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3317          $self->{line_prev} = $self->{line};
3318          $self->{column_prev} = $self->{column};
3319          $self->{column}++;
3320          $self->{nc}
3321              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3322        } else {
3323          $self->{set_nc}->($self);
3324        }
3325      
3326            redo A;
3327          } elsif ($self->{nc} == 0x0027 and # '
3328                   ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
3329                    $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
3330            
3331            $self->{state} = DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE;
3332            $self->{ct}->{value} = ''; # ENTITY
3333            
3334        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3335          $self->{line_prev} = $self->{line};
3336          $self->{column_prev} = $self->{column};
3337          $self->{column}++;
3338          $self->{nc}
3339              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3340        } else {
3341          $self->{set_nc}->($self);
3342        }
3343      
3344            redo A;
3345          } elsif ($self->{is_xml} and
3346                   $self->{ct}->{type} == DOCTYPE_TOKEN and
3347                   $self->{nc} == 0x005B) { # [
3348            
3349            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3350            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3351            $self->{in_subset} = 1;
3352            
3353        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3354          $self->{line_prev} = $self->{line};
3355          $self->{column_prev} = $self->{column};
3356          $self->{column}++;
3357          $self->{nc}
3358              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3359        } else {
3360          $self->{set_nc}->($self);
3361        }
3362      
3363            return  ($self->{ct}); # DOCTYPE
3364            redo A;
3365          } else {
3366            $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name'); ## TODO: type
3367    
3368            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3369              
3370              $self->{ct}->{quirks} = 1;
3371              $self->{state} = BOGUS_DOCTYPE_STATE;
3372            } else {
3373              
3374              $self->{state} = BOGUS_MD_STATE;
3375            }
3376    
         $self->{state} = BOGUS_DOCTYPE_STATE;  
3377                    
3378      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3379        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2804  sub _get_next_token ($) { Line 3395  sub _get_next_token ($) {
3395              0x0042, # B              0x0042, # B
3396              0x004C, # L              0x004C, # L
3397              0x0049, # I              0x0049, # I
3398            ]->[length $self->{s_kwd}] or            ]->[length $self->{kwd}] or
3399            $self->{nc} == [            $self->{nc} == [
3400              undef,              undef,
3401              0x0075, # u              0x0075, # u
3402              0x0062, # b              0x0062, # b
3403              0x006C, # l              0x006C, # l
3404              0x0069, # i              0x0069, # i
3405            ]->[length $self->{s_kwd}]) {            ]->[length $self->{kwd}]) {
3406                    
3407          ## Stay in the state.          ## Stay in the state.
3408          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
3409                    
3410      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3411        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2827  sub _get_next_token ($) { Line 3418  sub _get_next_token ($) {
3418      }      }
3419        
3420          redo A;          redo A;
3421        } elsif ((length $self->{s_kwd}) == 5 and        } elsif ((length $self->{kwd}) == 5 and
3422                 ($self->{nc} == 0x0043 or # C                 ($self->{nc} == 0x0043 or # C
3423                  $self->{nc} == 0x0063)) { # c                  $self->{nc} == 0x0063)) { # c
3424                    if ($self->{is_xml} and
3425                ($self->{kwd} ne 'PUBLI' or $self->{nc} == 0x0063)) { # c
3426              
3427              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
3428                              text => 'PUBLIC',
3429                              line => $self->{line_prev},
3430                              column => $self->{column_prev} - 4);
3431            } else {
3432              
3433            }
3434          $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;          $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3435                    
3436      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 2845  sub _get_next_token ($) { Line 3445  sub _get_next_token ($) {
3445        
3446          redo A;          redo A;
3447        } else {        } else {
3448                    $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name', ## TODO: type
         $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name',  
3449                          line => $self->{line_prev},                          line => $self->{line_prev},
3450                          column => $self->{column_prev} + 1 - length $self->{s_kwd});                          column => $self->{column_prev} + 1 - length $self->{kwd});
3451          $self->{ct}->{quirks} = 1;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3452              
3453          $self->{state} = BOGUS_DOCTYPE_STATE;            $self->{ct}->{quirks} = 1;
3454              $self->{state} = BOGUS_DOCTYPE_STATE;
3455            } else {
3456              
3457              $self->{state} = BOGUS_MD_STATE;
3458            }
3459          ## Reconsume.          ## Reconsume.
3460          redo A;          redo A;
3461        }        }
# Line 2863  sub _get_next_token ($) { Line 3467  sub _get_next_token ($) {
3467              0x0053, # S              0x0053, # S
3468              0x0054, # T              0x0054, # T
3469              0x0045, # E              0x0045, # E
3470            ]->[length $self->{s_kwd}] or            ]->[length $self->{kwd}] or
3471            $self->{nc} == [            $self->{nc} == [
3472              undef,              undef,
3473              0x0079, # y              0x0079, # y
3474              0x0073, # s              0x0073, # s
3475              0x0074, # t              0x0074, # t
3476              0x0065, # e              0x0065, # e
3477            ]->[length $self->{s_kwd}]) {            ]->[length $self->{kwd}]) {
3478                    
3479          ## Stay in the state.          ## Stay in the state.
3480          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
3481                    
3482      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3483        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2886  sub _get_next_token ($) { Line 3490  sub _get_next_token ($) {
3490      }      }
3491        
3492          redo A;          redo A;
3493        } elsif ((length $self->{s_kwd}) == 5 and        } elsif ((length $self->{kwd}) == 5 and
3494                 ($self->{nc} == 0x004D or # M                 ($self->{nc} == 0x004D or # M
3495                  $self->{nc} == 0x006D)) { # m                  $self->{nc} == 0x006D)) { # m
3496                    if ($self->{is_xml} and
3497                ($self->{kwd} ne 'SYSTE' or $self->{nc} == 0x006D)) { # m
3498              
3499              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
3500                              text => 'SYSTEM',
3501                              line => $self->{line_prev},
3502                              column => $self->{column_prev} - 4);
3503            } else {
3504              
3505            }
3506          $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;          $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
3507                    
3508      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 2904  sub _get_next_token ($) { Line 3517  sub _get_next_token ($) {
3517        
3518          redo A;          redo A;
3519        } else {        } else {
3520                    $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name', ## TODO: type
         $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name',  
3521                          line => $self->{line_prev},                          line => $self->{line_prev},
3522                          column => $self->{column_prev} + 1 - length $self->{s_kwd});                          column => $self->{column_prev} + 1 - length $self->{kwd});
3523          $self->{ct}->{quirks} = 1;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3524              
3525          $self->{state} = BOGUS_DOCTYPE_STATE;            $self->{ct}->{quirks} = 1;
3526              $self->{state} = BOGUS_DOCTYPE_STATE;
3527            } else {
3528              
3529              $self->{state} = BOGUS_MD_STATE;
3530            }
3531          ## Reconsume.          ## Reconsume.
3532          redo A;          redo A;
3533        }        }
# Line 2963  sub _get_next_token ($) { Line 3580  sub _get_next_token ($) {
3580        
3581          redo A;          redo A;
3582        } elsif ($self->{nc} eq 0x003E) { # >        } elsif ($self->{nc} eq 0x003E) { # >
           
3583          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3584            
3585          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3586          $self->{s_kwd} = '';            
3587              $self->{state} = DATA_STATE;
3588              $self->{s_kwd} = '';
3589              $self->{ct}->{quirks} = 1;
3590            } else {
3591              
3592              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3593            }
3594            
3595                    
3596      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3597        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2979  sub _get_next_token ($) { Line 3603  sub _get_next_token ($) {
3603        $self->{set_nc}->($self);        $self->{set_nc}->($self);
3604      }      }
3605        
3606            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         return  ($self->{ct}); # DOCTYPE  
   
3607          redo A;          redo A;
3608        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
3609            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3610              
3611              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3612              $self->{state} = DATA_STATE;
3613              $self->{s_kwd} = '';
3614              $self->{ct}->{quirks} = 1;
3615            } else {
3616              
3617              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3618              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3619            }
3620                    
         $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');  
   
         $self->{state} = DATA_STATE;  
         $self->{s_kwd} = '';  
3621          ## reconsume          ## reconsume
   
         $self->{ct}->{quirks} = 1;  
3622          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
   
3623          redo A;          redo A;
3624        } else {        } elsif ($self->{is_xml} and
3625                   $self->{ct}->{type} == DOCTYPE_TOKEN and
3626                   $self->{nc} == 0x005B) { # [
3627                    
3628            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3629            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3630            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3631            $self->{in_subset} = 1;
3632            
3633        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3634          $self->{line_prev} = $self->{line};
3635          $self->{column_prev} = $self->{column};
3636          $self->{column}++;
3637          $self->{nc}
3638              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3639        } else {
3640          $self->{set_nc}->($self);
3641        }
3642      
3643            return  ($self->{ct}); # DOCTYPE
3644            redo A;
3645          } else {
3646          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC');
         $self->{ct}->{quirks} = 1;  
3647    
3648          $self->{state} = BOGUS_DOCTYPE_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3649              
3650              $self->{ct}->{quirks} = 1;
3651              $self->{state} = BOGUS_DOCTYPE_STATE;
3652            } else {
3653              
3654              $self->{state} = BOGUS_MD_STATE;
3655            }
3656    
3657                    
3658      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3659        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3032  sub _get_next_token ($) { Line 3684  sub _get_next_token ($) {
3684        
3685          redo A;          redo A;
3686        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
           
3687          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3688    
3689          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3690          $self->{s_kwd} = '';            
3691              $self->{state} = DATA_STATE;
3692              $self->{s_kwd} = '';
3693              $self->{ct}->{quirks} = 1;
3694            } else {
3695              
3696              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3697            }
3698    
3699                    
3700      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3701        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3048  sub _get_next_token ($) { Line 3707  sub _get_next_token ($) {
3707        $self->{set_nc}->($self);        $self->{set_nc}->($self);
3708      }      }
3709        
3710            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         return  ($self->{ct}); # DOCTYPE  
   
3711          redo A;          redo A;
3712        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
3713          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3714    
3715          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3716          $self->{s_kwd} = '';            
3717          ## reconsume            $self->{state} = DATA_STATE;
3718              $self->{s_kwd} = '';
3719          $self->{ct}->{quirks} = 1;            $self->{ct}->{quirks} = 1;
3720            } else {
3721              
3722              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3723            }
3724            
3725            ## Reconsume.
3726          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
   
3727          redo A;          redo A;
3728        } else {        } else {
3729                    
3730          $self->{ct}->{pubid} # DOCTYPE          $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
             .= chr $self->{nc};  
3731          $self->{read_until}->($self->{ct}->{pubid}, q[">],          $self->{read_until}->($self->{ct}->{pubid}, q[">],
3732                                length $self->{ct}->{pubid});                                length $self->{ct}->{pubid});
3733    
# Line 3103  sub _get_next_token ($) { Line 3762  sub _get_next_token ($) {
3762        
3763          redo A;          redo A;
3764        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
           
3765          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3766    
3767          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3768          $self->{s_kwd} = '';            
3769              $self->{state} = DATA_STATE;
3770              $self->{s_kwd} = '';
3771              $self->{ct}->{quirks} = 1;
3772            } else {
3773              
3774              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3775            }
3776    
3777                    
3778      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3779        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3119  sub _get_next_token ($) { Line 3785  sub _get_next_token ($) {
3785        $self->{set_nc}->($self);        $self->{set_nc}->($self);
3786      }      }
3787        
3788            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         return  ($self->{ct}); # DOCTYPE  
   
3789          redo A;          redo A;
3790        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
3791          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3792    
3793          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3794          $self->{s_kwd} = '';            
3795              $self->{state} = DATA_STATE;
3796              $self->{s_kwd} = '';
3797              $self->{ct}->{quirks} = 1;
3798            } else {
3799              
3800              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3801            }
3802          
3803          ## reconsume          ## reconsume
3804            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         return  ($self->{ct}); # DOCTYPE  
   
3805          redo A;          redo A;
3806        } else {        } else {
3807                    
3808          $self->{ct}->{pubid} # DOCTYPE          $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
             .= chr $self->{nc};  
3809          $self->{read_until}->($self->{ct}->{pubid}, q['>],          $self->{read_until}->($self->{ct}->{pubid}, q['>],
3810                                length $self->{ct}->{pubid});                                length $self->{ct}->{pubid});
3811    
# Line 3175  sub _get_next_token ($) { Line 3841  sub _get_next_token ($) {
3841          redo A;          redo A;
3842        } elsif ($self->{nc} == 0x0022) { # "        } elsif ($self->{nc} == 0x0022) { # "
3843                    
3844          $self->{ct}->{sysid} = ''; # DOCTYPE          $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
3845          $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;          $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3846                    
3847      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 3191  sub _get_next_token ($) { Line 3857  sub _get_next_token ($) {
3857          redo A;          redo A;
3858        } elsif ($self->{nc} == 0x0027) { # '        } elsif ($self->{nc} == 0x0027) { # '
3859                    
3860          $self->{ct}->{sysid} = ''; # DOCTYPE          $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
3861          $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;          $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
3862                    
3863      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 3206  sub _get_next_token ($) { Line 3872  sub _get_next_token ($) {
3872        
3873          redo A;          redo A;
3874        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
3875            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3876              if ($self->{is_xml}) {
3877                
3878                $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3879              } else {
3880                
3881              }
3882              $self->{state} = DATA_STATE;
3883              $self->{s_kwd} = '';
3884            } else {
3885              if ($self->{ct}->{type} == NOTATION_TOKEN) {
3886                
3887              } else {
3888                
3889                $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');            
3890              }
3891              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3892            }
3893                    
         $self->{state} = DATA_STATE;  
         $self->{s_kwd} = '';  
3894                    
3895      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3896        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3220  sub _get_next_token ($) { Line 3902  sub _get_next_token ($) {
3902        $self->{set_nc}->($self);        $self->{set_nc}->($self);
3903      }      }
3904        
3905            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         return  ($self->{ct}); # DOCTYPE  
   
3906          redo A;          redo A;
3907        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
3908            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3909              
3910              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3911              
3912              $self->{state} = DATA_STATE;
3913              $self->{s_kwd} = '';
3914              $self->{ct}->{quirks} = 1;
3915            } else {
3916              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3917              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3918            }
3919                    
         $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');  
   
         $self->{state} = DATA_STATE;  
         $self->{s_kwd} = '';  
3920          ## reconsume          ## reconsume
3921            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3922          $self->{ct}->{quirks} = 1;          redo A;
3923          } elsif ($self->{is_xml} and
3924                   $self->{ct}->{type} == DOCTYPE_TOKEN and
3925                   $self->{nc} == 0x005B) { # [
3926            
3927            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3928            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3929            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3930            $self->{in_subset} = 1;
3931            
3932        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3933          $self->{line_prev} = $self->{line};
3934          $self->{column_prev} = $self->{column};
3935          $self->{column}++;
3936          $self->{nc}
3937              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3938        } else {
3939          $self->{set_nc}->($self);
3940        }
3941      
3942          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
   
3943          redo A;          redo A;
3944        } else {        } else {
           
3945          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC literal');
         $self->{ct}->{quirks} = 1;  
3946    
3947          $self->{state} = BOGUS_DOCTYPE_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3948              
3949              $self->{ct}->{quirks} = 1;
3950              $self->{state} = BOGUS_DOCTYPE_STATE;
3951            } else {
3952              
3953              $self->{state} = BOGUS_MD_STATE;
3954            }
3955    
3956                    
3957      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3958        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3304  sub _get_next_token ($) { Line 4015  sub _get_next_token ($) {
4015        
4016          redo A;          redo A;
4017        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
           
4018          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
         $self->{state} = DATA_STATE;  
         $self->{s_kwd} = '';  
4019                    
4020      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4021        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3320  sub _get_next_token ($) { Line 4028  sub _get_next_token ($) {
4028      }      }
4029        
4030    
4031          $self->{ct}->{quirks} = 1;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4032          return  ($self->{ct}); # DOCTYPE            
4033              $self->{state} = DATA_STATE;
4034              $self->{s_kwd} = '';
4035              $self->{ct}->{quirks} = 1;
4036            } else {
4037              
4038              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4039            }
4040    
4041            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4042          redo A;          redo A;
4043        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
4044            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4045              
4046              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
4047              $self->{state} = DATA_STATE;
4048              $self->{s_kwd} = '';
4049              $self->{ct}->{quirks} = 1;
4050            } else {
4051              
4052              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4053              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4054            }
4055                    
         $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');  
   
         $self->{state} = DATA_STATE;  
         $self->{s_kwd} = '';  
4056          ## reconsume          ## reconsume
4057            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4058            redo A;
4059          } elsif ($self->{is_xml} and
4060                   $self->{ct}->{type} == DOCTYPE_TOKEN and
4061                   $self->{nc} == 0x005B) { # [
4062            
4063            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
4064    
4065          $self->{ct}->{quirks} = 1;          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4066            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4067            $self->{in_subset} = 1;
4068            
4069        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4070          $self->{line_prev} = $self->{line};
4071          $self->{column_prev} = $self->{column};
4072          $self->{column}++;
4073          $self->{nc}
4074              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4075        } else {
4076          $self->{set_nc}->($self);
4077        }
4078      
4079          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
   
4080          redo A;          redo A;
4081        } else {        } else {
           
4082          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM');
         $self->{ct}->{quirks} = 1;  
4083    
4084          $self->{state} = BOGUS_DOCTYPE_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4085                        
4086              $self->{ct}->{quirks} = 1;
4087              $self->{state} = BOGUS_DOCTYPE_STATE;
4088            } else {
4089              
4090              $self->{state} = BOGUS_MD_STATE;
4091            }
4092    
4093                    
4094      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4095        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3371  sub _get_next_token ($) { Line 4119  sub _get_next_token ($) {
4119      }      }
4120        
4121          redo A;          redo A;
4122        } elsif ($self->{nc} == 0x003E) { # >        } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
           
4123          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4124    
4125          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4126          $self->{s_kwd} = '';            
4127              $self->{state} = DATA_STATE;
4128              $self->{s_kwd} = '';
4129              $self->{ct}->{quirks} = 1;
4130            } else {
4131              
4132              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4133            }
4134            
4135                    
4136      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4137        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3388  sub _get_next_token ($) { Line 4143  sub _get_next_token ($) {
4143        $self->{set_nc}->($self);        $self->{set_nc}->($self);
4144      }      }
4145        
4146            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         return  ($self->{ct}); # DOCTYPE  
   
4147          redo A;          redo A;
4148        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
4149          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4150    
4151          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4152          $self->{s_kwd} = '';            
4153              $self->{state} = DATA_STATE;
4154              $self->{s_kwd} = '';
4155              $self->{ct}->{quirks} = 1;
4156            } else {
4157              
4158              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4159            }
4160            
4161          ## reconsume          ## reconsume
4162            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         return  ($self->{ct}); # DOCTYPE  
   
4163          redo A;          redo A;
4164        } else {        } else {
4165                    
4166          $self->{ct}->{sysid} # DOCTYPE          $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
             .= chr $self->{nc};  
4167          $self->{read_until}->($self->{ct}->{sysid}, q[">],          $self->{read_until}->($self->{ct}->{sysid}, q[">],
4168                                length $self->{ct}->{sysid});                                length $self->{ct}->{sysid});
4169    
# Line 3442  sub _get_next_token ($) { Line 4197  sub _get_next_token ($) {
4197      }      }
4198        
4199          redo A;          redo A;
4200        } elsif ($self->{nc} == 0x003E) { # >        } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
4201                    
4202          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4203    
# Line 3465  sub _get_next_token ($) { Line 4220  sub _get_next_token ($) {
4220    
4221          redo A;          redo A;
4222        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
4223          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4224    
4225          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4226          $self->{s_kwd} = '';            
4227          ## reconsume            $self->{state} = DATA_STATE;
4228              $self->{s_kwd} = '';
4229          $self->{ct}->{quirks} = 1;            $self->{ct}->{quirks} = 1;
4230          return  ($self->{ct}); # DOCTYPE          } else {
4231              
4232              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4233            }
4234    
4235            ## reconsume
4236            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4237          redo A;          redo A;
4238        } else {        } else {
4239                    
4240          $self->{ct}->{sysid} # DOCTYPE          $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
             .= chr $self->{nc};  
4241          $self->{read_until}->($self->{ct}->{sysid}, q['>],          $self->{read_until}->($self->{ct}->{sysid}, q['>],
4242                                length $self->{ct}->{sysid});                                length $self->{ct}->{sysid});
4243    
# Line 3499  sub _get_next_token ($) { Line 4257  sub _get_next_token ($) {
4257        }        }
4258      } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {      } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
4259        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
4260                    if ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN) {
4261          ## Stay in the state            
4262              $self->{state} = BEFORE_NDATA_STATE;
4263            } else {
4264              
4265              ## Stay in the state
4266            }
4267                    
4268      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4269        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3514  sub _get_next_token ($) { Line 4277  sub _get_next_token ($) {
4277        
4278          redo A;          redo A;
4279        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
4280            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4281              
4282              $self->{state} = DATA_STATE;
4283              $self->{s_kwd} = '';
4284            } else {
4285              
4286              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4287            }
4288    
4289                    
4290          $self->{state} = DATA_STATE;      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4291          $self->{s_kwd} = '';        $self->{line_prev} = $self->{line};
4292          $self->{column_prev} = $self->{column};
4293          $self->{column}++;
4294          $self->{nc}
4295              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4296        } else {
4297          $self->{set_nc}->($self);
4298        }
4299      
4300            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4301            redo A;
4302          } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
4303                   ($self->{nc} == 0x004E or # N
4304                    $self->{nc} == 0x006E)) { # n
4305            
4306            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before NDATA'); ## TODO: type
4307            $self->{state} = NDATA_STATE;
4308            $self->{kwd} = chr $self->{nc};
4309                    
4310      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4311        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3528  sub _get_next_token ($) { Line 4317  sub _get_next_token ($) {
4317        $self->{set_nc}->($self);        $self->{set_nc}->($self);
4318      }      }
4319        
4320            redo A;
4321          } elsif ($self->{nc} == -1) {
4322            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4323              
4324              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
4325              $self->{state} = DATA_STATE;
4326              $self->{s_kwd} = '';
4327              $self->{ct}->{quirks} = 1;
4328            } else {
4329              
4330              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4331              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4332            }
4333    
4334            ## reconsume
4335            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4336            redo A;
4337          } elsif ($self->{is_xml} and
4338                   $self->{ct}->{type} == DOCTYPE_TOKEN and
4339                   $self->{nc} == 0x005B) { # [
4340            
4341            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4342            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4343            $self->{in_subset} = 1;
4344            
4345        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4346          $self->{line_prev} = $self->{line};
4347          $self->{column_prev} = $self->{column};
4348          $self->{column}++;
4349          $self->{nc}
4350              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4351        } else {
4352          $self->{set_nc}->($self);
4353        }
4354      
4355          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
4356            redo A;
4357          } else {
4358            $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
4359    
4360            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4361              
4362              #$self->{ct}->{quirks} = 1;
4363              $self->{state} = BOGUS_DOCTYPE_STATE;
4364            } else {
4365              
4366              $self->{state} = BOGUS_MD_STATE;
4367            }
4368    
4369            
4370        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4371          $self->{line_prev} = $self->{line};
4372          $self->{column_prev} = $self->{column};
4373          $self->{column}++;
4374          $self->{nc}
4375              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4376        } else {
4377          $self->{set_nc}->($self);
4378        }
4379      
4380            redo A;
4381          }
4382        } elsif ($self->{state} == BEFORE_NDATA_STATE) {
4383          if ($is_space->{$self->{nc}}) {
4384            
4385            ## Stay in the state.
4386            
4387        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4388          $self->{line_prev} = $self->{line};
4389          $self->{column_prev} = $self->{column};
4390          $self->{column}++;
4391          $self->{nc}
4392              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4393        } else {
4394          $self->{set_nc}->($self);
4395        }
4396      
4397            redo A;
4398          } elsif ($self->{nc} == 0x003E) { # >
4399            
4400            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4401            
4402        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4403          $self->{line_prev} = $self->{line};
4404          $self->{column_prev} = $self->{column};
4405          $self->{column}++;
4406          $self->{nc}
4407              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4408        } else {
4409          $self->{set_nc}->($self);
4410        }
4411      
4412            return  ($self->{ct}); # ENTITY
4413            redo A;
4414          } elsif ($self->{nc} == 0x004E or # N
4415                   $self->{nc} == 0x006E) { # n
4416            
4417            $self->{state} = NDATA_STATE;
4418            $self->{kwd} = chr $self->{nc};
4419            
4420        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4421          $self->{line_prev} = $self->{line};
4422          $self->{column_prev} = $self->{column};
4423          $self->{column}++;
4424          $self->{nc}
4425              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4426        } else {
4427          $self->{set_nc}->($self);
4428        }
4429      
4430          redo A;          redo A;
4431        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
4432                    
4433          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4434          $self->{state} = DATA_STATE;          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
         $self->{s_kwd} = '';  
4435          ## reconsume          ## reconsume
4436            return  ($self->{ct}); # ENTITY
         $self->{ct}->{quirks} = 1;  
         return  ($self->{ct}); # DOCTYPE  
   
4437          redo A;          redo A;
4438        } else {        } else {
4439                    
4440          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
4441          #$self->{ct}->{quirks} = 1;          $self->{state} = BOGUS_MD_STATE;
   
         $self->{state} = BOGUS_DOCTYPE_STATE;  
4442                    
4443      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4444        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3582  sub _get_next_token ($) { Line 4472  sub _get_next_token ($) {
4472          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
4473    
4474          redo A;          redo A;
4475          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
4476            
4477            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4478            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4479            $self->{in_subset} = 1;
4480            
4481        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4482          $self->{line_prev} = $self->{line};
4483          $self->{column_prev} = $self->{column};
4484          $self->{column}++;
4485          $self->{nc}
4486              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4487        } else {
4488          $self->{set_nc}->($self);
4489        }
4490      
4491            return  ($self->{ct}); # DOCTYPE
4492            redo A;
4493        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
4494                    
4495          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
# Line 3594  sub _get_next_token ($) { Line 4502  sub _get_next_token ($) {
4502        } else {        } else {
4503                    
4504          my $s = '';          my $s = '';
4505          $self->{read_until}->($s, q[>], 0);          $self->{read_until}->($s, q{>[}, 0);
4506    
4507          ## Stay in the state          ## Stay in the state
4508                    
# Line 3614  sub _get_next_token ($) { Line 4522  sub _get_next_token ($) {
4522        ## NOTE: "CDATA section state" in the state is jointly implemented        ## NOTE: "CDATA section state" in the state is jointly implemented
4523        ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,        ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
4524        ## and |CDATA_SECTION_MSE2_STATE|.        ## and |CDATA_SECTION_MSE2_STATE|.
4525    
4526          ## XML5: "CDATA state".
4527                
4528        if ($self->{nc} == 0x005D) { # ]        if ($self->{nc} == 0x005D) { # ]
4529                    
# Line 3631  sub _get_next_token ($) { Line 4541  sub _get_next_token ($) {
4541        
4542          redo A;          redo A;
4543        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
4544            if ($self->{is_xml}) {
4545              
4546              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no mse'); ## TODO: type
4547            } else {
4548              
4549            }
4550    
4551          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
4552          $self->{s_kwd} = '';          $self->{s_kwd} = '';
4553                    ## Reconsume.
     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {  
       $self->{line_prev} = $self->{line};  
       $self->{column_prev} = $self->{column};  
       $self->{column}++;  
       $self->{nc}  
           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);  
     } else {  
       $self->{set_nc}->($self);  
     }  
     
4554          if (length $self->{ct}->{data}) { # character          if (length $self->{ct}->{data}) { # character
4555                        
4556            return  ($self->{ct}); # character            return  ($self->{ct}); # character
# Line 3676  sub _get_next_token ($) { Line 4583  sub _get_next_token ($) {
4583    
4584        ## ISSUE: "text tokens" in spec.        ## ISSUE: "text tokens" in spec.
4585      } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {      } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
4586          ## XML5: "CDATA bracket state".
4587    
4588        if ($self->{nc} == 0x005D) { # ]        if ($self->{nc} == 0x005D) { # ]
4589                    
4590          $self->{state} = CDATA_SECTION_MSE2_STATE;          $self->{state} = CDATA_SECTION_MSE2_STATE;
# Line 3693  sub _get_next_token ($) { Line 4602  sub _get_next_token ($) {
4602          redo A;          redo A;
4603        } else {        } else {
4604                    
4605            ## XML5: If EOF, "]" is not appended and changed to the data state.
4606          $self->{ct}->{data} .= ']';          $self->{ct}->{data} .= ']';
4607          $self->{state} = CDATA_SECTION_STATE;          $self->{state} = CDATA_SECTION_STATE; ## XML5: Stay in the state.
4608          ## Reconsume.          ## Reconsume.
4609          redo A;          redo A;
4610        }        }
4611      } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {      } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
4612          ## XML5: "CDATA end state".
4613    
4614        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
4615          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
4616          $self->{s_kwd} = '';          $self->{s_kwd} = '';
# Line 3741  sub _get_next_token ($) { Line 4653  sub _get_next_token ($) {
4653                    
4654          $self->{ct}->{data} .= ']]'; # character          $self->{ct}->{data} .= ']]'; # character
4655          $self->{state} = CDATA_SECTION_STATE;          $self->{state} = CDATA_SECTION_STATE;
4656          ## Reconsume.          ## Reconsume. ## XML5: Emit.
4657          redo A;          redo A;
4658        }        }
4659      } elsif ($self->{state} == ENTITY_STATE) {      } elsif ($self->{state} == ENTITY_STATE) {
# Line 3750  sub _get_next_token ($) { Line 4662  sub _get_next_token ($) {
4662              0x003C => 1, 0x0026 => 1, -1 => 1, # <, &              0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
4663              $self->{entity_add} => 1,              $self->{entity_add} => 1,
4664            }->{$self->{nc}}) {            }->{$self->{nc}}) {
4665                    if ($self->{is_xml}) {
4666              
4667              $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
4668                              line => $self->{line_prev},
4669                              column => $self->{column_prev}
4670                                  + ($self->{nc} == -1 ? 1 : 0));
4671            } else {
4672              
4673              ## No error
4674            }
4675          ## Don't consume          ## Don't consume
         ## No error  
4676          ## Return nothing.          ## Return nothing.
4677          #          #
4678        } elsif ($self->{nc} == 0x0023) { # #        } elsif ($self->{nc} == 0x0023) { # #
4679                    
4680          $self->{state} = ENTITY_HASH_STATE;          $self->{state} = ENTITY_HASH_STATE;
4681          $self->{s_kwd} = '#';          $self->{kwd} = '#';
4682                    
4683      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4684        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3771  sub _get_next_token ($) { Line 4691  sub _get_next_token ($) {
4691      }      }
4692        
4693          redo A;          redo A;
4694        } elsif ((0x0041 <= $self->{nc} and        } elsif ($self->{is_xml} or
4695                   (0x0041 <= $self->{nc} and
4696                  $self->{nc} <= 0x005A) or # A..Z                  $self->{nc} <= 0x005A) or # A..Z
4697                 (0x0061 <= $self->{nc} and                 (0x0061 <= $self->{nc} and
4698                  $self->{nc} <= 0x007A)) { # a..z                  $self->{nc} <= 0x007A)) { # a..z
4699                    
4700          require Whatpm::_NamedEntityList;          require Whatpm::_NamedEntityList;
4701          $self->{state} = ENTITY_NAME_STATE;          $self->{state} = ENTITY_NAME_STATE;
4702          $self->{s_kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
4703          $self->{entity__value} = $self->{s_kwd};          $self->{entity__value} = $self->{kwd};
4704          $self->{entity__match} = 0;          $self->{entity__match} = 0;
4705                    
4706      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 3825  sub _get_next_token ($) { Line 4746  sub _get_next_token ($) {
4746          redo A;          redo A;
4747        }        }
4748      } elsif ($self->{state} == ENTITY_HASH_STATE) {      } elsif ($self->{state} == ENTITY_HASH_STATE) {
4749        if ($self->{nc} == 0x0078 or # x        if ($self->{nc} == 0x0078) { # x
           $self->{nc} == 0x0058) { # X  
4750                    
4751          $self->{state} = HEXREF_X_STATE;          $self->{state} = HEXREF_X_STATE;
4752          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
4753            
4754        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4755          $self->{line_prev} = $self->{line};
4756          $self->{column_prev} = $self->{column};
4757          $self->{column}++;
4758          $self->{nc}
4759              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4760        } else {
4761          $self->{set_nc}->($self);
4762        }
4763      
4764            redo A;
4765          } elsif ($self->{nc} == 0x0058) { # X
4766            
4767            if ($self->{is_xml}) {
4768              $self->{parse_error}->(level => $self->{level}->{must}, type => 'uppercase hcro'); ## TODO: type
4769            }
4770            $self->{state} = HEXREF_X_STATE;
4771            $self->{kwd} .= chr $self->{nc};
4772                    
4773      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4774        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3846  sub _get_next_token ($) { Line 4785  sub _get_next_token ($) {
4785                 $self->{nc} <= 0x0039) { # 0..9                 $self->{nc} <= 0x0039) { # 0..9
4786                    
4787          $self->{state} = NCR_NUM_STATE;          $self->{state} = NCR_NUM_STATE;
4788          $self->{s_kwd} = $self->{nc} - 0x0030;          $self->{kwd} = $self->{nc} - 0x0030;
4789                    
4790      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4791        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3892  sub _get_next_token ($) { Line 4831  sub _get_next_token ($) {
4831        if (0x0030 <= $self->{nc} and        if (0x0030 <= $self->{nc} and
4832            $self->{nc} <= 0x0039) { # 0..9            $self->{nc} <= 0x0039) { # 0..9
4833                    
4834          $self->{s_kwd} *= 10;          $self->{kwd} *= 10;
4835          $self->{s_kwd} += $self->{nc} - 0x0030;          $self->{kwd} += $self->{nc} - 0x0030;
4836                    
4837          ## Stay in the state.          ## Stay in the state.
4838                    
# Line 3929  sub _get_next_token ($) { Line 4868  sub _get_next_token ($) {
4868          #          #
4869        }        }
4870    
4871        my $code = $self->{s_kwd};        my $code = $self->{kwd};
4872        my $l = $self->{line_prev};        my $l = $self->{line_prev};
4873        my $c = $self->{column_prev};        my $c = $self->{column_prev};
4874        if ($charref_map->{$code}) {        if ((not $self->{is_xml} and $charref_map->{$code}) or
4875              ($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or
4876              ($self->{is_xml} and $code == 0x0000)) {
4877                    
4878          $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4879                          text => (sprintf 'U+%04X', $code),                          text => (sprintf 'U+%04X', $code),
# Line 3952  sub _get_next_token ($) { Line 4893  sub _get_next_token ($) {
4893          $self->{s_kwd} = '';          $self->{s_kwd} = '';
4894          ## Reconsume.          ## Reconsume.
4895          return  ({type => CHARACTER_TOKEN, data => chr $code,          return  ({type => CHARACTER_TOKEN, data => chr $code,
4896                      has_reference => 1,
4897                    line => $l, column => $c,                    line => $l, column => $c,
4898                   });                   });
4899          redo A;          redo A;
# Line 3971  sub _get_next_token ($) { Line 4913  sub _get_next_token ($) {
4913          # 0..9, A..F, a..f          # 0..9, A..F, a..f
4914                    
4915          $self->{state} = HEXREF_HEX_STATE;          $self->{state} = HEXREF_HEX_STATE;
4916          $self->{s_kwd} = 0;          $self->{kwd} = 0;
4917          ## Reconsume.          ## Reconsume.
4918          redo A;          redo A;
4919        } else {        } else {
# Line 3989  sub _get_next_token ($) { Line 4931  sub _get_next_token ($) {
4931            $self->{s_kwd} = '';            $self->{s_kwd} = '';
4932            ## Reconsume.            ## Reconsume.
4933            return  ({type => CHARACTER_TOKEN,            return  ({type => CHARACTER_TOKEN,
4934                      data => '&' . $self->{s_kwd},                      data => '&' . $self->{kwd},
4935                      line => $self->{line_prev},                      line => $self->{line_prev},
4936                      column => $self->{column_prev} - length $self->{s_kwd},                      column => $self->{column_prev} - length $self->{kwd},
4937                     });                     });
4938            redo A;            redo A;
4939          } else {          } else {
4940                        
4941            $self->{ca}->{value} .= '&' . $self->{s_kwd};            $self->{ca}->{value} .= '&' . $self->{kwd};
4942            $self->{state} = $self->{prev_state};            $self->{state} = $self->{prev_state};
4943            $self->{s_kwd} = '';            $self->{s_kwd} = '';
4944            ## Reconsume.            ## Reconsume.
# Line 4007  sub _get_next_token ($) { Line 4949  sub _get_next_token ($) {
4949        if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {        if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
4950          # 0..9          # 0..9
4951                    
4952          $self->{s_kwd} *= 0x10;          $self->{kwd} *= 0x10;
4953          $self->{s_kwd} += $self->{nc} - 0x0030;          $self->{kwd} += $self->{nc} - 0x0030;
4954          ## Stay in the state.          ## Stay in the state.
4955                    
4956      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 4025  sub _get_next_token ($) { Line 4967  sub _get_next_token ($) {
4967        } elsif (0x0061 <= $self->{nc} and        } elsif (0x0061 <= $self->{nc} and
4968                 $self->{nc} <= 0x0066) { # a..f                 $self->{nc} <= 0x0066) { # a..f
4969                    
4970          $self->{s_kwd} *= 0x10;          $self->{kwd} *= 0x10;
4971          $self->{s_kwd} += $self->{nc} - 0x0060 + 9;          $self->{kwd} += $self->{nc} - 0x0060 + 9;
4972          ## Stay in the state.          ## Stay in the state.
4973                    
4974      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 4043  sub _get_next_token ($) { Line 4985  sub _get_next_token ($) {
4985        } elsif (0x0041 <= $self->{nc} and        } elsif (0x0041 <= $self->{nc} and
4986                 $self->{nc} <= 0x0046) { # A..F                 $self->{nc} <= 0x0046) { # A..F
4987                    
4988          $self->{s_kwd} *= 0x10;          $self->{kwd} *= 0x10;
4989          $self->{s_kwd} += $self->{nc} - 0x0040 + 9;          $self->{kwd} += $self->{nc} - 0x0040 + 9;
4990          ## Stay in the state.          ## Stay in the state.
4991                    
4992      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 4081  sub _get_next_token ($) { Line 5023  sub _get_next_token ($) {
5023          #          #
5024        }        }
5025    
5026        my $code = $self->{s_kwd};        my $code = $self->{kwd};
5027        my $l = $self->{line_prev};        my $l = $self->{line_prev};
5028        my $c = $self->{column_prev};        my $c = $self->{column_prev};
5029        if ($charref_map->{$code}) {        if ((not $self->{is_xml} and $charref_map->{$code}) or
5030              ($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or
5031              ($self->{is_xml} and $code == 0x0000)) {
5032                    
5033          $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
5034                          text => (sprintf 'U+%04X', $code),                          text => (sprintf 'U+%04X', $code),
# Line 4104  sub _get_next_token ($) { Line 5048  sub _get_next_token ($) {
5048          $self->{s_kwd} = '';          $self->{s_kwd} = '';
5049          ## Reconsume.          ## Reconsume.
5050          return  ({type => CHARACTER_TOKEN, data => chr $code,          return  ({type => CHARACTER_TOKEN, data => chr $code,
5051                      has_reference => 1,
5052                    line => $l, column => $c,                    line => $l, column => $c,
5053                   });                   });
5054          redo A;          redo A;
# Line 4117  sub _get_next_token ($) { Line 5062  sub _get_next_token ($) {
5062          redo A;          redo A;
5063        }        }
5064      } elsif ($self->{state} == ENTITY_NAME_STATE) {      } elsif ($self->{state} == ENTITY_NAME_STATE) {
5065        if (length $self->{s_kwd} < 30 and        if ((0x0041 <= $self->{nc} and # a
5066            ## NOTE: Some number greater than the maximum length of entity name             $self->{nc} <= 0x005A) or # x
5067            ((0x0041 <= $self->{nc} and # a            (0x0061 <= $self->{nc} and # a
5068              $self->{nc} <= 0x005A) or # x             $self->{nc} <= 0x007A) or # z
5069             (0x0061 <= $self->{nc} and # a            (0x0030 <= $self->{nc} and # 0
5070              $self->{nc} <= 0x007A) or # z             $self->{nc} <= 0x0039) or # 9
5071             (0x0030 <= $self->{nc} and # 0            $self->{nc} == 0x003B or # ;
5072              $self->{nc} <= 0x0039) or # 9            ($self->{is_xml} and
5073             $self->{nc} == 0x003B)) { # ;             not ($is_space->{$self->{nc}} or
5074                    {
5075                      0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
5076                      $self->{entity_add} => 1,
5077                    }->{$self->{nc}}))) {
5078          our $EntityChar;          our $EntityChar;
5079          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
5080          if (defined $EntityChar->{$self->{s_kwd}}) {          if (defined $EntityChar->{$self->{kwd}} or
5081                $self->{ge}->{$self->{kwd}}) {
5082            if ($self->{nc} == 0x003B) { # ;            if ($self->{nc} == 0x003B) { # ;
5083                            if (defined $self->{ge}->{$self->{kwd}}) {
5084              $self->{entity__value} = $EntityChar->{$self->{s_kwd}};                if ($self->{ge}->{$self->{kwd}}->{only_text}) {
5085                    
5086                    $self->{entity__value} = $self->{ge}->{$self->{kwd}}->{value};
5087                  } else {
5088                    if (defined $self->{ge}->{$self->{kwd}}->{notation}) {
5089                      
5090                      $self->{parse_error}->(level => $self->{level}->{must}, type => 'unparsed entity', ## TODO: type
5091                                      value => $self->{kwd});
5092                    } else {
5093                      
5094                    }
5095                    $self->{entity__value} = '&' . $self->{kwd}; ## TODO: expand
5096                  }
5097                } else {
5098                  if ($self->{is_xml}) {
5099                    
5100                    $self->{parse_error}->(level => $self->{level}->{must}, type => 'entity not declared', ## TODO: type
5101                                    value => $self->{kwd},
5102                                    level => {
5103                                              'amp;' => $self->{level}->{warn},
5104                                              'quot;' => $self->{level}->{warn},
5105                                              'lt;' => $self->{level}->{warn},
5106                                              'gt;' => $self->{level}->{warn},
5107                                              'apos;' => $self->{level}->{warn},
5108                                             }->{$self->{kwd}} ||
5109                                             $self->{level}->{must});
5110                  } else {
5111                    
5112                  }
5113                  $self->{entity__value} = $EntityChar->{$self->{kwd}};
5114                }
5115              $self->{entity__match} = 1;              $self->{entity__match} = 1;
5116                            
5117      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 4147  sub _get_next_token ($) { Line 5127  sub _get_next_token ($) {
5127              #              #
5128            } else {            } else {
5129                            
5130              $self->{entity__value} = $EntityChar->{$self->{s_kwd}};              $self->{entity__value} = $EntityChar->{$self->{kwd}};
5131              $self->{entity__match} = -1;              $self->{entity__match} = -1;
5132              ## Stay in the state.              ## Stay in the state.
5133                            
# Line 4195  sub _get_next_token ($) { Line 5175  sub _get_next_token ($) {
5175          if ($self->{prev_state} != DATA_STATE and # in attribute          if ($self->{prev_state} != DATA_STATE and # in attribute
5176              $self->{entity__match} < -1) {              $self->{entity__match} < -1) {
5177                        
5178            $data = '&' . $self->{s_kwd};            $data = '&' . $self->{kwd};
5179            #            #
5180          } else {          } else {
5181                        
# Line 4207  sub _get_next_token ($) { Line 5187  sub _get_next_token ($) {
5187                    
5188          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
5189                          line => $self->{line_prev},                          line => $self->{line_prev},
5190                          column => $self->{column_prev} - length $self->{s_kwd});                          column => $self->{column_prev} - length $self->{kwd});
5191          $data = '&' . $self->{s_kwd};          $data = '&' . $self->{kwd};
5192          #          #
5193        }        }
5194        
# Line 4229  sub _get_next_token ($) { Line 5209  sub _get_next_token ($) {
5209          ## Reconsume.          ## Reconsume.
5210          return  ({type => CHARACTER_TOKEN,          return  ({type => CHARACTER_TOKEN,
5211                    data => $data,                    data => $data,
5212                      has_reference => $has_ref,
5213                    line => $self->{line_prev},                    line => $self->{line_prev},
5214                    column => $self->{column_prev} + 1 - length $self->{s_kwd},                    column => $self->{column_prev} + 1 - length $self->{kwd},
5215                   });                   });
5216          redo A;          redo A;
5217        } else {        } else {
# Line 4242  sub _get_next_token ($) { Line 5223  sub _get_next_token ($) {
5223          ## Reconsume.          ## Reconsume.
5224          redo A;          redo A;
5225        }        }
5226    
5227        ## XML-only states
5228    
5229        } elsif ($self->{state} == PI_STATE) {
5230          ## XML5: "Pi state" and "DOCTYPE pi state".
5231    
5232          if ($is_space->{$self->{nc}} or
5233              $self->{nc} == 0x003F or # ?
5234              $self->{nc} == -1) {
5235            ## XML5: U+003F: "pi state": Same as "Anything else"; "DOCTYPE
5236            ## pi state": Switch to the "DOCTYPE pi after state".  EOF:
5237            ## "DOCTYPE pi state": Parse error, switch to the "data
5238            ## state".
5239            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare pio', ## TODO: type
5240                            line => $self->{line_prev},
5241                            column => $self->{column_prev}
5242                                - 1 * ($self->{nc} != -1));
5243            $self->{state} = BOGUS_COMMENT_STATE;
5244            ## Reconsume.
5245            $self->{ct} = {type => COMMENT_TOKEN,
5246                           data => '?',
5247                           line => $self->{line_prev},
5248                           column => $self->{column_prev}
5249                               - 1 * ($self->{nc} != -1),
5250                          };
5251            redo A;
5252          } else {
5253            ## XML5: "DOCTYPE pi state": Stay in the state.
5254            $self->{ct} = {type => PI_TOKEN,
5255                           target => chr $self->{nc},
5256                           data => '',
5257                           line => $self->{line_prev},
5258                           column => $self->{column_prev} - 1,
5259                          };
5260            $self->{state} = PI_TARGET_STATE;
5261            
5262        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5263          $self->{line_prev} = $self->{line};
5264          $self->{column_prev} = $self->{column};
5265          $self->{column}++;
5266          $self->{nc}
5267              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5268        } else {
5269          $self->{set_nc}->($self);
5270        }
5271      
5272            redo A;
5273          }
5274        } elsif ($self->{state} == PI_TARGET_STATE) {
5275          if ($is_space->{$self->{nc}}) {
5276            $self->{state} = PI_TARGET_AFTER_STATE;
5277            
5278        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5279          $self->{line_prev} = $self->{line};
5280          $self->{column_prev} = $self->{column};
5281          $self->{column}++;
5282          $self->{nc}
5283              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5284        } else {
5285          $self->{set_nc}->($self);
5286        }
5287      
5288            redo A;
5289          } elsif ($self->{nc} == -1) {
5290            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
5291            if ($self->{in_subset}) {
5292              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5293            } else {
5294              $self->{state} = DATA_STATE;
5295              $self->{s_kwd} = '';
5296            }
5297            ## Reconsume.
5298            return  ($self->{ct}); # pi
5299            redo A;
5300          } elsif ($self->{nc} == 0x003F) { # ?
5301            $self->{state} = PI_AFTER_STATE;
5302            
5303        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5304          $self->{line_prev} = $self->{line};
5305          $self->{column_prev} = $self->{column};
5306          $self->{column}++;
5307          $self->{nc}
5308              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5309        } else {
5310          $self->{set_nc}->($self);
5311        }
5312      
5313            redo A;
5314          } else {
5315            ## XML5: typo ("tag name" -> "target")
5316            $self->{ct}->{target} .= chr $self->{nc}; # pi
5317            
5318        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5319          $self->{line_prev} = $self->{line};
5320          $self->{column_prev} = $self->{column};
5321          $self->{column}++;
5322          $self->{nc}
5323              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5324        } else {
5325          $self->{set_nc}->($self);
5326        }
5327      
5328            redo A;
5329          }
5330        } elsif ($self->{state} == PI_TARGET_AFTER_STATE) {
5331          if ($is_space->{$self->{nc}}) {
5332            ## Stay in the state.
5333            
5334        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5335          $self->{line_prev} = $self->{line};
5336          $self->{column_prev} = $self->{column};
5337          $self->{column}++;
5338          $self->{nc}
5339              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5340        } else {
5341          $self->{set_nc}->($self);
5342        }
5343      
5344            redo A;
5345          } else {
5346            $self->{state} = PI_DATA_STATE;
5347            ## Reprocess.
5348            redo A;
5349          }
5350        } elsif ($self->{state} == PI_DATA_STATE) {
5351          if ($self->{nc} == 0x003F) { # ?
5352            $self->{state} = PI_DATA_AFTER_STATE;
5353            
5354        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5355          $self->{line_prev} = $self->{line};
5356          $self->{column_prev} = $self->{column};
5357          $self->{column}++;
5358          $self->{nc}
5359              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5360        } else {
5361          $self->{set_nc}->($self);
5362        }
5363      
5364            redo A;
5365          } elsif ($self->{nc} == -1) {
5366            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
5367            if ($self->{in_subset}) {
5368              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state"
5369            } else {
5370              $self->{state} = DATA_STATE;
5371              $self->{s_kwd} = '';
5372            }
5373            ## Reprocess.
5374            return  ($self->{ct}); # pi
5375            redo A;
5376          } else {
5377            $self->{ct}->{data} .= chr $self->{nc}; # pi
5378            $self->{read_until}->($self->{ct}->{data}, q[?],
5379                                  length $self->{ct}->{data});
5380            ## Stay in the state.
5381            
5382        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5383          $self->{line_prev} = $self->{line};
5384          $self->{column_prev} = $self->{column};
5385          $self->{column}++;
5386          $self->{nc}
5387              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5388        } else {
5389          $self->{set_nc}->($self);
5390        }
5391      
5392            ## Reprocess.
5393            redo A;
5394          }
5395        } elsif ($self->{state} == PI_AFTER_STATE) {
5396          ## XML5: Part of "Pi after state".
5397    
5398          if ($self->{nc} == 0x003E) { # >
5399            if ($self->{in_subset}) {
5400              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5401            } else {
5402              $self->{state} = DATA_STATE;
5403              $self->{s_kwd} = '';
5404            }
5405            
5406        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5407          $self->{line_prev} = $self->{line};
5408          $self->{column_prev} = $self->{column};
5409          $self->{column}++;
5410          $self->{nc}
5411              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5412        } else {
5413          $self->{set_nc}->($self);
5414        }
5415      
5416            return  ($self->{ct}); # pi
5417            redo A;
5418          } elsif ($self->{nc} == 0x003F) { # ?
5419            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
5420                            line => $self->{line_prev},
5421                            column => $self->{column_prev}); ## XML5: no error
5422            $self->{ct}->{data} .= '?';
5423            $self->{state} = PI_DATA_AFTER_STATE;
5424            
5425        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5426          $self->{line_prev} = $self->{line};
5427          $self->{column_prev} = $self->{column};
5428          $self->{column}++;
5429          $self->{nc}
5430              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5431        } else {
5432          $self->{set_nc}->($self);
5433        }
5434      
5435            redo A;
5436          } else {
5437            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
5438                            line => $self->{line_prev},
5439                            column => $self->{column_prev}
5440                                + 1 * ($self->{nc} == -1)); ## XML5: no error
5441            $self->{ct}->{data} .= '?'; ## XML5: not appended
5442            $self->{state} = PI_DATA_STATE;
5443            ## Reprocess.
5444            redo A;
5445          }
5446        } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
5447          ## XML5: Same as "pi after state" and "DOCTYPE pi after state".
5448    
5449          if ($self->{nc} == 0x003E) { # >
5450            if ($self->{in_subset}) {
5451              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5452            } else {
5453              $self->{state} = DATA_STATE;
5454              $self->{s_kwd} = '';
5455            }
5456            
5457        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5458          $self->{line_prev} = $self->{line};
5459          $self->{column_prev} = $self->{column};
5460          $self->{column}++;
5461          $self->{nc}
5462              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5463        } else {
5464          $self->{set_nc}->($self);
5465        }
5466      
5467            return  ($self->{ct}); # pi
5468            redo A;
5469          } elsif ($self->{nc} == 0x003F) { # ?
5470            $self->{ct}->{data} .= '?';
5471            ## Stay in the state.
5472            
5473        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5474          $self->{line_prev} = $self->{line};
5475          $self->{column_prev} = $self->{column};
5476          $self->{column}++;
5477          $self->{nc}
5478              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5479        } else {
5480          $self->{set_nc}->($self);
5481        }
5482      
5483            redo A;
5484          } else {
5485            $self->{ct}->{data} .= '?'; ## XML5: not appended
5486            $self->{state} = PI_DATA_STATE;
5487            ## Reprocess.
5488            redo A;
5489          }
5490    
5491        } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_STATE) {
5492          if ($self->{nc} == 0x003C) { # <
5493            $self->{state} = DOCTYPE_TAG_STATE;
5494            
5495        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5496          $self->{line_prev} = $self->{line};
5497          $self->{column_prev} = $self->{column};
5498          $self->{column}++;
5499          $self->{nc}
5500              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5501        } else {
5502          $self->{set_nc}->($self);
5503        }
5504      
5505            redo A;
5506          } elsif ($self->{nc} == 0x0025) { # %
5507            ## XML5: Not defined yet.
5508    
5509            ## TODO:
5510    
5511            if (not $self->{stop_processing} and
5512                not $self->{document}->xml_standalone) {
5513              $self->{parse_error}->(level => $self->{level}->{must}, type => 'stop processing', ## TODO: type
5514                              level => $self->{level}->{info});
5515              $self->{stop_processing} = 1;
5516            }
5517    
5518            
5519        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5520          $self->{line_prev} = $self->{line};
5521          $self->{column_prev} = $self->{column};
5522          $self->{column}++;
5523          $self->{nc}
5524              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5525        } else {
5526          $self->{set_nc}->($self);
5527        }
5528      
5529            redo A;
5530          } elsif ($self->{nc} == 0x005D) { # ]
5531            delete $self->{in_subset};
5532            $self->{state} = DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
5533            
5534        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5535          $self->{line_prev} = $self->{line};
5536          $self->{column_prev} = $self->{column};
5537          $self->{column}++;
5538          $self->{nc}
5539              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5540        } else {
5541          $self->{set_nc}->($self);
5542        }
5543      
5544            redo A;
5545          } elsif ($is_space->{$self->{nc}}) {
5546            ## Stay in the state.
5547            
5548        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5549          $self->{line_prev} = $self->{line};
5550          $self->{column_prev} = $self->{column};
5551          $self->{column}++;
5552          $self->{nc}
5553              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5554        } else {
5555          $self->{set_nc}->($self);
5556        }
5557      
5558            redo A;
5559          } elsif ($self->{nc} == -1) {
5560            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed internal subset'); ## TODO: type
5561            delete $self->{in_subset};
5562            $self->{state} = DATA_STATE;
5563            $self->{s_kwd} = '';
5564            ## Reconsume.
5565            return  ({type => END_OF_DOCTYPE_TOKEN});
5566            redo A;
5567          } else {
5568            unless ($self->{internal_subset_tainted}) {
5569              ## XML5: No parse error.
5570              $self->{parse_error}->(level => $self->{level}->{must}, type => 'string in internal subset');
5571              $self->{internal_subset_tainted} = 1;
5572            }
5573            ## Stay in the state.
5574            
5575        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5576          $self->{line_prev} = $self->{line};
5577          $self->{column_prev} = $self->{column};
5578          $self->{column}++;
5579          $self->{nc}
5580              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5581        } else {
5582          $self->{set_nc}->($self);
5583        }
5584      
5585            redo A;
5586          }
5587        } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
5588          if ($self->{nc} == 0x003E) { # >
5589            $self->{state} = DATA_STATE;
5590            $self->{s_kwd} = '';
5591            
5592        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5593          $self->{line_prev} = $self->{line};
5594          $self->{column_prev} = $self->{column};
5595          $self->{column}++;
5596          $self->{nc}
5597              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5598        } else {
5599          $self->{set_nc}->($self);
5600        }
5601      
5602            return  ({type => END_OF_DOCTYPE_TOKEN});
5603            redo A;
5604          } elsif ($self->{nc} == -1) {
5605            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
5606            $self->{state} = DATA_STATE;
5607            $self->{s_kwd} = '';
5608            ## Reconsume.
5609            return  ({type => END_OF_DOCTYPE_TOKEN});
5610            redo A;
5611          } else {
5612            ## XML5: No parse error and stay in the state.
5613            $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after internal subset'); ## TODO: type
5614    
5615            $self->{state} = BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
5616            
5617        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5618          $self->{line_prev} = $self->{line};
5619          $self->{column_prev} = $self->{column};
5620          $self->{column}++;
5621          $self->{nc}
5622              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5623        } else {
5624          $self->{set_nc}->($self);
5625        }
5626      
5627            redo A;
5628          }
5629        } elsif ($self->{state} == BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
5630          if ($self->{nc} == 0x003E) { # >
5631            $self->{state} = DATA_STATE;
5632            $self->{s_kwd} = '';
5633            
5634        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5635          $self->{line_prev} = $self->{line};
5636          $self->{column_prev} = $self->{column};
5637          $self->{column}++;
5638          $self->{nc}
5639              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5640        } else {
5641          $self->{set_nc}->($self);
5642        }
5643      
5644            return  ({type => END_OF_DOCTYPE_TOKEN});
5645            redo A;
5646          } elsif ($self->{nc} == -1) {
5647            $self->{state} = DATA_STATE;
5648            $self->{s_kwd} = '';
5649            ## Reconsume.
5650            return  ({type => END_OF_DOCTYPE_TOKEN});
5651            redo A;
5652          } else {
5653            ## Stay in the state.
5654            
5655        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5656          $self->{line_prev} = $self->{line};
5657          $self->{column_prev} = $self->{column};
5658          $self->{column}++;
5659          $self->{nc}
5660              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5661        } else {
5662          $self->{set_nc}->($self);
5663        }
5664      
5665            redo A;
5666          }
5667        } elsif ($self->{state} == DOCTYPE_TAG_STATE) {
5668          if ($self->{nc} == 0x0021) { # !
5669            $self->{state} = DOCTYPE_MARKUP_DECLARATION_OPEN_STATE;
5670            
5671        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5672          $self->{line_prev} = $self->{line};
5673          $self->{column_prev} = $self->{column};
5674          $self->{column}++;
5675          $self->{nc}
5676              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5677        } else {
5678          $self->{set_nc}->($self);
5679        }
5680      
5681            redo A;
5682          } elsif ($self->{nc} == 0x003F) { # ?
5683            $self->{state} = PI_STATE;
5684            
5685        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5686          $self->{line_prev} = $self->{line};
5687          $self->{column_prev} = $self->{column};
5688          $self->{column}++;
5689          $self->{nc}
5690              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5691        } else {
5692          $self->{set_nc}->($self);
5693        }
5694      
5695            redo A;
5696          } elsif ($self->{nc} == -1) {
5697            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago');
5698            $self->{state} = DATA_STATE;
5699            $self->{s_kwd} = '';
5700            ## Reconsume.
5701            redo A;
5702          } else {
5703            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago', ## XML5: Not a parse error.
5704                            line => $self->{line_prev},
5705                            column => $self->{column_prev});
5706            $self->{state} = BOGUS_COMMENT_STATE;
5707            $self->{ct} = {type => COMMENT_TOKEN,
5708                           data => '',
5709                          }; ## NOTE: Will be discarded.
5710            
5711        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5712          $self->{line_prev} = $self->{line};
5713          $self->{column_prev} = $self->{column};
5714          $self->{column}++;
5715          $self->{nc}
5716              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5717        } else {
5718          $self->{set_nc}->($self);
5719        }
5720      
5721            redo A;
5722          }
5723        } elsif ($self->{state} == DOCTYPE_MARKUP_DECLARATION_OPEN_STATE) {
5724          ## XML5: "DOCTYPE markup declaration state".
5725          
5726          if ($self->{nc} == 0x002D) { # -
5727            $self->{state} = MD_HYPHEN_STATE;
5728            
5729        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5730          $self->{line_prev} = $self->{line};
5731          $self->{column_prev} = $self->{column};
5732          $self->{column}++;
5733          $self->{nc}
5734              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5735        } else {
5736          $self->{set_nc}->($self);
5737        }
5738      
5739            redo A;
5740          } elsif ($self->{nc} == 0x0045 or # E
5741                   $self->{nc} == 0x0065) { # e
5742            $self->{state} = MD_E_STATE;
5743            $self->{kwd} = chr $self->{nc};
5744            
5745        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5746          $self->{line_prev} = $self->{line};
5747          $self->{column_prev} = $self->{column};
5748          $self->{column}++;
5749          $self->{nc}
5750              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5751        } else {
5752          $self->{set_nc}->($self);
5753        }
5754      
5755            redo A;
5756          } elsif ($self->{nc} == 0x0041 or # A
5757                   $self->{nc} == 0x0061) { # a
5758            $self->{state} = MD_ATTLIST_STATE;
5759            $self->{kwd} = chr $self->{nc};
5760            
5761        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5762          $self->{line_prev} = $self->{line};
5763          $self->{column_prev} = $self->{column};
5764          $self->{column}++;
5765          $self->{nc}
5766              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5767        } else {
5768          $self->{set_nc}->($self);
5769        }
5770      
5771            redo A;
5772          } elsif ($self->{nc} == 0x004E or # N
5773                   $self->{nc} == 0x006E) { # n
5774            $self->{state} = MD_NOTATION_STATE;
5775            $self->{kwd} = chr $self->{nc};
5776            
5777        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5778          $self->{line_prev} = $self->{line};
5779          $self->{column_prev} = $self->{column};
5780          $self->{column}++;
5781          $self->{nc}
5782              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5783        } else {
5784          $self->{set_nc}->($self);
5785        }
5786      
5787            redo A;
5788          } else {
5789            #
5790          }
5791          
5792          ## XML5: No parse error.
5793          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5794                          line => $self->{line_prev},
5795                          column => $self->{column_prev} - 1);
5796          ## Reconsume.
5797          $self->{state} = BOGUS_COMMENT_STATE;
5798          $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded.
5799          redo A;
5800        } elsif ($self->{state} == MD_E_STATE) {
5801          if ($self->{nc} == 0x004E or # N
5802              $self->{nc} == 0x006E) { # n
5803            $self->{state} = MD_ENTITY_STATE;
5804            $self->{kwd} .= chr $self->{nc};
5805            
5806        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5807          $self->{line_prev} = $self->{line};
5808          $self->{column_prev} = $self->{column};
5809          $self->{column}++;
5810          $self->{nc}
5811              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5812        } else {
5813          $self->{set_nc}->($self);
5814        }
5815      
5816            redo A;
5817          } elsif ($self->{nc} == 0x004C or # L
5818                   $self->{nc} == 0x006C) { # l
5819            ## XML5: <!ELEMENT> not supported.
5820            $self->{state} = MD_ELEMENT_STATE;
5821            $self->{kwd} .= chr $self->{nc};
5822            
5823        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5824          $self->{line_prev} = $self->{line};
5825          $self->{column_prev} = $self->{column};
5826          $self->{column}++;
5827          $self->{nc}
5828              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5829        } else {
5830          $self->{set_nc}->($self);
5831        }
5832      
5833            redo A;
5834          } else {
5835            ## XML5: No parse error.
5836            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5837                            line => $self->{line_prev},
5838                            column => $self->{column_prev} - 2
5839                                + 1 * ($self->{nc} == -1));
5840            ## Reconsume.
5841            $self->{state} = BOGUS_COMMENT_STATE;
5842            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5843            redo A;
5844          }
5845        } elsif ($self->{state} == MD_ENTITY_STATE) {
5846          if ($self->{nc} == [
5847                undef,
5848                undef,
5849                0x0054, # T
5850                0x0049, # I
5851                0x0054, # T
5852              ]->[length $self->{kwd}] or
5853              $self->{nc} == [
5854                undef,
5855                undef,
5856                0x0074, # t
5857                0x0069, # i
5858                0x0074, # t
5859              ]->[length $self->{kwd}]) {
5860            ## Stay in the state.
5861            $self->{kwd} .= chr $self->{nc};
5862            
5863        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5864          $self->{line_prev} = $self->{line};
5865          $self->{column_prev} = $self->{column};
5866          $self->{column}++;
5867          $self->{nc}
5868              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5869        } else {
5870          $self->{set_nc}->($self);
5871        }
5872      
5873            redo A;
5874          } elsif ((length $self->{kwd}) == 5 and
5875                   ($self->{nc} == 0x0059 or # Y
5876                    $self->{nc} == 0x0079)) { # y
5877            if ($self->{kwd} ne 'ENTIT' or $self->{nc} == 0x0079) {
5878              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
5879                              text => 'ENTITY',
5880                              line => $self->{line_prev},
5881                              column => $self->{column_prev} - 4);
5882            }
5883            $self->{ct} = {type => GENERAL_ENTITY_TOKEN, name => '',
5884                           line => $self->{line_prev},
5885                           column => $self->{column_prev} - 6};
5886            $self->{state} = DOCTYPE_MD_STATE;
5887            
5888        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5889          $self->{line_prev} = $self->{line};
5890          $self->{column_prev} = $self->{column};
5891          $self->{column}++;
5892          $self->{nc}
5893              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5894        } else {
5895          $self->{set_nc}->($self);
5896        }
5897      
5898            redo A;
5899          } else {
5900            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5901                            line => $self->{line_prev},
5902                            column => $self->{column_prev} - 1
5903                                - (length $self->{kwd})
5904                                + 1 * ($self->{nc} == -1));
5905            $self->{state} = BOGUS_COMMENT_STATE;
5906            ## Reconsume.
5907            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5908            redo A;
5909          }
5910        } elsif ($self->{state} == MD_ELEMENT_STATE) {
5911          if ($self->{nc} == [
5912               undef,
5913               undef,
5914               0x0045, # E
5915               0x004D, # M
5916               0x0045, # E
5917               0x004E, # N
5918              ]->[length $self->{kwd}] or
5919              $self->{nc} == [
5920               undef,
5921               undef,
5922               0x0065, # e
5923               0x006D, # m
5924               0x0065, # e
5925               0x006E, # n
5926              ]->[length $self->{kwd}]) {
5927            ## Stay in the state.
5928            $self->{kwd} .= chr $self->{nc};
5929            
5930        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5931          $self->{line_prev} = $self->{line};
5932          $self->{column_prev} = $self->{column};
5933          $self->{column}++;
5934          $self->{nc}
5935              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5936        } else {
5937          $self->{set_nc}->($self);
5938        }
5939      
5940            redo A;
5941          } elsif ((length $self->{kwd}) == 6 and
5942                   ($self->{nc} == 0x0054 or # T
5943                    $self->{nc} == 0x0074)) { # t
5944            if ($self->{kwd} ne 'ELEMEN' or $self->{nc} == 0x0074) {
5945              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
5946                              text => 'ELEMENT',
5947                              line => $self->{line_prev},
5948                              column => $self->{column_prev} - 5);
5949            }
5950            $self->{ct} = {type => ELEMENT_TOKEN, name => '',
5951                           line => $self->{line_prev},
5952                           column => $self->{column_prev} - 7};
5953            $self->{state} = DOCTYPE_MD_STATE;
5954            
5955        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5956          $self->{line_prev} = $self->{line};
5957          $self->{column_prev} = $self->{column};
5958          $self->{column}++;
5959          $self->{nc}
5960              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5961        } else {
5962          $self->{set_nc}->($self);
5963        }
5964      
5965            redo A;
5966          } else {
5967            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5968                            line => $self->{line_prev},
5969                            column => $self->{column_prev} - 1
5970                                - (length $self->{kwd})
5971                                + 1 * ($self->{nc} == -1));
5972            $self->{state} = BOGUS_COMMENT_STATE;
5973            ## Reconsume.
5974            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5975            redo A;
5976          }
5977        } elsif ($self->{state} == MD_ATTLIST_STATE) {
5978          if ($self->{nc} == [
5979               undef,
5980               0x0054, # T
5981               0x0054, # T
5982               0x004C, # L
5983               0x0049, # I
5984               0x0053, # S
5985              ]->[length $self->{kwd}] or
5986              $self->{nc} == [
5987               undef,
5988               0x0074, # t
5989               0x0074, # t
5990               0x006C, # l
5991               0x0069, # i
5992               0x0073, # s
5993              ]->[length $self->{kwd}]) {
5994            ## Stay in the state.
5995            $self->{kwd} .= chr $self->{nc};
5996            
5997        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5998          $self->{line_prev} = $self->{line};
5999          $self->{column_prev} = $self->{column};
6000          $self->{column}++;
6001          $self->{nc}
6002              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6003        } else {
6004          $self->{set_nc}->($self);
6005        }
6006      
6007            redo A;
6008          } elsif ((length $self->{kwd}) == 6 and
6009                   ($self->{nc} == 0x0054 or # T
6010                    $self->{nc} == 0x0074)) { # t
6011            if ($self->{kwd} ne 'ATTLIS' or $self->{nc} == 0x0074) {
6012              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
6013                              text => 'ATTLIST',
6014                              line => $self->{line_prev},
6015                              column => $self->{column_prev} - 5);
6016            }
6017            $self->{ct} = {type => ATTLIST_TOKEN, name => '',
6018                           attrdefs => [],
6019                           line => $self->{line_prev},
6020                           column => $self->{column_prev} - 7};
6021            $self->{state} = DOCTYPE_MD_STATE;
6022            
6023        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6024          $self->{line_prev} = $self->{line};
6025          $self->{column_prev} = $self->{column};
6026          $self->{column}++;
6027          $self->{nc}
6028              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6029        } else {
6030          $self->{set_nc}->($self);
6031        }
6032      
6033            redo A;
6034          } else {
6035            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
6036                            line => $self->{line_prev},
6037                            column => $self->{column_prev} - 1
6038                                 - (length $self->{kwd})
6039                                 + 1 * ($self->{nc} == -1));
6040            $self->{state} = BOGUS_COMMENT_STATE;
6041            ## Reconsume.
6042            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6043            redo A;
6044          }
6045        } elsif ($self->{state} == MD_NOTATION_STATE) {
6046          if ($self->{nc} == [
6047               undef,
6048               0x004F, # O
6049               0x0054, # T
6050               0x0041, # A
6051               0x0054, # T
6052               0x0049, # I
6053               0x004F, # O
6054              ]->[length $self->{kwd}] or
6055              $self->{nc} == [
6056               undef,
6057               0x006F, # o
6058               0x0074, # t
6059               0x0061, # a
6060               0x0074, # t
6061               0x0069, # i
6062               0x006F, # o
6063              ]->[length $self->{kwd}]) {
6064            ## Stay in the state.
6065            $self->{kwd} .= chr $self->{nc};
6066            
6067        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6068          $self->{line_prev} = $self->{line};
6069          $self->{column_prev} = $self->{column};
6070          $self->{column}++;
6071          $self->{nc}
6072              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6073        } else {
6074          $self->{set_nc}->($self);
6075        }
6076      
6077            redo A;
6078          } elsif ((length $self->{kwd}) == 7 and
6079                   ($self->{nc} == 0x004E or # N
6080                    $self->{nc} == 0x006E)) { # n
6081            if ($self->{kwd} ne 'NOTATIO' or $self->{nc} == 0x006E) {
6082              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
6083                              text => 'NOTATION',
6084                              line => $self->{line_prev},
6085                              column => $self->{column_prev} - 6);
6086            }
6087            $self->{ct} = {type => NOTATION_TOKEN, name => '',
6088                           line => $self->{line_prev},
6089                           column => $self->{column_prev} - 8};
6090            $self->{state} = DOCTYPE_MD_STATE;
6091            
6092        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6093          $self->{line_prev} = $self->{line};
6094          $self->{column_prev} = $self->{column};
6095          $self->{column}++;
6096          $self->{nc}
6097              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6098        } else {
6099          $self->{set_nc}->($self);
6100        }
6101      
6102            redo A;
6103          } else {
6104            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
6105                            line => $self->{line_prev},
6106                            column => $self->{column_prev} - 1
6107                                - (length $self->{kwd})
6108                                + 1 * ($self->{nc} == -1));
6109            $self->{state} = BOGUS_COMMENT_STATE;
6110            ## Reconsume.
6111            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6112            redo A;
6113          }
6114        } elsif ($self->{state} == DOCTYPE_MD_STATE) {
6115          ## XML5: "DOCTYPE ENTITY state", "DOCTYPE ATTLIST state", and
6116          ## "DOCTYPE NOTATION state".
6117    
6118          if ($is_space->{$self->{nc}}) {
6119            ## XML5: [NOTATION] Switch to the "DOCTYPE NOTATION identifier state".
6120            $self->{state} = BEFORE_MD_NAME_STATE;
6121            
6122        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6123          $self->{line_prev} = $self->{line};
6124          $self->{column_prev} = $self->{column};
6125          $self->{column}++;
6126          $self->{nc}
6127              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6128        } else {
6129          $self->{set_nc}->($self);
6130        }
6131      
6132            redo A;
6133          } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
6134                   $self->{nc} == 0x0025) { # %
6135            ## XML5: Switch to the "DOCTYPE bogus comment state".
6136            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before md name'); ## TODO: type
6137            $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
6138            
6139        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6140          $self->{line_prev} = $self->{line};
6141          $self->{column_prev} = $self->{column};
6142          $self->{column}++;
6143          $self->{nc}
6144              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6145        } else {
6146          $self->{set_nc}->($self);
6147        }
6148      
6149            redo A;
6150          } elsif ($self->{nc} == -1) {
6151            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6152            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6153            ## Reconsume.
6154            redo A;
6155          } elsif ($self->{nc} == 0x003E) { # >
6156            ## XML5: Switch to the "DOCTYPE bogus comment state".
6157            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6158            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6159            
6160        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6161          $self->{line_prev} = $self->{line};
6162          $self->{column_prev} = $self->{column};
6163          $self->{column}++;
6164          $self->{nc}
6165              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6166        } else {
6167          $self->{set_nc}->($self);
6168        }
6169      
6170            redo A;
6171          } else {
6172            ## XML5: Switch to the "DOCTYPE bogus comment state".
6173            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before md name'); ## TODO: type
6174            $self->{state} = BEFORE_MD_NAME_STATE;
6175            redo A;
6176          }
6177        } elsif ($self->{state} == BEFORE_MD_NAME_STATE) {
6178          ## XML5: "DOCTYPE ENTITY parameter state", "DOCTYPE ENTITY type
6179          ## before state", "DOCTYPE ATTLIST name before state".
6180    
6181          if ($is_space->{$self->{nc}}) {
6182            ## Stay in the state.
6183            
6184        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6185          $self->{line_prev} = $self->{line};
6186          $self->{column_prev} = $self->{column};
6187          $self->{column}++;
6188          $self->{nc}
6189              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6190        } else {
6191          $self->{set_nc}->($self);
6192        }
6193      
6194            redo A;
6195          } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
6196                   $self->{nc} == 0x0025) { # %
6197            $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
6198            
6199        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6200          $self->{line_prev} = $self->{line};
6201          $self->{column_prev} = $self->{column};
6202          $self->{column}++;
6203          $self->{nc}
6204              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6205        } else {
6206          $self->{set_nc}->($self);
6207        }
6208      
6209            redo A;
6210          } elsif ($self->{nc} == 0x003E) { # >
6211            ## XML5: Same as "Anything else".
6212            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6213            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6214            
6215        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6216          $self->{line_prev} = $self->{line};
6217          $self->{column_prev} = $self->{column};
6218          $self->{column}++;
6219          $self->{nc}
6220              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6221        } else {
6222          $self->{set_nc}->($self);
6223        }
6224      
6225            redo A;
6226          } elsif ($self->{nc} == -1) {
6227            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6228            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6229            ## Reconsume.
6230            redo A;
6231          } else {
6232            ## XML5: [ATTLIST] Not defined yet.
6233            $self->{ct}->{name} .= chr $self->{nc};
6234            $self->{state} = MD_NAME_STATE;
6235            
6236        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6237          $self->{line_prev} = $self->{line};
6238          $self->{column_prev} = $self->{column};
6239          $self->{column}++;
6240          $self->{nc}
6241              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6242        } else {
6243          $self->{set_nc}->($self);
6244        }
6245      
6246            redo A;
6247          }
6248        } elsif ($self->{state} == DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE) {
6249          if ($is_space->{$self->{nc}}) {
6250            ## XML5: Switch to the "DOCTYPE ENTITY parameter state".
6251            $self->{ct}->{type} = PARAMETER_ENTITY_TOKEN;
6252            $self->{state} = BEFORE_MD_NAME_STATE;
6253            
6254        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6255          $self->{line_prev} = $self->{line};
6256          $self->{column_prev} = $self->{column};
6257          $self->{column}++;
6258          $self->{nc}
6259              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6260        } else {
6261          $self->{set_nc}->($self);
6262        }
6263      
6264            redo A;
6265          } elsif ($self->{nc} == 0x003E) { # >
6266            ## XML5: Same as "Anything else".
6267            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6268            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6269            
6270        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6271          $self->{line_prev} = $self->{line};
6272          $self->{column_prev} = $self->{column};
6273          $self->{column}++;
6274          $self->{nc}
6275              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6276        } else {
6277          $self->{set_nc}->($self);
6278        }
6279      
6280            redo A;
6281          } elsif ($self->{nc} == -1) {
6282            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md');
6283            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6284            ## Reconsume.
6285            redo A;
6286          } else {
6287            ## XML5: No parse error.
6288            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space after ENTITY percent'); ## TODO: type
6289            $self->{state} = BOGUS_COMMENT_STATE;
6290            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6291            ## Reconsume.
6292            redo A;
6293          }
6294        } elsif ($self->{state} == MD_NAME_STATE) {
6295          ## XML5: "DOCTYPE ENTITY name state" and "DOCTYPE ATTLIST name state".
6296          
6297          if ($is_space->{$self->{nc}}) {
6298            if ($self->{ct}->{type} == ATTLIST_TOKEN) {
6299              $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
6300            } elsif ($self->{ct}->{type} == ELEMENT_TOKEN) {
6301              $self->{state} = AFTER_ELEMENT_NAME_STATE;
6302            } else { # ENTITY/NOTATION
6303              $self->{state} = AFTER_DOCTYPE_NAME_STATE;
6304            }
6305            
6306        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6307          $self->{line_prev} = $self->{line};
6308          $self->{column_prev} = $self->{column};
6309          $self->{column}++;
6310          $self->{nc}
6311              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6312        } else {
6313          $self->{set_nc}->($self);
6314        }
6315      
6316            redo A;
6317          } elsif ($self->{nc} == 0x003E) { # >
6318            if ($self->{ct}->{type} == ATTLIST_TOKEN) {
6319              #
6320            } else {
6321              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
6322            }
6323            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6324            
6325        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6326          $self->{line_prev} = $self->{line};
6327          $self->{column_prev} = $self->{column};
6328          $self->{column}++;
6329          $self->{nc}
6330              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6331        } else {
6332          $self->{set_nc}->($self);
6333        }
6334      
6335            return  ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
6336            redo A;
6337          } elsif ($self->{nc} == -1) {
6338            ## XML5: [ATTLIST] No parse error.
6339            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md');
6340            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6341            ## Reconsume.
6342            return  ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
6343            redo A;
6344          } else {
6345            ## XML5: [ATTLIST] Not defined yet.
6346            $self->{ct}->{name} .= chr $self->{nc};
6347            ## Stay in the state.
6348            
6349        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6350          $self->{line_prev} = $self->{line};
6351          $self->{column_prev} = $self->{column};
6352          $self->{column}++;
6353          $self->{nc}
6354              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6355        } else {
6356          $self->{set_nc}->($self);
6357        }
6358      
6359            redo A;
6360          }
6361        } elsif ($self->{state} == DOCTYPE_ATTLIST_NAME_AFTER_STATE) {
6362          if ($is_space->{$self->{nc}}) {
6363            ## Stay in the state.
6364            
6365        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6366          $self->{line_prev} = $self->{line};
6367          $self->{column_prev} = $self->{column};
6368          $self->{column}++;
6369          $self->{nc}
6370              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6371        } else {
6372          $self->{set_nc}->($self);
6373        }
6374      
6375            redo A;
6376          } elsif ($self->{nc} == 0x003E) { # >
6377            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6378            
6379        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6380          $self->{line_prev} = $self->{line};
6381          $self->{column_prev} = $self->{column};
6382          $self->{column}++;
6383          $self->{nc}
6384              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6385        } else {
6386          $self->{set_nc}->($self);
6387        }
6388      
6389            return  ($self->{ct}); # ATTLIST
6390            redo A;
6391          } elsif ($self->{nc} == -1) {
6392            ## XML5: No parse error.
6393            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6394            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6395            return  ($self->{ct});
6396            redo A;
6397          } else {
6398            ## XML5: Not defined yet.
6399            $self->{ca} = {name => chr ($self->{nc}), # attrdef
6400                           tokens => [],
6401                           line => $self->{line}, column => $self->{column}};
6402            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE;
6403            
6404        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6405          $self->{line_prev} = $self->{line};
6406          $self->{column_prev} = $self->{column};
6407          $self->{column}++;
6408          $self->{nc}
6409              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6410        } else {
6411          $self->{set_nc}->($self);
6412        }
6413      
6414            redo A;
6415          }
6416        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE) {
6417          if ($is_space->{$self->{nc}}) {
6418            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE;
6419            
6420        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6421          $self->{line_prev} = $self->{line};
6422          $self->{column_prev} = $self->{column};
6423          $self->{column}++;
6424          $self->{nc}
6425              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6426        } else {
6427          $self->{set_nc}->($self);
6428        }
6429      
6430            redo A;
6431          } elsif ($self->{nc} == 0x003E) { # >
6432            ## XML5: Same as "anything else".
6433            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr type'); ## TODO: type
6434            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6435            
6436        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6437          $self->{line_prev} = $self->{line};
6438          $self->{column_prev} = $self->{column};
6439          $self->{column}++;
6440          $self->{nc}
6441              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6442        } else {
6443          $self->{set_nc}->($self);
6444        }
6445      
6446            return  ($self->{ct}); # ATTLIST
6447            redo A;
6448          } elsif ($self->{nc} == 0x0028) { # (
6449            ## XML5: Same as "anything else".
6450            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before paren'); ## TODO: type
6451            $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6452            
6453        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6454          $self->{line_prev} = $self->{line};
6455          $self->{column_prev} = $self->{column};
6456          $self->{column}++;
6457          $self->{nc}
6458              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6459        } else {
6460          $self->{set_nc}->($self);
6461        }
6462      
6463            redo A;
6464          } elsif ($self->{nc} == -1) {
6465            ## XML5: No parse error.
6466            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6467            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6468            
6469        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6470          $self->{line_prev} = $self->{line};
6471          $self->{column_prev} = $self->{column};
6472          $self->{column}++;
6473          $self->{nc}
6474              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6475        } else {
6476          $self->{set_nc}->($self);
6477        }
6478      
6479            return  ($self->{ct}); # ATTLIST
6480            redo A;
6481          } else {
6482            ## XML5: Not defined yet.
6483            $self->{ca}->{name} .= chr $self->{nc};
6484            ## Stay in the state.
6485            
6486        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6487          $self->{line_prev} = $self->{line};
6488          $self->{column_prev} = $self->{column};
6489          $self->{column}++;
6490          $self->{nc}
6491              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6492        } else {
6493          $self->{set_nc}->($self);
6494        }
6495      
6496            redo A;
6497          }
6498        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE) {
6499          if ($is_space->{$self->{nc}}) {
6500            ## Stay in the state.
6501            
6502        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6503          $self->{line_prev} = $self->{line};
6504          $self->{column_prev} = $self->{column};
6505          $self->{column}++;
6506          $self->{nc}
6507              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6508        } else {
6509          $self->{set_nc}->($self);
6510        }
6511      
6512            redo A;
6513          } elsif ($self->{nc} == 0x003E) { # >
6514            ## XML5: Same as "anything else".
6515            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr type'); ## TODO: type
6516            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6517            
6518        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6519          $self->{line_prev} = $self->{line};
6520          $self->{column_prev} = $self->{column};
6521          $self->{column}++;
6522          $self->{nc}
6523              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6524        } else {
6525          $self->{set_nc}->($self);
6526        }
6527      
6528            return  ($self->{ct}); # ATTLIST
6529            redo A;
6530          } elsif ($self->{nc} == 0x0028) { # (
6531            ## XML5: Same as "anything else".
6532            $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6533            
6534        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6535          $self->{line_prev} = $self->{line};
6536          $self->{column_prev} = $self->{column};
6537          $self->{column}++;
6538          $self->{nc}
6539              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6540        } else {
6541          $self->{set_nc}->($self);
6542        }
6543      
6544            redo A;
6545          } elsif ($self->{nc} == -1) {
6546            ## XML5: No parse error.
6547            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6548            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6549            
6550        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6551          $self->{line_prev} = $self->{line};
6552          $self->{column_prev} = $self->{column};
6553          $self->{column}++;
6554          $self->{nc}
6555              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6556        } else {
6557          $self->{set_nc}->($self);
6558        }
6559      
6560            return  ($self->{ct});
6561            redo A;
6562          } else {
6563            ## XML5: Not defined yet.
6564            $self->{ca}->{type} = chr $self->{nc};
6565            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE;
6566            
6567        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6568          $self->{line_prev} = $self->{line};
6569          $self->{column_prev} = $self->{column};
6570          $self->{column}++;
6571          $self->{nc}
6572              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6573        } else {
6574          $self->{set_nc}->($self);
6575        }
6576      
6577            redo A;
6578          }
6579        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE) {
6580          if ($is_space->{$self->{nc}}) {
6581            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE;
6582            
6583        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6584          $self->{line_prev} = $self->{line};
6585          $self->{column_prev} = $self->{column};
6586          $self->{column}++;
6587          $self->{nc}
6588              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6589        } else {
6590          $self->{set_nc}->($self);
6591        }
6592      
6593            redo A;
6594          } elsif ($self->{nc} == 0x0023) { # #
6595            ## XML5: Same as "anything else".
6596            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6597            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
6598            
6599        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6600          $self->{line_prev} = $self->{line};
6601          $self->{column_prev} = $self->{column};
6602          $self->{column}++;
6603          $self->{nc}
6604              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6605        } else {
6606          $self->{set_nc}->($self);
6607        }
6608      
6609            redo A;
6610          } elsif ($self->{nc} == 0x0022) { # "
6611            ## XML5: Same as "anything else".
6612            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6613            $self->{ca}->{value} = '';
6614            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
6615            
6616        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6617          $self->{line_prev} = $self->{line};
6618          $self->{column_prev} = $self->{column};
6619          $self->{column}++;
6620          $self->{nc}
6621              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6622        } else {
6623          $self->{set_nc}->($self);
6624        }
6625      
6626            redo A;
6627          } elsif ($self->{nc} == 0x0027) { # '
6628            ## XML5: Same as "anything else".
6629            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6630            $self->{ca}->{value} = '';
6631            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
6632            
6633        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6634          $self->{line_prev} = $self->{line};
6635          $self->{column_prev} = $self->{column};
6636          $self->{column}++;
6637          $self->{nc}
6638              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6639        } else {
6640          $self->{set_nc}->($self);
6641        }
6642      
6643            redo A;
6644          } elsif ($self->{nc} == 0x003E) { # >
6645            ## XML5: Same as "anything else".
6646            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
6647            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6648            
6649        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6650          $self->{line_prev} = $self->{line};
6651          $self->{column_prev} = $self->{column};
6652          $self->{column}++;
6653          $self->{nc}
6654              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6655        } else {
6656          $self->{set_nc}->($self);
6657        }
6658      
6659            return  ($self->{ct}); # ATTLIST
6660            redo A;
6661          } elsif ($self->{nc} == 0x0028) { # (
6662            ## XML5: Same as "anything else".
6663            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before paren'); ## TODO: type
6664            $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6665            
6666        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6667          $self->{line_prev} = $self->{line};
6668          $self->{column_prev} = $self->{column};
6669          $self->{column}++;
6670          $self->{nc}
6671              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6672        } else {
6673          $self->{set_nc}->($self);
6674        }
6675      
6676            redo A;
6677          } elsif ($self->{nc} == -1) {
6678            ## XML5: No parse error.
6679            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6680            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6681            
6682        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6683          $self->{line_prev} = $self->{line};
6684          $self->{column_prev} = $self->{column};
6685          $self->{column}++;
6686          $self->{nc}
6687              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6688        } else {
6689          $self->{set_nc}->($self);
6690        }
6691      
6692            return  ($self->{ct});
6693            redo A;
6694          } else {
6695            ## XML5: Not defined yet.
6696            $self->{ca}->{type} .= chr $self->{nc};
6697            ## Stay in the state.
6698            
6699        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6700          $self->{line_prev} = $self->{line};
6701          $self->{column_prev} = $self->{column};
6702          $self->{column}++;
6703          $self->{nc}
6704              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6705        } else {
6706          $self->{set_nc}->($self);
6707        }
6708      
6709            redo A;
6710          }
6711        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE) {
6712          if ($is_space->{$self->{nc}}) {
6713            ## Stay in the state.
6714            
6715        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6716          $self->{line_prev} = $self->{line};
6717          $self->{column_prev} = $self->{column};
6718          $self->{column}++;
6719          $self->{nc}
6720              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6721        } else {
6722          $self->{set_nc}->($self);
6723        }
6724      
6725            redo A;
6726          } elsif ($self->{nc} == 0x0028) { # (
6727            ## XML5: Same as "anything else".
6728            $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6729            
6730        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6731          $self->{line_prev} = $self->{line};
6732          $self->{column_prev} = $self->{column};
6733          $self->{column}++;
6734          $self->{nc}
6735              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6736        } else {
6737          $self->{set_nc}->($self);
6738        }
6739      
6740            redo A;
6741          } elsif ($self->{nc} == 0x0023) { # #
6742            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
6743            
6744        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6745          $self->{line_prev} = $self->{line};
6746          $self->{column_prev} = $self->{column};
6747          $self->{column}++;
6748          $self->{nc}
6749              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6750        } else {
6751          $self->{set_nc}->($self);
6752        }
6753      
6754            redo A;
6755          } elsif ($self->{nc} == 0x0022) { # "
6756            ## XML5: Same as "anything else".
6757            $self->{ca}->{value} = '';
6758            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
6759            
6760        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6761          $self->{line_prev} = $self->{line};
6762          $self->{column_prev} = $self->{column};
6763          $self->{column}++;
6764          $self->{nc}
6765              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6766        } else {
6767          $self->{set_nc}->($self);
6768        }
6769      
6770            redo A;
6771          } elsif ($self->{nc} == 0x0027) { # '
6772            ## XML5: Same as "anything else".
6773            $self->{ca}->{value} = '';
6774            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
6775            
6776        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6777          $self->{line_prev} = $self->{line};
6778          $self->{column_prev} = $self->{column};
6779          $self->{column}++;
6780          $self->{nc}
6781              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6782        } else {
6783          $self->{set_nc}->($self);
6784        }
6785      
6786            redo A;
6787          } elsif ($self->{nc} == 0x003E) { # >
6788            ## XML5: Same as "anything else".
6789            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
6790            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6791            
6792        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6793          $self->{line_prev} = $self->{line};
6794          $self->{column_prev} = $self->{column};
6795          $self->{column}++;
6796          $self->{nc}
6797              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6798        } else {
6799          $self->{set_nc}->($self);
6800        }
6801      
6802            return  ($self->{ct}); # ATTLIST
6803            redo A;
6804          } elsif ($self->{nc} == -1) {
6805            ## XML5: No parse error.
6806            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6807            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6808            
6809        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6810          $self->{line_prev} = $self->{line};
6811          $self->{column_prev} = $self->{column};
6812          $self->{column}++;
6813          $self->{nc}
6814              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6815        } else {
6816          $self->{set_nc}->($self);
6817        }
6818      
6819            return  ($self->{ct});
6820            redo A;
6821          } else {
6822            ## XML5: Switch to the "DOCTYPE bogus comment state".
6823            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
6824            $self->{ca}->{value} = '';
6825            $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
6826            ## Reconsume.
6827            redo A;
6828          }
6829        } elsif ($self->{state} == BEFORE_ALLOWED_TOKEN_STATE) {
6830          if ($is_space->{$self->{nc}}) {
6831            ## Stay in the state.
6832            
6833        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6834          $self->{line_prev} = $self->{line};
6835          $self->{column_prev} = $self->{column};
6836          $self->{column}++;
6837          $self->{nc}
6838              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6839        } else {
6840          $self->{set_nc}->($self);
6841        }
6842      
6843            redo A;
6844          } elsif ($self->{nc} == 0x007C) { # |
6845            $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty allowed token'); ## TODO: type
6846            ## Stay in the state.
6847            
6848        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6849          $self->{line_prev} = $self->{line};
6850          $self->{column_prev} = $self->{column};
6851          $self->{column}++;
6852          $self->{nc}
6853              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6854        } else {
6855          $self->{set_nc}->($self);
6856        }
6857      
6858            redo A;
6859          } elsif ($self->{nc} == 0x0029) { # )
6860            $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty allowed token'); ## TODO: type
6861            $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
6862            
6863        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6864          $self->{line_prev} = $self->{line};
6865          $self->{column_prev} = $self->{column};
6866          $self->{column}++;
6867          $self->{nc}
6868              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6869        } else {
6870          $self->{set_nc}->($self);
6871        }
6872      
6873            redo A;
6874          } elsif ($self->{nc} == 0x003E) { # >
6875            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
6876            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6877            
6878        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6879          $self->{line_prev} = $self->{line};
6880          $self->{column_prev} = $self->{column};
6881          $self->{column}++;
6882          $self->{nc}
6883              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6884        } else {
6885          $self->{set_nc}->($self);
6886        }
6887      
6888            return  ($self->{ct}); # ATTLIST
6889            redo A;
6890          } elsif ($self->{nc} == -1) {
6891            ## XML5: No parse error.
6892            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6893            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6894            
6895        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6896          $self->{line_prev} = $self->{line};
6897          $self->{column_prev} = $self->{column};
6898          $self->{column}++;
6899          $self->{nc}
6900              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6901        } else {
6902          $self->{set_nc}->($self);
6903        }
6904      
6905            return  ($self->{ct});
6906            redo A;
6907          } else {
6908            push @{$self->{ca}->{tokens}}, chr $self->{nc};
6909            $self->{state} = ALLOWED_TOKEN_STATE;
6910            
6911        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6912          $self->{line_prev} = $self->{line};
6913          $self->{column_prev} = $self->{column};
6914          $self->{column}++;
6915          $self->{nc}
6916              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6917        } else {
6918          $self->{set_nc}->($self);
6919        }
6920      
6921            redo A;
6922          }
6923        } elsif ($self->{state} == ALLOWED_TOKEN_STATE) {
6924          if ($is_space->{$self->{nc}}) {
6925            $self->{state} = AFTER_ALLOWED_TOKEN_STATE;
6926            
6927        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6928          $self->{line_prev} = $self->{line};
6929          $self->{column_prev} = $self->{column};
6930          $self->{column}++;
6931          $self->{nc}
6932              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6933        } else {
6934          $self->{set_nc}->($self);
6935        }
6936      
6937            redo A;
6938          } elsif ($self->{nc} == 0x007C) { # |
6939            $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6940            
6941        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6942          $self->{line_prev} = $self->{line};
6943          $self->{column_prev} = $self->{column};
6944          $self->{column}++;
6945          $self->{nc}
6946              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6947        } else {
6948          $self->{set_nc}->($self);
6949        }
6950      
6951            redo A;
6952          } elsif ($self->{nc} == 0x0029) { # )
6953            $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
6954            
6955        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6956          $self->{line_prev} = $self->{line};
6957          $self->{column_prev} = $self->{column};
6958          $self->{column}++;
6959          $self->{nc}
6960              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6961        } else {
6962          $self->{set_nc}->($self);
6963        }
6964      
6965            redo A;
6966          } elsif ($self->{nc} == 0x003E) { # >
6967            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
6968            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6969            
6970        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6971          $self->{line_prev} = $self->{line};
6972          $self->{column_prev} = $self->{column};
6973          $self->{column}++;
6974          $self->{nc}
6975              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6976        } else {
6977          $self->{set_nc}->($self);
6978        }
6979      
6980            return  ($self->{ct}); # ATTLIST
6981            redo A;
6982          } elsif ($self->{nc} == -1) {
6983            ## XML5: No parse error.
6984            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6985            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6986            
6987        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6988          $self->{line_prev} = $self->{line};
6989          $self->{column_prev} = $self->{column};
6990          $self->{column}++;
6991          $self->{nc}
6992              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6993        } else {
6994          $self->{set_nc}->($self);
6995        }
6996      
6997            return  ($self->{ct});
6998            redo A;
6999          } else {
7000            $self->{ca}->{tokens}->[-1] .= chr $self->{nc};
7001            ## Stay in the state.
7002            
7003        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7004          $self->{line_prev} = $self->{line};
7005          $self->{column_prev} = $self->{column};
7006          $self->{column}++;
7007          $self->{nc}
7008              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7009        } else {
7010          $self->{set_nc}->($self);
7011        }
7012      
7013            redo A;
7014          }
7015        } elsif ($self->{state} == AFTER_ALLOWED_TOKEN_STATE) {
7016          if ($is_space->{$self->{nc}}) {
7017            ## Stay in the state.
7018            
7019        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7020          $self->{line_prev} = $self->{line};
7021          $self->{column_prev} = $self->{column};
7022          $self->{column}++;
7023          $self->{nc}
7024              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7025        } else {
7026          $self->{set_nc}->($self);
7027        }
7028      
7029            redo A;
7030          } elsif ($self->{nc} == 0x007C) { # |
7031            $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
7032            
7033        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7034          $self->{line_prev} = $self->{line};
7035          $self->{column_prev} = $self->{column};
7036          $self->{column}++;
7037          $self->{nc}
7038              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7039        } else {
7040          $self->{set_nc}->($self);
7041        }
7042      
7043            redo A;
7044          } elsif ($self->{nc} == 0x0029) { # )
7045            $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
7046            
7047        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7048          $self->{line_prev} = $self->{line};
7049          $self->{column_prev} = $self->{column};
7050          $self->{column}++;
7051          $self->{nc}
7052              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7053        } else {
7054          $self->{set_nc}->($self);
7055        }
7056      
7057            redo A;
7058          } elsif ($self->{nc} == 0x003E) { # >
7059            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
7060            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7061            
7062        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7063          $self->{line_prev} = $self->{line};
7064          $self->{column_prev} = $self->{column};
7065          $self->{column}++;
7066          $self->{nc}
7067              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7068        } else {
7069          $self->{set_nc}->($self);
7070        }
7071      
7072            return  ($self->{ct}); # ATTLIST
7073            redo A;
7074          } elsif ($self->{nc} == -1) {
7075            ## XML5: No parse error.
7076            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7077            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7078            
7079        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7080          $self->{line_prev} = $self->{line};
7081          $self->{column_prev} = $self->{column};
7082          $self->{column}++;
7083          $self->{nc}
7084              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7085        } else {
7086          $self->{set_nc}->($self);
7087        }
7088      
7089            return  ($self->{ct});
7090            redo A;
7091          } else {
7092            $self->{parse_error}->(level => $self->{level}->{must}, type => 'space in allowed token', ## TODO: type
7093                            line => $self->{line_prev},
7094                            column => $self->{column_prev});
7095            $self->{ca}->{tokens}->[-1] .= ' ' . chr $self->{nc};
7096            $self->{state} = ALLOWED_TOKEN_STATE;
7097            
7098        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7099          $self->{line_prev} = $self->{line};
7100          $self->{column_prev} = $self->{column};
7101          $self->{column}++;
7102          $self->{nc}
7103              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7104        } else {
7105          $self->{set_nc}->($self);
7106        }
7107      
7108            redo A;
7109          }
7110        } elsif ($self->{state} == AFTER_ALLOWED_TOKENS_STATE) {
7111          if ($is_space->{$self->{nc}}) {
7112            $self->{state} = BEFORE_ATTR_DEFAULT_STATE;
7113            
7114        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7115          $self->{line_prev} = $self->{line};
7116          $self->{column_prev} = $self->{column};
7117          $self->{column}++;
7118          $self->{nc}
7119              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7120        } else {
7121          $self->{set_nc}->($self);
7122        }
7123      
7124            redo A;
7125          } elsif ($self->{nc} == 0x0023) { # #
7126            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7127            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
7128            
7129        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7130          $self->{line_prev} = $self->{line};
7131          $self->{column_prev} = $self->{column};
7132          $self->{column}++;
7133          $self->{nc}
7134              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7135        } else {
7136          $self->{set_nc}->($self);
7137        }
7138      
7139            redo A;
7140          } elsif ($self->{nc} == 0x0022) { # "
7141            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7142            $self->{ca}->{value} = '';
7143            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7144            
7145        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7146          $self->{line_prev} = $self->{line};
7147          $self->{column_prev} = $self->{column};
7148          $self->{column}++;
7149          $self->{nc}
7150              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7151        } else {
7152          $self->{set_nc}->($self);
7153        }
7154      
7155            redo A;
7156          } elsif ($self->{nc} == 0x0027) { # '
7157            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7158            $self->{ca}->{value} = '';
7159            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7160            
7161        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7162          $self->{line_prev} = $self->{line};
7163          $self->{column_prev} = $self->{column};
7164          $self->{column}++;
7165          $self->{nc}
7166              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7167        } else {
7168          $self->{set_nc}->($self);
7169        }
7170      
7171            redo A;
7172          } elsif ($self->{nc} == 0x003E) { # >
7173            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7174            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7175            
7176        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7177          $self->{line_prev} = $self->{line};
7178          $self->{column_prev} = $self->{column};
7179          $self->{column}++;
7180          $self->{nc}
7181              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7182        } else {
7183          $self->{set_nc}->($self);
7184        }
7185      
7186            return  ($self->{ct}); # ATTLIST
7187            redo A;
7188          } elsif ($self->{nc} == -1) {
7189            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7190            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7191            
7192        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7193          $self->{line_prev} = $self->{line};
7194          $self->{column_prev} = $self->{column};
7195          $self->{column}++;
7196          $self->{nc}
7197              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7198        } else {
7199          $self->{set_nc}->($self);
7200        }
7201      
7202            return  ($self->{ct});
7203            redo A;
7204          } else {
7205            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
7206            $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7207            ## Reconsume.
7208            redo A;
7209          }
7210        } elsif ($self->{state} == BEFORE_ATTR_DEFAULT_STATE) {
7211          if ($is_space->{$self->{nc}}) {
7212            ## Stay in the state.
7213            
7214        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7215          $self->{line_prev} = $self->{line};
7216          $self->{column_prev} = $self->{column};
7217          $self->{column}++;
7218          $self->{nc}
7219              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7220        } else {
7221          $self->{set_nc}->($self);
7222        }
7223      
7224            redo A;
7225          } elsif ($self->{nc} == 0x0023) { # #
7226            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
7227            
7228        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7229          $self->{line_prev} = $self->{line};
7230          $self->{column_prev} = $self->{column};
7231          $self->{column}++;
7232          $self->{nc}
7233              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7234        } else {
7235          $self->{set_nc}->($self);
7236        }
7237      
7238            redo A;
7239          } elsif ($self->{nc} == 0x0022) { # "
7240            $self->{ca}->{value} = '';
7241            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7242            
7243        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7244          $self->{line_prev} = $self->{line};
7245          $self->{column_prev} = $self->{column};
7246          $self->{column}++;
7247          $self->{nc}
7248              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7249        } else {
7250          $self->{set_nc}->($self);
7251        }
7252      
7253            redo A;
7254          } elsif ($self->{nc} == 0x0027) { # '
7255            $self->{ca}->{value} = '';
7256            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7257            
7258        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7259          $self->{line_prev} = $self->{line};
7260          $self->{column_prev} = $self->{column};
7261          $self->{column}++;
7262          $self->{nc}
7263              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7264        } else {
7265          $self->{set_nc}->($self);
7266        }
7267      
7268            redo A;
7269          } elsif ($self->{nc} == 0x003E) { # >
7270            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7271            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7272            
7273        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7274          $self->{line_prev} = $self->{line};
7275          $self->{column_prev} = $self->{column};
7276          $self->{column}++;
7277          $self->{nc}
7278              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7279        } else {
7280          $self->{set_nc}->($self);
7281        }
7282      
7283            return  ($self->{ct}); # ATTLIST
7284            redo A;
7285          } elsif ($self->{nc} == -1) {
7286            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7287            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7288            
7289        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7290          $self->{line_prev} = $self->{line};
7291          $self->{column_prev} = $self->{column};
7292          $self->{column}++;
7293          $self->{nc}
7294              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7295        } else {
7296          $self->{set_nc}->($self);
7297        }
7298      
7299            return  ($self->{ct});
7300            redo A;
7301          } else {
7302            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
7303            $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7304            ## Reconsume.
7305            redo A;
7306          }
7307        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE) {
7308          if ($is_space->{$self->{nc}}) {
7309            ## XML5: No parse error.
7310            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no default type'); ## TODO: type
7311            $self->{state} = BOGUS_MD_STATE;
7312            ## Reconsume.
7313            redo A;
7314          } elsif ($self->{nc} == 0x0022) { # "
7315            ## XML5: Same as "anything else".
7316            $self->{ca}->{value} = '';
7317            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7318            
7319        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7320          $self->{line_prev} = $self->{line};
7321          $self->{column_prev} = $self->{column};
7322          $self->{column}++;
7323          $self->{nc}
7324              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7325        } else {
7326          $self->{set_nc}->($self);
7327        }
7328      
7329            redo A;
7330          } elsif ($self->{nc} == 0x0027) { # '
7331            ## XML5: Same as "anything else".
7332            $self->{ca}->{value} = '';
7333            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7334            
7335        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7336          $self->{line_prev} = $self->{line};
7337          $self->{column_prev} = $self->{column};
7338          $self->{column}++;
7339          $self->{nc}
7340              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7341        } else {
7342          $self->{set_nc}->($self);
7343        }
7344      
7345            redo A;
7346          } elsif ($self->{nc} == 0x003E) { # >
7347            ## XML5: Same as "anything else".
7348            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7349            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7350            
7351        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7352          $self->{line_prev} = $self->{line};
7353          $self->{column_prev} = $self->{column};
7354          $self->{column}++;
7355          $self->{nc}
7356              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7357        } else {
7358          $self->{set_nc}->($self);
7359        }
7360      
7361            return  ($self->{ct}); # ATTLIST
7362            redo A;
7363          } elsif ($self->{nc} == -1) {
7364            ## XML5: No parse error.
7365            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7366            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7367            
7368        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7369          $self->{line_prev} = $self->{line};
7370          $self->{column_prev} = $self->{column};
7371          $self->{column}++;
7372          $self->{nc}
7373              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7374        } else {
7375          $self->{set_nc}->($self);
7376        }
7377      
7378            return  ($self->{ct});
7379            redo A;
7380          } else {
7381            $self->{ca}->{default} = chr $self->{nc};
7382            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE;
7383            
7384        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7385          $self->{line_prev} = $self->{line};
7386          $self->{column_prev} = $self->{column};
7387          $self->{column}++;
7388          $self->{nc}
7389              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7390        } else {
7391          $self->{set_nc}->($self);
7392        }
7393      
7394            redo A;
7395          }
7396        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE) {
7397          if ($is_space->{$self->{nc}}) {
7398            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE;
7399            
7400        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7401          $self->{line_prev} = $self->{line};
7402          $self->{column_prev} = $self->{column};
7403          $self->{column}++;
7404          $self->{nc}
7405              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7406        } else {
7407          $self->{set_nc}->($self);
7408        }
7409      
7410            redo A;
7411          } elsif ($self->{nc} == 0x0022) { # "
7412            ## XML5: Same as "anything else".
7413            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7414            $self->{ca}->{value} = '';
7415            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7416            
7417        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7418          $self->{line_prev} = $self->{line};
7419          $self->{column_prev} = $self->{column};
7420          $self->{column}++;
7421          $self->{nc}
7422              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7423        } else {
7424          $self->{set_nc}->($self);
7425        }
7426      
7427            redo A;
7428          } elsif ($self->{nc} == 0x0027) { # '
7429            ## XML5: Same as "anything else".
7430            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7431            $self->{ca}->{value} = '';
7432            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7433            
7434        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7435          $self->{line_prev} = $self->{line};
7436          $self->{column_prev} = $self->{column};
7437          $self->{column}++;
7438          $self->{nc}
7439              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7440        } else {
7441          $self->{set_nc}->($self);
7442        }
7443      
7444            redo A;
7445          } elsif ($self->{nc} == 0x003E) { # >
7446            ## XML5: Same as "anything else".
7447            push @{$self->{ct}->{attrdefs}}, $self->{ca};
7448            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7449            
7450        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7451          $self->{line_prev} = $self->{line};
7452          $self->{column_prev} = $self->{column};
7453          $self->{column}++;
7454          $self->{nc}
7455              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7456        } else {
7457          $self->{set_nc}->($self);
7458        }
7459      
7460            return  ($self->{ct}); # ATTLIST
7461            redo A;
7462          } elsif ($self->{nc} == -1) {
7463            ## XML5: No parse error.
7464            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7465            push @{$self->{ct}->{attrdefs}}, $self->{ca};
7466            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7467            
7468        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7469          $self->{line_prev} = $self->{line};
7470          $self->{column_prev} = $self->{column};
7471          $self->{column}++;
7472          $self->{nc}
7473              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7474        } else {
7475          $self->{set_nc}->($self);
7476        }
7477      
7478            return  ($self->{ct});
7479            redo A;
7480          } else {
7481            $self->{ca}->{default} .= chr $self->{nc};
7482            ## Stay in the state.
7483            
7484        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7485          $self->{line_prev} = $self->{line};
7486          $self->{column_prev} = $self->{column};
7487          $self->{column}++;
7488          $self->{nc}
7489              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7490        } else {
7491          $self->{set_nc}->($self);
7492        }
7493      
7494            redo A;
7495          }
7496        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE) {
7497          if ($is_space->{$self->{nc}}) {
7498            ## Stay in the state.
7499            
7500        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7501          $self->{line_prev} = $self->{line};
7502          $self->{column_prev} = $self->{column};
7503          $self->{column}++;
7504          $self->{nc}
7505              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7506        } else {
7507          $self->{set_nc}->($self);
7508        }
7509      
7510            redo A;
7511          } elsif ($self->{nc} == 0x0022) { # "
7512            $self->{ca}->{value} = '';
7513            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7514            
7515        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7516          $self->{line_prev} = $self->{line};
7517          $self->{column_prev} = $self->{column};
7518          $self->{column}++;
7519          $self->{nc}
7520              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7521        } else {
7522          $self->{set_nc}->($self);
7523        }
7524      
7525            redo A;
7526          } elsif ($self->{nc} == 0x0027) { # '
7527            $self->{ca}->{value} = '';
7528            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7529            
7530        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7531          $self->{line_prev} = $self->{line};
7532          $self->{column_prev} = $self->{column};
7533          $self->{column}++;
7534          $self->{nc}
7535              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7536        } else {
7537          $self->{set_nc}->($self);
7538        }
7539      
7540            redo A;
7541          } elsif ($self->{nc} == 0x003E) { # >
7542            push @{$self->{ct}->{attrdefs}}, $self->{ca};
7543            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7544            
7545        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7546          $self->{line_prev} = $self->{line};
7547          $self->{column_prev} = $self->{column};
7548          $self->{column}++;
7549          $self->{nc}
7550              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7551        } else {
7552          $self->{set_nc}->($self);
7553        }
7554      
7555            return  ($self->{ct}); # ATTLIST
7556            redo A;
7557          } elsif ($self->{nc} == -1) {
7558            ## XML5: No parse error.
7559            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7560            push @{$self->{ct}->{attrdefs}}, $self->{ca};
7561            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7562            
7563        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7564          $self->{line_prev} = $self->{line};
7565          $self->{column_prev} = $self->{column};
7566          $self->{column}++;
7567          $self->{nc}
7568              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7569        } else {
7570          $self->{set_nc}->($self);
7571        }
7572      
7573            return  ($self->{ct});
7574            redo A;
7575          } else {
7576            ## XML5: Not defined yet.
7577            if ($self->{ca}->{default} eq 'FIXED') {
7578              $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7579            } else {
7580              push @{$self->{ct}->{attrdefs}}, $self->{ca};
7581              $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7582            }
7583            ## Reconsume.
7584            redo A;
7585          }
7586        } elsif ($self->{state} == AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE) {
7587          if ($is_space->{$self->{nc}} or
7588              $self->{nc} == -1 or
7589              $self->{nc} == 0x003E) { # >
7590            $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7591            ## Reconsume.
7592            redo A;
7593          } else {
7594            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before attr name'); ## TODO: type
7595            $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7596            ## Reconsume.
7597            redo A;
7598          }
7599        } elsif ($self->{state} == NDATA_STATE) {
7600          ## ASCII case-insensitive
7601          if ($self->{nc} == [
7602                undef,
7603                0x0044, # D
7604                0x0041, # A
7605                0x0054, # T
7606              ]->[length $self->{kwd}] or
7607              $self->{nc} == [
7608                undef,
7609                0x0064, # d
7610                0x0061, # a
7611                0x0074, # t
7612              ]->[length $self->{kwd}]) {
7613            
7614            ## Stay in the state.
7615            $self->{kwd} .= chr $self->{nc};
7616            
7617        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7618          $self->{line_prev} = $self->{line};
7619          $self->{column_prev} = $self->{column};
7620          $self->{column}++;
7621          $self->{nc}
7622              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7623        } else {
7624          $self->{set_nc}->($self);
7625        }
7626      
7627            redo A;
7628          } elsif ((length $self->{kwd}) == 4 and
7629                   ($self->{nc} == 0x0041 or # A
7630                    $self->{nc} == 0x0061)) { # a
7631            if ($self->{kwd} ne 'NDAT' or $self->{nc} == 0x0061) { # a
7632              
7633              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
7634                              text => 'NDATA',
7635                              line => $self->{line_prev},
7636                              column => $self->{column_prev} - 4);
7637            } else {
7638              
7639            }
7640            $self->{state} = AFTER_NDATA_STATE;
7641            
7642        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7643          $self->{line_prev} = $self->{line};
7644          $self->{column_prev} = $self->{column};
7645          $self->{column}++;
7646          $self->{nc}
7647              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7648        } else {
7649          $self->{set_nc}->($self);
7650        }
7651      
7652            redo A;
7653          } else {
7654            $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after literal', ## TODO: type
7655                            line => $self->{line_prev},
7656                            column => $self->{column_prev} + 1
7657                                - length $self->{kwd});
7658            
7659            $self->{state} = BOGUS_MD_STATE;
7660            ## Reconsume.
7661            redo A;
7662          }
7663        } elsif ($self->{state} == AFTER_NDATA_STATE) {
7664          if ($is_space->{$self->{nc}}) {
7665            $self->{state} = BEFORE_NOTATION_NAME_STATE;
7666            
7667        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7668          $self->{line_prev} = $self->{line};
7669          $self->{column_prev} = $self->{column};
7670          $self->{column}++;
7671          $self->{nc}
7672              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7673        } else {
7674          $self->{set_nc}->($self);
7675        }
7676      
7677            redo A;
7678          } elsif ($self->{nc} == 0x003E) { # >
7679            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no notation name'); ## TODO: type
7680            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7681            
7682        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7683          $self->{line_prev} = $self->{line};
7684          $self->{column_prev} = $self->{column};
7685          $self->{column}++;
7686          $self->{nc}
7687              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7688        } else {
7689          $self->{set_nc}->($self);
7690        }
7691      
7692            return  ($self->{ct}); # ENTITY
7693            redo A;
7694          } elsif ($self->{nc} == -1) {
7695            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7696            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7697            
7698        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7699          $self->{line_prev} = $self->{line};
7700          $self->{column_prev} = $self->{column};
7701          $self->{column}++;
7702          $self->{nc}
7703              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7704        } else {
7705          $self->{set_nc}->($self);
7706        }
7707      
7708            return  ($self->{ct}); # ENTITY
7709            redo A;
7710          } else {
7711            $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after literal', ## TODO: type
7712                            line => $self->{line_prev},
7713                            column => $self->{column_prev} + 1
7714                                - length $self->{kwd});
7715            $self->{state} = BOGUS_MD_STATE;
7716            ## Reconsume.
7717            redo A;
7718          }
7719        } elsif ($self->{state} == BEFORE_NOTATION_NAME_STATE) {
7720          if ($is_space->{$self->{nc}}) {
7721            ## Stay in the state.
7722            
7723        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7724          $self->{line_prev} = $self->{line};
7725          $self->{column_prev} = $self->{column};
7726          $self->{column}++;
7727          $self->{nc}
7728              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7729        } else {
7730          $self->{set_nc}->($self);
7731        }
7732      
7733            redo A;
7734          } elsif ($self->{nc} == 0x003E) { # >
7735            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no notation name'); ## TODO: type
7736            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7737            
7738        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7739          $self->{line_prev} = $self->{line};
7740          $self->{column_prev} = $self->{column};
7741          $self->{column}++;
7742          $self->{nc}
7743              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7744        } else {
7745          $self->{set_nc}->($self);
7746        }
7747      
7748            return  ($self->{ct}); # ENTITY
7749            redo A;
7750          } elsif ($self->{nc} == -1) {
7751            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7752            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7753            
7754        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7755          $self->{line_prev} = $self->{line};
7756          $self->{column_prev} = $self->{column};
7757          $self->{column}++;
7758          $self->{nc}
7759              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7760        } else {
7761          $self->{set_nc}->($self);
7762        }
7763      
7764            return  ($self->{ct}); # ENTITY
7765            redo A;
7766          } else {
7767            $self->{ct}->{notation} = chr $self->{nc}; # ENTITY
7768            $self->{state} = NOTATION_NAME_STATE;
7769            
7770        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7771          $self->{line_prev} = $self->{line};
7772          $self->{column_prev} = $self->{column};
7773          $self->{column}++;
7774          $self->{nc}
7775              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7776        } else {
7777          $self->{set_nc}->($self);
7778        }
7779      
7780            redo A;
7781          }
7782        } elsif ($self->{state} == NOTATION_NAME_STATE) {
7783          if ($is_space->{$self->{nc}}) {
7784            $self->{state} = AFTER_MD_DEF_STATE;
7785            
7786        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7787          $self->{line_prev} = $self->{line};
7788          $self->{column_prev} = $self->{column};
7789          $self->{column}++;
7790          $self->{nc}
7791              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7792        } else {
7793          $self->{set_nc}->($self);
7794        }
7795      
7796            redo A;
7797          } elsif ($self->{nc} == 0x003E) { # >
7798            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7799            
7800        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7801          $self->{line_prev} = $self->{line};
7802          $self->{column_prev} = $self->{column};
7803          $self->{column}++;
7804          $self->{nc}
7805              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7806        } else {
7807          $self->{set_nc}->($self);
7808        }
7809      
7810            return  ($self->{ct}); # ENTITY
7811            redo A;
7812          } elsif ($self->{nc} == -1) {
7813            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7814            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7815            
7816        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7817          $self->{line_prev} = $self->{line};
7818          $self->{column_prev} = $self->{column};
7819          $self->{column}++;
7820          $self->{nc}
7821              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7822        } else {
7823          $self->{set_nc}->($self);
7824        }
7825      
7826            return  ($self->{ct}); # ENTITY
7827            redo A;
7828          } else {
7829            $self->{ct}->{notation} .= chr $self->{nc}; # ENTITY
7830            ## Stay in the state.
7831            
7832        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7833          $self->{line_prev} = $self->{line};
7834          $self->{column_prev} = $self->{column};
7835          $self->{column}++;
7836          $self->{nc}
7837              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7838        } else {
7839          $self->{set_nc}->($self);
7840        }
7841      
7842            redo A;
7843          }
7844        } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE) {
7845          if ($self->{nc} == 0x0022) { # "
7846            $self->{state} = AFTER_MD_DEF_STATE;
7847            
7848        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7849          $self->{line_prev} = $self->{line};
7850          $self->{column_prev} = $self->{column};
7851          $self->{column}++;
7852          $self->{nc}
7853              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7854        } else {
7855          $self->{set_nc}->($self);
7856        }
7857      
7858            redo A;
7859          } elsif ($self->{nc} == 0x0026) { # &
7860            $self->{prev_state} = $self->{state};
7861            $self->{state} = ENTITY_VALUE_ENTITY_STATE;
7862            $self->{entity_add} = 0x0022; # "
7863            
7864        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7865          $self->{line_prev} = $self->{line};
7866          $self->{column_prev} = $self->{column};
7867          $self->{column}++;
7868          $self->{nc}
7869              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7870        } else {
7871          $self->{set_nc}->($self);
7872        }
7873      
7874            redo A;
7875    ## TODO: %
7876          } elsif ($self->{nc} == -1) {
7877            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed entity value'); ## TODO: type
7878            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7879            ## Reconsume.
7880            return  ($self->{ct}); # ENTITY
7881            redo A;
7882          } else {
7883            $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
7884            
7885        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7886          $self->{line_prev} = $self->{line};
7887          $self->{column_prev} = $self->{column};
7888          $self->{column}++;
7889          $self->{nc}
7890              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7891        } else {
7892          $self->{set_nc}->($self);
7893        }
7894      
7895            redo A;
7896          }
7897        } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE) {
7898          if ($self->{nc} == 0x0027) { # '
7899            $self->{state} = AFTER_MD_DEF_STATE;
7900            
7901        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7902          $self->{line_prev} = $self->{line};
7903          $self->{column_prev} = $self->{column};
7904          $self->{column}++;
7905          $self->{nc}
7906              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7907        } else {
7908          $self->{set_nc}->($self);
7909        }
7910      
7911            redo A;
7912          } elsif ($self->{nc} == 0x0026) { # &
7913            $self->{prev_state} = $self->{state};
7914            $self->{state} = ENTITY_VALUE_ENTITY_STATE;
7915            $self->{entity_add} = 0x0027; # '
7916            
7917        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7918          $self->{line_prev} = $self->{line};
7919          $self->{column_prev} = $self->{column};
7920          $self->{column}++;
7921          $self->{nc}
7922              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7923        } else {
7924          $self->{set_nc}->($self);
7925        }
7926      
7927            redo A;
7928    ## TODO: %
7929          } elsif ($self->{nc} == -1) {
7930            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed entity value'); ## TODO: type
7931            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7932            ## Reconsume.
7933            return  ($self->{ct}); # ENTITY
7934            redo A;
7935          } else {
7936            $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
7937            
7938        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7939          $self->{line_prev} = $self->{line};
7940          $self->{column_prev} = $self->{column};
7941          $self->{column}++;
7942          $self->{nc}
7943              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7944        } else {
7945          $self->{set_nc}->($self);
7946        }
7947      
7948            redo A;
7949          }
7950        } elsif ($self->{state} == ENTITY_VALUE_ENTITY_STATE) {
7951          if ($is_space->{$self->{nc}} or
7952              {
7953                0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
7954                $self->{entity_add} => 1,
7955              }->{$self->{nc}}) {
7956            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
7957                            line => $self->{line_prev},
7958                            column => $self->{column_prev}
7959                                + ($self->{nc} == -1 ? 1 : 0));
7960            ## Don't consume
7961            ## Return nothing.
7962            #
7963          } elsif ($self->{nc} == 0x0023) { # #
7964            $self->{ca} = $self->{ct};
7965            $self->{state} = ENTITY_HASH_STATE;
7966            $self->{kwd} = '#';
7967            
7968        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7969          $self->{line_prev} = $self->{line};
7970          $self->{column_prev} = $self->{column};
7971          $self->{column}++;
7972          $self->{nc}
7973              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7974        } else {
7975          $self->{set_nc}->($self);
7976        }
7977      
7978            redo A;
7979          } else {
7980            #
7981          }
7982    
7983          $self->{ct}->{value} .= '&';
7984          $self->{state} = $self->{prev_state};
7985          ## Reconsume.
7986          redo A;
7987        } elsif ($self->{state} == AFTER_ELEMENT_NAME_STATE) {
7988          if ($is_space->{$self->{nc}}) {
7989            $self->{state} = BEFORE_ELEMENT_CONTENT_STATE;
7990            
7991        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7992          $self->{line_prev} = $self->{line};
7993          $self->{column_prev} = $self->{column};
7994          $self->{column}++;
7995          $self->{nc}
7996              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7997        } else {
7998          $self->{set_nc}->($self);
7999        }
8000      
8001            redo A;
8002          } elsif ($self->{nc} == 0x0028) { # (
8003            $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
8004            $self->{ct}->{content} = ['('];
8005            $self->{group_depth} = 1;
8006            
8007        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8008          $self->{line_prev} = $self->{line};
8009          $self->{column_prev} = $self->{column};
8010          $self->{column}++;
8011          $self->{nc}
8012              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8013        } else {
8014          $self->{set_nc}->($self);
8015        }
8016      
8017            redo A;
8018          } elsif ($self->{nc} == 0x003E) { # >
8019            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
8020            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8021            
8022        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8023          $self->{line_prev} = $self->{line};
8024          $self->{column_prev} = $self->{column};
8025          $self->{column}++;
8026          $self->{nc}
8027              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8028        } else {
8029          $self->{set_nc}->($self);
8030        }
8031      
8032            return  ($self->{ct}); # ELEMENT
8033            redo A;
8034          } elsif ($self->{nc} == -1) {
8035            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8036            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8037            
8038        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8039          $self->{line_prev} = $self->{line};
8040          $self->{column_prev} = $self->{column};
8041          $self->{column}++;
8042          $self->{nc}
8043              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8044        } else {
8045          $self->{set_nc}->($self);
8046        }
8047      
8048            return  ($self->{ct}); # ELEMENT
8049            redo A;
8050          } else {
8051            $self->{ct}->{content} = [chr $self->{nc}];
8052            $self->{state} = CONTENT_KEYWORD_STATE;
8053            
8054        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8055          $self->{line_prev} = $self->{line};
8056          $self->{column_prev} = $self->{column};
8057          $self->{column}++;
8058          $self->{nc}
8059              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8060        } else {
8061          $self->{set_nc}->($self);
8062        }
8063      
8064            redo A;
8065          }
8066        } elsif ($self->{state} == CONTENT_KEYWORD_STATE) {
8067          if ($is_space->{$self->{nc}}) {
8068            $self->{state} = AFTER_MD_DEF_STATE;
8069            
8070        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8071          $self->{line_prev} = $self->{line};
8072          $self->{column_prev} = $self->{column};
8073          $self->{column}++;
8074          $self->{nc}
8075              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8076        } else {
8077          $self->{set_nc}->($self);
8078        }
8079      
8080            redo A;
8081          } elsif ($self->{nc} == 0x003E) { # >
8082            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8083            
8084        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8085          $self->{line_prev} = $self->{line};
8086          $self->{column_prev} = $self->{column};
8087          $self->{column}++;
8088          $self->{nc}
8089              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8090        } else {
8091          $self->{set_nc}->($self);
8092        }
8093      
8094            return  ($self->{ct}); # ELEMENT
8095            redo A;
8096          } elsif ($self->{nc} == -1) {
8097            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8098            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8099            
8100        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8101          $self->{line_prev} = $self->{line};
8102          $self->{column_prev} = $self->{column};
8103          $self->{column}++;
8104          $self->{nc}
8105              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8106        } else {
8107          $self->{set_nc}->($self);
8108        }
8109      
8110            return  ($self->{ct}); # ELEMENT
8111            redo A;
8112          } else {
8113            $self->{ct}->{content}->[-1] .= chr $self->{nc}; # ELEMENT
8114            ## Stay in the state.
8115            
8116        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8117          $self->{line_prev} = $self->{line};
8118          $self->{column_prev} = $self->{column};
8119          $self->{column}++;
8120          $self->{nc}
8121              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8122        } else {
8123          $self->{set_nc}->($self);
8124        }
8125      
8126            redo A;
8127          }
8128        } elsif ($self->{state} == AFTER_CM_GROUP_OPEN_STATE) {
8129          if ($is_space->{$self->{nc}}) {
8130            ## Stay in the state.
8131            
8132        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8133          $self->{line_prev} = $self->{line};
8134          $self->{column_prev} = $self->{column};
8135          $self->{column}++;
8136          $self->{nc}
8137              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8138        } else {
8139          $self->{set_nc}->($self);
8140        }
8141      
8142            redo A;
8143          } elsif ($self->{nc} == 0x0028) { # (
8144            $self->{group_depth}++;
8145            push @{$self->{ct}->{content}}, chr $self->{nc};
8146            ## Stay in the state.
8147            
8148        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8149          $self->{line_prev} = $self->{line};
8150          $self->{column_prev} = $self->{column};
8151          $self->{column}++;
8152          $self->{nc}
8153              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8154        } else {
8155          $self->{set_nc}->($self);
8156        }
8157      
8158            redo A;
8159          } elsif ($self->{nc} == 0x007C or # |
8160                   $self->{nc} == 0x002C) { # ,
8161            $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty element name'); ## TODO: type
8162            ## Stay in the state.
8163            
8164        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8165          $self->{line_prev} = $self->{line};
8166          $self->{column_prev} = $self->{column};
8167          $self->{column}++;
8168          $self->{nc}
8169              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8170        } else {
8171          $self->{set_nc}->($self);
8172        }
8173      
8174            redo A;
8175          } elsif ($self->{nc} == 0x0029) { # )
8176            $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty element name'); ## TODO: type
8177            push @{$self->{ct}->{content}}, chr $self->{nc};
8178            $self->{group_depth}--;
8179            $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8180            
8181        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8182          $self->{line_prev} = $self->{line};
8183          $self->{column_prev} = $self->{column};
8184          $self->{column}++;
8185          $self->{nc}
8186              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8187        } else {
8188          $self->{set_nc}->($self);
8189        }
8190      
8191            redo A;
8192          } elsif ($self->{nc} == 0x003E) { # >
8193            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8194            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8195            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8196            
8197        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8198          $self->{line_prev} = $self->{line};
8199          $self->{column_prev} = $self->{column};
8200          $self->{column}++;
8201          $self->{nc}
8202              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8203        } else {
8204          $self->{set_nc}->($self);
8205        }
8206      
8207            return  ($self->{ct}); # ELEMENT
8208            redo A;
8209          } elsif ($self->{nc} == -1) {
8210            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8211            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8212            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8213            
8214        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8215          $self->{line_prev} = $self->{line};
8216          $self->{column_prev} = $self->{column};
8217          $self->{column}++;
8218          $self->{nc}
8219              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8220        } else {
8221          $self->{set_nc}->($self);
8222        }
8223      
8224            return  ($self->{ct}); # ELEMENT
8225            redo A;
8226          } else {
8227            push @{$self->{ct}->{content}}, chr $self->{nc};
8228            $self->{state} = CM_ELEMENT_NAME_STATE;
8229            
8230        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8231          $self->{line_prev} = $self->{line};
8232          $self->{column_prev} = $self->{column};
8233          $self->{column}++;
8234          $self->{nc}
8235              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8236        } else {
8237          $self->{set_nc}->($self);
8238        }
8239      
8240            redo A;
8241          }
8242        } elsif ($self->{state} == CM_ELEMENT_NAME_STATE) {
8243          if ($is_space->{$self->{nc}}) {
8244            $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8245            
8246        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8247          $self->{line_prev} = $self->{line};
8248          $self->{column_prev} = $self->{column};
8249          $self->{column}++;
8250          $self->{nc}
8251              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8252        } else {
8253          $self->{set_nc}->($self);
8254        }
8255      
8256            redo A;
8257          } elsif ($self->{nc} == 0x002A or # *
8258                   $self->{nc} == 0x002B or # +
8259                   $self->{nc} == 0x003F) { # ?
8260            push @{$self->{ct}->{content}}, chr $self->{nc};
8261            $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8262            
8263        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8264          $self->{line_prev} = $self->{line};
8265          $self->{column_prev} = $self->{column};
8266          $self->{column}++;
8267          $self->{nc}
8268              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8269        } else {
8270          $self->{set_nc}->($self);
8271        }
8272      
8273            redo A;
8274          } elsif ($self->{nc} == 0x007C or # |
8275                   $self->{nc} == 0x002C) { # ,
8276            push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
8277            $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
8278            
8279        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8280          $self->{line_prev} = $self->{line};
8281          $self->{column_prev} = $self->{column};
8282          $self->{column}++;
8283          $self->{nc}
8284              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8285        } else {
8286          $self->{set_nc}->($self);
8287        }
8288      
8289            redo A;
8290          } elsif ($self->{nc} == 0x0029) { # )
8291            $self->{group_depth}--;
8292            push @{$self->{ct}->{content}}, chr $self->{nc};
8293            $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8294            
8295        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8296          $self->{line_prev} = $self->{line};
8297          $self->{column_prev} = $self->{column};
8298          $self->{column}++;
8299          $self->{nc}
8300              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8301        } else {
8302          $self->{set_nc}->($self);
8303        }
8304      
8305            redo A;
8306          } elsif ($self->{nc} == 0x003E) { # >
8307            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8308            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8309            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8310            
8311        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8312          $self->{line_prev} = $self->{line};
8313          $self->{column_prev} = $self->{column};
8314          $self->{column}++;
8315          $self->{nc}
8316              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8317        } else {
8318          $self->{set_nc}->($self);
8319        }
8320      
8321            return  ($self->{ct}); # ELEMENT
8322            redo A;
8323          } elsif ($self->{nc} == -1) {
8324            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8325            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8326            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8327            
8328        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8329          $self->{line_prev} = $self->{line};
8330          $self->{column_prev} = $self->{column};
8331          $self->{column}++;
8332          $self->{nc}
8333              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8334        } else {
8335          $self->{set_nc}->($self);
8336        }
8337      
8338            return  ($self->{ct}); # ELEMENT
8339            redo A;
8340          } else {
8341            $self->{ct}->{content}->[-1] .= chr $self->{nc};
8342            ## Stay in the state.
8343            
8344        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8345          $self->{line_prev} = $self->{line};
8346          $self->{column_prev} = $self->{column};
8347          $self->{column}++;
8348          $self->{nc}
8349              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8350        } else {
8351          $self->{set_nc}->($self);
8352        }
8353      
8354            redo A;
8355          }
8356        } elsif ($self->{state} == AFTER_CM_ELEMENT_NAME_STATE) {
8357          if ($is_space->{$self->{nc}}) {
8358            ## Stay in the state.
8359            
8360        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8361          $self->{line_prev} = $self->{line};
8362          $self->{column_prev} = $self->{column};
8363          $self->{column}++;
8364          $self->{nc}
8365              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8366        } else {
8367          $self->{set_nc}->($self);
8368        }
8369      
8370            redo A;
8371          } elsif ($self->{nc} == 0x007C or # |
8372                   $self->{nc} == 0x002C) { # ,
8373            push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
8374            $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
8375            
8376        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8377          $self->{line_prev} = $self->{line};
8378          $self->{column_prev} = $self->{column};
8379          $self->{column}++;
8380          $self->{nc}
8381              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8382        } else {
8383          $self->{set_nc}->($self);
8384        }
8385      
8386            redo A;
8387          } elsif ($self->{nc} == 0x0029) { # )
8388            $self->{group_depth}--;
8389            push @{$self->{ct}->{content}}, chr $self->{nc};
8390            $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8391            
8392        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8393          $self->{line_prev} = $self->{line};
8394          $self->{column_prev} = $self->{column};
8395          $self->{column}++;
8396          $self->{nc}
8397              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8398        } else {
8399          $self->{set_nc}->($self);
8400        }
8401      
8402            redo A;
8403          } elsif ($self->{nc} == 0x003E) { # >
8404            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8405            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8406            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8407            
8408        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8409          $self->{line_prev} = $self->{line};
8410          $self->{column_prev} = $self->{column};
8411          $self->{column}++;
8412          $self->{nc}
8413              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8414        } else {
8415          $self->{set_nc}->($self);
8416        }
8417      
8418            return  ($self->{ct}); # ELEMENT
8419            redo A;
8420          } elsif ($self->{nc} == -1) {
8421            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8422            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8423            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8424            
8425        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8426          $self->{line_prev} = $self->{line};
8427          $self->{column_prev} = $self->{column};
8428          $self->{column}++;
8429          $self->{nc}
8430              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8431        } else {
8432          $self->{set_nc}->($self);
8433        }
8434      
8435            return  ($self->{ct}); # ELEMENT
8436            redo A;
8437          } else {
8438            $self->{parse_error}->(level => $self->{level}->{must}, type => 'after element name'); ## TODO: type
8439            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8440            $self->{state} = BOGUS_MD_STATE;
8441            
8442        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8443          $self->{line_prev} = $self->{line};
8444          $self->{column_prev} = $self->{column};
8445          $self->{column}++;
8446          $self->{nc}
8447              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8448        } else {
8449          $self->{set_nc}->($self);
8450        }
8451      
8452            redo A;
8453          }
8454        } elsif ($self->{state} == AFTER_CM_GROUP_CLOSE_STATE) {
8455          if ($is_space->{$self->{nc}}) {
8456            if ($self->{group_depth}) {
8457              $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8458            } else {
8459              $self->{state} = AFTER_MD_DEF_STATE;
8460            }
8461            
8462        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8463          $self->{line_prev} = $self->{line};
8464          $self->{column_prev} = $self->{column};
8465          $self->{column}++;
8466          $self->{nc}
8467              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8468        } else {
8469          $self->{set_nc}->($self);
8470        }
8471      
8472            redo A;
8473          } elsif ($self->{nc} == 0x002A or # *
8474                   $self->{nc} == 0x002B or # +
8475                   $self->{nc} == 0x003F) { # ?
8476            push @{$self->{ct}->{content}}, chr $self->{nc};
8477            if ($self->{group_depth}) {
8478              $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8479            } else {
8480              $self->{state} = AFTER_MD_DEF_STATE;
8481            }
8482            
8483        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8484          $self->{line_prev} = $self->{line};
8485          $self->{column_prev} = $self->{column};
8486          $self->{column}++;
8487          $self->{nc}
8488              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8489        } else {
8490          $self->{set_nc}->($self);
8491        }
8492      
8493            redo A;
8494          } elsif ($self->{nc} == 0x0029) { # )
8495            if ($self->{group_depth}) {
8496              $self->{group_depth}--;
8497              push @{$self->{ct}->{content}}, chr $self->{nc};
8498              ## Stay in the state.
8499              
8500        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8501          $self->{line_prev} = $self->{line};
8502          $self->{column_prev} = $self->{column};
8503          $self->{column}++;
8504          $self->{nc}
8505              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8506        } else {
8507          $self->{set_nc}->($self);
8508        }
8509      
8510              redo A;
8511            } else {
8512              $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8513              $self->{state} = BOGUS_MD_STATE;
8514              ## Reconsume.
8515              redo A;
8516            }
8517          } elsif ($self->{nc} == 0x003E) { # >
8518            if ($self->{group_depth}) {
8519              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8520              push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8521            }
8522            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8523            
8524        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8525          $self->{line_prev} = $self->{line};
8526          $self->{column_prev} = $self->{column};
8527          $self->{column}++;
8528          $self->{nc}
8529              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8530        } else {
8531          $self->{set_nc}->($self);
8532        }
8533      
8534            return  ($self->{ct}); # ELEMENT
8535            redo A;
8536          } elsif ($self->{nc} == -1) {
8537            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8538            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8539            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8540            
8541        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8542          $self->{line_prev} = $self->{line};
8543          $self->{column_prev} = $self->{column};
8544          $self->{column}++;
8545          $self->{nc}
8546              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8547        } else {
8548          $self->{set_nc}->($self);
8549        }
8550      
8551            return  ($self->{ct}); # ELEMENT
8552            redo A;
8553          } else {
8554            if ($self->{group_depth}) {
8555              $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8556            } else {
8557              $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8558              $self->{state} = BOGUS_MD_STATE;
8559            }
8560            ## Reconsume.
8561            redo A;
8562          }
8563        } elsif ($self->{state} == AFTER_MD_DEF_STATE) {
8564          if ($is_space->{$self->{nc}}) {
8565            ## Stay in the state.
8566            
8567        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8568          $self->{line_prev} = $self->{line};
8569          $self->{column_prev} = $self->{column};
8570          $self->{column}++;
8571          $self->{nc}
8572              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8573        } else {
8574          $self->{set_nc}->($self);
8575        }
8576      
8577            redo A;
8578          } elsif ($self->{nc} == 0x003E) { # >
8579            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8580            
8581        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8582          $self->{line_prev} = $self->{line};
8583          $self->{column_prev} = $self->{column};
8584          $self->{column}++;
8585          $self->{nc}
8586              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8587        } else {
8588          $self->{set_nc}->($self);
8589        }
8590      
8591            return  ($self->{ct}); # ENTITY/ELEMENT
8592            redo A;
8593          } elsif ($self->{nc} == -1) {
8594            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8595            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8596            
8597        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8598          $self->{line_prev} = $self->{line};
8599          $self->{column_prev} = $self->{column};
8600          $self->{column}++;
8601          $self->{nc}
8602              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8603        } else {
8604          $self->{set_nc}->($self);
8605        }
8606      
8607            return  ($self->{ct}); # ENTITY/ELEMENT
8608            redo A;
8609          } else {
8610            $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8611            $self->{state} = BOGUS_MD_STATE;
8612            ## Reconsume.
8613            redo A;
8614          }
8615        } elsif ($self->{state} == BOGUS_MD_STATE) {
8616          if ($self->{nc} == 0x003E) { # >
8617            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8618            
8619        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8620          $self->{line_prev} = $self->{line};
8621          $self->{column_prev} = $self->{column};
8622          $self->{column}++;
8623          $self->{nc}
8624              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8625        } else {
8626          $self->{set_nc}->($self);
8627        }
8628      
8629            return  ($self->{ct}); # ATTLIST/ENTITY/NOTATION
8630            redo A;
8631          } elsif ($self->{nc} == -1) {
8632            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8633            ## Reconsume.
8634            return  ($self->{ct}); # ATTLIST/ENTITY/NOTATION
8635            redo A;
8636          } else {
8637            ## Stay in the state.
8638            
8639        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8640          $self->{line_prev} = $self->{line};
8641          $self->{column_prev} = $self->{column};
8642          $self->{column}++;
8643          $self->{nc}
8644              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8645        } else {
8646          $self->{set_nc}->($self);
8647        }
8648      
8649            redo A;
8650          }
8651      } else {      } else {
8652        die "$0: $self->{state}: Unknown state";        die "$0: $self->{state}: Unknown state";
8653      }      }
# Line 4252  sub _get_next_token ($) { Line 8658  sub _get_next_token ($) {
8658    
8659  1;  1;
8660  ## $Date$  ## $Date$
8661                                    

Legend:
Removed from v.1.5  
changed lines
  Added in v.1.25

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24