/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm
Suika

Diff of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.7 by wakaba, Tue Oct 14 15:25:50 2008 UTC revision 1.33 by wakaba, Sat Sep 5 10:41:07 2009 UTC
# Line 15  BEGIN { Line 15  BEGIN {
15      CHARACTER_TOKEN      CHARACTER_TOKEN
16      PI_TOKEN      PI_TOKEN
17      ABORT_TOKEN      ABORT_TOKEN
18        END_OF_DOCTYPE_TOKEN
19        ATTLIST_TOKEN
20        ELEMENT_TOKEN
21        GENERAL_ENTITY_TOKEN
22        PARAMETER_ENTITY_TOKEN
23        NOTATION_TOKEN
24    );    );
25        
26    our %EXPORT_TAGS = (    our %EXPORT_TAGS = (
# Line 27  BEGIN { Line 33  BEGIN {
33        CHARACTER_TOKEN        CHARACTER_TOKEN
34        PI_TOKEN        PI_TOKEN
35        ABORT_TOKEN        ABORT_TOKEN
36          END_OF_DOCTYPE_TOKEN
37          ATTLIST_TOKEN
38          ELEMENT_TOKEN
39          GENERAL_ENTITY_TOKEN
40          PARAMETER_ENTITY_TOKEN
41          NOTATION_TOKEN
42      )],      )],
43    );    );
44  }  }
45    
46    ## NOTE: Differences from the XML5 draft are marked as "XML5:".
47    
48  ## Token types  ## Token types
49    
50  sub DOCTYPE_TOKEN () { 1 }  sub DOCTYPE_TOKEN () { 1 } ## XML5: No DOCTYPE token.
51  sub COMMENT_TOKEN () { 2 }  sub COMMENT_TOKEN () { 2 }
52  sub START_TAG_TOKEN () { 3 }  sub START_TAG_TOKEN () { 3 }
53  sub END_TAG_TOKEN () { 4 }  sub END_TAG_TOKEN () { 4 }
54  sub END_OF_FILE_TOKEN () { 5 }  sub END_OF_FILE_TOKEN () { 5 }
55  sub CHARACTER_TOKEN () { 6 }  sub CHARACTER_TOKEN () { 6 }
56  sub PI_TOKEN () { 7 } # XML5  sub PI_TOKEN () { 7 } ## NOTE: XML only.
57  sub ABORT_TOKEN () { 8 } # Not a token actually  sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing.
58    sub END_OF_DOCTYPE_TOKEN () { 9 } ## NOTE: XML only.
59    sub ATTLIST_TOKEN () { 10 } ## NOTE: XML only.
60    sub ELEMENT_TOKEN () { 11 } ## NOTE: XML only.
61    sub GENERAL_ENTITY_TOKEN () { 12 } ## NOTE: XML only.
62    sub PARAMETER_ENTITY_TOKEN () { 13 } ## NOTE: XML only.
63    sub NOTATION_TOKEN () { 14 } ## NOTE: XML only.
64    
65    ## XML5: XML5 has "empty tag token".  In this implementation, it is
66    ## represented as a start tag token with $self->{self_closing} flag
67    ## set to true.
68    
69    ## XML5: XML5 has "short end tag token".  In this implementation, it
70    ## is represented as an end tag token with $token->{tag_name} flag set
71    ## to an empty string.
72    
73  package Whatpm::HTML;  package Whatpm::HTML;
74    
# Line 77  sub COMMENT_START_STATE () { 14 } Line 105  sub COMMENT_START_STATE () { 14 }
105  sub COMMENT_START_DASH_STATE () { 15 }  sub COMMENT_START_DASH_STATE () { 15 }
106  sub COMMENT_STATE () { 16 }  sub COMMENT_STATE () { 16 }
107  sub COMMENT_END_STATE () { 17 }  sub COMMENT_END_STATE () { 17 }
108    sub COMMENT_END_BANG_STATE () { 102 }
109    sub COMMENT_END_SPACE_STATE () { 103 } ## LAST
110  sub COMMENT_END_DASH_STATE () { 18 }  sub COMMENT_END_DASH_STATE () { 18 }
111  sub BOGUS_COMMENT_STATE () { 19 }  sub BOGUS_COMMENT_STATE () { 19 }
112  sub DOCTYPE_STATE () { 20 }  sub DOCTYPE_STATE () { 20 }
# Line 114  sub HEXREF_HEX_STATE () { 48 } Line 144  sub HEXREF_HEX_STATE () { 48 }
144  sub ENTITY_NAME_STATE () { 49 }  sub ENTITY_NAME_STATE () { 49 }
145  sub PCDATA_STATE () { 50 } # "data state" in the spec  sub PCDATA_STATE () { 50 } # "data state" in the spec
146    
147    ## XML-only states
148    sub PI_STATE () { 51 }
149    sub PI_TARGET_STATE () { 52 }
150    sub PI_TARGET_AFTER_STATE () { 53 }
151    sub PI_DATA_STATE () { 54 }
152    sub PI_AFTER_STATE () { 55 }
153    sub PI_DATA_AFTER_STATE () { 56 }
154    sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 }
155    sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 }
156    sub BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 59 }
157    sub DOCTYPE_TAG_STATE () { 60 }
158    sub DOCTYPE_MARKUP_DECLARATION_OPEN_STATE () { 61 }
159    sub MD_ATTLIST_STATE () { 62 }
160    sub MD_E_STATE () { 63 }
161    sub MD_ELEMENT_STATE () { 64 }
162    sub MD_ENTITY_STATE () { 65 }
163    sub MD_NOTATION_STATE () { 66 }
164    sub DOCTYPE_MD_STATE () { 67 }
165    sub BEFORE_MD_NAME_STATE () { 68 }
166    sub MD_NAME_STATE () { 69 }
167    sub DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE () { 70 }
168    sub DOCTYPE_ATTLIST_NAME_AFTER_STATE () { 71 }
169    sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE () { 72 }
170    sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE () { 73 }
171    sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE () { 74 }
172    sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE () { 75 }
173    sub BEFORE_ALLOWED_TOKEN_STATE () { 76 }
174    sub ALLOWED_TOKEN_STATE () { 77 }
175    sub AFTER_ALLOWED_TOKEN_STATE () { 78 }
176    sub AFTER_ALLOWED_TOKENS_STATE () { 79 }
177    sub BEFORE_ATTR_DEFAULT_STATE () { 80 }
178    sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE () { 81 }
179    sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE () { 82 }
180    sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE () { 83 }
181    sub AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE () { 84 }
182    sub BEFORE_NDATA_STATE () { 85 }
183    sub NDATA_STATE () { 86 }
184    sub AFTER_NDATA_STATE () { 87 }
185    sub BEFORE_NOTATION_NAME_STATE () { 88 }
186    sub NOTATION_NAME_STATE () { 89 }
187    sub DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE () { 90 }
188    sub DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE () { 91 }
189    sub ENTITY_VALUE_ENTITY_STATE () { 92 }
190    sub AFTER_ELEMENT_NAME_STATE () { 93 }
191    sub BEFORE_ELEMENT_CONTENT_STATE () { 94 }
192    sub CONTENT_KEYWORD_STATE () { 95 }
193    sub AFTER_CM_GROUP_OPEN_STATE () { 96 }
194    sub CM_ELEMENT_NAME_STATE () { 97 }
195    sub AFTER_CM_ELEMENT_NAME_STATE () { 98 }
196    sub AFTER_CM_GROUP_CLOSE_STATE () { 99 }
197    sub AFTER_MD_DEF_STATE () { 100 }
198    sub BOGUS_MD_STATE () { 101 }
199    
200  ## Tree constructor state constants (see Whatpm::HTML for the full  ## Tree constructor state constants (see Whatpm::HTML for the full
201  ## list and descriptions)  ## list and descriptions)
202    
# Line 178  sub _initialize_tokenizer ($) { Line 261  sub _initialize_tokenizer ($) {
261    #$self->{is_xml} (if XML)    #$self->{is_xml} (if XML)
262    
263    $self->{state} = DATA_STATE; # MUST    $self->{state} = DATA_STATE; # MUST
264    $self->{s_kwd} = ''; # state keyword    $self->{s_kwd} = ''; # Data state keyword
265      #$self->{kwd} = ''; # State-dependent keyword; initialized when used
266    #$self->{entity__value}; # initialized when used    #$self->{entity__value}; # initialized when used
267    #$self->{entity__match}; # initialized when used    #$self->{entity__match}; # initialized when used
268    $self->{content_model} = PCDATA_CONTENT_MODEL; # be    $self->{content_model} = PCDATA_CONTENT_MODEL; # be
# Line 208  sub _initialize_tokenizer ($) { Line 292  sub _initialize_tokenizer ($) {
292    
293  ## A token has:  ## A token has:
294  ##   ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,  ##   ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
295  ##       CHARACTER_TOKEN, or END_OF_FILE_TOKEN  ##       CHARACTER_TOKEN, END_OF_FILE_TOKEN, PI_TOKEN, or ABORT_TOKEN
296  ##   ->{name} (DOCTYPE_TOKEN)  ##   ->{name} (DOCTYPE_TOKEN)
297  ##   ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)  ##   ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
298    ##   ->{target} (PI_TOKEN)
299  ##   ->{pubid} (DOCTYPE_TOKEN)  ##   ->{pubid} (DOCTYPE_TOKEN)
300  ##   ->{sysid} (DOCTYPE_TOKEN)  ##   ->{sysid} (DOCTYPE_TOKEN)
301  ##   ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag  ##   ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
# Line 218  sub _initialize_tokenizer ($) { Line 303  sub _initialize_tokenizer ($) {
303  ##        ->{name}  ##        ->{name}
304  ##        ->{value}  ##        ->{value}
305  ##        ->{has_reference} == 1 or 0  ##        ->{has_reference} == 1 or 0
306  ##   ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)  ##        ->{index}: Index of the attribute in a tag.
307    ##   ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)
308  ##   ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)  ##   ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
309    ##   ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.
310    ##   ->{has_internal_subset} = 1 or 0 (DOCTYPE_TOKEN)
311    
312  ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.  ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
313  ##     |->{self_closing}| is used to save the value of |$self->{self_closing}|  ##     |->{self_closing}| is used to save the value of |$self->{self_closing}|
314  ##     while the token is pushed back to the stack.  ##     while the token is pushed back to the stack.
# Line 239  my $is_space = { Line 328  my $is_space = {
328    0x0009 => 1, # CHARACTER TABULATION (HT)    0x0009 => 1, # CHARACTER TABULATION (HT)
329    0x000A => 1, # LINE FEED (LF)    0x000A => 1, # LINE FEED (LF)
330    #0x000B => 0, # LINE TABULATION (VT)    #0x000B => 0, # LINE TABULATION (VT)
331    0x000C => 1, # FORM FEED (FF)    0x000C => 1, # FORM FEED (FF) ## XML5: Not a space character.
332    #0x000D => 1, # CARRIAGE RETURN (CR)    #0x000D => 1, # CARRIAGE RETURN (CR)
333    0x0020 => 1, # SPACE (SP)    0x0020 => 1, # SPACE (SP)
334  };  };
# Line 499  sub _get_next_token ($) { Line 588  sub _get_next_token ($) {
588        return  ($token);        return  ($token);
589        redo A;        redo A;
590      } elsif ($self->{state} == TAG_OPEN_STATE) {      } elsif ($self->{state} == TAG_OPEN_STATE) {
591          ## XML5: "tag state".
592    
593        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
594          if ($self->{nc} == 0x002F) { # /          if ($self->{nc} == 0x002F) { # /
595                        
# Line 517  sub _get_next_token ($) { Line 608  sub _get_next_token ($) {
608            redo A;            redo A;
609          } elsif ($self->{nc} == 0x0021) { # !          } elsif ($self->{nc} == 0x0021) { # !
610                        
611            $self->{s_kwd} = '<' unless $self->{escape};            $self->{s_kwd} = $self->{escaped} ? '' : '<';
612            #            #
613          } else {          } else {
614                        
615              $self->{s_kwd} = '';
616            #            #
617          }          }
618    
619          ## reconsume          ## reconsume
620          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
         $self->{s_kwd} = '';  
621          return  ({type => CHARACTER_TOKEN, data => '<',          return  ({type => CHARACTER_TOKEN, data => '<',
622                    line => $self->{line_prev},                    line => $self->{line_prev},
623                    column => $self->{column_prev},                    column => $self->{column_prev},
# Line 630  sub _get_next_token ($) { Line 721  sub _get_next_token ($) {
721    
722            redo A;            redo A;
723          } elsif ($self->{nc} == 0x003F) { # ?          } elsif ($self->{nc} == 0x003F) { # ?
724                        if ($self->{is_xml}) {
725            $self->{parse_error}->(level => $self->{level}->{must}, type => 'pio',              
726                            line => $self->{line_prev},              $self->{state} = PI_STATE;
727                            column => $self->{column_prev});              
728            $self->{state} = BOGUS_COMMENT_STATE;      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
729            $self->{ct} = {type => COMMENT_TOKEN, data => '',        $self->{line_prev} = $self->{line};
730                                      line => $self->{line_prev},        $self->{column_prev} = $self->{column};
731                                      column => $self->{column_prev},        $self->{column}++;
732                                     };        $self->{nc}
733            ## $self->{nc} is intentionally left as is            = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
734            redo A;      } else {
735          } else {        $self->{set_nc}->($self);
736        }
737      
738                redo A;
739              } else {
740                
741                $self->{parse_error}->(level => $self->{level}->{must}, type => 'pio',
742                                line => $self->{line_prev},
743                                column => $self->{column_prev});
744                $self->{state} = BOGUS_COMMENT_STATE;
745                $self->{ct} = {type => COMMENT_TOKEN, data => '',
746                               line => $self->{line_prev},
747                               column => $self->{column_prev},
748                              };
749                ## $self->{nc} is intentionally left as is
750                redo A;
751              }
752            } elsif (not $self->{is_xml} or $is_space->{$self->{nc}}) {
753                        
754            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago',            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago',
755                            line => $self->{line_prev},                            line => $self->{line_prev},
# Line 656  sub _get_next_token ($) { Line 764  sub _get_next_token ($) {
764                     });                     });
765    
766            redo A;            redo A;
767            } else {
768              ## XML5: "<:" is a parse error.
769              
770              $self->{ct} = {type => START_TAG_TOKEN,
771                                        tag_name => chr ($self->{nc}),
772                                        line => $self->{line_prev},
773                                        column => $self->{column_prev}};
774              $self->{state} = TAG_NAME_STATE;
775              
776        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
777          $self->{line_prev} = $self->{line};
778          $self->{column_prev} = $self->{column};
779          $self->{column}++;
780          $self->{nc}
781              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
782        } else {
783          $self->{set_nc}->($self);
784        }
785      
786              redo A;
787          }          }
788        } else {        } else {
789          die "$0: $self->{content_model} in tag open";          die "$0: $self->{content_model} in tag open";
# Line 664  sub _get_next_token ($) { Line 792  sub _get_next_token ($) {
792        ## NOTE: The "close tag open state" in the spec is implemented as        ## NOTE: The "close tag open state" in the spec is implemented as
793        ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.        ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
794    
795          ## XML5: "end tag state".
796    
797        my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"        my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
798        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
799          if (defined $self->{last_stag_name}) {          if (defined $self->{last_stag_name}) {
800            $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;            $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
801            $self->{s_kwd} = '';            $self->{kwd} = '';
802            ## Reconsume.            ## Reconsume.
803            redo A;            redo A;
804          } else {          } else {
# Line 725  sub _get_next_token ($) { Line 855  sub _get_next_token ($) {
855        
856          redo A;          redo A;
857        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
           
858          $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty end tag',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty end tag',
859                          line => $self->{line_prev}, ## "<" in "</>"                          line => $self->{line_prev}, ## "<" in "</>"
860                          column => $self->{column_prev} - 1);                          column => $self->{column_prev} - 1);
861          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
862          $self->{s_kwd} = '';          $self->{s_kwd} = '';
863                    if ($self->{is_xml}) {
864              
865              ## XML5: No parse error.
866              
867              ## NOTE: This parser raises a parse error, since it supports
868              ## XML1, not XML5.
869    
870              ## NOTE: A short end tag token.
871              my $ct = {type => END_TAG_TOKEN,
872                        tag_name => '',
873                        line => $self->{line_prev},
874                        column => $self->{column_prev} - 1,
875                       };
876              
877        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
878          $self->{line_prev} = $self->{line};
879          $self->{column_prev} = $self->{column};
880          $self->{column}++;
881          $self->{nc}
882              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
883        } else {
884          $self->{set_nc}->($self);
885        }
886      
887              return  ($ct);
888            } else {
889              
890              
891      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
892        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
893        $self->{column_prev} = $self->{column};        $self->{column_prev} = $self->{column};
# Line 742  sub _get_next_token ($) { Line 898  sub _get_next_token ($) {
898        $self->{set_nc}->($self);        $self->{set_nc}->($self);
899      }      }
900        
901            }
902          redo A;          redo A;
903        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
904                    
# Line 755  sub _get_next_token ($) { Line 912  sub _get_next_token ($) {
912                   });                   });
913    
914          redo A;          redo A;
915        } else {        } elsif (not $self->{is_xml} or
916                   $is_space->{$self->{nc}}) {
917                    
918          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus end tag');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus end tag',
919                            line => $self->{line_prev}, # "<" of "</"
920                            column => $self->{column_prev} - 1);
921          $self->{state} = BOGUS_COMMENT_STATE;          $self->{state} = BOGUS_COMMENT_STATE;
922          $self->{ct} = {type => COMMENT_TOKEN, data => '',          $self->{ct} = {type => COMMENT_TOKEN, data => '',
923                                    line => $self->{line_prev}, # "<" of "</"                                    line => $self->{line_prev}, # "<" of "</"
# Line 770  sub _get_next_token ($) { Line 930  sub _get_next_token ($) {
930          ## generated from the bogus end tag, as defined in the          ## generated from the bogus end tag, as defined in the
931          ## "bogus comment state" entry.          ## "bogus comment state" entry.
932          redo A;          redo A;
933          } else {
934            ## XML5: "</:" is a parse error.
935            
936            $self->{ct} = {type => END_TAG_TOKEN,
937                           tag_name => chr ($self->{nc}),
938                           line => $l, column => $c};
939            $self->{state} = TAG_NAME_STATE; ## XML5: "end tag name state".
940            
941        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
942          $self->{line_prev} = $self->{line};
943          $self->{column_prev} = $self->{column};
944          $self->{column}++;
945          $self->{nc}
946              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
947        } else {
948          $self->{set_nc}->($self);
949        }
950      
951            redo A;
952        }        }
953      } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {      } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
954        my $ch = substr $self->{last_stag_name}, length $self->{s_kwd}, 1;        my $ch = substr $self->{last_stag_name}, length $self->{kwd}, 1;
955        if (length $ch) {        if (length $ch) {
956          my $CH = $ch;          my $CH = $ch;
957          $ch =~ tr/a-z/A-Z/;          $ch =~ tr/a-z/A-Z/;
# Line 780  sub _get_next_token ($) { Line 959  sub _get_next_token ($) {
959          if ($nch eq $ch or $nch eq $CH) {          if ($nch eq $ch or $nch eq $CH) {
960                        
961            ## Stay in the state.            ## Stay in the state.
962            $self->{s_kwd} .= $nch;            $self->{kwd} .= $nch;
963                        
964      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
965        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 799  sub _get_next_token ($) { Line 978  sub _get_next_token ($) {
978            $self->{s_kwd} = '';            $self->{s_kwd} = '';
979            ## Reconsume.            ## Reconsume.
980            return  ({type => CHARACTER_TOKEN,            return  ({type => CHARACTER_TOKEN,
981                      data => '</' . $self->{s_kwd},                      data => '</' . $self->{kwd},
982                      line => $self->{line_prev},                      line => $self->{line_prev},
983                      column => $self->{column_prev} - 1 - length $self->{s_kwd},                      column => $self->{column_prev} - 1 - length $self->{kwd},
984                     });                     });
985            redo A;            redo A;
986          }          }
# Line 817  sub _get_next_token ($) { Line 996  sub _get_next_token ($) {
996            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
997            $self->{s_kwd} = '';            $self->{s_kwd} = '';
998            return  ({type => CHARACTER_TOKEN,            return  ({type => CHARACTER_TOKEN,
999                      data => '</' . $self->{s_kwd},                      data => '</' . $self->{kwd},
1000                      line => $self->{line_prev},                      line => $self->{line_prev},
1001                      column => $self->{column_prev} - 1 - length $self->{s_kwd},                      column => $self->{column_prev} - 1 - length $self->{kwd},
1002                     });                     });
1003            redo A;            redo A;
1004          } else {          } else {
# Line 828  sub _get_next_token ($) { Line 1007  sub _get_next_token ($) {
1007                = {type => END_TAG_TOKEN,                = {type => END_TAG_TOKEN,
1008                   tag_name => $self->{last_stag_name},                   tag_name => $self->{last_stag_name},
1009                   line => $self->{line_prev},                   line => $self->{line_prev},
1010                   column => $self->{column_prev} - 1 - length $self->{s_kwd}};                   column => $self->{column_prev} - 1 - length $self->{kwd}};
1011            $self->{state} = TAG_NAME_STATE;            $self->{state} = TAG_NAME_STATE;
1012            ## Reconsume.            ## Reconsume.
1013            redo A;            redo A;
# Line 923  sub _get_next_token ($) { Line 1102  sub _get_next_token ($) {
1102          $self->{s_kwd} = '';          $self->{s_kwd} = '';
1103          # reconsume          # reconsume
1104    
1105          return  ($self->{ct}); # start tag or end tag          ## Discard the token.
1106            #return  ($self->{ct}); # start tag or end tag
1107    
1108          redo A;          redo A;
1109        } elsif ($self->{nc} == 0x002F) { # /        } elsif ($self->{nc} == 0x002F) { # /
# Line 960  sub _get_next_token ($) { Line 1140  sub _get_next_token ($) {
1140          redo A;          redo A;
1141        }        }
1142      } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {      } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
1143          ## XML5: "Tag attribute name before state".
1144    
1145        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
1146                    
1147          ## Stay in the state          ## Stay in the state
# Line 1062  sub _get_next_token ($) { Line 1244  sub _get_next_token ($) {
1244          $self->{s_kwd} = '';          $self->{s_kwd} = '';
1245          # reconsume          # reconsume
1246    
1247          return  ($self->{ct}); # start tag or end tag          ## Discard the token.
1248            #return  ($self->{ct}); # start tag or end tag
1249    
1250          redo A;          redo A;
1251        } else {        } else {
1252          if ({          if ({
1253               0x0022 => 1, # "               0x0022 => 1, # "
1254               0x0027 => 1, # '               0x0027 => 1, # '
1255                 0x003C => 1, # <
1256               0x003D => 1, # =               0x003D => 1, # =
1257              }->{$self->{nc}}) {              }->{$self->{nc}}) {
1258                        
1259              ## XML5: Not a parse error.
1260            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1261          } else {          } else {
1262                        
1263              ## XML5: ":" raises a parse error and is ignored.
1264          }          }
1265          $self->{ca}          $self->{ca}
1266              = {name => chr ($self->{nc}),              = {name => chr ($self->{nc}),
# Line 1095  sub _get_next_token ($) { Line 1281  sub _get_next_token ($) {
1281          redo A;          redo A;
1282        }        }
1283      } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {      } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
1284          ## XML5: "Tag attribute name state".
1285    
1286        my $before_leave = sub {        my $before_leave = sub {
1287          if (exists $self->{ct}->{attributes} # start tag or end tag          if (exists $self->{ct}->{attributes} # start tag or end tag
1288              ->{$self->{ca}->{name}}) { # MUST              ->{$self->{ca}->{name}}) { # MUST
# Line 1105  sub _get_next_token ($) { Line 1293  sub _get_next_token ($) {
1293                        
1294            $self->{ct}->{attributes}->{$self->{ca}->{name}}            $self->{ct}->{attributes}->{$self->{ca}->{name}}
1295              = $self->{ca};              = $self->{ca};
1296              $self->{ca}->{index} = ++$self->{ct}->{last_index};
1297          }          }
1298        }; # $before_leave        }; # $before_leave
1299    
# Line 1141  sub _get_next_token ($) { Line 1330  sub _get_next_token ($) {
1330        
1331          redo A;          redo A;
1332        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
1333            if ($self->{is_xml}) {
1334              
1335              ## XML5: Not a parse error.
1336              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1337            } else {
1338              
1339            }
1340    
1341          $before_leave->();          $before_leave->();
1342          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
1343                        
# Line 1190  sub _get_next_token ($) { Line 1387  sub _get_next_token ($) {
1387        
1388          redo A;          redo A;
1389        } elsif ($self->{nc} == 0x002F) { # /        } elsif ($self->{nc} == 0x002F) { # /
1390            if ($self->{is_xml}) {
1391              
1392              ## XML5: Not a parse error.
1393              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1394            } else {
1395              
1396            }
1397                    
1398          $before_leave->();          $before_leave->();
1399          $self->{state} = SELF_CLOSING_START_TAG_STATE;          $self->{state} = SELF_CLOSING_START_TAG_STATE;
# Line 1227  sub _get_next_token ($) { Line 1431  sub _get_next_token ($) {
1431          $self->{s_kwd} = '';          $self->{s_kwd} = '';
1432          # reconsume          # reconsume
1433    
1434          return  ($self->{ct}); # start tag or end tag          ## Discard the token.
1435            #return  ($self->{ct}); # start tag or end tag
1436    
1437          redo A;          redo A;
1438        } else {        } else {
1439          if ($self->{nc} == 0x0022 or # "          if ({
1440              $self->{nc} == 0x0027) { # '               0x0022 => 1, # "
1441                 0x0027 => 1, # '
1442                 0x003C => 1, # <
1443                }->{$self->{nc}}) {
1444                        
1445              ## XML5: Not a parse error.
1446            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1447          } else {          } else {
1448                        
# Line 1254  sub _get_next_token ($) { Line 1463  sub _get_next_token ($) {
1463          redo A;          redo A;
1464        }        }
1465      } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {      } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1466          ## XML5: "Tag attribute name after state".
1467          
1468        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
1469                    
1470          ## Stay in the state          ## Stay in the state
# Line 1285  sub _get_next_token ($) { Line 1496  sub _get_next_token ($) {
1496        
1497          redo A;          redo A;
1498        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
1499            if ($self->{is_xml}) {
1500              
1501              ## XML5: Not a parse error.
1502              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1503            } else {
1504              
1505            }
1506    
1507          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
1508                        
1509            $self->{last_stag_name} = $self->{ct}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
# Line 1338  sub _get_next_token ($) { Line 1557  sub _get_next_token ($) {
1557        
1558          redo A;          redo A;
1559        } elsif ($self->{nc} == 0x002F) { # /        } elsif ($self->{nc} == 0x002F) { # /
1560            if ($self->{is_xml}) {
1561              
1562              ## XML5: Not a parse error.
1563              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1564            } else {
1565              
1566            }
1567                    
1568          $self->{state} = SELF_CLOSING_START_TAG_STATE;          $self->{state} = SELF_CLOSING_START_TAG_STATE;
1569                    
# Line 1373  sub _get_next_token ($) { Line 1599  sub _get_next_token ($) {
1599          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1600          # reconsume          # reconsume
1601    
1602          return  ($self->{ct}); # start tag or end tag          ## Discard the token.
1603            #return  ($self->{ct}); # start tag or end tag
1604    
1605          redo A;          redo A;
1606        } else {        } else {
1607          if ($self->{nc} == 0x0022 or # "          if ($self->{is_xml}) {
             $self->{nc} == 0x0027) { # '  
1608                        
1609              ## XML5: Not a parse error.
1610              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1611            } else {
1612              
1613            }
1614    
1615            if ({
1616                 0x0022 => 1, # "
1617                 0x0027 => 1, # '
1618                 0x003C => 1, # <
1619                }->{$self->{nc}}) {
1620              
1621              ## XML5: Not a parse error.
1622            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1623          } else {          } else {
1624                        
# Line 1403  sub _get_next_token ($) { Line 1642  sub _get_next_token ($) {
1642          redo A;                  redo A;        
1643        }        }
1644      } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {      } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1645          ## XML5: "Tag attribute value before state".
1646    
1647        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
1648                    
1649          ## Stay in the state          ## Stay in the state
# Line 1508  sub _get_next_token ($) { Line 1749  sub _get_next_token ($) {
1749          $self->{s_kwd} = '';          $self->{s_kwd} = '';
1750          ## reconsume          ## reconsume
1751    
1752          return  ($self->{ct}); # start tag or end tag          ## Discard the token.
1753            #return  ($self->{ct}); # start tag or end tag
1754    
1755          redo A;          redo A;
1756        } else {        } else {
1757          if ($self->{nc} == 0x003D) { # =          if ($self->{nc} == 0x003D or $self->{nc} == 0x003C) { # =, <
1758                        
1759              ## XML5: Not a parse error.
1760            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
1761            } elsif ($self->{is_xml}) {
1762              
1763              ## XML5: No parse error.
1764              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO
1765          } else {          } else {
1766                        
1767          }          }
# Line 1534  sub _get_next_token ($) { Line 1781  sub _get_next_token ($) {
1781          redo A;          redo A;
1782        }        }
1783      } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {      } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1784          ## XML5: "Tag attribute value double quoted state" and "DOCTYPE
1785          ## ATTLIST attribute value double quoted state".
1786          
1787        if ($self->{nc} == 0x0022) { # "        if ($self->{nc} == 0x0022) { # "
1788                    if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1789          $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;            
1790              ## XML5: "DOCTYPE ATTLIST name after state".
1791              push @{$self->{ct}->{attrdefs}}, $self->{ca};
1792              $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1793            } else {
1794              
1795              ## XML5: "Tag attribute name before state".
1796              $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1797            }
1798                    
1799      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1800        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 1551  sub _get_next_token ($) { Line 1809  sub _get_next_token ($) {
1809          redo A;          redo A;
1810        } elsif ($self->{nc} == 0x0026) { # &        } elsif ($self->{nc} == 0x0026) { # &
1811                    
1812            ## XML5: Not defined yet.
1813    
1814          ## NOTE: In the spec, the tokenizer is switched to the          ## NOTE: In the spec, the tokenizer is switched to the
1815          ## "entity in attribute value state".  In this implementation, the          ## "entity in attribute value state".  In this implementation, the
1816          ## tokenizer is switched to the |ENTITY_STATE|, which is an          ## tokenizer is switched to the |ENTITY_STATE|, which is an
# Line 1570  sub _get_next_token ($) { Line 1830  sub _get_next_token ($) {
1830      }      }
1831        
1832          redo A;          redo A;
1833          } elsif ($self->{is_xml} and
1834                   $is_space->{$self->{nc}}) {
1835            
1836            $self->{ca}->{value} .= ' ';
1837            ## Stay in the state.
1838            
1839        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1840          $self->{line_prev} = $self->{line};
1841          $self->{column_prev} = $self->{column};
1842          $self->{column}++;
1843          $self->{nc}
1844              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1845        } else {
1846          $self->{set_nc}->($self);
1847        }
1848      
1849            redo A;
1850        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
1851          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1852          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
1853                        
1854            $self->{last_stag_name} = $self->{ct}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
1855    
1856              $self->{state} = DATA_STATE;
1857              $self->{s_kwd} = '';
1858              ## reconsume
1859              return  ($self->{ct}); # start tag
1860              redo A;
1861          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1862            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1863            if ($self->{ct}->{attributes}) {            if ($self->{ct}->{attributes}) {
# Line 1584  sub _get_next_token ($) { Line 1867  sub _get_next_token ($) {
1867              ## NOTE: This state should never be reached.              ## NOTE: This state should never be reached.
1868                            
1869            }            }
1870    
1871              $self->{state} = DATA_STATE;
1872              $self->{s_kwd} = '';
1873              ## reconsume
1874    
1875              ## Discard the token.
1876              #return  ($self->{ct}); # end tag
1877    
1878              redo A;
1879            } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1880              ## XML5: No parse error above; not defined yet.
1881              push @{$self->{ct}->{attrdefs}}, $self->{ca};
1882              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1883              ## Reconsume.
1884    
1885              ## Discard the token.
1886              #return  ($self->{ct}); # ATTLIST
1887    
1888              redo A;
1889          } else {          } else {
1890            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1891          }          }
         $self->{state} = DATA_STATE;  
         $self->{s_kwd} = '';  
         ## reconsume  
   
         return  ($self->{ct}); # start tag or end tag  
   
         redo A;  
1892        } else {        } else {
1893                    ## XML5 [ATTLIST]: Not defined yet.
1894            if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1895              
1896              ## XML5: Not a parse error.
1897              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
1898            } else {
1899              
1900            }
1901          $self->{ca}->{value} .= chr ($self->{nc});          $self->{ca}->{value} .= chr ($self->{nc});
1902          $self->{read_until}->($self->{ca}->{value},          $self->{read_until}->($self->{ca}->{value},
1903                                q["&],                                qq["&<\x09\x0C\x20],
1904                                length $self->{ca}->{value});                                length $self->{ca}->{value});
1905    
1906          ## Stay in the state          ## Stay in the state
# Line 1616  sub _get_next_token ($) { Line 1918  sub _get_next_token ($) {
1918          redo A;          redo A;
1919        }        }
1920      } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {      } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1921          ## XML5: "Tag attribute value single quoted state" and "DOCTYPE
1922          ## ATTLIST attribute value single quoted state".
1923    
1924        if ($self->{nc} == 0x0027) { # '        if ($self->{nc} == 0x0027) { # '
1925                    if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1926          $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;            
1927              ## XML5: "DOCTYPE ATTLIST name after state".
1928              push @{$self->{ct}->{attrdefs}}, $self->{ca};
1929              $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1930            } else {
1931              
1932              ## XML5: "Before attribute name state" (sic).
1933              $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1934            }
1935                    
1936      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1937        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 1633  sub _get_next_token ($) { Line 1946  sub _get_next_token ($) {
1946          redo A;          redo A;
1947        } elsif ($self->{nc} == 0x0026) { # &        } elsif ($self->{nc} == 0x0026) { # &
1948                    
1949            ## XML5: Not defined yet.
1950    
1951          ## NOTE: In the spec, the tokenizer is switched to the          ## NOTE: In the spec, the tokenizer is switched to the
1952          ## "entity in attribute value state".  In this implementation, the          ## "entity in attribute value state".  In this implementation, the
1953          ## tokenizer is switched to the |ENTITY_STATE|, which is an          ## tokenizer is switched to the |ENTITY_STATE|, which is an
# Line 1652  sub _get_next_token ($) { Line 1967  sub _get_next_token ($) {
1967      }      }
1968        
1969          redo A;          redo A;
1970          } elsif ($self->{is_xml} and
1971                   $is_space->{$self->{nc}}) {
1972            
1973            $self->{ca}->{value} .= ' ';
1974            ## Stay in the state.
1975            
1976        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1977          $self->{line_prev} = $self->{line};
1978          $self->{column_prev} = $self->{column};
1979          $self->{column}++;
1980          $self->{nc}
1981              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1982        } else {
1983          $self->{set_nc}->($self);
1984        }
1985      
1986            redo A;
1987        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
1988          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1989          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
1990                        
1991            $self->{last_stag_name} = $self->{ct}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
1992    
1993              $self->{state} = DATA_STATE;
1994              $self->{s_kwd} = '';
1995              ## reconsume
1996    
1997              ## Discard the token.
1998              #return  ($self->{ct}); # start tag
1999    
2000              redo A;
2001          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2002            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2003            if ($self->{ct}->{attributes}) {            if ($self->{ct}->{attributes}) {
# Line 1666  sub _get_next_token ($) { Line 2007  sub _get_next_token ($) {
2007              ## NOTE: This state should never be reached.              ## NOTE: This state should never be reached.
2008                            
2009            }            }
2010    
2011              $self->{state} = DATA_STATE;
2012              $self->{s_kwd} = '';
2013              ## reconsume
2014    
2015              ## Discard the token.
2016              #return  ($self->{ct}); # end tag
2017    
2018              redo A;
2019            } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2020              ## XML5: No parse error above; not defined yet.
2021              push @{$self->{ct}->{attrdefs}}, $self->{ca};
2022              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2023              ## Reconsume.
2024    
2025              ## Discard the token.
2026              #return  ($self->{ct}); # ATTLIST
2027    
2028              redo A;
2029          } else {          } else {
2030            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
2031          }          }
         $self->{state} = DATA_STATE;  
         $self->{s_kwd} = '';  
         ## reconsume  
   
         return  ($self->{ct}); # start tag or end tag  
   
         redo A;  
2032        } else {        } else {
2033                    ## XML5 [ATTLIST]: Not defined yet.
2034            if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
2035              
2036              ## XML5: Not a parse error.
2037              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
2038            } else {
2039              
2040            }
2041          $self->{ca}->{value} .= chr ($self->{nc});          $self->{ca}->{value} .= chr ($self->{nc});
2042          $self->{read_until}->($self->{ca}->{value},          $self->{read_until}->($self->{ca}->{value},
2043                                q['&],                                qq['&<\x09\x0C\x20],
2044                                length $self->{ca}->{value});                                length $self->{ca}->{value});
2045    
2046          ## Stay in the state          ## Stay in the state
# Line 1698  sub _get_next_token ($) { Line 2058  sub _get_next_token ($) {
2058          redo A;          redo A;
2059        }        }
2060      } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {      } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
2061          ## XML5: "Tag attribute value unquoted state".
2062    
2063        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
2064                    if ($self->{ct}->{type} == ATTLIST_TOKEN) {
2065          $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;            
2066              push @{$self->{ct}->{attrdefs}}, $self->{ca};
2067              $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
2068            } else {
2069              
2070              ## XML5: "Tag attribute name before state".
2071              $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2072            }
2073                    
2074      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2075        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 1715  sub _get_next_token ($) { Line 2084  sub _get_next_token ($) {
2084          redo A;          redo A;
2085        } elsif ($self->{nc} == 0x0026) { # &        } elsif ($self->{nc} == 0x0026) { # &
2086                    
2087    
2088            ## XML5: Not defined yet.
2089    
2090          ## NOTE: In the spec, the tokenizer is switched to the          ## NOTE: In the spec, the tokenizer is switched to the
2091          ## "entity in attribute value state".  In this implementation, the          ## "entity in attribute value state".  In this implementation, the
2092          ## tokenizer is switched to the |ENTITY_STATE|, which is an          ## tokenizer is switched to the |ENTITY_STATE|, which is an
# Line 1738  sub _get_next_token ($) { Line 2110  sub _get_next_token ($) {
2110          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
2111                        
2112            $self->{last_stag_name} = $self->{ct}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
2113    
2114              $self->{state} = DATA_STATE;
2115              $self->{s_kwd} = '';
2116              
2117        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2118          $self->{line_prev} = $self->{line};
2119          $self->{column_prev} = $self->{column};
2120          $self->{column}++;
2121          $self->{nc}
2122              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2123        } else {
2124          $self->{set_nc}->($self);
2125        }
2126      
2127              return  ($self->{ct}); # start tag
2128              redo A;
2129          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2130            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2131            if ($self->{ct}->{attributes}) {            if ($self->{ct}->{attributes}) {
# Line 1747  sub _get_next_token ($) { Line 2135  sub _get_next_token ($) {
2135              ## NOTE: This state should never be reached.              ## NOTE: This state should never be reached.
2136                            
2137            }            }
2138          } else {  
2139            die "$0: $self->{ct}->{type}: Unknown token type";            $self->{state} = DATA_STATE;
2140          }            $self->{s_kwd} = '';
2141          $self->{state} = DATA_STATE;            
         $self->{s_kwd} = '';  
           
2142      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2143        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
2144        $self->{column_prev} = $self->{column};        $self->{column_prev} = $self->{column};
# Line 1763  sub _get_next_token ($) { Line 2149  sub _get_next_token ($) {
2149        $self->{set_nc}->($self);        $self->{set_nc}->($self);
2150      }      }
2151        
2152              return  ($self->{ct}); # end tag
2153          return  ($self->{ct}); # start tag or end tag            redo A;
2154            } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2155          redo A;            push @{$self->{ct}->{attrdefs}}, $self->{ca};
2156              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2157              
2158        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2159          $self->{line_prev} = $self->{line};
2160          $self->{column_prev} = $self->{column};
2161          $self->{column}++;
2162          $self->{nc}
2163              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2164        } else {
2165          $self->{set_nc}->($self);
2166        }
2167      
2168              return  ($self->{ct}); # ATTLIST
2169              redo A;
2170            } else {
2171              die "$0: $self->{ct}->{type}: Unknown token type";
2172            }
2173        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
         $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');  
2174          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
2175                        
2176              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2177            $self->{last_stag_name} = $self->{ct}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
2178    
2179              $self->{state} = DATA_STATE;
2180              $self->{s_kwd} = '';
2181              ## reconsume
2182    
2183              ## Discard the token.
2184              #return  ($self->{ct}); # start tag
2185              
2186              redo A;
2187          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2188              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2189            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2190            if ($self->{ct}->{attributes}) {            if ($self->{ct}->{attributes}) {
2191                            
# Line 1781  sub _get_next_token ($) { Line 2194  sub _get_next_token ($) {
2194              ## NOTE: This state should never be reached.              ## NOTE: This state should never be reached.
2195                            
2196            }            }
2197    
2198              $self->{state} = DATA_STATE;
2199              $self->{s_kwd} = '';
2200              ## reconsume
2201    
2202              ## Discard the token.
2203              #return  ($self->{ct}); # end tag
2204    
2205              redo A;
2206            } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2207              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
2208              push @{$self->{ct}->{attrdefs}}, $self->{ca};
2209              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2210              ## Reconsume.
2211    
2212              ## Discard the token.
2213              #return  ($self->{ct}); # ATTLIST
2214    
2215              redo A;
2216          } else {          } else {
2217            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
2218          }          }
         $self->{state} = DATA_STATE;  
         $self->{s_kwd} = '';  
         ## reconsume  
   
         return  ($self->{ct}); # start tag or end tag  
   
         redo A;  
2219        } else {        } else {
2220          if ({          if ({
2221               0x0022 => 1, # "               0x0022 => 1, # "
2222               0x0027 => 1, # '               0x0027 => 1, # '
2223               0x003D => 1, # =               0x003D => 1, # =
2224                 0x003C => 1, # <
2225              }->{$self->{nc}}) {              }->{$self->{nc}}) {
2226                        
2227              ## XML5: Not a parse error.
2228            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
2229          } else {          } else {
2230                        
2231          }          }
2232          $self->{ca}->{value} .= chr ($self->{nc});          $self->{ca}->{value} .= chr ($self->{nc});
2233          $self->{read_until}->($self->{ca}->{value},          $self->{read_until}->($self->{ca}->{value},
2234                                q["'=& >],                                qq["'=& \x09\x0C>],
2235                                length $self->{ca}->{value});                                length $self->{ca}->{value});
2236    
2237          ## Stay in the state          ## Stay in the state
# Line 1904  sub _get_next_token ($) { Line 2331  sub _get_next_token ($) {
2331          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2332          $self->{s_kwd} = '';          $self->{s_kwd} = '';
2333          ## Reconsume.          ## Reconsume.
2334          return  ($self->{ct}); # start tag or end tag  
2335            ## Discard the token.
2336            #return  ($self->{ct}); # start tag or end tag
2337    
2338          redo A;          redo A;
2339        } else {        } else {
2340                    
# Line 1914  sub _get_next_token ($) { Line 2344  sub _get_next_token ($) {
2344          redo A;          redo A;
2345        }        }
2346      } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {      } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
2347          ## XML5: "Empty tag state".
2348    
2349        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
2350          if ($self->{ct}->{type} == END_TAG_TOKEN) {          if ($self->{ct}->{type} == END_TAG_TOKEN) {
2351                        
# Line 1965  sub _get_next_token ($) { Line 2397  sub _get_next_token ($) {
2397          } else {          } else {
2398            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
2399          }          }
2400            ## XML5: "Tag attribute name before state".
2401          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2402          $self->{s_kwd} = '';          $self->{s_kwd} = '';
2403          ## Reconsume.          ## Reconsume.
2404          return  ($self->{ct}); # start tag or end tag  
2405            ## Discard the token.
2406            #return  ($self->{ct}); # start tag or end tag
2407    
2408          redo A;          redo A;
2409        } else {        } else {
2410                    
# Line 1979  sub _get_next_token ($) { Line 2415  sub _get_next_token ($) {
2415          redo A;          redo A;
2416        }        }
2417      } elsif ($self->{state} == BOGUS_COMMENT_STATE) {      } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
2418        ## (only happen if PCDATA state)        ## XML5: "Bogus comment state" and "DOCTYPE bogus comment state".
2419    
2420        ## NOTE: Unlike spec's "bogus comment state", this implementation        ## NOTE: Unlike spec's "bogus comment state", this implementation
2421        ## consumes characters one-by-one basis.        ## consumes characters one-by-one basis.
2422                
2423        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
2424                    if ($self->{in_subset}) {
2425          $self->{state} = DATA_STATE;            
2426          $self->{s_kwd} = '';            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2427            } else {
2428              
2429              $self->{state} = DATA_STATE;
2430              $self->{s_kwd} = '';
2431            }
2432                    
2433      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2434        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2003  sub _get_next_token ($) { Line 2444  sub _get_next_token ($) {
2444          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
2445          redo A;          redo A;
2446        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
2447                    if ($self->{in_subset}) {
2448          $self->{state} = DATA_STATE;            
2449          $self->{s_kwd} = '';            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2450            } else {
2451              
2452              $self->{state} = DATA_STATE;
2453              $self->{s_kwd} = '';
2454            }
2455          ## reconsume          ## reconsume
2456    
2457          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
# Line 2032  sub _get_next_token ($) { Line 2478  sub _get_next_token ($) {
2478          redo A;          redo A;
2479        }        }
2480      } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {      } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
2481        ## (only happen if PCDATA state)        ## XML5: "Markup declaration state".
2482                
2483        if ($self->{nc} == 0x002D) { # -        if ($self->{nc} == 0x002D) { # -
2484                    
# Line 2054  sub _get_next_token ($) { Line 2500  sub _get_next_token ($) {
2500          ## ASCII case-insensitive.          ## ASCII case-insensitive.
2501                    
2502          $self->{state} = MD_DOCTYPE_STATE;          $self->{state} = MD_DOCTYPE_STATE;
2503          $self->{s_kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
2504                    
2505      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2506        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2073  sub _get_next_token ($) { Line 2519  sub _get_next_token ($) {
2519                 $self->{nc} == 0x005B) { # [                 $self->{nc} == 0x005B) { # [
2520                                                    
2521          $self->{state} = MD_CDATA_STATE;          $self->{state} = MD_CDATA_STATE;
2522          $self->{s_kwd} = '[';          $self->{kwd} = '[';
2523                    
2524      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2525        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2107  sub _get_next_token ($) { Line 2553  sub _get_next_token ($) {
2553                                    line => $self->{line_prev},                                    line => $self->{line_prev},
2554                                    column => $self->{column_prev} - 2,                                    column => $self->{column_prev} - 2,
2555                                   };                                   };
2556          $self->{state} = COMMENT_START_STATE;          $self->{state} = COMMENT_START_STATE; ## XML5: "comment state".
2557                    
2558      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2559        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2143  sub _get_next_token ($) { Line 2589  sub _get_next_token ($) {
2589              0x0054, # T              0x0054, # T
2590              0x0059, # Y              0x0059, # Y
2591              0x0050, # P              0x0050, # P
2592            ]->[length $self->{s_kwd}] or            ]->[length $self->{kwd}] or
2593            $self->{nc} == [            $self->{nc} == [
2594              undef,              undef,
2595              0x006F, # o              0x006F, # o
# Line 2151  sub _get_next_token ($) { Line 2597  sub _get_next_token ($) {
2597              0x0074, # t              0x0074, # t
2598              0x0079, # y              0x0079, # y
2599              0x0070, # p              0x0070, # p
2600            ]->[length $self->{s_kwd}]) {            ]->[length $self->{kwd}]) {
2601                    
2602          ## Stay in the state.          ## Stay in the state.
2603          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
2604                    
2605      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2606        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2167  sub _get_next_token ($) { Line 2613  sub _get_next_token ($) {
2613      }      }
2614        
2615          redo A;          redo A;
2616        } elsif ((length $self->{s_kwd}) == 6 and        } elsif ((length $self->{kwd}) == 6 and
2617                 ($self->{nc} == 0x0045 or # E                 ($self->{nc} == 0x0045 or # E
2618                  $self->{nc} == 0x0065)) { # e                  $self->{nc} == 0x0065)) { # e
2619                    if ($self->{is_xml} and
2620                ($self->{kwd} ne 'DOCTYP' or $self->{nc} == 0x0065)) {
2621              
2622              ## XML5: case-sensitive.
2623              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO
2624                              text => 'DOCTYPE',
2625                              line => $self->{line_prev},
2626                              column => $self->{column_prev} - 5);
2627            } else {
2628              
2629            }
2630          $self->{state} = DOCTYPE_STATE;          $self->{state} = DOCTYPE_STATE;
2631          $self->{ct} = {type => DOCTYPE_TOKEN,          $self->{ct} = {type => DOCTYPE_TOKEN,
2632                                    quirks => 1,                                    quirks => 1,
# Line 2193  sub _get_next_token ($) { Line 2649  sub _get_next_token ($) {
2649                                    
2650          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2651                          line => $self->{line_prev},                          line => $self->{line_prev},
2652                          column => $self->{column_prev} - 1 - length $self->{s_kwd});                          column => $self->{column_prev} - 1 - length $self->{kwd});
2653          $self->{state} = BOGUS_COMMENT_STATE;          $self->{state} = BOGUS_COMMENT_STATE;
2654          ## Reconsume.          ## Reconsume.
2655          $self->{ct} = {type => COMMENT_TOKEN,          $self->{ct} = {type => COMMENT_TOKEN,
2656                                    data => $self->{s_kwd},                                    data => $self->{kwd},
2657                                    line => $self->{line_prev},                                    line => $self->{line_prev},
2658                                    column => $self->{column_prev} - 1 - length $self->{s_kwd},                                    column => $self->{column_prev} - 1 - length $self->{kwd},
2659                                   };                                   };
2660          redo A;          redo A;
2661        }        }
# Line 2210  sub _get_next_token ($) { Line 2666  sub _get_next_token ($) {
2666              '[CD' => 0x0041, # A              '[CD' => 0x0041, # A
2667              '[CDA' => 0x0054, # T              '[CDA' => 0x0054, # T
2668              '[CDAT' => 0x0041, # A              '[CDAT' => 0x0041, # A
2669            }->{$self->{s_kwd}}) {            }->{$self->{kwd}}) {
2670                    
2671          ## Stay in the state.          ## Stay in the state.
2672          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
2673                    
2674      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2675        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2226  sub _get_next_token ($) { Line 2682  sub _get_next_token ($) {
2682      }      }
2683        
2684          redo A;          redo A;
2685        } elsif ($self->{s_kwd} eq '[CDATA' and        } elsif ($self->{kwd} eq '[CDATA' and
2686                 $self->{nc} == 0x005B) { # [                 $self->{nc} == 0x005B) { # [
           
   
2687          if ($self->{is_xml} and          if ($self->{is_xml} and
2688              not $self->{tainted} and              not $self->{tainted} and
2689              @{$self->{open_elements} or []} == 0) {              @{$self->{open_elements} or []} == 0) {
2690              
2691            $self->{parse_error}->(level => $self->{level}->{must}, type => 'cdata outside of root element',            $self->{parse_error}->(level => $self->{level}->{must}, type => 'cdata outside of root element',
2692                            line => $self->{line_prev},                            line => $self->{line_prev},
2693                            column => $self->{column_prev} - 7);                            column => $self->{column_prev} - 7);
2694            $self->{tainted} = 1;            $self->{tainted} = 1;
2695            } else {
2696              
2697          }          }
2698    
2699          $self->{ct} = {type => CHARACTER_TOKEN,          $self->{ct} = {type => CHARACTER_TOKEN,
# Line 2260  sub _get_next_token ($) { Line 2717  sub _get_next_token ($) {
2717                    
2718          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2719                          line => $self->{line_prev},                          line => $self->{line_prev},
2720                          column => $self->{column_prev} - 1 - length $self->{s_kwd});                          column => $self->{column_prev} - 1 - length $self->{kwd});
2721          $self->{state} = BOGUS_COMMENT_STATE;          $self->{state} = BOGUS_COMMENT_STATE;
2722          ## Reconsume.          ## Reconsume.
2723          $self->{ct} = {type => COMMENT_TOKEN,          $self->{ct} = {type => COMMENT_TOKEN,
2724                                    data => $self->{s_kwd},                                    data => $self->{kwd},
2725                                    line => $self->{line_prev},                                    line => $self->{line_prev},
2726                                    column => $self->{column_prev} - 1 - length $self->{s_kwd},                                    column => $self->{column_prev} - 1 - length $self->{kwd},
2727                                   };                                   };
2728          redo A;          redo A;
2729        }        }
# Line 2287  sub _get_next_token ($) { Line 2744  sub _get_next_token ($) {
2744        
2745          redo A;          redo A;
2746        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
           
2747          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2748          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
2749          $self->{s_kwd} = '';            
2750              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2751            } else {
2752              
2753              $self->{state} = DATA_STATE;
2754              $self->{s_kwd} = '';
2755            }
2756                    
2757      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2758        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2307  sub _get_next_token ($) { Line 2769  sub _get_next_token ($) {
2769    
2770          redo A;          redo A;
2771        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
2772          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2773          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
2774          $self->{s_kwd} = '';            
2775              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2776            } else {
2777              
2778              $self->{state} = DATA_STATE;
2779              $self->{s_kwd} = '';
2780            }
2781          ## reconsume          ## reconsume
2782    
2783          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
# Line 2351  sub _get_next_token ($) { Line 2818  sub _get_next_token ($) {
2818        
2819          redo A;          redo A;
2820        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
           
2821          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2822          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
2823          $self->{s_kwd} = '';            
2824              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2825            } else {
2826              
2827              $self->{state} = DATA_STATE;
2828              $self->{s_kwd} = '';
2829            }
2830                    
2831      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2832        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2371  sub _get_next_token ($) { Line 2843  sub _get_next_token ($) {
2843    
2844          redo A;          redo A;
2845        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
2846          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2847          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
2848          $self->{s_kwd} = '';            
2849              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2850            } else {
2851              
2852              $self->{state} = DATA_STATE;
2853              $self->{s_kwd} = '';
2854            }
2855          ## reconsume          ## reconsume
2856    
2857          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
# Line 2399  sub _get_next_token ($) { Line 2876  sub _get_next_token ($) {
2876          redo A;          redo A;
2877        }        }
2878      } elsif ($self->{state} == COMMENT_STATE) {      } elsif ($self->{state} == COMMENT_STATE) {
2879          ## XML5: "Comment state" and "DOCTYPE comment state".
2880    
2881        if ($self->{nc} == 0x002D) { # -        if ($self->{nc} == 0x002D) { # -
2882                    
2883          $self->{state} = COMMENT_END_DASH_STATE;          $self->{state} = COMMENT_END_DASH_STATE;
# Line 2415  sub _get_next_token ($) { Line 2894  sub _get_next_token ($) {
2894        
2895          redo A;          redo A;
2896        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
2897          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2898          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
2899          $self->{s_kwd} = '';            
2900              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2901            } else {
2902              
2903              $self->{state} = DATA_STATE;
2904              $self->{s_kwd} = '';
2905            }
2906          ## reconsume          ## reconsume
2907    
2908          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
# Line 2446  sub _get_next_token ($) { Line 2930  sub _get_next_token ($) {
2930          redo A;          redo A;
2931        }        }
2932      } elsif ($self->{state} == COMMENT_END_DASH_STATE) {      } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2933          ## XML5: "Comment dash state" and "DOCTYPE comment dash state".
2934    
2935        if ($self->{nc} == 0x002D) { # -        if ($self->{nc} == 0x002D) { # -
2936                    
2937          $self->{state} = COMMENT_END_STATE;          $self->{state} = COMMENT_END_STATE;
# Line 2462  sub _get_next_token ($) { Line 2948  sub _get_next_token ($) {
2948        
2949          redo A;          redo A;
2950        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
2951          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2952          $self->{s_kwd} = '';          if ($self->{in_subset}) {
2953          $self->{state} = DATA_STATE;            
2954          $self->{s_kwd} = '';            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2955            } else {
2956              
2957              $self->{state} = DATA_STATE;
2958              $self->{s_kwd} = '';
2959            }
2960          ## reconsume          ## reconsume
2961    
2962          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
# Line 2489  sub _get_next_token ($) { Line 2979  sub _get_next_token ($) {
2979        
2980          redo A;          redo A;
2981        }        }
2982      } elsif ($self->{state} == COMMENT_END_STATE) {      } elsif ($self->{state} == COMMENT_END_STATE or
2983                 $self->{state} == COMMENT_END_BANG_STATE) {
2984          ## XML5: "Comment end state" and "DOCTYPE comment end state".
2985          ## (No comment end bang state.)
2986    
2987        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
2988                    if ($self->{in_subset}) {
2989          $self->{state} = DATA_STATE;            
2990          $self->{s_kwd} = '';            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2991            } else {
2992              
2993              $self->{state} = DATA_STATE;
2994              $self->{s_kwd} = '';
2995            }
2996                    
2997      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2998        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2510  sub _get_next_token ($) { Line 3009  sub _get_next_token ($) {
3009    
3010          redo A;          redo A;
3011        } elsif ($self->{nc} == 0x002D) { # -        } elsif ($self->{nc} == 0x002D) { # -
3012            if ($self->{state} == COMMENT_END_BANG_STATE) {
3013              
3014              $self->{ct}->{data} .= '--!'; # comment
3015              $self->{state} = COMMENT_END_DASH_STATE;
3016            } else {
3017              
3018              ## XML5: Not a parse error.
3019              $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
3020                              line => $self->{line_prev},
3021                              column => $self->{column_prev});
3022              $self->{ct}->{data} .= '-'; # comment
3023              ## Stay in the state
3024            }
3025                    
3026          $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3027                          line => $self->{line_prev},        $self->{line_prev} = $self->{line};
3028                          column => $self->{column_prev});        $self->{column_prev} = $self->{column};
3029          $self->{ct}->{data} .= '-'; # comment        $self->{column}++;
3030          ## Stay in the state        $self->{nc}
3031              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3032        } else {
3033          $self->{set_nc}->($self);
3034        }
3035      
3036            redo A;
3037          } elsif ($self->{state} != COMMENT_END_BANG_STATE and
3038                   $is_space->{$self->{nc}}) {
3039            
3040            $self->{parse_error}->(level => $self->{level}->{must}, type => 'comment end space'); # XXX error type
3041            $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
3042            $self->{state} = COMMENT_END_SPACE_STATE;
3043                    
3044      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3045        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2528  sub _get_next_token ($) { Line 3052  sub _get_next_token ($) {
3052      }      }
3053        
3054          redo A;          redo A;
3055        } elsif ($self->{nc} == -1) {        } elsif ($self->{state} != COMMENT_END_BANG_STATE and
3056                   $self->{nc} == 0x0021) { # !
3057            
3058            $self->{parse_error}->(level => $self->{level}->{must}, type => 'comment end bang'); # XXX error type
3059            $self->{state} = COMMENT_END_BANG_STATE;
3060                    
3061        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3062          $self->{line_prev} = $self->{line};
3063          $self->{column_prev} = $self->{column};
3064          $self->{column}++;
3065          $self->{nc}
3066              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3067        } else {
3068          $self->{set_nc}->($self);
3069        }
3070      
3071            redo A;
3072          } elsif ($self->{nc} == -1) {
3073          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
3074          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
3075          $self->{s_kwd} = '';            
3076          ## reconsume            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3077            } else {
3078              
3079              $self->{state} = DATA_STATE;
3080              $self->{s_kwd} = '';
3081            }
3082            ## Reconsume.
3083    
3084          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
3085    
3086          redo A;          redo A;
3087        } else {        } else {
3088                    
3089          $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',          if ($self->{state} == COMMENT_END_BANG_STATE) {
3090                          line => $self->{line_prev},            $self->{ct}->{data} .= '--!' . chr ($self->{nc}); # comment
3091                          column => $self->{column_prev});          } else {
3092          $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment            $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
3093            }
3094          $self->{state} = COMMENT_STATE;          $self->{state} = COMMENT_STATE;
3095                    
3096      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 2558  sub _get_next_token ($) { Line 3105  sub _get_next_token ($) {
3105        
3106          redo A;          redo A;
3107        }        }
3108        } elsif ($self->{state} == COMMENT_END_SPACE_STATE) {
3109          ## XML5: Not exist.
3110    
3111          if ($self->{nc} == 0x003E) { # >
3112            if ($self->{in_subset}) {
3113              
3114              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3115            } else {
3116              
3117              $self->{state} = DATA_STATE;
3118              $self->{s_kwd} = '';
3119            }
3120            
3121        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3122          $self->{line_prev} = $self->{line};
3123          $self->{column_prev} = $self->{column};
3124          $self->{column}++;
3125          $self->{nc}
3126              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3127        } else {
3128          $self->{set_nc}->($self);
3129        }
3130      
3131    
3132            return  ($self->{ct}); # comment
3133    
3134            redo A;
3135          } elsif ($is_space->{$self->{nc}}) {
3136            
3137            $self->{ct}->{data} .= chr ($self->{nc}); # comment
3138            ## Stay in the state.
3139            
3140        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3141          $self->{line_prev} = $self->{line};
3142          $self->{column_prev} = $self->{column};
3143          $self->{column}++;
3144          $self->{nc}
3145              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3146        } else {
3147          $self->{set_nc}->($self);
3148        }
3149      
3150            redo A;
3151          } elsif ($self->{nc} == -1) {
3152            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
3153            if ($self->{in_subset}) {
3154              
3155              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3156            } else {
3157              
3158              $self->{state} = DATA_STATE;
3159              $self->{s_kwd} = '';
3160            }
3161            ## Reconsume.
3162    
3163            return  ($self->{ct}); # comment
3164    
3165            redo A;
3166          } else {
3167            
3168            $self->{ct}->{data} .= chr ($self->{nc}); # comment
3169            $self->{state} = COMMENT_STATE;
3170            
3171        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3172          $self->{line_prev} = $self->{line};
3173          $self->{column_prev} = $self->{column};
3174          $self->{column}++;
3175          $self->{nc}
3176              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3177        } else {
3178          $self->{set_nc}->($self);
3179        }
3180      
3181            redo A;
3182          }
3183      } elsif ($self->{state} == DOCTYPE_STATE) {      } elsif ($self->{state} == DOCTYPE_STATE) {
3184        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
3185                    
# Line 2574  sub _get_next_token ($) { Line 3196  sub _get_next_token ($) {
3196      }      }
3197        
3198          redo A;          redo A;
3199          } elsif ($self->{nc} == -1) {
3200            
3201            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3202            $self->{ct}->{quirks} = 1;
3203    
3204            $self->{state} = DATA_STATE;
3205            ## Reconsume.
3206            return  ($self->{ct}); # DOCTYPE (quirks)
3207    
3208            redo A;
3209        } else {        } else {
3210                    
3211            ## XML5: Swith to the bogus comment state.
3212          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before DOCTYPE name');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before DOCTYPE name');
3213          $self->{state} = BEFORE_DOCTYPE_NAME_STATE;          $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
3214          ## reconsume          ## reconsume
3215          redo A;          redo A;
3216        }        }
3217      } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {      } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
3218          ## XML5: "DOCTYPE root name before state".
3219    
3220        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
3221                    
3222          ## Stay in the state          ## Stay in the state
# Line 2599  sub _get_next_token ($) { Line 3234  sub _get_next_token ($) {
3234          redo A;          redo A;
3235        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
3236                    
3237            ## XML5: No parse error.
3238          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3239          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3240          $self->{s_kwd} = '';          $self->{s_kwd} = '';
# Line 2617  sub _get_next_token ($) { Line 3253  sub _get_next_token ($) {
3253          return  ($self->{ct}); # DOCTYPE (quirks)          return  ($self->{ct}); # DOCTYPE (quirks)
3254    
3255          redo A;          redo A;
3256          } elsif (0x0041 <= $self->{nc} and $self->{nc} <= 0x005A) { # A..Z
3257            
3258            $self->{ct}->{name} # DOCTYPE
3259                = chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
3260            delete $self->{ct}->{quirks};
3261            $self->{state} = DOCTYPE_NAME_STATE;
3262            
3263        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3264          $self->{line_prev} = $self->{line};
3265          $self->{column_prev} = $self->{column};
3266          $self->{column}++;
3267          $self->{nc}
3268              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3269        } else {
3270          $self->{set_nc}->($self);
3271        }
3272      
3273            redo A;
3274        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
3275                    
3276          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
# Line 2627  sub _get_next_token ($) { Line 3281  sub _get_next_token ($) {
3281          return  ($self->{ct}); # DOCTYPE (quirks)          return  ($self->{ct}); # DOCTYPE (quirks)
3282    
3283          redo A;          redo A;
3284          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3285            
3286            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3287            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3288            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3289            $self->{in_subset} = 1;
3290            
3291        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3292          $self->{line_prev} = $self->{line};
3293          $self->{column_prev} = $self->{column};
3294          $self->{column}++;
3295          $self->{nc}
3296              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3297        } else {
3298          $self->{set_nc}->($self);
3299        }
3300      
3301            return  ($self->{ct}); # DOCTYPE
3302            redo A;
3303        } else {        } else {
3304                    
3305          $self->{ct}->{name} = chr $self->{nc};          $self->{ct}->{name} = chr $self->{nc};
# Line 2646  sub _get_next_token ($) { Line 3319  sub _get_next_token ($) {
3319          redo A;          redo A;
3320        }        }
3321      } elsif ($self->{state} == DOCTYPE_NAME_STATE) {      } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
3322  ## ISSUE: Redundant "First," in the spec.        ## XML5: "DOCTYPE root name state".
3323    
3324          ## ISSUE: Redundant "First," in the spec.
3325    
3326        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
3327                    
3328          $self->{state} = AFTER_DOCTYPE_NAME_STATE;          $self->{state} = AFTER_DOCTYPE_NAME_STATE;
# Line 2681  sub _get_next_token ($) { Line 3357  sub _get_next_token ($) {
3357          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
3358    
3359          redo A;          redo A;
3360          } elsif (0x0041 <= $self->{nc} and $self->{nc} <= 0x005A) { # A..Z
3361            
3362            $self->{ct}->{name} # DOCTYPE
3363                .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
3364            delete $self->{ct}->{quirks};
3365            ## Stay in the state.
3366            
3367        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3368          $self->{line_prev} = $self->{line};
3369          $self->{column_prev} = $self->{column};
3370          $self->{column}++;
3371          $self->{nc}
3372              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3373        } else {
3374          $self->{set_nc}->($self);
3375        }
3376      
3377            redo A;
3378        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
3379                    
3380          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
# Line 2692  sub _get_next_token ($) { Line 3386  sub _get_next_token ($) {
3386          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
3387    
3388          redo A;          redo A;
3389          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3390            
3391            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3392            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3393            $self->{in_subset} = 1;
3394            
3395        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3396          $self->{line_prev} = $self->{line};
3397          $self->{column_prev} = $self->{column};
3398          $self->{column}++;
3399          $self->{nc}
3400              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3401        } else {
3402          $self->{set_nc}->($self);
3403        }
3404      
3405            return  ($self->{ct}); # DOCTYPE
3406            redo A;
3407        } else {        } else {
3408                    
3409          $self->{ct}->{name}          $self->{ct}->{name} .= chr ($self->{nc}); # DOCTYPE
3410            .= chr ($self->{nc}); # DOCTYPE          ## Stay in the state.
         ## Stay in the state  
3411                    
3412      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3413        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2711  sub _get_next_token ($) { Line 3422  sub _get_next_token ($) {
3422          redo A;          redo A;
3423        }        }
3424      } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {      } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
3425          ## XML5: Corresponding to XML5's "DOCTYPE root name after
3426          ## state", but implemented differently.
3427    
3428        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
3429                    
3430          ## Stay in the state          ## Stay in the state
# Line 2727  sub _get_next_token ($) { Line 3441  sub _get_next_token ($) {
3441        
3442          redo A;          redo A;
3443        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
3444            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3445              
3446              $self->{state} = DATA_STATE;
3447              $self->{s_kwd} = '';
3448            } else {
3449              
3450              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
3451              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3452            }
3453                    
         $self->{state} = DATA_STATE;  
         $self->{s_kwd} = '';  
3454                    
3455      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3456        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2741  sub _get_next_token ($) { Line 3462  sub _get_next_token ($) {
3462        $self->{set_nc}->($self);        $self->{set_nc}->($self);
3463      }      }
3464        
3465            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         return  ($self->{ct}); # DOCTYPE  
   
3466          redo A;          redo A;
3467        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
3468            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3469              
3470              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3471              $self->{state} = DATA_STATE;
3472              $self->{s_kwd} = '';
3473              $self->{ct}->{quirks} = 1;
3474            } else {
3475              
3476              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3477              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3478            }
3479                    
3480          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');          ## Reconsume.
3481          $self->{state} = DATA_STATE;          return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{s_kwd} = '';  
         ## reconsume  
   
         $self->{ct}->{quirks} = 1;  
         return  ($self->{ct}); # DOCTYPE  
   
3482          redo A;          redo A;
3483        } elsif ($self->{nc} == 0x0050 or # P        } elsif ($self->{nc} == 0x0050 or # P
3484                 $self->{nc} == 0x0070) { # p                 $self->{nc} == 0x0070) { # p
3485            
3486          $self->{state} = PUBLIC_STATE;          $self->{state} = PUBLIC_STATE;
3487          $self->{s_kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
3488                    
3489      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3490        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2774  sub _get_next_token ($) { Line 3499  sub _get_next_token ($) {
3499          redo A;          redo A;
3500        } elsif ($self->{nc} == 0x0053 or # S        } elsif ($self->{nc} == 0x0053 or # S
3501                 $self->{nc} == 0x0073) { # s                 $self->{nc} == 0x0073) { # s
3502            
3503          $self->{state} = SYSTEM_STATE;          $self->{state} = SYSTEM_STATE;
3504          $self->{s_kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
3505                    
3506      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3507        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2788  sub _get_next_token ($) { Line 3514  sub _get_next_token ($) {
3514      }      }
3515        
3516          redo A;          redo A;
3517        } else {        } elsif ($self->{nc} == 0x0022 and # "
3518                   ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
3519                    $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
3520                    
3521          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name');          $self->{state} = DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE;
3522          $self->{ct}->{quirks} = 1;          $self->{ct}->{value} = ''; # ENTITY
3523            
3524        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3525          $self->{line_prev} = $self->{line};
3526          $self->{column_prev} = $self->{column};
3527          $self->{column}++;
3528          $self->{nc}
3529              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3530        } else {
3531          $self->{set_nc}->($self);
3532        }
3533      
3534            redo A;
3535          } elsif ($self->{nc} == 0x0027 and # '
3536                   ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
3537                    $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
3538            
3539            $self->{state} = DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE;
3540            $self->{ct}->{value} = ''; # ENTITY
3541            
3542        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3543          $self->{line_prev} = $self->{line};
3544          $self->{column_prev} = $self->{column};
3545          $self->{column}++;
3546          $self->{nc}
3547              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3548        } else {
3549          $self->{set_nc}->($self);
3550        }
3551      
3552            redo A;
3553          } elsif ($self->{is_xml} and
3554                   $self->{ct}->{type} == DOCTYPE_TOKEN and
3555                   $self->{nc} == 0x005B) { # [
3556            
3557            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3558            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3559            $self->{in_subset} = 1;
3560            
3561        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3562          $self->{line_prev} = $self->{line};
3563          $self->{column_prev} = $self->{column};
3564          $self->{column}++;
3565          $self->{nc}
3566              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3567        } else {
3568          $self->{set_nc}->($self);
3569        }
3570      
3571            return  ($self->{ct}); # DOCTYPE
3572            redo A;
3573          } else {
3574            $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name'); ## TODO: type
3575    
3576            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3577              
3578              $self->{ct}->{quirks} = 1;
3579              $self->{state} = BOGUS_DOCTYPE_STATE;
3580            } else {
3581              
3582              $self->{state} = BOGUS_MD_STATE;
3583            }
3584    
         $self->{state} = BOGUS_DOCTYPE_STATE;  
3585                    
3586      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3587        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2815  sub _get_next_token ($) { Line 3603  sub _get_next_token ($) {
3603              0x0042, # B              0x0042, # B
3604              0x004C, # L              0x004C, # L
3605              0x0049, # I              0x0049, # I
3606            ]->[length $self->{s_kwd}] or            ]->[length $self->{kwd}] or
3607            $self->{nc} == [            $self->{nc} == [
3608              undef,              undef,
3609              0x0075, # u              0x0075, # u
3610              0x0062, # b              0x0062, # b
3611              0x006C, # l              0x006C, # l
3612              0x0069, # i              0x0069, # i
3613            ]->[length $self->{s_kwd}]) {            ]->[length $self->{kwd}]) {
3614                    
3615          ## Stay in the state.          ## Stay in the state.
3616          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
3617                    
3618      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3619        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2838  sub _get_next_token ($) { Line 3626  sub _get_next_token ($) {
3626      }      }
3627        
3628          redo A;          redo A;
3629        } elsif ((length $self->{s_kwd}) == 5 and        } elsif ((length $self->{kwd}) == 5 and
3630                 ($self->{nc} == 0x0043 or # C                 ($self->{nc} == 0x0043 or # C
3631                  $self->{nc} == 0x0063)) { # c                  $self->{nc} == 0x0063)) { # c
3632                    if ($self->{is_xml} and
3633                ($self->{kwd} ne 'PUBLI' or $self->{nc} == 0x0063)) { # c
3634              
3635              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
3636                              text => 'PUBLIC',
3637                              line => $self->{line_prev},
3638                              column => $self->{column_prev} - 4);
3639            } else {
3640              
3641            }
3642          $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;          $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3643                    
3644      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 2856  sub _get_next_token ($) { Line 3653  sub _get_next_token ($) {
3653        
3654          redo A;          redo A;
3655        } else {        } else {
3656                    $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name', ## TODO: type
         $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name',  
3657                          line => $self->{line_prev},                          line => $self->{line_prev},
3658                          column => $self->{column_prev} + 1 - length $self->{s_kwd});                          column => $self->{column_prev} + 1 - length $self->{kwd});
3659          $self->{ct}->{quirks} = 1;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3660              
3661          $self->{state} = BOGUS_DOCTYPE_STATE;            $self->{ct}->{quirks} = 1;
3662              $self->{state} = BOGUS_DOCTYPE_STATE;
3663            } else {
3664              
3665              $self->{state} = BOGUS_MD_STATE;
3666            }
3667          ## Reconsume.          ## Reconsume.
3668          redo A;          redo A;
3669        }        }
# Line 2874  sub _get_next_token ($) { Line 3675  sub _get_next_token ($) {
3675              0x0053, # S              0x0053, # S
3676              0x0054, # T              0x0054, # T
3677              0x0045, # E              0x0045, # E
3678            ]->[length $self->{s_kwd}] or            ]->[length $self->{kwd}] or
3679            $self->{nc} == [            $self->{nc} == [
3680              undef,              undef,
3681              0x0079, # y              0x0079, # y
3682              0x0073, # s              0x0073, # s
3683              0x0074, # t              0x0074, # t
3684              0x0065, # e              0x0065, # e
3685            ]->[length $self->{s_kwd}]) {            ]->[length $self->{kwd}]) {
3686                    
3687          ## Stay in the state.          ## Stay in the state.
3688          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
3689                    
3690      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3691        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2897  sub _get_next_token ($) { Line 3698  sub _get_next_token ($) {
3698      }      }
3699        
3700          redo A;          redo A;
3701        } elsif ((length $self->{s_kwd}) == 5 and        } elsif ((length $self->{kwd}) == 5 and
3702                 ($self->{nc} == 0x004D or # M                 ($self->{nc} == 0x004D or # M
3703                  $self->{nc} == 0x006D)) { # m                  $self->{nc} == 0x006D)) { # m
3704                    if ($self->{is_xml} and
3705                ($self->{kwd} ne 'SYSTE' or $self->{nc} == 0x006D)) { # m
3706              
3707              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
3708                              text => 'SYSTEM',
3709                              line => $self->{line_prev},
3710                              column => $self->{column_prev} - 4);
3711            } else {
3712              
3713            }
3714          $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;          $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
3715                    
3716      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 2915  sub _get_next_token ($) { Line 3725  sub _get_next_token ($) {
3725        
3726          redo A;          redo A;
3727        } else {        } else {
3728                    $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name', ## TODO: type
         $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name',  
3729                          line => $self->{line_prev},                          line => $self->{line_prev},
3730                          column => $self->{column_prev} + 1 - length $self->{s_kwd});                          column => $self->{column_prev} + 1 - length $self->{kwd});
3731          $self->{ct}->{quirks} = 1;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3732              
3733          $self->{state} = BOGUS_DOCTYPE_STATE;            $self->{ct}->{quirks} = 1;
3734              $self->{state} = BOGUS_DOCTYPE_STATE;
3735            } else {
3736              
3737              $self->{state} = BOGUS_MD_STATE;
3738            }
3739          ## Reconsume.          ## Reconsume.
3740          redo A;          redo A;
3741        }        }
# Line 2974  sub _get_next_token ($) { Line 3788  sub _get_next_token ($) {
3788        
3789          redo A;          redo A;
3790        } elsif ($self->{nc} eq 0x003E) { # >        } elsif ($self->{nc} eq 0x003E) { # >
           
3791          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3792            
3793          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3794          $self->{s_kwd} = '';            
3795              $self->{state} = DATA_STATE;
3796              $self->{s_kwd} = '';
3797              $self->{ct}->{quirks} = 1;
3798            } else {
3799              
3800              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3801            }
3802            
3803                    
3804      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3805        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2990  sub _get_next_token ($) { Line 3811  sub _get_next_token ($) {
3811        $self->{set_nc}->($self);        $self->{set_nc}->($self);
3812      }      }
3813        
3814            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         return  ($self->{ct}); # DOCTYPE  
   
3815          redo A;          redo A;
3816        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
3817            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3818              
3819              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3820              $self->{state} = DATA_STATE;
3821              $self->{s_kwd} = '';
3822              $self->{ct}->{quirks} = 1;
3823            } else {
3824              
3825              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3826              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3827            }
3828                    
         $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');  
   
         $self->{state} = DATA_STATE;  
         $self->{s_kwd} = '';  
3829          ## reconsume          ## reconsume
   
         $self->{ct}->{quirks} = 1;  
3830          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
   
3831          redo A;          redo A;
3832        } else {        } elsif ($self->{is_xml} and
3833                   $self->{ct}->{type} == DOCTYPE_TOKEN and
3834                   $self->{nc} == 0x005B) { # [
3835                    
3836            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3837            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3838            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3839            $self->{in_subset} = 1;
3840            
3841        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3842          $self->{line_prev} = $self->{line};
3843          $self->{column_prev} = $self->{column};
3844          $self->{column}++;
3845          $self->{nc}
3846              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3847        } else {
3848          $self->{set_nc}->($self);
3849        }
3850      
3851            return  ($self->{ct}); # DOCTYPE
3852            redo A;
3853          } else {
3854          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC');
         $self->{ct}->{quirks} = 1;  
3855    
3856          $self->{state} = BOGUS_DOCTYPE_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3857              
3858              $self->{ct}->{quirks} = 1;
3859              $self->{state} = BOGUS_DOCTYPE_STATE;
3860            } else {
3861              
3862              $self->{state} = BOGUS_MD_STATE;
3863            }
3864    
3865                    
3866      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3867        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3043  sub _get_next_token ($) { Line 3892  sub _get_next_token ($) {
3892        
3893          redo A;          redo A;
3894        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
           
3895          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3896    
3897          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3898          $self->{s_kwd} = '';            
3899              $self->{state} = DATA_STATE;
3900              $self->{s_kwd} = '';
3901              $self->{ct}->{quirks} = 1;
3902            } else {
3903              
3904              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3905            }
3906    
3907                    
3908      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3909        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3059  sub _get_next_token ($) { Line 3915  sub _get_next_token ($) {
3915        $self->{set_nc}->($self);        $self->{set_nc}->($self);
3916      }      }
3917        
3918            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         return  ($self->{ct}); # DOCTYPE  
   
3919          redo A;          redo A;
3920        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
3921          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3922    
3923          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3924          $self->{s_kwd} = '';            
3925          ## reconsume            $self->{state} = DATA_STATE;
3926              $self->{s_kwd} = '';
3927          $self->{ct}->{quirks} = 1;            $self->{ct}->{quirks} = 1;
3928            } else {
3929              
3930              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3931            }
3932            
3933            ## Reconsume.
3934          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
   
3935          redo A;          redo A;
3936        } else {        } else {
3937                    
3938          $self->{ct}->{pubid} # DOCTYPE          $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
             .= chr $self->{nc};  
3939          $self->{read_until}->($self->{ct}->{pubid}, q[">],          $self->{read_until}->($self->{ct}->{pubid}, q[">],
3940                                length $self->{ct}->{pubid});                                length $self->{ct}->{pubid});
3941    
# Line 3114  sub _get_next_token ($) { Line 3970  sub _get_next_token ($) {
3970        
3971          redo A;          redo A;
3972        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
           
3973          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3974    
3975          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3976          $self->{s_kwd} = '';            
3977              $self->{state} = DATA_STATE;
3978              $self->{s_kwd} = '';
3979              $self->{ct}->{quirks} = 1;
3980            } else {
3981              
3982              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3983            }
3984    
3985                    
3986      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3987        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3130  sub _get_next_token ($) { Line 3993  sub _get_next_token ($) {
3993        $self->{set_nc}->($self);        $self->{set_nc}->($self);
3994      }      }
3995        
3996            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         return  ($self->{ct}); # DOCTYPE  
   
3997          redo A;          redo A;
3998        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
3999          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
4000    
4001          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4002          $self->{s_kwd} = '';            
4003              $self->{state} = DATA_STATE;
4004              $self->{s_kwd} = '';
4005              $self->{ct}->{quirks} = 1;
4006            } else {
4007              
4008              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4009            }
4010          
4011          ## reconsume          ## reconsume
4012            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         return  ($self->{ct}); # DOCTYPE  
   
4013          redo A;          redo A;
4014        } else {        } else {
4015                    
4016          $self->{ct}->{pubid} # DOCTYPE          $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
             .= chr $self->{nc};  
4017          $self->{read_until}->($self->{ct}->{pubid}, q['>],          $self->{read_until}->($self->{ct}->{pubid}, q['>],
4018                                length $self->{ct}->{pubid});                                length $self->{ct}->{pubid});
4019    
# Line 3186  sub _get_next_token ($) { Line 4049  sub _get_next_token ($) {
4049          redo A;          redo A;
4050        } elsif ($self->{nc} == 0x0022) { # "        } elsif ($self->{nc} == 0x0022) { # "
4051                    
4052          $self->{ct}->{sysid} = ''; # DOCTYPE          $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
4053          $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;          $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
4054                    
4055      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 3202  sub _get_next_token ($) { Line 4065  sub _get_next_token ($) {
4065          redo A;          redo A;
4066        } elsif ($self->{nc} == 0x0027) { # '        } elsif ($self->{nc} == 0x0027) { # '
4067                    
4068          $self->{ct}->{sysid} = ''; # DOCTYPE          $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
4069          $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;          $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
4070                    
4071      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 3217  sub _get_next_token ($) { Line 4080  sub _get_next_token ($) {
4080        
4081          redo A;          redo A;
4082        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
4083            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4084              if ($self->{is_xml}) {
4085                
4086                $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
4087              } else {
4088                
4089              }
4090              $self->{state} = DATA_STATE;
4091              $self->{s_kwd} = '';
4092            } else {
4093              if ($self->{ct}->{type} == NOTATION_TOKEN) {
4094                
4095              } else {
4096                
4097                $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');            
4098              }
4099              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4100            }
4101                    
         $self->{state} = DATA_STATE;  
         $self->{s_kwd} = '';  
4102                    
4103      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4104        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3231  sub _get_next_token ($) { Line 4110  sub _get_next_token ($) {
4110        $self->{set_nc}->($self);        $self->{set_nc}->($self);
4111      }      }
4112        
4113            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         return  ($self->{ct}); # DOCTYPE  
   
4114          redo A;          redo A;
4115        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
4116            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4117              
4118              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
4119              
4120              $self->{state} = DATA_STATE;
4121              $self->{s_kwd} = '';
4122              $self->{ct}->{quirks} = 1;
4123            } else {
4124              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4125              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4126            }
4127                    
         $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');  
   
         $self->{state} = DATA_STATE;  
         $self->{s_kwd} = '';  
4128          ## reconsume          ## reconsume
4129            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4130          $self->{ct}->{quirks} = 1;          redo A;
4131          } elsif ($self->{is_xml} and
4132                   $self->{ct}->{type} == DOCTYPE_TOKEN and
4133                   $self->{nc} == 0x005B) { # [
4134            
4135            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
4136            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4137            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4138            $self->{in_subset} = 1;
4139            
4140        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4141          $self->{line_prev} = $self->{line};
4142          $self->{column_prev} = $self->{column};
4143          $self->{column}++;
4144          $self->{nc}
4145              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4146        } else {
4147          $self->{set_nc}->($self);
4148        }
4149      
4150          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
   
4151          redo A;          redo A;
4152        } else {        } else {
           
4153          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC literal');
         $self->{ct}->{quirks} = 1;  
4154    
4155          $self->{state} = BOGUS_DOCTYPE_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4156              
4157              $self->{ct}->{quirks} = 1;
4158              $self->{state} = BOGUS_DOCTYPE_STATE;
4159            } else {
4160              
4161              $self->{state} = BOGUS_MD_STATE;
4162            }
4163    
4164                    
4165      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4166        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3315  sub _get_next_token ($) { Line 4223  sub _get_next_token ($) {
4223        
4224          redo A;          redo A;
4225        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
           
4226          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
         $self->{state} = DATA_STATE;  
         $self->{s_kwd} = '';  
4227                    
4228      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4229        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3331  sub _get_next_token ($) { Line 4236  sub _get_next_token ($) {
4236      }      }
4237        
4238    
4239          $self->{ct}->{quirks} = 1;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4240          return  ($self->{ct}); # DOCTYPE            
4241              $self->{state} = DATA_STATE;
4242              $self->{s_kwd} = '';
4243              $self->{ct}->{quirks} = 1;
4244            } else {
4245              
4246              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4247            }
4248    
4249            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4250          redo A;          redo A;
4251        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
4252            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4253              
4254              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
4255              $self->{state} = DATA_STATE;
4256              $self->{s_kwd} = '';
4257              $self->{ct}->{quirks} = 1;
4258            } else {
4259              
4260              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4261              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4262            }
4263                    
         $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');  
   
         $self->{state} = DATA_STATE;  
         $self->{s_kwd} = '';  
4264          ## reconsume          ## reconsume
4265            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4266            redo A;
4267          } elsif ($self->{is_xml} and
4268                   $self->{ct}->{type} == DOCTYPE_TOKEN and
4269                   $self->{nc} == 0x005B) { # [
4270            
4271            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
4272    
4273          $self->{ct}->{quirks} = 1;          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4274            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4275            $self->{in_subset} = 1;
4276            
4277        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4278          $self->{line_prev} = $self->{line};
4279          $self->{column_prev} = $self->{column};
4280          $self->{column}++;
4281          $self->{nc}
4282              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4283        } else {
4284          $self->{set_nc}->($self);
4285        }
4286      
4287          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
   
4288          redo A;          redo A;
4289        } else {        } else {
           
4290          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM');
         $self->{ct}->{quirks} = 1;  
4291    
4292          $self->{state} = BOGUS_DOCTYPE_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4293                        
4294              $self->{ct}->{quirks} = 1;
4295              $self->{state} = BOGUS_DOCTYPE_STATE;
4296            } else {
4297              
4298              $self->{state} = BOGUS_MD_STATE;
4299            }
4300    
4301                    
4302      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4303        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3382  sub _get_next_token ($) { Line 4327  sub _get_next_token ($) {
4327      }      }
4328        
4329          redo A;          redo A;
4330        } elsif ($self->{nc} == 0x003E) { # >        } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
           
4331          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4332    
4333          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4334          $self->{s_kwd} = '';            
4335              $self->{state} = DATA_STATE;
4336              $self->{s_kwd} = '';
4337              $self->{ct}->{quirks} = 1;
4338            } else {
4339              
4340              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4341            }
4342            
4343                    
4344      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4345        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3399  sub _get_next_token ($) { Line 4351  sub _get_next_token ($) {
4351        $self->{set_nc}->($self);        $self->{set_nc}->($self);
4352      }      }
4353        
4354            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         return  ($self->{ct}); # DOCTYPE  
   
4355          redo A;          redo A;
4356        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
4357          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4358    
4359          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4360          $self->{s_kwd} = '';            
4361              $self->{state} = DATA_STATE;
4362              $self->{s_kwd} = '';
4363              $self->{ct}->{quirks} = 1;
4364            } else {
4365              
4366              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4367            }
4368            
4369          ## reconsume          ## reconsume
4370            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         return  ($self->{ct}); # DOCTYPE  
   
4371          redo A;          redo A;
4372        } else {        } else {
4373                    
4374          $self->{ct}->{sysid} # DOCTYPE          $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
             .= chr $self->{nc};  
4375          $self->{read_until}->($self->{ct}->{sysid}, q[">],          $self->{read_until}->($self->{ct}->{sysid}, q[">],
4376                                length $self->{ct}->{sysid});                                length $self->{ct}->{sysid});
4377    
# Line 3453  sub _get_next_token ($) { Line 4405  sub _get_next_token ($) {
4405      }      }
4406        
4407          redo A;          redo A;
4408        } elsif ($self->{nc} == 0x003E) { # >        } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
4409                    
4410          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4411    
# Line 3476  sub _get_next_token ($) { Line 4428  sub _get_next_token ($) {
4428    
4429          redo A;          redo A;
4430        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
4431          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4432    
4433          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4434          $self->{s_kwd} = '';            
4435          ## reconsume            $self->{state} = DATA_STATE;
4436              $self->{s_kwd} = '';
4437          $self->{ct}->{quirks} = 1;            $self->{ct}->{quirks} = 1;
4438          return  ($self->{ct}); # DOCTYPE          } else {
4439              
4440              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4441            }
4442    
4443            ## reconsume
4444            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4445          redo A;          redo A;
4446        } else {        } else {
4447                    
4448          $self->{ct}->{sysid} # DOCTYPE          $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
             .= chr $self->{nc};  
4449          $self->{read_until}->($self->{ct}->{sysid}, q['>],          $self->{read_until}->($self->{ct}->{sysid}, q['>],
4450                                length $self->{ct}->{sysid});                                length $self->{ct}->{sysid});
4451    
# Line 3510  sub _get_next_token ($) { Line 4465  sub _get_next_token ($) {
4465        }        }
4466      } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {      } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
4467        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
4468                    if ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN) {
4469          ## Stay in the state            
4470              $self->{state} = BEFORE_NDATA_STATE;
4471            } else {
4472              
4473              ## Stay in the state
4474            }
4475                    
4476      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4477        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3525  sub _get_next_token ($) { Line 4485  sub _get_next_token ($) {
4485        
4486          redo A;          redo A;
4487        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
4488            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4489              
4490              $self->{state} = DATA_STATE;
4491              $self->{s_kwd} = '';
4492            } else {
4493              
4494              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4495            }
4496    
4497                    
4498          $self->{state} = DATA_STATE;      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4499          $self->{s_kwd} = '';        $self->{line_prev} = $self->{line};
4500          $self->{column_prev} = $self->{column};
4501          $self->{column}++;
4502          $self->{nc}
4503              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4504        } else {
4505          $self->{set_nc}->($self);
4506        }
4507      
4508            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4509            redo A;
4510          } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
4511                   ($self->{nc} == 0x004E or # N
4512                    $self->{nc} == 0x006E)) { # n
4513            
4514            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before NDATA'); ## TODO: type
4515            $self->{state} = NDATA_STATE;
4516            $self->{kwd} = chr $self->{nc};
4517                    
4518      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4519        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3539  sub _get_next_token ($) { Line 4525  sub _get_next_token ($) {
4525        $self->{set_nc}->($self);        $self->{set_nc}->($self);
4526      }      }
4527        
4528            redo A;
4529          } elsif ($self->{nc} == -1) {
4530            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4531              
4532              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
4533              $self->{state} = DATA_STATE;
4534              $self->{s_kwd} = '';
4535              $self->{ct}->{quirks} = 1;
4536            } else {
4537              
4538              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4539              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4540            }
4541    
4542            ## reconsume
4543            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4544            redo A;
4545          } elsif ($self->{is_xml} and
4546                   $self->{ct}->{type} == DOCTYPE_TOKEN and
4547                   $self->{nc} == 0x005B) { # [
4548            
4549            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4550            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4551            $self->{in_subset} = 1;
4552            
4553        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4554          $self->{line_prev} = $self->{line};
4555          $self->{column_prev} = $self->{column};
4556          $self->{column}++;
4557          $self->{nc}
4558              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4559        } else {
4560          $self->{set_nc}->($self);
4561        }
4562      
4563          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
4564            redo A;
4565          } else {
4566            $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
4567    
4568            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4569              
4570              #$self->{ct}->{quirks} = 1;
4571              $self->{state} = BOGUS_DOCTYPE_STATE;
4572            } else {
4573              
4574              $self->{state} = BOGUS_MD_STATE;
4575            }
4576    
4577            
4578        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4579          $self->{line_prev} = $self->{line};
4580          $self->{column_prev} = $self->{column};
4581          $self->{column}++;
4582          $self->{nc}
4583              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4584        } else {
4585          $self->{set_nc}->($self);
4586        }
4587      
4588            redo A;
4589          }
4590        } elsif ($self->{state} == BEFORE_NDATA_STATE) {
4591          if ($is_space->{$self->{nc}}) {
4592            
4593            ## Stay in the state.
4594            
4595        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4596          $self->{line_prev} = $self->{line};
4597          $self->{column_prev} = $self->{column};
4598          $self->{column}++;
4599          $self->{nc}
4600              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4601        } else {
4602          $self->{set_nc}->($self);
4603        }
4604      
4605            redo A;
4606          } elsif ($self->{nc} == 0x003E) { # >
4607            
4608            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4609            
4610        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4611          $self->{line_prev} = $self->{line};
4612          $self->{column_prev} = $self->{column};
4613          $self->{column}++;
4614          $self->{nc}
4615              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4616        } else {
4617          $self->{set_nc}->($self);
4618        }
4619      
4620            return  ($self->{ct}); # ENTITY
4621            redo A;
4622          } elsif ($self->{nc} == 0x004E or # N
4623                   $self->{nc} == 0x006E) { # n
4624            
4625            $self->{state} = NDATA_STATE;
4626            $self->{kwd} = chr $self->{nc};
4627            
4628        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4629          $self->{line_prev} = $self->{line};
4630          $self->{column_prev} = $self->{column};
4631          $self->{column}++;
4632          $self->{nc}
4633              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4634        } else {
4635          $self->{set_nc}->($self);
4636        }
4637      
4638          redo A;          redo A;
4639        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
4640                    
4641          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4642          $self->{state} = DATA_STATE;          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
         $self->{s_kwd} = '';  
4643          ## reconsume          ## reconsume
4644            return  ($self->{ct}); # ENTITY
         $self->{ct}->{quirks} = 1;  
         return  ($self->{ct}); # DOCTYPE  
   
4645          redo A;          redo A;
4646        } else {        } else {
4647                    
4648          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
4649          #$self->{ct}->{quirks} = 1;          $self->{state} = BOGUS_MD_STATE;
   
         $self->{state} = BOGUS_DOCTYPE_STATE;  
4650                    
4651      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4652        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3593  sub _get_next_token ($) { Line 4680  sub _get_next_token ($) {
4680          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
4681    
4682          redo A;          redo A;
4683          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
4684            
4685            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4686            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4687            $self->{in_subset} = 1;
4688            
4689        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4690          $self->{line_prev} = $self->{line};
4691          $self->{column_prev} = $self->{column};
4692          $self->{column}++;
4693          $self->{nc}
4694              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4695        } else {
4696          $self->{set_nc}->($self);
4697        }
4698      
4699            return  ($self->{ct}); # DOCTYPE
4700            redo A;
4701        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
4702                    
4703          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
# Line 3605  sub _get_next_token ($) { Line 4710  sub _get_next_token ($) {
4710        } else {        } else {
4711                    
4712          my $s = '';          my $s = '';
4713          $self->{read_until}->($s, q[>], 0);          $self->{read_until}->($s, q{>[}, 0);
4714    
4715          ## Stay in the state          ## Stay in the state
4716                    
# Line 3625  sub _get_next_token ($) { Line 4730  sub _get_next_token ($) {
4730        ## NOTE: "CDATA section state" in the state is jointly implemented        ## NOTE: "CDATA section state" in the state is jointly implemented
4731        ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,        ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
4732        ## and |CDATA_SECTION_MSE2_STATE|.        ## and |CDATA_SECTION_MSE2_STATE|.
4733    
4734          ## XML5: "CDATA state".
4735                
4736        if ($self->{nc} == 0x005D) { # ]        if ($self->{nc} == 0x005D) { # ]
4737                    
# Line 3643  sub _get_next_token ($) { Line 4750  sub _get_next_token ($) {
4750          redo A;          redo A;
4751        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
4752          if ($self->{is_xml}) {          if ($self->{is_xml}) {
4753              
4754            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no mse'); ## TODO: type            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no mse'); ## TODO: type
4755            } else {
4756              
4757          }          }
4758    
4759          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
4760          $self->{s_kwd} = '';          $self->{s_kwd} = '';
4761                    ## Reconsume.
     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {  
       $self->{line_prev} = $self->{line};  
       $self->{column_prev} = $self->{column};  
       $self->{column}++;  
       $self->{nc}  
           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);  
     } else {  
       $self->{set_nc}->($self);  
     }  
     
4762          if (length $self->{ct}->{data}) { # character          if (length $self->{ct}->{data}) { # character
4763                        
4764            return  ($self->{ct}); # character            return  ($self->{ct}); # character
# Line 3691  sub _get_next_token ($) { Line 4791  sub _get_next_token ($) {
4791    
4792        ## ISSUE: "text tokens" in spec.        ## ISSUE: "text tokens" in spec.
4793      } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {      } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
4794          ## XML5: "CDATA bracket state".
4795    
4796        if ($self->{nc} == 0x005D) { # ]        if ($self->{nc} == 0x005D) { # ]
4797                    
4798          $self->{state} = CDATA_SECTION_MSE2_STATE;          $self->{state} = CDATA_SECTION_MSE2_STATE;
# Line 3708  sub _get_next_token ($) { Line 4810  sub _get_next_token ($) {
4810          redo A;          redo A;
4811        } else {        } else {
4812                    
4813            ## XML5: If EOF, "]" is not appended and changed to the data state.
4814          $self->{ct}->{data} .= ']';          $self->{ct}->{data} .= ']';
4815          $self->{state} = CDATA_SECTION_STATE;          $self->{state} = CDATA_SECTION_STATE; ## XML5: Stay in the state.
4816          ## Reconsume.          ## Reconsume.
4817          redo A;          redo A;
4818        }        }
4819      } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {      } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
4820          ## XML5: "CDATA end state".
4821    
4822        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
4823          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
4824          $self->{s_kwd} = '';          $self->{s_kwd} = '';
# Line 3756  sub _get_next_token ($) { Line 4861  sub _get_next_token ($) {
4861                    
4862          $self->{ct}->{data} .= ']]'; # character          $self->{ct}->{data} .= ']]'; # character
4863          $self->{state} = CDATA_SECTION_STATE;          $self->{state} = CDATA_SECTION_STATE;
4864          ## Reconsume.          ## Reconsume. ## XML5: Emit.
4865          redo A;          redo A;
4866        }        }
4867      } elsif ($self->{state} == ENTITY_STATE) {      } elsif ($self->{state} == ENTITY_STATE) {
# Line 3765  sub _get_next_token ($) { Line 4870  sub _get_next_token ($) {
4870              0x003C => 1, 0x0026 => 1, -1 => 1, # <, &              0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
4871              $self->{entity_add} => 1,              $self->{entity_add} => 1,
4872            }->{$self->{nc}}) {            }->{$self->{nc}}) {
4873                    if ($self->{is_xml}) {
4874              
4875              $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
4876                              line => $self->{line_prev},
4877                              column => $self->{column_prev}
4878                                  + ($self->{nc} == -1 ? 1 : 0));
4879            } else {
4880              
4881              ## No error
4882            }
4883          ## Don't consume          ## Don't consume
         ## No error  
4884          ## Return nothing.          ## Return nothing.
4885          #          #
4886        } elsif ($self->{nc} == 0x0023) { # #        } elsif ($self->{nc} == 0x0023) { # #
4887                    
4888          $self->{state} = ENTITY_HASH_STATE;          $self->{state} = ENTITY_HASH_STATE;
4889          $self->{s_kwd} = '#';          $self->{kwd} = '#';
4890                    
4891      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4892        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3786  sub _get_next_token ($) { Line 4899  sub _get_next_token ($) {
4899      }      }
4900        
4901          redo A;          redo A;
4902        } elsif ((0x0041 <= $self->{nc} and        } elsif ($self->{is_xml} or
4903                   (0x0041 <= $self->{nc} and
4904                  $self->{nc} <= 0x005A) or # A..Z                  $self->{nc} <= 0x005A) or # A..Z
4905                 (0x0061 <= $self->{nc} and                 (0x0061 <= $self->{nc} and
4906                  $self->{nc} <= 0x007A)) { # a..z                  $self->{nc} <= 0x007A)) { # a..z
4907                    
4908          require Whatpm::_NamedEntityList;          require Whatpm::_NamedEntityList;
4909          $self->{state} = ENTITY_NAME_STATE;          $self->{state} = ENTITY_NAME_STATE;
4910          $self->{s_kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
4911          $self->{entity__value} = $self->{s_kwd};          $self->{entity__value} = $self->{kwd};
4912          $self->{entity__match} = 0;          $self->{entity__match} = 0;
4913                    
4914      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 3840  sub _get_next_token ($) { Line 4954  sub _get_next_token ($) {
4954          redo A;          redo A;
4955        }        }
4956      } elsif ($self->{state} == ENTITY_HASH_STATE) {      } elsif ($self->{state} == ENTITY_HASH_STATE) {
4957        if ($self->{nc} == 0x0078 or # x        if ($self->{nc} == 0x0078) { # x
4958            $self->{nc} == 0x0058) { # X          
4959            $self->{state} = HEXREF_X_STATE;
4960            $self->{kwd} .= chr $self->{nc};
4961            
4962        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4963          $self->{line_prev} = $self->{line};
4964          $self->{column_prev} = $self->{column};
4965          $self->{column}++;
4966          $self->{nc}
4967              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4968        } else {
4969          $self->{set_nc}->($self);
4970        }
4971      
4972            redo A;
4973          } elsif ($self->{nc} == 0x0058) { # X
4974                    
4975            if ($self->{is_xml}) {
4976              $self->{parse_error}->(level => $self->{level}->{must}, type => 'uppercase hcro'); ## TODO: type
4977            }
4978          $self->{state} = HEXREF_X_STATE;          $self->{state} = HEXREF_X_STATE;
4979          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
4980                    
4981      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4982        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3861  sub _get_next_token ($) { Line 4993  sub _get_next_token ($) {
4993                 $self->{nc} <= 0x0039) { # 0..9                 $self->{nc} <= 0x0039) { # 0..9
4994                    
4995          $self->{state} = NCR_NUM_STATE;          $self->{state} = NCR_NUM_STATE;
4996          $self->{s_kwd} = $self->{nc} - 0x0030;          $self->{kwd} = $self->{nc} - 0x0030;
4997                    
4998      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4999        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3907  sub _get_next_token ($) { Line 5039  sub _get_next_token ($) {
5039        if (0x0030 <= $self->{nc} and        if (0x0030 <= $self->{nc} and
5040            $self->{nc} <= 0x0039) { # 0..9            $self->{nc} <= 0x0039) { # 0..9
5041                    
5042          $self->{s_kwd} *= 10;          $self->{kwd} *= 10;
5043          $self->{s_kwd} += $self->{nc} - 0x0030;          $self->{kwd} += $self->{nc} - 0x0030;
5044                    
5045          ## Stay in the state.          ## Stay in the state.
5046                    
# Line 3944  sub _get_next_token ($) { Line 5076  sub _get_next_token ($) {
5076          #          #
5077        }        }
5078    
5079        my $code = $self->{s_kwd};        my $code = $self->{kwd};
5080        my $l = $self->{line_prev};        my $l = $self->{line_prev};
5081        my $c = $self->{column_prev};        my $c = $self->{column_prev};
5082        if ($charref_map->{$code}) {        if ((not $self->{is_xml} and $charref_map->{$code}) or
5083              ($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or
5084              ($self->{is_xml} and $code == 0x0000)) {
5085                    
5086          $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
5087                          text => (sprintf 'U+%04X', $code),                          text => (sprintf 'U+%04X', $code),
# Line 3987  sub _get_next_token ($) { Line 5121  sub _get_next_token ($) {
5121          # 0..9, A..F, a..f          # 0..9, A..F, a..f
5122                    
5123          $self->{state} = HEXREF_HEX_STATE;          $self->{state} = HEXREF_HEX_STATE;
5124          $self->{s_kwd} = 0;          $self->{kwd} = 0;
5125          ## Reconsume.          ## Reconsume.
5126          redo A;          redo A;
5127        } else {        } else {
# Line 4005  sub _get_next_token ($) { Line 5139  sub _get_next_token ($) {
5139            $self->{s_kwd} = '';            $self->{s_kwd} = '';
5140            ## Reconsume.            ## Reconsume.
5141            return  ({type => CHARACTER_TOKEN,            return  ({type => CHARACTER_TOKEN,
5142                      data => '&' . $self->{s_kwd},                      data => '&' . $self->{kwd},
5143                      line => $self->{line_prev},                      line => $self->{line_prev},
5144                      column => $self->{column_prev} - length $self->{s_kwd},                      column => $self->{column_prev} - length $self->{kwd},
5145                     });                     });
5146            redo A;            redo A;
5147          } else {          } else {
5148                        
5149            $self->{ca}->{value} .= '&' . $self->{s_kwd};            $self->{ca}->{value} .= '&' . $self->{kwd};
5150            $self->{state} = $self->{prev_state};            $self->{state} = $self->{prev_state};
5151            $self->{s_kwd} = '';            $self->{s_kwd} = '';
5152            ## Reconsume.            ## Reconsume.
# Line 4023  sub _get_next_token ($) { Line 5157  sub _get_next_token ($) {
5157        if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {        if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
5158          # 0..9          # 0..9
5159                    
5160          $self->{s_kwd} *= 0x10;          $self->{kwd} *= 0x10;
5161          $self->{s_kwd} += $self->{nc} - 0x0030;          $self->{kwd} += $self->{nc} - 0x0030;
5162          ## Stay in the state.          ## Stay in the state.
5163                    
5164      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 4041  sub _get_next_token ($) { Line 5175  sub _get_next_token ($) {
5175        } elsif (0x0061 <= $self->{nc} and        } elsif (0x0061 <= $self->{nc} and
5176                 $self->{nc} <= 0x0066) { # a..f                 $self->{nc} <= 0x0066) { # a..f
5177                    
5178          $self->{s_kwd} *= 0x10;          $self->{kwd} *= 0x10;
5179          $self->{s_kwd} += $self->{nc} - 0x0060 + 9;          $self->{kwd} += $self->{nc} - 0x0060 + 9;
5180          ## Stay in the state.          ## Stay in the state.
5181                    
5182      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 4059  sub _get_next_token ($) { Line 5193  sub _get_next_token ($) {
5193        } elsif (0x0041 <= $self->{nc} and        } elsif (0x0041 <= $self->{nc} and
5194                 $self->{nc} <= 0x0046) { # A..F                 $self->{nc} <= 0x0046) { # A..F
5195                    
5196          $self->{s_kwd} *= 0x10;          $self->{kwd} *= 0x10;
5197          $self->{s_kwd} += $self->{nc} - 0x0040 + 9;          $self->{kwd} += $self->{nc} - 0x0040 + 9;
5198          ## Stay in the state.          ## Stay in the state.
5199                    
5200      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 4097  sub _get_next_token ($) { Line 5231  sub _get_next_token ($) {
5231          #          #
5232        }        }
5233    
5234        my $code = $self->{s_kwd};        my $code = $self->{kwd};
5235        my $l = $self->{line_prev};        my $l = $self->{line_prev};
5236        my $c = $self->{column_prev};        my $c = $self->{column_prev};
5237        if ($charref_map->{$code}) {        if ((not $self->{is_xml} and $charref_map->{$code}) or
5238              ($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or
5239              ($self->{is_xml} and $code == 0x0000)) {
5240                    
5241          $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
5242                          text => (sprintf 'U+%04X', $code),                          text => (sprintf 'U+%04X', $code),
# Line 4134  sub _get_next_token ($) { Line 5270  sub _get_next_token ($) {
5270          redo A;          redo A;
5271        }        }
5272      } elsif ($self->{state} == ENTITY_NAME_STATE) {      } elsif ($self->{state} == ENTITY_NAME_STATE) {
5273        if (length $self->{s_kwd} < 30 and        if ((0x0041 <= $self->{nc} and # a
5274            ## NOTE: Some number greater than the maximum length of entity name             $self->{nc} <= 0x005A) or # x
5275            ((0x0041 <= $self->{nc} and # a            (0x0061 <= $self->{nc} and # a
5276              $self->{nc} <= 0x005A) or # x             $self->{nc} <= 0x007A) or # z
5277             (0x0061 <= $self->{nc} and # a            (0x0030 <= $self->{nc} and # 0
5278              $self->{nc} <= 0x007A) or # z             $self->{nc} <= 0x0039) or # 9
5279             (0x0030 <= $self->{nc} and # 0            $self->{nc} == 0x003B or # ;
5280              $self->{nc} <= 0x0039) or # 9            ($self->{is_xml} and
5281             $self->{nc} == 0x003B)) { # ;             not ($is_space->{$self->{nc}} or
5282                    {
5283                      0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
5284                      $self->{entity_add} => 1,
5285                    }->{$self->{nc}}))) {
5286          our $EntityChar;          our $EntityChar;
5287          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
5288          if (defined $EntityChar->{$self->{s_kwd}}) {          if (defined $EntityChar->{$self->{kwd}} or
5289                $self->{ge}->{$self->{kwd}}) {
5290            if ($self->{nc} == 0x003B) { # ;            if ($self->{nc} == 0x003B) { # ;
5291                            if (defined $self->{ge}->{$self->{kwd}}) {
5292              $self->{entity__value} = $EntityChar->{$self->{s_kwd}};                if ($self->{ge}->{$self->{kwd}}->{only_text}) {
5293                    
5294                    $self->{entity__value} = $self->{ge}->{$self->{kwd}}->{value};
5295                  } else {
5296                    if (defined $self->{ge}->{$self->{kwd}}->{notation}) {
5297                      
5298                      $self->{parse_error}->(level => $self->{level}->{must}, type => 'unparsed entity', ## TODO: type
5299                                      value => $self->{kwd});
5300                    } else {
5301                      
5302                    }
5303                    $self->{entity__value} = '&' . $self->{kwd}; ## TODO: expand
5304                  }
5305                } else {
5306                  if ($self->{is_xml}) {
5307                    
5308                    $self->{parse_error}->(level => $self->{level}->{must}, type => 'entity not declared', ## TODO: type
5309                                    value => $self->{kwd},
5310                                    level => {
5311                                              'amp;' => $self->{level}->{warn},
5312                                              'quot;' => $self->{level}->{warn},
5313                                              'lt;' => $self->{level}->{warn},
5314                                              'gt;' => $self->{level}->{warn},
5315                                              'apos;' => $self->{level}->{warn},
5316                                             }->{$self->{kwd}} ||
5317                                             $self->{level}->{must});
5318                  } else {
5319                    
5320                  }
5321                  $self->{entity__value} = $EntityChar->{$self->{kwd}};
5322                }
5323              $self->{entity__match} = 1;              $self->{entity__match} = 1;
5324                            
5325      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 4164  sub _get_next_token ($) { Line 5335  sub _get_next_token ($) {
5335              #              #
5336            } else {            } else {
5337                            
5338              $self->{entity__value} = $EntityChar->{$self->{s_kwd}};              $self->{entity__value} = $EntityChar->{$self->{kwd}};
5339              $self->{entity__match} = -1;              $self->{entity__match} = -1;
5340              ## Stay in the state.              ## Stay in the state.
5341                            
# Line 4212  sub _get_next_token ($) { Line 5383  sub _get_next_token ($) {
5383          if ($self->{prev_state} != DATA_STATE and # in attribute          if ($self->{prev_state} != DATA_STATE and # in attribute
5384              $self->{entity__match} < -1) {              $self->{entity__match} < -1) {
5385                        
5386            $data = '&' . $self->{s_kwd};            $data = '&' . $self->{kwd};
5387            #            #
5388          } else {          } else {
5389                        
# Line 4224  sub _get_next_token ($) { Line 5395  sub _get_next_token ($) {
5395                    
5396          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
5397                          line => $self->{line_prev},                          line => $self->{line_prev},
5398                          column => $self->{column_prev} - length $self->{s_kwd});                          column => $self->{column_prev} - length $self->{kwd});
5399          $data = '&' . $self->{s_kwd};          $data = '&' . $self->{kwd};
5400          #          #
5401        }        }
5402        
# Line 4248  sub _get_next_token ($) { Line 5419  sub _get_next_token ($) {
5419                    data => $data,                    data => $data,
5420                    has_reference => $has_ref,                    has_reference => $has_ref,
5421                    line => $self->{line_prev},                    line => $self->{line_prev},
5422                    column => $self->{column_prev} + 1 - length $self->{s_kwd},                    column => $self->{column_prev} + 1 - length $self->{kwd},
5423                   });                   });
5424          redo A;          redo A;
5425        } else {        } else {
# Line 4260  sub _get_next_token ($) { Line 5431  sub _get_next_token ($) {
5431          ## Reconsume.          ## Reconsume.
5432          redo A;          redo A;
5433        }        }
5434    
5435        ## XML-only states
5436    
5437        } elsif ($self->{state} == PI_STATE) {
5438          ## XML5: "Pi state" and "DOCTYPE pi state".
5439    
5440          if ($is_space->{$self->{nc}} or
5441              $self->{nc} == 0x003F or # ?
5442              $self->{nc} == -1) {
5443            ## XML5: U+003F: "pi state": Same as "Anything else"; "DOCTYPE
5444            ## pi state": Switch to the "DOCTYPE pi after state".  EOF:
5445            ## "DOCTYPE pi state": Parse error, switch to the "data
5446            ## state".
5447            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare pio', ## TODO: type
5448                            line => $self->{line_prev},
5449                            column => $self->{column_prev}
5450                                - 1 * ($self->{nc} != -1));
5451            $self->{state} = BOGUS_COMMENT_STATE;
5452            ## Reconsume.
5453            $self->{ct} = {type => COMMENT_TOKEN,
5454                           data => '?',
5455                           line => $self->{line_prev},
5456                           column => $self->{column_prev}
5457                               - 1 * ($self->{nc} != -1),
5458                          };
5459            redo A;
5460          } else {
5461            ## XML5: "DOCTYPE pi state": Stay in the state.
5462            $self->{ct} = {type => PI_TOKEN,
5463                           target => chr $self->{nc},
5464                           data => '',
5465                           line => $self->{line_prev},
5466                           column => $self->{column_prev} - 1,
5467                          };
5468            $self->{state} = PI_TARGET_STATE;
5469            
5470        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5471          $self->{line_prev} = $self->{line};
5472          $self->{column_prev} = $self->{column};
5473          $self->{column}++;
5474          $self->{nc}
5475              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5476        } else {
5477          $self->{set_nc}->($self);
5478        }
5479      
5480            redo A;
5481          }
5482        } elsif ($self->{state} == PI_TARGET_STATE) {
5483          if ($is_space->{$self->{nc}}) {
5484            $self->{state} = PI_TARGET_AFTER_STATE;
5485            
5486        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5487          $self->{line_prev} = $self->{line};
5488          $self->{column_prev} = $self->{column};
5489          $self->{column}++;
5490          $self->{nc}
5491              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5492        } else {
5493          $self->{set_nc}->($self);
5494        }
5495      
5496            redo A;
5497          } elsif ($self->{nc} == -1) {
5498            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
5499            if ($self->{in_subset}) {
5500              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5501            } else {
5502              $self->{state} = DATA_STATE;
5503              $self->{s_kwd} = '';
5504            }
5505            ## Reconsume.
5506            return  ($self->{ct}); # pi
5507            redo A;
5508          } elsif ($self->{nc} == 0x003F) { # ?
5509            $self->{state} = PI_AFTER_STATE;
5510            
5511        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5512          $self->{line_prev} = $self->{line};
5513          $self->{column_prev} = $self->{column};
5514          $self->{column}++;
5515          $self->{nc}
5516              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5517        } else {
5518          $self->{set_nc}->($self);
5519        }
5520      
5521            redo A;
5522          } else {
5523            ## XML5: typo ("tag name" -> "target")
5524            $self->{ct}->{target} .= chr $self->{nc}; # pi
5525            
5526        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5527          $self->{line_prev} = $self->{line};
5528          $self->{column_prev} = $self->{column};
5529          $self->{column}++;
5530          $self->{nc}
5531              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5532        } else {
5533          $self->{set_nc}->($self);
5534        }
5535      
5536            redo A;
5537          }
5538        } elsif ($self->{state} == PI_TARGET_AFTER_STATE) {
5539          if ($is_space->{$self->{nc}}) {
5540            ## Stay in the state.
5541            
5542        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5543          $self->{line_prev} = $self->{line};
5544          $self->{column_prev} = $self->{column};
5545          $self->{column}++;
5546          $self->{nc}
5547              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5548        } else {
5549          $self->{set_nc}->($self);
5550        }
5551      
5552            redo A;
5553          } else {
5554            $self->{state} = PI_DATA_STATE;
5555            ## Reprocess.
5556            redo A;
5557          }
5558        } elsif ($self->{state} == PI_DATA_STATE) {
5559          if ($self->{nc} == 0x003F) { # ?
5560            $self->{state} = PI_DATA_AFTER_STATE;
5561            
5562        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5563          $self->{line_prev} = $self->{line};
5564          $self->{column_prev} = $self->{column};
5565          $self->{column}++;
5566          $self->{nc}
5567              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5568        } else {
5569          $self->{set_nc}->($self);
5570        }
5571      
5572            redo A;
5573          } elsif ($self->{nc} == -1) {
5574            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
5575            if ($self->{in_subset}) {
5576              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state"
5577            } else {
5578              $self->{state} = DATA_STATE;
5579              $self->{s_kwd} = '';
5580            }
5581            ## Reprocess.
5582            return  ($self->{ct}); # pi
5583            redo A;
5584          } else {
5585            $self->{ct}->{data} .= chr $self->{nc}; # pi
5586            $self->{read_until}->($self->{ct}->{data}, q[?],
5587                                  length $self->{ct}->{data});
5588            ## Stay in the state.
5589            
5590        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5591          $self->{line_prev} = $self->{line};
5592          $self->{column_prev} = $self->{column};
5593          $self->{column}++;
5594          $self->{nc}
5595              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5596        } else {
5597          $self->{set_nc}->($self);
5598        }
5599      
5600            ## Reprocess.
5601            redo A;
5602          }
5603        } elsif ($self->{state} == PI_AFTER_STATE) {
5604          ## XML5: Part of "Pi after state".
5605    
5606          if ($self->{nc} == 0x003E) { # >
5607            if ($self->{in_subset}) {
5608              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5609            } else {
5610              $self->{state} = DATA_STATE;
5611              $self->{s_kwd} = '';
5612            }
5613            
5614        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5615          $self->{line_prev} = $self->{line};
5616          $self->{column_prev} = $self->{column};
5617          $self->{column}++;
5618          $self->{nc}
5619              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5620        } else {
5621          $self->{set_nc}->($self);
5622        }
5623      
5624            return  ($self->{ct}); # pi
5625            redo A;
5626          } elsif ($self->{nc} == 0x003F) { # ?
5627            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
5628                            line => $self->{line_prev},
5629                            column => $self->{column_prev}); ## XML5: no error
5630            $self->{ct}->{data} .= '?';
5631            $self->{state} = PI_DATA_AFTER_STATE;
5632            
5633        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5634          $self->{line_prev} = $self->{line};
5635          $self->{column_prev} = $self->{column};
5636          $self->{column}++;
5637          $self->{nc}
5638              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5639        } else {
5640          $self->{set_nc}->($self);
5641        }
5642      
5643            redo A;
5644          } else {
5645            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
5646                            line => $self->{line_prev},
5647                            column => $self->{column_prev}
5648                                + 1 * ($self->{nc} == -1)); ## XML5: no error
5649            $self->{ct}->{data} .= '?'; ## XML5: not appended
5650            $self->{state} = PI_DATA_STATE;
5651            ## Reprocess.
5652            redo A;
5653          }
5654        } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
5655          ## XML5: Same as "pi after state" and "DOCTYPE pi after state".
5656    
5657          if ($self->{nc} == 0x003E) { # >
5658            if ($self->{in_subset}) {
5659              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5660            } else {
5661              $self->{state} = DATA_STATE;
5662              $self->{s_kwd} = '';
5663            }
5664            
5665        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5666          $self->{line_prev} = $self->{line};
5667          $self->{column_prev} = $self->{column};
5668          $self->{column}++;
5669          $self->{nc}
5670              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5671        } else {
5672          $self->{set_nc}->($self);
5673        }
5674      
5675            return  ($self->{ct}); # pi
5676            redo A;
5677          } elsif ($self->{nc} == 0x003F) { # ?
5678            $self->{ct}->{data} .= '?';
5679            ## Stay in the state.
5680            
5681        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5682          $self->{line_prev} = $self->{line};
5683          $self->{column_prev} = $self->{column};
5684          $self->{column}++;
5685          $self->{nc}
5686              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5687        } else {
5688          $self->{set_nc}->($self);
5689        }
5690      
5691            redo A;
5692          } else {
5693            $self->{ct}->{data} .= '?'; ## XML5: not appended
5694            $self->{state} = PI_DATA_STATE;
5695            ## Reprocess.
5696            redo A;
5697          }
5698    
5699        } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_STATE) {
5700          if ($self->{nc} == 0x003C) { # <
5701            $self->{state} = DOCTYPE_TAG_STATE;
5702            
5703        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5704          $self->{line_prev} = $self->{line};
5705          $self->{column_prev} = $self->{column};
5706          $self->{column}++;
5707          $self->{nc}
5708              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5709        } else {
5710          $self->{set_nc}->($self);
5711        }
5712      
5713            redo A;
5714          } elsif ($self->{nc} == 0x0025) { # %
5715            ## XML5: Not defined yet.
5716    
5717            ## TODO:
5718    
5719            if (not $self->{stop_processing} and
5720                not $self->{document}->xml_standalone) {
5721              $self->{parse_error}->(level => $self->{level}->{must}, type => 'stop processing', ## TODO: type
5722                              level => $self->{level}->{info});
5723              $self->{stop_processing} = 1;
5724            }
5725    
5726            
5727        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5728          $self->{line_prev} = $self->{line};
5729          $self->{column_prev} = $self->{column};
5730          $self->{column}++;
5731          $self->{nc}
5732              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5733        } else {
5734          $self->{set_nc}->($self);
5735        }
5736      
5737            redo A;
5738          } elsif ($self->{nc} == 0x005D) { # ]
5739            delete $self->{in_subset};
5740            $self->{state} = DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
5741            
5742        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5743          $self->{line_prev} = $self->{line};
5744          $self->{column_prev} = $self->{column};
5745          $self->{column}++;
5746          $self->{nc}
5747              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5748        } else {
5749          $self->{set_nc}->($self);
5750        }
5751      
5752            redo A;
5753          } elsif ($is_space->{$self->{nc}}) {
5754            ## Stay in the state.
5755            
5756        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5757          $self->{line_prev} = $self->{line};
5758          $self->{column_prev} = $self->{column};
5759          $self->{column}++;
5760          $self->{nc}
5761              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5762        } else {
5763          $self->{set_nc}->($self);
5764        }
5765      
5766            redo A;
5767          } elsif ($self->{nc} == -1) {
5768            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed internal subset'); ## TODO: type
5769            delete $self->{in_subset};
5770            $self->{state} = DATA_STATE;
5771            $self->{s_kwd} = '';
5772            ## Reconsume.
5773            return  ({type => END_OF_DOCTYPE_TOKEN});
5774            redo A;
5775          } else {
5776            unless ($self->{internal_subset_tainted}) {
5777              ## XML5: No parse error.
5778              $self->{parse_error}->(level => $self->{level}->{must}, type => 'string in internal subset');
5779              $self->{internal_subset_tainted} = 1;
5780            }
5781            ## Stay in the state.
5782            
5783        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5784          $self->{line_prev} = $self->{line};
5785          $self->{column_prev} = $self->{column};
5786          $self->{column}++;
5787          $self->{nc}
5788              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5789        } else {
5790          $self->{set_nc}->($self);
5791        }
5792      
5793            redo A;
5794          }
5795        } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
5796          if ($self->{nc} == 0x003E) { # >
5797            $self->{state} = DATA_STATE;
5798            $self->{s_kwd} = '';
5799            
5800        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5801          $self->{line_prev} = $self->{line};
5802          $self->{column_prev} = $self->{column};
5803          $self->{column}++;
5804          $self->{nc}
5805              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5806        } else {
5807          $self->{set_nc}->($self);
5808        }
5809      
5810            return  ({type => END_OF_DOCTYPE_TOKEN});
5811            redo A;
5812          } elsif ($self->{nc} == -1) {
5813            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
5814            $self->{state} = DATA_STATE;
5815            $self->{s_kwd} = '';
5816            ## Reconsume.
5817            return  ({type => END_OF_DOCTYPE_TOKEN});
5818            redo A;
5819          } else {
5820            ## XML5: No parse error and stay in the state.
5821            $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after internal subset'); ## TODO: type
5822    
5823            $self->{state} = BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
5824            
5825        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5826          $self->{line_prev} = $self->{line};
5827          $self->{column_prev} = $self->{column};
5828          $self->{column}++;
5829          $self->{nc}
5830              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5831        } else {
5832          $self->{set_nc}->($self);
5833        }
5834      
5835            redo A;
5836          }
5837        } elsif ($self->{state} == BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
5838          if ($self->{nc} == 0x003E) { # >
5839            $self->{state} = DATA_STATE;
5840            $self->{s_kwd} = '';
5841            
5842        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5843          $self->{line_prev} = $self->{line};
5844          $self->{column_prev} = $self->{column};
5845          $self->{column}++;
5846          $self->{nc}
5847              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5848        } else {
5849          $self->{set_nc}->($self);
5850        }
5851      
5852            return  ({type => END_OF_DOCTYPE_TOKEN});
5853            redo A;
5854          } elsif ($self->{nc} == -1) {
5855            $self->{state} = DATA_STATE;
5856            $self->{s_kwd} = '';
5857            ## Reconsume.
5858            return  ({type => END_OF_DOCTYPE_TOKEN});
5859            redo A;
5860          } else {
5861            ## Stay in the state.
5862            
5863        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5864          $self->{line_prev} = $self->{line};
5865          $self->{column_prev} = $self->{column};
5866          $self->{column}++;
5867          $self->{nc}
5868              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5869        } else {
5870          $self->{set_nc}->($self);
5871        }
5872      
5873            redo A;
5874          }
5875        } elsif ($self->{state} == DOCTYPE_TAG_STATE) {
5876          if ($self->{nc} == 0x0021) { # !
5877            $self->{state} = DOCTYPE_MARKUP_DECLARATION_OPEN_STATE;
5878            
5879        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5880          $self->{line_prev} = $self->{line};
5881          $self->{column_prev} = $self->{column};
5882          $self->{column}++;
5883          $self->{nc}
5884              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5885        } else {
5886          $self->{set_nc}->($self);
5887        }
5888      
5889            redo A;
5890          } elsif ($self->{nc} == 0x003F) { # ?
5891            $self->{state} = PI_STATE;
5892            
5893        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5894          $self->{line_prev} = $self->{line};
5895          $self->{column_prev} = $self->{column};
5896          $self->{column}++;
5897          $self->{nc}
5898              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5899        } else {
5900          $self->{set_nc}->($self);
5901        }
5902      
5903            redo A;
5904          } elsif ($self->{nc} == -1) {
5905            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago');
5906            $self->{state} = DATA_STATE;
5907            $self->{s_kwd} = '';
5908            ## Reconsume.
5909            redo A;
5910          } else {
5911            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago', ## XML5: Not a parse error.
5912                            line => $self->{line_prev},
5913                            column => $self->{column_prev});
5914            $self->{state} = BOGUS_COMMENT_STATE;
5915            $self->{ct} = {type => COMMENT_TOKEN,
5916                           data => '',
5917                          }; ## NOTE: Will be discarded.
5918            
5919        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5920          $self->{line_prev} = $self->{line};
5921          $self->{column_prev} = $self->{column};
5922          $self->{column}++;
5923          $self->{nc}
5924              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5925        } else {
5926          $self->{set_nc}->($self);
5927        }
5928      
5929            redo A;
5930          }
5931        } elsif ($self->{state} == DOCTYPE_MARKUP_DECLARATION_OPEN_STATE) {
5932          ## XML5: "DOCTYPE markup declaration state".
5933          
5934          if ($self->{nc} == 0x002D) { # -
5935            $self->{state} = MD_HYPHEN_STATE;
5936            
5937        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5938          $self->{line_prev} = $self->{line};
5939          $self->{column_prev} = $self->{column};
5940          $self->{column}++;
5941          $self->{nc}
5942              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5943        } else {
5944          $self->{set_nc}->($self);
5945        }
5946      
5947            redo A;
5948          } elsif ($self->{nc} == 0x0045 or # E
5949                   $self->{nc} == 0x0065) { # e
5950            $self->{state} = MD_E_STATE;
5951            $self->{kwd} = chr $self->{nc};
5952            
5953        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5954          $self->{line_prev} = $self->{line};
5955          $self->{column_prev} = $self->{column};
5956          $self->{column}++;
5957          $self->{nc}
5958              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5959        } else {
5960          $self->{set_nc}->($self);
5961        }
5962      
5963            redo A;
5964          } elsif ($self->{nc} == 0x0041 or # A
5965                   $self->{nc} == 0x0061) { # a
5966            $self->{state} = MD_ATTLIST_STATE;
5967            $self->{kwd} = chr $self->{nc};
5968            
5969        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5970          $self->{line_prev} = $self->{line};
5971          $self->{column_prev} = $self->{column};
5972          $self->{column}++;
5973          $self->{nc}
5974              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5975        } else {
5976          $self->{set_nc}->($self);
5977        }
5978      
5979            redo A;
5980          } elsif ($self->{nc} == 0x004E or # N
5981                   $self->{nc} == 0x006E) { # n
5982            $self->{state} = MD_NOTATION_STATE;
5983            $self->{kwd} = chr $self->{nc};
5984            
5985        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5986          $self->{line_prev} = $self->{line};
5987          $self->{column_prev} = $self->{column};
5988          $self->{column}++;
5989          $self->{nc}
5990              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5991        } else {
5992          $self->{set_nc}->($self);
5993        }
5994      
5995            redo A;
5996          } else {
5997            #
5998          }
5999          
6000          ## XML5: No parse error.
6001          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
6002                          line => $self->{line_prev},
6003                          column => $self->{column_prev} - 1);
6004          ## Reconsume.
6005          $self->{state} = BOGUS_COMMENT_STATE;
6006          $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded.
6007          redo A;
6008        } elsif ($self->{state} == MD_E_STATE) {
6009          if ($self->{nc} == 0x004E or # N
6010              $self->{nc} == 0x006E) { # n
6011            $self->{state} = MD_ENTITY_STATE;
6012            $self->{kwd} .= chr $self->{nc};
6013            
6014        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6015          $self->{line_prev} = $self->{line};
6016          $self->{column_prev} = $self->{column};
6017          $self->{column}++;
6018          $self->{nc}
6019              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6020        } else {
6021          $self->{set_nc}->($self);
6022        }
6023      
6024            redo A;
6025          } elsif ($self->{nc} == 0x004C or # L
6026                   $self->{nc} == 0x006C) { # l
6027            ## XML5: <!ELEMENT> not supported.
6028            $self->{state} = MD_ELEMENT_STATE;
6029            $self->{kwd} .= chr $self->{nc};
6030            
6031        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6032          $self->{line_prev} = $self->{line};
6033          $self->{column_prev} = $self->{column};
6034          $self->{column}++;
6035          $self->{nc}
6036              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6037        } else {
6038          $self->{set_nc}->($self);
6039        }
6040      
6041            redo A;
6042          } else {
6043            ## XML5: No parse error.
6044            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
6045                            line => $self->{line_prev},
6046                            column => $self->{column_prev} - 2
6047                                + 1 * ($self->{nc} == -1));
6048            ## Reconsume.
6049            $self->{state} = BOGUS_COMMENT_STATE;
6050            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6051            redo A;
6052          }
6053        } elsif ($self->{state} == MD_ENTITY_STATE) {
6054          if ($self->{nc} == [
6055                undef,
6056                undef,
6057                0x0054, # T
6058                0x0049, # I
6059                0x0054, # T
6060              ]->[length $self->{kwd}] or
6061              $self->{nc} == [
6062                undef,
6063                undef,
6064                0x0074, # t
6065                0x0069, # i
6066                0x0074, # t
6067              ]->[length $self->{kwd}]) {
6068            ## Stay in the state.
6069            $self->{kwd} .= chr $self->{nc};
6070            
6071        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6072          $self->{line_prev} = $self->{line};
6073          $self->{column_prev} = $self->{column};
6074          $self->{column}++;
6075          $self->{nc}
6076              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6077        } else {
6078          $self->{set_nc}->($self);
6079        }
6080      
6081            redo A;
6082          } elsif ((length $self->{kwd}) == 5 and
6083                   ($self->{nc} == 0x0059 or # Y
6084                    $self->{nc} == 0x0079)) { # y
6085            if ($self->{kwd} ne 'ENTIT' or $self->{nc} == 0x0079) {
6086              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
6087                              text => 'ENTITY',
6088                              line => $self->{line_prev},
6089                              column => $self->{column_prev} - 4);
6090            }
6091            $self->{ct} = {type => GENERAL_ENTITY_TOKEN, name => '',
6092                           line => $self->{line_prev},
6093                           column => $self->{column_prev} - 6};
6094            $self->{state} = DOCTYPE_MD_STATE;
6095            
6096        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6097          $self->{line_prev} = $self->{line};
6098          $self->{column_prev} = $self->{column};
6099          $self->{column}++;
6100          $self->{nc}
6101              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6102        } else {
6103          $self->{set_nc}->($self);
6104        }
6105      
6106            redo A;
6107          } else {
6108            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
6109                            line => $self->{line_prev},
6110                            column => $self->{column_prev} - 1
6111                                - (length $self->{kwd})
6112                                + 1 * ($self->{nc} == -1));
6113            $self->{state} = BOGUS_COMMENT_STATE;
6114            ## Reconsume.
6115            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6116            redo A;
6117          }
6118        } elsif ($self->{state} == MD_ELEMENT_STATE) {
6119          if ($self->{nc} == [
6120               undef,
6121               undef,
6122               0x0045, # E
6123               0x004D, # M
6124               0x0045, # E
6125               0x004E, # N
6126              ]->[length $self->{kwd}] or
6127              $self->{nc} == [
6128               undef,
6129               undef,
6130               0x0065, # e
6131               0x006D, # m
6132               0x0065, # e
6133               0x006E, # n
6134              ]->[length $self->{kwd}]) {
6135            ## Stay in the state.
6136            $self->{kwd} .= chr $self->{nc};
6137            
6138        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6139          $self->{line_prev} = $self->{line};
6140          $self->{column_prev} = $self->{column};
6141          $self->{column}++;
6142          $self->{nc}
6143              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6144        } else {
6145          $self->{set_nc}->($self);
6146        }
6147      
6148            redo A;
6149          } elsif ((length $self->{kwd}) == 6 and
6150                   ($self->{nc} == 0x0054 or # T
6151                    $self->{nc} == 0x0074)) { # t
6152            if ($self->{kwd} ne 'ELEMEN' or $self->{nc} == 0x0074) {
6153              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
6154                              text => 'ELEMENT',
6155                              line => $self->{line_prev},
6156                              column => $self->{column_prev} - 5);
6157            }
6158            $self->{ct} = {type => ELEMENT_TOKEN, name => '',
6159                           line => $self->{line_prev},
6160                           column => $self->{column_prev} - 7};
6161            $self->{state} = DOCTYPE_MD_STATE;
6162            
6163        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6164          $self->{line_prev} = $self->{line};
6165          $self->{column_prev} = $self->{column};
6166          $self->{column}++;
6167          $self->{nc}
6168              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6169        } else {
6170          $self->{set_nc}->($self);
6171        }
6172      
6173            redo A;
6174          } else {
6175            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
6176                            line => $self->{line_prev},
6177                            column => $self->{column_prev} - 1
6178                                - (length $self->{kwd})
6179                                + 1 * ($self->{nc} == -1));
6180            $self->{state} = BOGUS_COMMENT_STATE;
6181            ## Reconsume.
6182            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6183            redo A;
6184          }
6185        } elsif ($self->{state} == MD_ATTLIST_STATE) {
6186          if ($self->{nc} == [
6187               undef,
6188               0x0054, # T
6189               0x0054, # T
6190               0x004C, # L
6191               0x0049, # I
6192               0x0053, # S
6193              ]->[length $self->{kwd}] or
6194              $self->{nc} == [
6195               undef,
6196               0x0074, # t
6197               0x0074, # t
6198               0x006C, # l
6199               0x0069, # i
6200               0x0073, # s
6201              ]->[length $self->{kwd}]) {
6202            ## Stay in the state.
6203            $self->{kwd} .= chr $self->{nc};
6204            
6205        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6206          $self->{line_prev} = $self->{line};
6207          $self->{column_prev} = $self->{column};
6208          $self->{column}++;
6209          $self->{nc}
6210              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6211        } else {
6212          $self->{set_nc}->($self);
6213        }
6214      
6215            redo A;
6216          } elsif ((length $self->{kwd}) == 6 and
6217                   ($self->{nc} == 0x0054 or # T
6218                    $self->{nc} == 0x0074)) { # t
6219            if ($self->{kwd} ne 'ATTLIS' or $self->{nc} == 0x0074) {
6220              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
6221                              text => 'ATTLIST',
6222                              line => $self->{line_prev},
6223                              column => $self->{column_prev} - 5);
6224            }
6225            $self->{ct} = {type => ATTLIST_TOKEN, name => '',
6226                           attrdefs => [],
6227                           line => $self->{line_prev},
6228                           column => $self->{column_prev} - 7};
6229            $self->{state} = DOCTYPE_MD_STATE;
6230            
6231        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6232          $self->{line_prev} = $self->{line};
6233          $self->{column_prev} = $self->{column};
6234          $self->{column}++;
6235          $self->{nc}
6236              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6237        } else {
6238          $self->{set_nc}->($self);
6239        }
6240      
6241            redo A;
6242          } else {
6243            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
6244                            line => $self->{line_prev},
6245                            column => $self->{column_prev} - 1
6246                                 - (length $self->{kwd})
6247                                 + 1 * ($self->{nc} == -1));
6248            $self->{state} = BOGUS_COMMENT_STATE;
6249            ## Reconsume.
6250            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6251            redo A;
6252          }
6253        } elsif ($self->{state} == MD_NOTATION_STATE) {
6254          if ($self->{nc} == [
6255               undef,
6256               0x004F, # O
6257               0x0054, # T
6258               0x0041, # A
6259               0x0054, # T
6260               0x0049, # I
6261               0x004F, # O
6262              ]->[length $self->{kwd}] or
6263              $self->{nc} == [
6264               undef,
6265               0x006F, # o
6266               0x0074, # t
6267               0x0061, # a
6268               0x0074, # t
6269               0x0069, # i
6270               0x006F, # o
6271              ]->[length $self->{kwd}]) {
6272            ## Stay in the state.
6273            $self->{kwd} .= chr $self->{nc};
6274            
6275        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6276          $self->{line_prev} = $self->{line};
6277          $self->{column_prev} = $self->{column};
6278          $self->{column}++;
6279          $self->{nc}
6280              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6281        } else {
6282          $self->{set_nc}->($self);
6283        }
6284      
6285            redo A;
6286          } elsif ((length $self->{kwd}) == 7 and
6287                   ($self->{nc} == 0x004E or # N
6288                    $self->{nc} == 0x006E)) { # n
6289            if ($self->{kwd} ne 'NOTATIO' or $self->{nc} == 0x006E) {
6290              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
6291                              text => 'NOTATION',
6292                              line => $self->{line_prev},
6293                              column => $self->{column_prev} - 6);
6294            }
6295            $self->{ct} = {type => NOTATION_TOKEN, name => '',
6296                           line => $self->{line_prev},
6297                           column => $self->{column_prev} - 8};
6298            $self->{state} = DOCTYPE_MD_STATE;
6299            
6300        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6301          $self->{line_prev} = $self->{line};
6302          $self->{column_prev} = $self->{column};
6303          $self->{column}++;
6304          $self->{nc}
6305              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6306        } else {
6307          $self->{set_nc}->($self);
6308        }
6309      
6310            redo A;
6311          } else {
6312            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
6313                            line => $self->{line_prev},
6314                            column => $self->{column_prev} - 1
6315                                - (length $self->{kwd})
6316                                + 1 * ($self->{nc} == -1));
6317            $self->{state} = BOGUS_COMMENT_STATE;
6318            ## Reconsume.
6319            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6320            redo A;
6321          }
6322        } elsif ($self->{state} == DOCTYPE_MD_STATE) {
6323          ## XML5: "DOCTYPE ENTITY state", "DOCTYPE ATTLIST state", and
6324          ## "DOCTYPE NOTATION state".
6325    
6326          if ($is_space->{$self->{nc}}) {
6327            ## XML5: [NOTATION] Switch to the "DOCTYPE NOTATION identifier state".
6328            $self->{state} = BEFORE_MD_NAME_STATE;
6329            
6330        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6331          $self->{line_prev} = $self->{line};
6332          $self->{column_prev} = $self->{column};
6333          $self->{column}++;
6334          $self->{nc}
6335              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6336        } else {
6337          $self->{set_nc}->($self);
6338        }
6339      
6340            redo A;
6341          } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
6342                   $self->{nc} == 0x0025) { # %
6343            ## XML5: Switch to the "DOCTYPE bogus comment state".
6344            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before md name'); ## TODO: type
6345            $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
6346            
6347        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6348          $self->{line_prev} = $self->{line};
6349          $self->{column_prev} = $self->{column};
6350          $self->{column}++;
6351          $self->{nc}
6352              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6353        } else {
6354          $self->{set_nc}->($self);
6355        }
6356      
6357            redo A;
6358          } elsif ($self->{nc} == -1) {
6359            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6360            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6361            ## Reconsume.
6362            redo A;
6363          } elsif ($self->{nc} == 0x003E) { # >
6364            ## XML5: Switch to the "DOCTYPE bogus comment state".
6365            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6366            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6367            
6368        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6369          $self->{line_prev} = $self->{line};
6370          $self->{column_prev} = $self->{column};
6371          $self->{column}++;
6372          $self->{nc}
6373              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6374        } else {
6375          $self->{set_nc}->($self);
6376        }
6377      
6378            redo A;
6379          } else {
6380            ## XML5: Switch to the "DOCTYPE bogus comment state".
6381            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before md name'); ## TODO: type
6382            $self->{state} = BEFORE_MD_NAME_STATE;
6383            redo A;
6384          }
6385        } elsif ($self->{state} == BEFORE_MD_NAME_STATE) {
6386          ## XML5: "DOCTYPE ENTITY parameter state", "DOCTYPE ENTITY type
6387          ## before state", "DOCTYPE ATTLIST name before state".
6388    
6389          if ($is_space->{$self->{nc}}) {
6390            ## Stay in the state.
6391            
6392        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6393          $self->{line_prev} = $self->{line};
6394          $self->{column_prev} = $self->{column};
6395          $self->{column}++;
6396          $self->{nc}
6397              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6398        } else {
6399          $self->{set_nc}->($self);
6400        }
6401      
6402            redo A;
6403          } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
6404                   $self->{nc} == 0x0025) { # %
6405            $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
6406            
6407        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6408          $self->{line_prev} = $self->{line};
6409          $self->{column_prev} = $self->{column};
6410          $self->{column}++;
6411          $self->{nc}
6412              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6413        } else {
6414          $self->{set_nc}->($self);
6415        }
6416      
6417            redo A;
6418          } elsif ($self->{nc} == 0x003E) { # >
6419            ## XML5: Same as "Anything else".
6420            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6421            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6422            
6423        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6424          $self->{line_prev} = $self->{line};
6425          $self->{column_prev} = $self->{column};
6426          $self->{column}++;
6427          $self->{nc}
6428              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6429        } else {
6430          $self->{set_nc}->($self);
6431        }
6432      
6433            redo A;
6434          } elsif ($self->{nc} == -1) {
6435            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6436            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6437            ## Reconsume.
6438            redo A;
6439          } else {
6440            ## XML5: [ATTLIST] Not defined yet.
6441            $self->{ct}->{name} .= chr $self->{nc};
6442            $self->{state} = MD_NAME_STATE;
6443            
6444        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6445          $self->{line_prev} = $self->{line};
6446          $self->{column_prev} = $self->{column};
6447          $self->{column}++;
6448          $self->{nc}
6449              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6450        } else {
6451          $self->{set_nc}->($self);
6452        }
6453      
6454            redo A;
6455          }
6456        } elsif ($self->{state} == DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE) {
6457          if ($is_space->{$self->{nc}}) {
6458            ## XML5: Switch to the "DOCTYPE ENTITY parameter state".
6459            $self->{ct}->{type} = PARAMETER_ENTITY_TOKEN;
6460            $self->{state} = BEFORE_MD_NAME_STATE;
6461            
6462        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6463          $self->{line_prev} = $self->{line};
6464          $self->{column_prev} = $self->{column};
6465          $self->{column}++;
6466          $self->{nc}
6467              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6468        } else {
6469          $self->{set_nc}->($self);
6470        }
6471      
6472            redo A;
6473          } elsif ($self->{nc} == 0x003E) { # >
6474            ## XML5: Same as "Anything else".
6475            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6476            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6477            
6478        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6479          $self->{line_prev} = $self->{line};
6480          $self->{column_prev} = $self->{column};
6481          $self->{column}++;
6482          $self->{nc}
6483              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6484        } else {
6485          $self->{set_nc}->($self);
6486        }
6487      
6488            redo A;
6489          } elsif ($self->{nc} == -1) {
6490            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md');
6491            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6492            ## Reconsume.
6493            redo A;
6494          } else {
6495            ## XML5: No parse error.
6496            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space after ENTITY percent'); ## TODO: type
6497            $self->{state} = BOGUS_COMMENT_STATE;
6498            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6499            ## Reconsume.
6500            redo A;
6501          }
6502        } elsif ($self->{state} == MD_NAME_STATE) {
6503          ## XML5: "DOCTYPE ENTITY name state" and "DOCTYPE ATTLIST name state".
6504          
6505          if ($is_space->{$self->{nc}}) {
6506            if ($self->{ct}->{type} == ATTLIST_TOKEN) {
6507              $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
6508            } elsif ($self->{ct}->{type} == ELEMENT_TOKEN) {
6509              $self->{state} = AFTER_ELEMENT_NAME_STATE;
6510            } else { # ENTITY/NOTATION
6511              $self->{state} = AFTER_DOCTYPE_NAME_STATE;
6512            }
6513            
6514        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6515          $self->{line_prev} = $self->{line};
6516          $self->{column_prev} = $self->{column};
6517          $self->{column}++;
6518          $self->{nc}
6519              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6520        } else {
6521          $self->{set_nc}->($self);
6522        }
6523      
6524            redo A;
6525          } elsif ($self->{nc} == 0x003E) { # >
6526            if ($self->{ct}->{type} == ATTLIST_TOKEN) {
6527              #
6528            } else {
6529              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
6530            }
6531            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6532            
6533        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6534          $self->{line_prev} = $self->{line};
6535          $self->{column_prev} = $self->{column};
6536          $self->{column}++;
6537          $self->{nc}
6538              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6539        } else {
6540          $self->{set_nc}->($self);
6541        }
6542      
6543            return  ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
6544            redo A;
6545          } elsif ($self->{nc} == -1) {
6546            ## XML5: [ATTLIST] No parse error.
6547            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md');
6548            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6549            ## Reconsume.
6550            return  ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
6551            redo A;
6552          } else {
6553            ## XML5: [ATTLIST] Not defined yet.
6554            $self->{ct}->{name} .= chr $self->{nc};
6555            ## Stay in the state.
6556            
6557        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6558          $self->{line_prev} = $self->{line};
6559          $self->{column_prev} = $self->{column};
6560          $self->{column}++;
6561          $self->{nc}
6562              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6563        } else {
6564          $self->{set_nc}->($self);
6565        }
6566      
6567            redo A;
6568          }
6569        } elsif ($self->{state} == DOCTYPE_ATTLIST_NAME_AFTER_STATE) {
6570          if ($is_space->{$self->{nc}}) {
6571            ## Stay in the state.
6572            
6573        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6574          $self->{line_prev} = $self->{line};
6575          $self->{column_prev} = $self->{column};
6576          $self->{column}++;
6577          $self->{nc}
6578              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6579        } else {
6580          $self->{set_nc}->($self);
6581        }
6582      
6583            redo A;
6584          } elsif ($self->{nc} == 0x003E) { # >
6585            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6586            
6587        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6588          $self->{line_prev} = $self->{line};
6589          $self->{column_prev} = $self->{column};
6590          $self->{column}++;
6591          $self->{nc}
6592              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6593        } else {
6594          $self->{set_nc}->($self);
6595        }
6596      
6597            return  ($self->{ct}); # ATTLIST
6598            redo A;
6599          } elsif ($self->{nc} == -1) {
6600            ## XML5: No parse error.
6601            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6602            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6603            return  ($self->{ct});
6604            redo A;
6605          } else {
6606            ## XML5: Not defined yet.
6607            $self->{ca} = {name => chr ($self->{nc}), # attrdef
6608                           tokens => [],
6609                           line => $self->{line}, column => $self->{column}};
6610            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE;
6611            
6612        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6613          $self->{line_prev} = $self->{line};
6614          $self->{column_prev} = $self->{column};
6615          $self->{column}++;
6616          $self->{nc}
6617              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6618        } else {
6619          $self->{set_nc}->($self);
6620        }
6621      
6622            redo A;
6623          }
6624        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE) {
6625          if ($is_space->{$self->{nc}}) {
6626            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE;
6627            
6628        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6629          $self->{line_prev} = $self->{line};
6630          $self->{column_prev} = $self->{column};
6631          $self->{column}++;
6632          $self->{nc}
6633              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6634        } else {
6635          $self->{set_nc}->($self);
6636        }
6637      
6638            redo A;
6639          } elsif ($self->{nc} == 0x003E) { # >
6640            ## XML5: Same as "anything else".
6641            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr type'); ## TODO: type
6642            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6643            
6644        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6645          $self->{line_prev} = $self->{line};
6646          $self->{column_prev} = $self->{column};
6647          $self->{column}++;
6648          $self->{nc}
6649              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6650        } else {
6651          $self->{set_nc}->($self);
6652        }
6653      
6654            return  ($self->{ct}); # ATTLIST
6655            redo A;
6656          } elsif ($self->{nc} == 0x0028) { # (
6657            ## XML5: Same as "anything else".
6658            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before paren'); ## TODO: type
6659            $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6660            
6661        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6662          $self->{line_prev} = $self->{line};
6663          $self->{column_prev} = $self->{column};
6664          $self->{column}++;
6665          $self->{nc}
6666              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6667        } else {
6668          $self->{set_nc}->($self);
6669        }
6670      
6671            redo A;
6672          } elsif ($self->{nc} == -1) {
6673            ## XML5: No parse error.
6674            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6675            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6676            
6677        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6678          $self->{line_prev} = $self->{line};
6679          $self->{column_prev} = $self->{column};
6680          $self->{column}++;
6681          $self->{nc}
6682              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6683        } else {
6684          $self->{set_nc}->($self);
6685        }
6686      
6687            return  ($self->{ct}); # ATTLIST
6688            redo A;
6689          } else {
6690            ## XML5: Not defined yet.
6691            $self->{ca}->{name} .= chr $self->{nc};
6692            ## Stay in the state.
6693            
6694        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6695          $self->{line_prev} = $self->{line};
6696          $self->{column_prev} = $self->{column};
6697          $self->{column}++;
6698          $self->{nc}
6699              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6700        } else {
6701          $self->{set_nc}->($self);
6702        }
6703      
6704            redo A;
6705          }
6706        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE) {
6707          if ($is_space->{$self->{nc}}) {
6708            ## Stay in the state.
6709            
6710        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6711          $self->{line_prev} = $self->{line};
6712          $self->{column_prev} = $self->{column};
6713          $self->{column}++;
6714          $self->{nc}
6715              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6716        } else {
6717          $self->{set_nc}->($self);
6718        }
6719      
6720            redo A;
6721          } elsif ($self->{nc} == 0x003E) { # >
6722            ## XML5: Same as "anything else".
6723            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr type'); ## TODO: type
6724            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6725            
6726        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6727          $self->{line_prev} = $self->{line};
6728          $self->{column_prev} = $self->{column};
6729          $self->{column}++;
6730          $self->{nc}
6731              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6732        } else {
6733          $self->{set_nc}->($self);
6734        }
6735      
6736            return  ($self->{ct}); # ATTLIST
6737            redo A;
6738          } elsif ($self->{nc} == 0x0028) { # (
6739            ## XML5: Same as "anything else".
6740            $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6741            
6742        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6743          $self->{line_prev} = $self->{line};
6744          $self->{column_prev} = $self->{column};
6745          $self->{column}++;
6746          $self->{nc}
6747              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6748        } else {
6749          $self->{set_nc}->($self);
6750        }
6751      
6752            redo A;
6753          } elsif ($self->{nc} == -1) {
6754            ## XML5: No parse error.
6755            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6756            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6757            
6758        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6759          $self->{line_prev} = $self->{line};
6760          $self->{column_prev} = $self->{column};
6761          $self->{column}++;
6762          $self->{nc}
6763              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6764        } else {
6765          $self->{set_nc}->($self);
6766        }
6767      
6768            return  ($self->{ct});
6769            redo A;
6770          } else {
6771            ## XML5: Not defined yet.
6772            $self->{ca}->{type} = chr $self->{nc};
6773            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE;
6774            
6775        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6776          $self->{line_prev} = $self->{line};
6777          $self->{column_prev} = $self->{column};
6778          $self->{column}++;
6779          $self->{nc}
6780              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6781        } else {
6782          $self->{set_nc}->($self);
6783        }
6784      
6785            redo A;
6786          }
6787        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE) {
6788          if ($is_space->{$self->{nc}}) {
6789            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE;
6790            
6791        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6792          $self->{line_prev} = $self->{line};
6793          $self->{column_prev} = $self->{column};
6794          $self->{column}++;
6795          $self->{nc}
6796              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6797        } else {
6798          $self->{set_nc}->($self);
6799        }
6800      
6801            redo A;
6802          } elsif ($self->{nc} == 0x0023) { # #
6803            ## XML5: Same as "anything else".
6804            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6805            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
6806            
6807        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6808          $self->{line_prev} = $self->{line};
6809          $self->{column_prev} = $self->{column};
6810          $self->{column}++;
6811          $self->{nc}
6812              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6813        } else {
6814          $self->{set_nc}->($self);
6815        }
6816      
6817            redo A;
6818          } elsif ($self->{nc} == 0x0022) { # "
6819            ## XML5: Same as "anything else".
6820            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6821            $self->{ca}->{value} = '';
6822            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
6823            
6824        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6825          $self->{line_prev} = $self->{line};
6826          $self->{column_prev} = $self->{column};
6827          $self->{column}++;
6828          $self->{nc}
6829              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6830        } else {
6831          $self->{set_nc}->($self);
6832        }
6833      
6834            redo A;
6835          } elsif ($self->{nc} == 0x0027) { # '
6836            ## XML5: Same as "anything else".
6837            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6838            $self->{ca}->{value} = '';
6839            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
6840            
6841        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6842          $self->{line_prev} = $self->{line};
6843          $self->{column_prev} = $self->{column};
6844          $self->{column}++;
6845          $self->{nc}
6846              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6847        } else {
6848          $self->{set_nc}->($self);
6849        }
6850      
6851            redo A;
6852          } elsif ($self->{nc} == 0x003E) { # >
6853            ## XML5: Same as "anything else".
6854            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
6855            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6856            
6857        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6858          $self->{line_prev} = $self->{line};
6859          $self->{column_prev} = $self->{column};
6860          $self->{column}++;
6861          $self->{nc}
6862              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6863        } else {
6864          $self->{set_nc}->($self);
6865        }
6866      
6867            return  ($self->{ct}); # ATTLIST
6868            redo A;
6869          } elsif ($self->{nc} == 0x0028) { # (
6870            ## XML5: Same as "anything else".
6871            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before paren'); ## TODO: type
6872            $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6873            
6874        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6875          $self->{line_prev} = $self->{line};
6876          $self->{column_prev} = $self->{column};
6877          $self->{column}++;
6878          $self->{nc}
6879              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6880        } else {
6881          $self->{set_nc}->($self);
6882        }
6883      
6884            redo A;
6885          } elsif ($self->{nc} == -1) {
6886            ## XML5: No parse error.
6887            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6888            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6889            
6890        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6891          $self->{line_prev} = $self->{line};
6892          $self->{column_prev} = $self->{column};
6893          $self->{column}++;
6894          $self->{nc}
6895              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6896        } else {
6897          $self->{set_nc}->($self);
6898        }
6899      
6900            return  ($self->{ct});
6901            redo A;
6902          } else {
6903            ## XML5: Not defined yet.
6904            $self->{ca}->{type} .= chr $self->{nc};
6905            ## Stay in the state.
6906            
6907        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6908          $self->{line_prev} = $self->{line};
6909          $self->{column_prev} = $self->{column};
6910          $self->{column}++;
6911          $self->{nc}
6912              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6913        } else {
6914          $self->{set_nc}->($self);
6915        }
6916      
6917            redo A;
6918          }
6919        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE) {
6920          if ($is_space->{$self->{nc}}) {
6921            ## Stay in the state.
6922            
6923        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6924          $self->{line_prev} = $self->{line};
6925          $self->{column_prev} = $self->{column};
6926          $self->{column}++;
6927          $self->{nc}
6928              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6929        } else {
6930          $self->{set_nc}->($self);
6931        }
6932      
6933            redo A;
6934          } elsif ($self->{nc} == 0x0028) { # (
6935            ## XML5: Same as "anything else".
6936            $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6937            
6938        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6939          $self->{line_prev} = $self->{line};
6940          $self->{column_prev} = $self->{column};
6941          $self->{column}++;
6942          $self->{nc}
6943              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6944        } else {
6945          $self->{set_nc}->($self);
6946        }
6947      
6948            redo A;
6949          } elsif ($self->{nc} == 0x0023) { # #
6950            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
6951            
6952        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6953          $self->{line_prev} = $self->{line};
6954          $self->{column_prev} = $self->{column};
6955          $self->{column}++;
6956          $self->{nc}
6957              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6958        } else {
6959          $self->{set_nc}->($self);
6960        }
6961      
6962            redo A;
6963          } elsif ($self->{nc} == 0x0022) { # "
6964            ## XML5: Same as "anything else".
6965            $self->{ca}->{value} = '';
6966            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
6967            
6968        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6969          $self->{line_prev} = $self->{line};
6970          $self->{column_prev} = $self->{column};
6971          $self->{column}++;
6972          $self->{nc}
6973              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6974        } else {
6975          $self->{set_nc}->($self);
6976        }
6977      
6978            redo A;
6979          } elsif ($self->{nc} == 0x0027) { # '
6980            ## XML5: Same as "anything else".
6981            $self->{ca}->{value} = '';
6982            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
6983            
6984        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6985          $self->{line_prev} = $self->{line};
6986          $self->{column_prev} = $self->{column};
6987          $self->{column}++;
6988          $self->{nc}
6989              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6990        } else {
6991          $self->{set_nc}->($self);
6992        }
6993      
6994            redo A;
6995          } elsif ($self->{nc} == 0x003E) { # >
6996            ## XML5: Same as "anything else".
6997            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
6998            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6999            
7000        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7001          $self->{line_prev} = $self->{line};
7002          $self->{column_prev} = $self->{column};
7003          $self->{column}++;
7004          $self->{nc}
7005              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7006        } else {
7007          $self->{set_nc}->($self);
7008        }
7009      
7010            return  ($self->{ct}); # ATTLIST
7011            redo A;
7012          } elsif ($self->{nc} == -1) {
7013            ## XML5: No parse error.
7014            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7015            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7016            
7017        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7018          $self->{line_prev} = $self->{line};
7019          $self->{column_prev} = $self->{column};
7020          $self->{column}++;
7021          $self->{nc}
7022              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7023        } else {
7024          $self->{set_nc}->($self);
7025        }
7026      
7027            return  ($self->{ct});
7028            redo A;
7029          } else {
7030            ## XML5: Switch to the "DOCTYPE bogus comment state".
7031            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
7032            $self->{ca}->{value} = '';
7033            $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7034            ## Reconsume.
7035            redo A;
7036          }
7037        } elsif ($self->{state} == BEFORE_ALLOWED_TOKEN_STATE) {
7038          if ($is_space->{$self->{nc}}) {
7039            ## Stay in the state.
7040            
7041        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7042          $self->{line_prev} = $self->{line};
7043          $self->{column_prev} = $self->{column};
7044          $self->{column}++;
7045          $self->{nc}
7046              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7047        } else {
7048          $self->{set_nc}->($self);
7049        }
7050      
7051            redo A;
7052          } elsif ($self->{nc} == 0x007C) { # |
7053            $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty allowed token'); ## TODO: type
7054            ## Stay in the state.
7055            
7056        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7057          $self->{line_prev} = $self->{line};
7058          $self->{column_prev} = $self->{column};
7059          $self->{column}++;
7060          $self->{nc}
7061              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7062        } else {
7063          $self->{set_nc}->($self);
7064        }
7065      
7066            redo A;
7067          } elsif ($self->{nc} == 0x0029) { # )
7068            $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty allowed token'); ## TODO: type
7069            $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
7070            
7071        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7072          $self->{line_prev} = $self->{line};
7073          $self->{column_prev} = $self->{column};
7074          $self->{column}++;
7075          $self->{nc}
7076              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7077        } else {
7078          $self->{set_nc}->($self);
7079        }
7080      
7081            redo A;
7082          } elsif ($self->{nc} == 0x003E) { # >
7083            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
7084            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7085            
7086        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7087          $self->{line_prev} = $self->{line};
7088          $self->{column_prev} = $self->{column};
7089          $self->{column}++;
7090          $self->{nc}
7091              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7092        } else {
7093          $self->{set_nc}->($self);
7094        }
7095      
7096            return  ($self->{ct}); # ATTLIST
7097            redo A;
7098          } elsif ($self->{nc} == -1) {
7099            ## XML5: No parse error.
7100            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7101            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7102            
7103        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7104          $self->{line_prev} = $self->{line};
7105          $self->{column_prev} = $self->{column};
7106          $self->{column}++;
7107          $self->{nc}
7108              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7109        } else {
7110          $self->{set_nc}->($self);
7111        }
7112      
7113            return  ($self->{ct});
7114            redo A;
7115          } else {
7116            push @{$self->{ca}->{tokens}}, chr $self->{nc};
7117            $self->{state} = ALLOWED_TOKEN_STATE;
7118            
7119        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7120          $self->{line_prev} = $self->{line};
7121          $self->{column_prev} = $self->{column};
7122          $self->{column}++;
7123          $self->{nc}
7124              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7125        } else {
7126          $self->{set_nc}->($self);
7127        }
7128      
7129            redo A;
7130          }
7131        } elsif ($self->{state} == ALLOWED_TOKEN_STATE) {
7132          if ($is_space->{$self->{nc}}) {
7133            $self->{state} = AFTER_ALLOWED_TOKEN_STATE;
7134            
7135        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7136          $self->{line_prev} = $self->{line};
7137          $self->{column_prev} = $self->{column};
7138          $self->{column}++;
7139          $self->{nc}
7140              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7141        } else {
7142          $self->{set_nc}->($self);
7143        }
7144      
7145            redo A;
7146          } elsif ($self->{nc} == 0x007C) { # |
7147            $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
7148            
7149        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7150          $self->{line_prev} = $self->{line};
7151          $self->{column_prev} = $self->{column};
7152          $self->{column}++;
7153          $self->{nc}
7154              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7155        } else {
7156          $self->{set_nc}->($self);
7157        }
7158      
7159            redo A;
7160          } elsif ($self->{nc} == 0x0029) { # )
7161            $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
7162            
7163        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7164          $self->{line_prev} = $self->{line};
7165          $self->{column_prev} = $self->{column};
7166          $self->{column}++;
7167          $self->{nc}
7168              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7169        } else {
7170          $self->{set_nc}->($self);
7171        }
7172      
7173            redo A;
7174          } elsif ($self->{nc} == 0x003E) { # >
7175            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
7176            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7177            
7178        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7179          $self->{line_prev} = $self->{line};
7180          $self->{column_prev} = $self->{column};
7181          $self->{column}++;
7182          $self->{nc}
7183              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7184        } else {
7185          $self->{set_nc}->($self);
7186        }
7187      
7188            return  ($self->{ct}); # ATTLIST
7189            redo A;
7190          } elsif ($self->{nc} == -1) {
7191            ## XML5: No parse error.
7192            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7193            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7194            
7195        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7196          $self->{line_prev} = $self->{line};
7197          $self->{column_prev} = $self->{column};
7198          $self->{column}++;
7199          $self->{nc}
7200              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7201        } else {
7202          $self->{set_nc}->($self);
7203        }
7204      
7205            return  ($self->{ct});
7206            redo A;
7207          } else {
7208            $self->{ca}->{tokens}->[-1] .= chr $self->{nc};
7209            ## Stay in the state.
7210            
7211        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7212          $self->{line_prev} = $self->{line};
7213          $self->{column_prev} = $self->{column};
7214          $self->{column}++;
7215          $self->{nc}
7216              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7217        } else {
7218          $self->{set_nc}->($self);
7219        }
7220      
7221            redo A;
7222          }
7223        } elsif ($self->{state} == AFTER_ALLOWED_TOKEN_STATE) {
7224          if ($is_space->{$self->{nc}}) {
7225            ## Stay in the state.
7226            
7227        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7228          $self->{line_prev} = $self->{line};
7229          $self->{column_prev} = $self->{column};
7230          $self->{column}++;
7231          $self->{nc}
7232              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7233        } else {
7234          $self->{set_nc}->($self);
7235        }
7236      
7237            redo A;
7238          } elsif ($self->{nc} == 0x007C) { # |
7239            $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
7240            
7241        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7242          $self->{line_prev} = $self->{line};
7243          $self->{column_prev} = $self->{column};
7244          $self->{column}++;
7245          $self->{nc}
7246              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7247        } else {
7248          $self->{set_nc}->($self);
7249        }
7250      
7251            redo A;
7252          } elsif ($self->{nc} == 0x0029) { # )
7253            $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
7254            
7255        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7256          $self->{line_prev} = $self->{line};
7257          $self->{column_prev} = $self->{column};
7258          $self->{column}++;
7259          $self->{nc}
7260              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7261        } else {
7262          $self->{set_nc}->($self);
7263        }
7264      
7265            redo A;
7266          } elsif ($self->{nc} == 0x003E) { # >
7267            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
7268            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7269            
7270        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7271          $self->{line_prev} = $self->{line};
7272          $self->{column_prev} = $self->{column};
7273          $self->{column}++;
7274          $self->{nc}
7275              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7276        } else {
7277          $self->{set_nc}->($self);
7278        }
7279      
7280            return  ($self->{ct}); # ATTLIST
7281            redo A;
7282          } elsif ($self->{nc} == -1) {
7283            ## XML5: No parse error.
7284            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7285            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7286            
7287        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7288          $self->{line_prev} = $self->{line};
7289          $self->{column_prev} = $self->{column};
7290          $self->{column}++;
7291          $self->{nc}
7292              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7293        } else {
7294          $self->{set_nc}->($self);
7295        }
7296      
7297            return  ($self->{ct});
7298            redo A;
7299          } else {
7300            $self->{parse_error}->(level => $self->{level}->{must}, type => 'space in allowed token', ## TODO: type
7301                            line => $self->{line_prev},
7302                            column => $self->{column_prev});
7303            $self->{ca}->{tokens}->[-1] .= ' ' . chr $self->{nc};
7304            $self->{state} = ALLOWED_TOKEN_STATE;
7305            
7306        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7307          $self->{line_prev} = $self->{line};
7308          $self->{column_prev} = $self->{column};
7309          $self->{column}++;
7310          $self->{nc}
7311              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7312        } else {
7313          $self->{set_nc}->($self);
7314        }
7315      
7316            redo A;
7317          }
7318        } elsif ($self->{state} == AFTER_ALLOWED_TOKENS_STATE) {
7319          if ($is_space->{$self->{nc}}) {
7320            $self->{state} = BEFORE_ATTR_DEFAULT_STATE;
7321            
7322        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7323          $self->{line_prev} = $self->{line};
7324          $self->{column_prev} = $self->{column};
7325          $self->{column}++;
7326          $self->{nc}
7327              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7328        } else {
7329          $self->{set_nc}->($self);
7330        }
7331      
7332            redo A;
7333          } elsif ($self->{nc} == 0x0023) { # #
7334            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7335            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
7336            
7337        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7338          $self->{line_prev} = $self->{line};
7339          $self->{column_prev} = $self->{column};
7340          $self->{column}++;
7341          $self->{nc}
7342              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7343        } else {
7344          $self->{set_nc}->($self);
7345        }
7346      
7347            redo A;
7348          } elsif ($self->{nc} == 0x0022) { # "
7349            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7350            $self->{ca}->{value} = '';
7351            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7352            
7353        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7354          $self->{line_prev} = $self->{line};
7355          $self->{column_prev} = $self->{column};
7356          $self->{column}++;
7357          $self->{nc}
7358              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7359        } else {
7360          $self->{set_nc}->($self);
7361        }
7362      
7363            redo A;
7364          } elsif ($self->{nc} == 0x0027) { # '
7365            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7366            $self->{ca}->{value} = '';
7367            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7368            
7369        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7370          $self->{line_prev} = $self->{line};
7371          $self->{column_prev} = $self->{column};
7372          $self->{column}++;
7373          $self->{nc}
7374              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7375        } else {
7376          $self->{set_nc}->($self);
7377        }
7378      
7379            redo A;
7380          } elsif ($self->{nc} == 0x003E) { # >
7381            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7382            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7383            
7384        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7385          $self->{line_prev} = $self->{line};
7386          $self->{column_prev} = $self->{column};
7387          $self->{column}++;
7388          $self->{nc}
7389              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7390        } else {
7391          $self->{set_nc}->($self);
7392        }
7393      
7394            return  ($self->{ct}); # ATTLIST
7395            redo A;
7396          } elsif ($self->{nc} == -1) {
7397            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7398            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7399            
7400        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7401          $self->{line_prev} = $self->{line};
7402          $self->{column_prev} = $self->{column};
7403          $self->{column}++;
7404          $self->{nc}
7405              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7406        } else {
7407          $self->{set_nc}->($self);
7408        }
7409      
7410            return  ($self->{ct});
7411            redo A;
7412          } else {
7413            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
7414            $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7415            ## Reconsume.
7416            redo A;
7417          }
7418        } elsif ($self->{state} == BEFORE_ATTR_DEFAULT_STATE) {
7419          if ($is_space->{$self->{nc}}) {
7420            ## Stay in the state.
7421            
7422        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7423          $self->{line_prev} = $self->{line};
7424          $self->{column_prev} = $self->{column};
7425          $self->{column}++;
7426          $self->{nc}
7427              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7428        } else {
7429          $self->{set_nc}->($self);
7430        }
7431      
7432            redo A;
7433          } elsif ($self->{nc} == 0x0023) { # #
7434            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
7435            
7436        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7437          $self->{line_prev} = $self->{line};
7438          $self->{column_prev} = $self->{column};
7439          $self->{column}++;
7440          $self->{nc}
7441              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7442        } else {
7443          $self->{set_nc}->($self);
7444        }
7445      
7446            redo A;
7447          } elsif ($self->{nc} == 0x0022) { # "
7448            $self->{ca}->{value} = '';
7449            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7450            
7451        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7452          $self->{line_prev} = $self->{line};
7453          $self->{column_prev} = $self->{column};
7454          $self->{column}++;
7455          $self->{nc}
7456              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7457        } else {
7458          $self->{set_nc}->($self);
7459        }
7460      
7461            redo A;
7462          } elsif ($self->{nc} == 0x0027) { # '
7463            $self->{ca}->{value} = '';
7464            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7465            
7466        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7467          $self->{line_prev} = $self->{line};
7468          $self->{column_prev} = $self->{column};
7469          $self->{column}++;
7470          $self->{nc}
7471              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7472        } else {
7473          $self->{set_nc}->($self);
7474        }
7475      
7476            redo A;
7477          } elsif ($self->{nc} == 0x003E) { # >
7478            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7479            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7480            
7481        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7482          $self->{line_prev} = $self->{line};
7483          $self->{column_prev} = $self->{column};
7484          $self->{column}++;
7485          $self->{nc}
7486              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7487        } else {
7488          $self->{set_nc}->($self);
7489        }
7490      
7491            return  ($self->{ct}); # ATTLIST
7492            redo A;
7493          } elsif ($self->{nc} == -1) {
7494            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7495            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7496            
7497        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7498          $self->{line_prev} = $self->{line};
7499          $self->{column_prev} = $self->{column};
7500          $self->{column}++;
7501          $self->{nc}
7502              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7503        } else {
7504          $self->{set_nc}->($self);
7505        }
7506      
7507            return  ($self->{ct});
7508            redo A;
7509          } else {
7510            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
7511            $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7512            ## Reconsume.
7513            redo A;
7514          }
7515        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE) {
7516          if ($is_space->{$self->{nc}}) {
7517            ## XML5: No parse error.
7518            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no default type'); ## TODO: type
7519            $self->{state} = BOGUS_MD_STATE;
7520            ## Reconsume.
7521            redo A;
7522          } elsif ($self->{nc} == 0x0022) { # "
7523            ## XML5: Same as "anything else".
7524            $self->{ca}->{value} = '';
7525            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7526            
7527        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7528          $self->{line_prev} = $self->{line};
7529          $self->{column_prev} = $self->{column};
7530          $self->{column}++;
7531          $self->{nc}
7532              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7533        } else {
7534          $self->{set_nc}->($self);
7535        }
7536      
7537            redo A;
7538          } elsif ($self->{nc} == 0x0027) { # '
7539            ## XML5: Same as "anything else".
7540            $self->{ca}->{value} = '';
7541            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7542            
7543        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7544          $self->{line_prev} = $self->{line};
7545          $self->{column_prev} = $self->{column};
7546          $self->{column}++;
7547          $self->{nc}
7548              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7549        } else {
7550          $self->{set_nc}->($self);
7551        }
7552      
7553            redo A;
7554          } elsif ($self->{nc} == 0x003E) { # >
7555            ## XML5: Same as "anything else".
7556            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7557            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7558            
7559        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7560          $self->{line_prev} = $self->{line};
7561          $self->{column_prev} = $self->{column};
7562          $self->{column}++;
7563          $self->{nc}
7564              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7565        } else {
7566          $self->{set_nc}->($self);
7567        }
7568      
7569            return  ($self->{ct}); # ATTLIST
7570            redo A;
7571          } elsif ($self->{nc} == -1) {
7572            ## XML5: No parse error.
7573            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7574            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7575            
7576        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7577          $self->{line_prev} = $self->{line};
7578          $self->{column_prev} = $self->{column};
7579          $self->{column}++;
7580          $self->{nc}
7581              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7582        } else {
7583          $self->{set_nc}->($self);
7584        }
7585      
7586            return  ($self->{ct});
7587            redo A;
7588          } else {
7589            $self->{ca}->{default} = chr $self->{nc};
7590            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE;
7591            
7592        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7593          $self->{line_prev} = $self->{line};
7594          $self->{column_prev} = $self->{column};
7595          $self->{column}++;
7596          $self->{nc}
7597              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7598        } else {
7599          $self->{set_nc}->($self);
7600        }
7601      
7602            redo A;
7603          }
7604        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE) {
7605          if ($is_space->{$self->{nc}}) {
7606            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE;
7607            
7608        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7609          $self->{line_prev} = $self->{line};
7610          $self->{column_prev} = $self->{column};
7611          $self->{column}++;
7612          $self->{nc}
7613              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7614        } else {
7615          $self->{set_nc}->($self);
7616        }
7617      
7618            redo A;
7619          } elsif ($self->{nc} == 0x0022) { # "
7620            ## XML5: Same as "anything else".
7621            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7622            $self->{ca}->{value} = '';
7623            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7624            
7625        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7626          $self->{line_prev} = $self->{line};
7627          $self->{column_prev} = $self->{column};
7628          $self->{column}++;
7629          $self->{nc}
7630              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7631        } else {
7632          $self->{set_nc}->($self);
7633        }
7634      
7635            redo A;
7636          } elsif ($self->{nc} == 0x0027) { # '
7637            ## XML5: Same as "anything else".
7638            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7639            $self->{ca}->{value} = '';
7640            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7641            
7642        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7643          $self->{line_prev} = $self->{line};
7644          $self->{column_prev} = $self->{column};
7645          $self->{column}++;
7646          $self->{nc}
7647              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7648        } else {
7649          $self->{set_nc}->($self);
7650        }
7651      
7652            redo A;
7653          } elsif ($self->{nc} == 0x003E) { # >
7654            ## XML5: Same as "anything else".
7655            push @{$self->{ct}->{attrdefs}}, $self->{ca};
7656            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7657            
7658        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7659          $self->{line_prev} = $self->{line};
7660          $self->{column_prev} = $self->{column};
7661          $self->{column}++;
7662          $self->{nc}
7663              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7664        } else {
7665          $self->{set_nc}->($self);
7666        }
7667      
7668            return  ($self->{ct}); # ATTLIST
7669            redo A;
7670          } elsif ($self->{nc} == -1) {
7671            ## XML5: No parse error.
7672            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7673            push @{$self->{ct}->{attrdefs}}, $self->{ca};
7674            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7675            
7676        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7677          $self->{line_prev} = $self->{line};
7678          $self->{column_prev} = $self->{column};
7679          $self->{column}++;
7680          $self->{nc}
7681              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7682        } else {
7683          $self->{set_nc}->($self);
7684        }
7685      
7686            return  ($self->{ct});
7687            redo A;
7688          } else {
7689            $self->{ca}->{default} .= chr $self->{nc};
7690            ## Stay in the state.
7691            
7692        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7693          $self->{line_prev} = $self->{line};
7694          $self->{column_prev} = $self->{column};
7695          $self->{column}++;
7696          $self->{nc}
7697              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7698        } else {
7699          $self->{set_nc}->($self);
7700        }
7701      
7702            redo A;
7703          }
7704        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE) {
7705          if ($is_space->{$self->{nc}}) {
7706            ## Stay in the state.
7707            
7708        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7709          $self->{line_prev} = $self->{line};
7710          $self->{column_prev} = $self->{column};
7711          $self->{column}++;
7712          $self->{nc}
7713              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7714        } else {
7715          $self->{set_nc}->($self);
7716        }
7717      
7718            redo A;
7719          } elsif ($self->{nc} == 0x0022) { # "
7720            $self->{ca}->{value} = '';
7721            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7722            
7723        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7724          $self->{line_prev} = $self->{line};
7725          $self->{column_prev} = $self->{column};
7726          $self->{column}++;
7727          $self->{nc}
7728              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7729        } else {
7730          $self->{set_nc}->($self);
7731        }
7732      
7733            redo A;
7734          } elsif ($self->{nc} == 0x0027) { # '
7735            $self->{ca}->{value} = '';
7736            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7737            
7738        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7739          $self->{line_prev} = $self->{line};
7740          $self->{column_prev} = $self->{column};
7741          $self->{column}++;
7742          $self->{nc}
7743              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7744        } else {
7745          $self->{set_nc}->($self);
7746        }
7747      
7748            redo A;
7749          } elsif ($self->{nc} == 0x003E) { # >
7750            push @{$self->{ct}->{attrdefs}}, $self->{ca};
7751            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7752            
7753        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7754          $self->{line_prev} = $self->{line};
7755          $self->{column_prev} = $self->{column};
7756          $self->{column}++;
7757          $self->{nc}
7758              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7759        } else {
7760          $self->{set_nc}->($self);
7761        }
7762      
7763            return  ($self->{ct}); # ATTLIST
7764            redo A;
7765          } elsif ($self->{nc} == -1) {
7766            ## XML5: No parse error.
7767            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7768            push @{$self->{ct}->{attrdefs}}, $self->{ca};
7769            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7770            
7771        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7772          $self->{line_prev} = $self->{line};
7773          $self->{column_prev} = $self->{column};
7774          $self->{column}++;
7775          $self->{nc}
7776              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7777        } else {
7778          $self->{set_nc}->($self);
7779        }
7780      
7781            return  ($self->{ct});
7782            redo A;
7783          } else {
7784            ## XML5: Not defined yet.
7785            if ($self->{ca}->{default} eq 'FIXED') {
7786              $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7787            } else {
7788              push @{$self->{ct}->{attrdefs}}, $self->{ca};
7789              $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7790            }
7791            ## Reconsume.
7792            redo A;
7793          }
7794        } elsif ($self->{state} == AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE) {
7795          if ($is_space->{$self->{nc}} or
7796              $self->{nc} == -1 or
7797              $self->{nc} == 0x003E) { # >
7798            $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7799            ## Reconsume.
7800            redo A;
7801          } else {
7802            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before attr name'); ## TODO: type
7803            $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7804            ## Reconsume.
7805            redo A;
7806          }
7807        } elsif ($self->{state} == NDATA_STATE) {
7808          ## ASCII case-insensitive
7809          if ($self->{nc} == [
7810                undef,
7811                0x0044, # D
7812                0x0041, # A
7813                0x0054, # T
7814              ]->[length $self->{kwd}] or
7815              $self->{nc} == [
7816                undef,
7817                0x0064, # d
7818                0x0061, # a
7819                0x0074, # t
7820              ]->[length $self->{kwd}]) {
7821            
7822            ## Stay in the state.
7823            $self->{kwd} .= chr $self->{nc};
7824            
7825        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7826          $self->{line_prev} = $self->{line};
7827          $self->{column_prev} = $self->{column};
7828          $self->{column}++;
7829          $self->{nc}
7830              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7831        } else {
7832          $self->{set_nc}->($self);
7833        }
7834      
7835            redo A;
7836          } elsif ((length $self->{kwd}) == 4 and
7837                   ($self->{nc} == 0x0041 or # A
7838                    $self->{nc} == 0x0061)) { # a
7839            if ($self->{kwd} ne 'NDAT' or $self->{nc} == 0x0061) { # a
7840              
7841              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
7842                              text => 'NDATA',
7843                              line => $self->{line_prev},
7844                              column => $self->{column_prev} - 4);
7845            } else {
7846              
7847            }
7848            $self->{state} = AFTER_NDATA_STATE;
7849            
7850        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7851          $self->{line_prev} = $self->{line};
7852          $self->{column_prev} = $self->{column};
7853          $self->{column}++;
7854          $self->{nc}
7855              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7856        } else {
7857          $self->{set_nc}->($self);
7858        }
7859      
7860            redo A;
7861          } else {
7862            $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after literal', ## TODO: type
7863                            line => $self->{line_prev},
7864                            column => $self->{column_prev} + 1
7865                                - length $self->{kwd});
7866            
7867            $self->{state} = BOGUS_MD_STATE;
7868            ## Reconsume.
7869            redo A;
7870          }
7871        } elsif ($self->{state} == AFTER_NDATA_STATE) {
7872          if ($is_space->{$self->{nc}}) {
7873            $self->{state} = BEFORE_NOTATION_NAME_STATE;
7874            
7875        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7876          $self->{line_prev} = $self->{line};
7877          $self->{column_prev} = $self->{column};
7878          $self->{column}++;
7879          $self->{nc}
7880              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7881        } else {
7882          $self->{set_nc}->($self);
7883        }
7884      
7885            redo A;
7886          } elsif ($self->{nc} == 0x003E) { # >
7887            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no notation name'); ## TODO: type
7888            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7889            
7890        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7891          $self->{line_prev} = $self->{line};
7892          $self->{column_prev} = $self->{column};
7893          $self->{column}++;
7894          $self->{nc}
7895              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7896        } else {
7897          $self->{set_nc}->($self);
7898        }
7899      
7900            return  ($self->{ct}); # ENTITY
7901            redo A;
7902          } elsif ($self->{nc} == -1) {
7903            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7904            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7905            
7906        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7907          $self->{line_prev} = $self->{line};
7908          $self->{column_prev} = $self->{column};
7909          $self->{column}++;
7910          $self->{nc}
7911              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7912        } else {
7913          $self->{set_nc}->($self);
7914        }
7915      
7916            return  ($self->{ct}); # ENTITY
7917            redo A;
7918          } else {
7919            $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after literal', ## TODO: type
7920                            line => $self->{line_prev},
7921                            column => $self->{column_prev} + 1
7922                                - length $self->{kwd});
7923            $self->{state} = BOGUS_MD_STATE;
7924            ## Reconsume.
7925            redo A;
7926          }
7927        } elsif ($self->{state} == BEFORE_NOTATION_NAME_STATE) {
7928          if ($is_space->{$self->{nc}}) {
7929            ## Stay in the state.
7930            
7931        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7932          $self->{line_prev} = $self->{line};
7933          $self->{column_prev} = $self->{column};
7934          $self->{column}++;
7935          $self->{nc}
7936              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7937        } else {
7938          $self->{set_nc}->($self);
7939        }
7940      
7941            redo A;
7942          } elsif ($self->{nc} == 0x003E) { # >
7943            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no notation name'); ## TODO: type
7944            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7945            
7946        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7947          $self->{line_prev} = $self->{line};
7948          $self->{column_prev} = $self->{column};
7949          $self->{column}++;
7950          $self->{nc}
7951              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7952        } else {
7953          $self->{set_nc}->($self);
7954        }
7955      
7956            return  ($self->{ct}); # ENTITY
7957            redo A;
7958          } elsif ($self->{nc} == -1) {
7959            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7960            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7961            
7962        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7963          $self->{line_prev} = $self->{line};
7964          $self->{column_prev} = $self->{column};
7965          $self->{column}++;
7966          $self->{nc}
7967              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7968        } else {
7969          $self->{set_nc}->($self);
7970        }
7971      
7972            return  ($self->{ct}); # ENTITY
7973            redo A;
7974          } else {
7975            $self->{ct}->{notation} = chr $self->{nc}; # ENTITY
7976            $self->{state} = NOTATION_NAME_STATE;
7977            
7978        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7979          $self->{line_prev} = $self->{line};
7980          $self->{column_prev} = $self->{column};
7981          $self->{column}++;
7982          $self->{nc}
7983              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7984        } else {
7985          $self->{set_nc}->($self);
7986        }
7987      
7988            redo A;
7989          }
7990        } elsif ($self->{state} == NOTATION_NAME_STATE) {
7991          if ($is_space->{$self->{nc}}) {
7992            $self->{state} = AFTER_MD_DEF_STATE;
7993            
7994        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7995          $self->{line_prev} = $self->{line};
7996          $self->{column_prev} = $self->{column};
7997          $self->{column}++;
7998          $self->{nc}
7999              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8000        } else {
8001          $self->{set_nc}->($self);
8002        }
8003      
8004            redo A;
8005          } elsif ($self->{nc} == 0x003E) { # >
8006            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8007            
8008        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8009          $self->{line_prev} = $self->{line};
8010          $self->{column_prev} = $self->{column};
8011          $self->{column}++;
8012          $self->{nc}
8013              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8014        } else {
8015          $self->{set_nc}->($self);
8016        }
8017      
8018            return  ($self->{ct}); # ENTITY
8019            redo A;
8020          } elsif ($self->{nc} == -1) {
8021            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8022            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8023            
8024        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8025          $self->{line_prev} = $self->{line};
8026          $self->{column_prev} = $self->{column};
8027          $self->{column}++;
8028          $self->{nc}
8029              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8030        } else {
8031          $self->{set_nc}->($self);
8032        }
8033      
8034            return  ($self->{ct}); # ENTITY
8035            redo A;
8036          } else {
8037            $self->{ct}->{notation} .= chr $self->{nc}; # ENTITY
8038            ## Stay in the state.
8039            
8040        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8041          $self->{line_prev} = $self->{line};
8042          $self->{column_prev} = $self->{column};
8043          $self->{column}++;
8044          $self->{nc}
8045              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8046        } else {
8047          $self->{set_nc}->($self);
8048        }
8049      
8050            redo A;
8051          }
8052        } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE) {
8053          if ($self->{nc} == 0x0022) { # "
8054            $self->{state} = AFTER_MD_DEF_STATE;
8055            
8056        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8057          $self->{line_prev} = $self->{line};
8058          $self->{column_prev} = $self->{column};
8059          $self->{column}++;
8060          $self->{nc}
8061              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8062        } else {
8063          $self->{set_nc}->($self);
8064        }
8065      
8066            redo A;
8067          } elsif ($self->{nc} == 0x0026) { # &
8068            $self->{prev_state} = $self->{state};
8069            $self->{state} = ENTITY_VALUE_ENTITY_STATE;
8070            $self->{entity_add} = 0x0022; # "
8071            
8072        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8073          $self->{line_prev} = $self->{line};
8074          $self->{column_prev} = $self->{column};
8075          $self->{column}++;
8076          $self->{nc}
8077              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8078        } else {
8079          $self->{set_nc}->($self);
8080        }
8081      
8082            redo A;
8083    ## TODO: %
8084          } elsif ($self->{nc} == -1) {
8085            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed entity value'); ## TODO: type
8086            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8087            ## Reconsume.
8088            return  ($self->{ct}); # ENTITY
8089            redo A;
8090          } else {
8091            $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
8092            
8093        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8094          $self->{line_prev} = $self->{line};
8095          $self->{column_prev} = $self->{column};
8096          $self->{column}++;
8097          $self->{nc}
8098              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8099        } else {
8100          $self->{set_nc}->($self);
8101        }
8102      
8103            redo A;
8104          }
8105        } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE) {
8106          if ($self->{nc} == 0x0027) { # '
8107            $self->{state} = AFTER_MD_DEF_STATE;
8108            
8109        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8110          $self->{line_prev} = $self->{line};
8111          $self->{column_prev} = $self->{column};
8112          $self->{column}++;
8113          $self->{nc}
8114              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8115        } else {
8116          $self->{set_nc}->($self);
8117        }
8118      
8119            redo A;
8120          } elsif ($self->{nc} == 0x0026) { # &
8121            $self->{prev_state} = $self->{state};
8122            $self->{state} = ENTITY_VALUE_ENTITY_STATE;
8123            $self->{entity_add} = 0x0027; # '
8124            
8125        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8126          $self->{line_prev} = $self->{line};
8127          $self->{column_prev} = $self->{column};
8128          $self->{column}++;
8129          $self->{nc}
8130              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8131        } else {
8132          $self->{set_nc}->($self);
8133        }
8134      
8135            redo A;
8136    ## TODO: %
8137          } elsif ($self->{nc} == -1) {
8138            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed entity value'); ## TODO: type
8139            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8140            ## Reconsume.
8141            return  ($self->{ct}); # ENTITY
8142            redo A;
8143          } else {
8144            $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
8145            
8146        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8147          $self->{line_prev} = $self->{line};
8148          $self->{column_prev} = $self->{column};
8149          $self->{column}++;
8150          $self->{nc}
8151              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8152        } else {
8153          $self->{set_nc}->($self);
8154        }
8155      
8156            redo A;
8157          }
8158        } elsif ($self->{state} == ENTITY_VALUE_ENTITY_STATE) {
8159          if ($is_space->{$self->{nc}} or
8160              {
8161                0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
8162                $self->{entity_add} => 1,
8163              }->{$self->{nc}}) {
8164            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
8165                            line => $self->{line_prev},
8166                            column => $self->{column_prev}
8167                                + ($self->{nc} == -1 ? 1 : 0));
8168            ## Don't consume
8169            ## Return nothing.
8170            #
8171          } elsif ($self->{nc} == 0x0023) { # #
8172            $self->{ca} = $self->{ct};
8173            $self->{state} = ENTITY_HASH_STATE;
8174            $self->{kwd} = '#';
8175            
8176        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8177          $self->{line_prev} = $self->{line};
8178          $self->{column_prev} = $self->{column};
8179          $self->{column}++;
8180          $self->{nc}
8181              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8182        } else {
8183          $self->{set_nc}->($self);
8184        }
8185      
8186            redo A;
8187          } else {
8188            #
8189          }
8190    
8191          $self->{ct}->{value} .= '&';
8192          $self->{state} = $self->{prev_state};
8193          ## Reconsume.
8194          redo A;
8195        } elsif ($self->{state} == AFTER_ELEMENT_NAME_STATE) {
8196          if ($is_space->{$self->{nc}}) {
8197            $self->{state} = BEFORE_ELEMENT_CONTENT_STATE;
8198            
8199        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8200          $self->{line_prev} = $self->{line};
8201          $self->{column_prev} = $self->{column};
8202          $self->{column}++;
8203          $self->{nc}
8204              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8205        } else {
8206          $self->{set_nc}->($self);
8207        }
8208      
8209            redo A;
8210          } elsif ($self->{nc} == 0x0028) { # (
8211            $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
8212            $self->{ct}->{content} = ['('];
8213            $self->{group_depth} = 1;
8214            
8215        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8216          $self->{line_prev} = $self->{line};
8217          $self->{column_prev} = $self->{column};
8218          $self->{column}++;
8219          $self->{nc}
8220              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8221        } else {
8222          $self->{set_nc}->($self);
8223        }
8224      
8225            redo A;
8226          } elsif ($self->{nc} == 0x003E) { # >
8227            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
8228            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8229            
8230        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8231          $self->{line_prev} = $self->{line};
8232          $self->{column_prev} = $self->{column};
8233          $self->{column}++;
8234          $self->{nc}
8235              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8236        } else {
8237          $self->{set_nc}->($self);
8238        }
8239      
8240            return  ($self->{ct}); # ELEMENT
8241            redo A;
8242          } elsif ($self->{nc} == -1) {
8243            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8244            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8245            
8246        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8247          $self->{line_prev} = $self->{line};
8248          $self->{column_prev} = $self->{column};
8249          $self->{column}++;
8250          $self->{nc}
8251              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8252        } else {
8253          $self->{set_nc}->($self);
8254        }
8255      
8256            return  ($self->{ct}); # ELEMENT
8257            redo A;
8258          } else {
8259            $self->{ct}->{content} = [chr $self->{nc}];
8260            $self->{state} = CONTENT_KEYWORD_STATE;
8261            
8262        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8263          $self->{line_prev} = $self->{line};
8264          $self->{column_prev} = $self->{column};
8265          $self->{column}++;
8266          $self->{nc}
8267              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8268        } else {
8269          $self->{set_nc}->($self);
8270        }
8271      
8272            redo A;
8273          }
8274        } elsif ($self->{state} == CONTENT_KEYWORD_STATE) {
8275          if ($is_space->{$self->{nc}}) {
8276            $self->{state} = AFTER_MD_DEF_STATE;
8277            
8278        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8279          $self->{line_prev} = $self->{line};
8280          $self->{column_prev} = $self->{column};
8281          $self->{column}++;
8282          $self->{nc}
8283              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8284        } else {
8285          $self->{set_nc}->($self);
8286        }
8287      
8288            redo A;
8289          } elsif ($self->{nc} == 0x003E) { # >
8290            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8291            
8292        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8293          $self->{line_prev} = $self->{line};
8294          $self->{column_prev} = $self->{column};
8295          $self->{column}++;
8296          $self->{nc}
8297              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8298        } else {
8299          $self->{set_nc}->($self);
8300        }
8301      
8302            return  ($self->{ct}); # ELEMENT
8303            redo A;
8304          } elsif ($self->{nc} == -1) {
8305            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8306            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8307            
8308        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8309          $self->{line_prev} = $self->{line};
8310          $self->{column_prev} = $self->{column};
8311          $self->{column}++;
8312          $self->{nc}
8313              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8314        } else {
8315          $self->{set_nc}->($self);
8316        }
8317      
8318            return  ($self->{ct}); # ELEMENT
8319            redo A;
8320          } else {
8321            $self->{ct}->{content}->[-1] .= chr $self->{nc}; # ELEMENT
8322            ## Stay in the state.
8323            
8324        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8325          $self->{line_prev} = $self->{line};
8326          $self->{column_prev} = $self->{column};
8327          $self->{column}++;
8328          $self->{nc}
8329              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8330        } else {
8331          $self->{set_nc}->($self);
8332        }
8333      
8334            redo A;
8335          }
8336        } elsif ($self->{state} == AFTER_CM_GROUP_OPEN_STATE) {
8337          if ($is_space->{$self->{nc}}) {
8338            ## Stay in the state.
8339            
8340        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8341          $self->{line_prev} = $self->{line};
8342          $self->{column_prev} = $self->{column};
8343          $self->{column}++;
8344          $self->{nc}
8345              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8346        } else {
8347          $self->{set_nc}->($self);
8348        }
8349      
8350            redo A;
8351          } elsif ($self->{nc} == 0x0028) { # (
8352            $self->{group_depth}++;
8353            push @{$self->{ct}->{content}}, chr $self->{nc};
8354            ## Stay in the state.
8355            
8356        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8357          $self->{line_prev} = $self->{line};
8358          $self->{column_prev} = $self->{column};
8359          $self->{column}++;
8360          $self->{nc}
8361              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8362        } else {
8363          $self->{set_nc}->($self);
8364        }
8365      
8366            redo A;
8367          } elsif ($self->{nc} == 0x007C or # |
8368                   $self->{nc} == 0x002C) { # ,
8369            $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty element name'); ## TODO: type
8370            ## Stay in the state.
8371            
8372        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8373          $self->{line_prev} = $self->{line};
8374          $self->{column_prev} = $self->{column};
8375          $self->{column}++;
8376          $self->{nc}
8377              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8378        } else {
8379          $self->{set_nc}->($self);
8380        }
8381      
8382            redo A;
8383          } elsif ($self->{nc} == 0x0029) { # )
8384            $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty element name'); ## TODO: type
8385            push @{$self->{ct}->{content}}, chr $self->{nc};
8386            $self->{group_depth}--;
8387            $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8388            
8389        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8390          $self->{line_prev} = $self->{line};
8391          $self->{column_prev} = $self->{column};
8392          $self->{column}++;
8393          $self->{nc}
8394              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8395        } else {
8396          $self->{set_nc}->($self);
8397        }
8398      
8399            redo A;
8400          } elsif ($self->{nc} == 0x003E) { # >
8401            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8402            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8403            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8404            
8405        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8406          $self->{line_prev} = $self->{line};
8407          $self->{column_prev} = $self->{column};
8408          $self->{column}++;
8409          $self->{nc}
8410              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8411        } else {
8412          $self->{set_nc}->($self);
8413        }
8414      
8415            return  ($self->{ct}); # ELEMENT
8416            redo A;
8417          } elsif ($self->{nc} == -1) {
8418            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8419            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8420            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8421            
8422        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8423          $self->{line_prev} = $self->{line};
8424          $self->{column_prev} = $self->{column};
8425          $self->{column}++;
8426          $self->{nc}
8427              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8428        } else {
8429          $self->{set_nc}->($self);
8430        }
8431      
8432            return  ($self->{ct}); # ELEMENT
8433            redo A;
8434          } else {
8435            push @{$self->{ct}->{content}}, chr $self->{nc};
8436            $self->{state} = CM_ELEMENT_NAME_STATE;
8437            
8438        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8439          $self->{line_prev} = $self->{line};
8440          $self->{column_prev} = $self->{column};
8441          $self->{column}++;
8442          $self->{nc}
8443              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8444        } else {
8445          $self->{set_nc}->($self);
8446        }
8447      
8448            redo A;
8449          }
8450        } elsif ($self->{state} == CM_ELEMENT_NAME_STATE) {
8451          if ($is_space->{$self->{nc}}) {
8452            $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8453            
8454        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8455          $self->{line_prev} = $self->{line};
8456          $self->{column_prev} = $self->{column};
8457          $self->{column}++;
8458          $self->{nc}
8459              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8460        } else {
8461          $self->{set_nc}->($self);
8462        }
8463      
8464            redo A;
8465          } elsif ($self->{nc} == 0x002A or # *
8466                   $self->{nc} == 0x002B or # +
8467                   $self->{nc} == 0x003F) { # ?
8468            push @{$self->{ct}->{content}}, chr $self->{nc};
8469            $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8470            
8471        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8472          $self->{line_prev} = $self->{line};
8473          $self->{column_prev} = $self->{column};
8474          $self->{column}++;
8475          $self->{nc}
8476              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8477        } else {
8478          $self->{set_nc}->($self);
8479        }
8480      
8481            redo A;
8482          } elsif ($self->{nc} == 0x007C or # |
8483                   $self->{nc} == 0x002C) { # ,
8484            push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
8485            $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
8486            
8487        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8488          $self->{line_prev} = $self->{line};
8489          $self->{column_prev} = $self->{column};
8490          $self->{column}++;
8491          $self->{nc}
8492              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8493        } else {
8494          $self->{set_nc}->($self);
8495        }
8496      
8497            redo A;
8498          } elsif ($self->{nc} == 0x0029) { # )
8499            $self->{group_depth}--;
8500            push @{$self->{ct}->{content}}, chr $self->{nc};
8501            $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8502            
8503        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8504          $self->{line_prev} = $self->{line};
8505          $self->{column_prev} = $self->{column};
8506          $self->{column}++;
8507          $self->{nc}
8508              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8509        } else {
8510          $self->{set_nc}->($self);
8511        }
8512      
8513            redo A;
8514          } elsif ($self->{nc} == 0x003E) { # >
8515            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8516            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8517            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8518            
8519        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8520          $self->{line_prev} = $self->{line};
8521          $self->{column_prev} = $self->{column};
8522          $self->{column}++;
8523          $self->{nc}
8524              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8525        } else {
8526          $self->{set_nc}->($self);
8527        }
8528      
8529            return  ($self->{ct}); # ELEMENT
8530            redo A;
8531          } elsif ($self->{nc} == -1) {
8532            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8533            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8534            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8535            
8536        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8537          $self->{line_prev} = $self->{line};
8538          $self->{column_prev} = $self->{column};
8539          $self->{column}++;
8540          $self->{nc}
8541              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8542        } else {
8543          $self->{set_nc}->($self);
8544        }
8545      
8546            return  ($self->{ct}); # ELEMENT
8547            redo A;
8548          } else {
8549            $self->{ct}->{content}->[-1] .= chr $self->{nc};
8550            ## Stay in the state.
8551            
8552        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8553          $self->{line_prev} = $self->{line};
8554          $self->{column_prev} = $self->{column};
8555          $self->{column}++;
8556          $self->{nc}
8557              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8558        } else {
8559          $self->{set_nc}->($self);
8560        }
8561      
8562            redo A;
8563          }
8564        } elsif ($self->{state} == AFTER_CM_ELEMENT_NAME_STATE) {
8565          if ($is_space->{$self->{nc}}) {
8566            ## Stay in the state.
8567            
8568        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8569          $self->{line_prev} = $self->{line};
8570          $self->{column_prev} = $self->{column};
8571          $self->{column}++;
8572          $self->{nc}
8573              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8574        } else {
8575          $self->{set_nc}->($self);
8576        }
8577      
8578            redo A;
8579          } elsif ($self->{nc} == 0x007C or # |
8580                   $self->{nc} == 0x002C) { # ,
8581            push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
8582            $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
8583            
8584        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8585          $self->{line_prev} = $self->{line};
8586          $self->{column_prev} = $self->{column};
8587          $self->{column}++;
8588          $self->{nc}
8589              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8590        } else {
8591          $self->{set_nc}->($self);
8592        }
8593      
8594            redo A;
8595          } elsif ($self->{nc} == 0x0029) { # )
8596            $self->{group_depth}--;
8597            push @{$self->{ct}->{content}}, chr $self->{nc};
8598            $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8599            
8600        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8601          $self->{line_prev} = $self->{line};
8602          $self->{column_prev} = $self->{column};
8603          $self->{column}++;
8604          $self->{nc}
8605              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8606        } else {
8607          $self->{set_nc}->($self);
8608        }
8609      
8610            redo A;
8611          } elsif ($self->{nc} == 0x003E) { # >
8612            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8613            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8614            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8615            
8616        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8617          $self->{line_prev} = $self->{line};
8618          $self->{column_prev} = $self->{column};
8619          $self->{column}++;
8620          $self->{nc}
8621              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8622        } else {
8623          $self->{set_nc}->($self);
8624        }
8625      
8626            return  ($self->{ct}); # ELEMENT
8627            redo A;
8628          } elsif ($self->{nc} == -1) {
8629            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8630            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8631            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8632            
8633        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8634          $self->{line_prev} = $self->{line};
8635          $self->{column_prev} = $self->{column};
8636          $self->{column}++;
8637          $self->{nc}
8638              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8639        } else {
8640          $self->{set_nc}->($self);
8641        }
8642      
8643            return  ($self->{ct}); # ELEMENT
8644            redo A;
8645          } else {
8646            $self->{parse_error}->(level => $self->{level}->{must}, type => 'after element name'); ## TODO: type
8647            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8648            $self->{state} = BOGUS_MD_STATE;
8649            
8650        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8651          $self->{line_prev} = $self->{line};
8652          $self->{column_prev} = $self->{column};
8653          $self->{column}++;
8654          $self->{nc}
8655              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8656        } else {
8657          $self->{set_nc}->($self);
8658        }
8659      
8660            redo A;
8661          }
8662        } elsif ($self->{state} == AFTER_CM_GROUP_CLOSE_STATE) {
8663          if ($is_space->{$self->{nc}}) {
8664            if ($self->{group_depth}) {
8665              $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8666            } else {
8667              $self->{state} = AFTER_MD_DEF_STATE;
8668            }
8669            
8670        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8671          $self->{line_prev} = $self->{line};
8672          $self->{column_prev} = $self->{column};
8673          $self->{column}++;
8674          $self->{nc}
8675              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8676        } else {
8677          $self->{set_nc}->($self);
8678        }
8679      
8680            redo A;
8681          } elsif ($self->{nc} == 0x002A or # *
8682                   $self->{nc} == 0x002B or # +
8683                   $self->{nc} == 0x003F) { # ?
8684            push @{$self->{ct}->{content}}, chr $self->{nc};
8685            if ($self->{group_depth}) {
8686              $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8687            } else {
8688              $self->{state} = AFTER_MD_DEF_STATE;
8689            }
8690            
8691        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8692          $self->{line_prev} = $self->{line};
8693          $self->{column_prev} = $self->{column};
8694          $self->{column}++;
8695          $self->{nc}
8696              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8697        } else {
8698          $self->{set_nc}->($self);
8699        }
8700      
8701            redo A;
8702          } elsif ($self->{nc} == 0x0029) { # )
8703            if ($self->{group_depth}) {
8704              $self->{group_depth}--;
8705              push @{$self->{ct}->{content}}, chr $self->{nc};
8706              ## Stay in the state.
8707              
8708        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8709          $self->{line_prev} = $self->{line};
8710          $self->{column_prev} = $self->{column};
8711          $self->{column}++;
8712          $self->{nc}
8713              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8714        } else {
8715          $self->{set_nc}->($self);
8716        }
8717      
8718              redo A;
8719            } else {
8720              $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8721              $self->{state} = BOGUS_MD_STATE;
8722              ## Reconsume.
8723              redo A;
8724            }
8725          } elsif ($self->{nc} == 0x003E) { # >
8726            if ($self->{group_depth}) {
8727              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8728              push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8729            }
8730            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8731            
8732        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8733          $self->{line_prev} = $self->{line};
8734          $self->{column_prev} = $self->{column};
8735          $self->{column}++;
8736          $self->{nc}
8737              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8738        } else {
8739          $self->{set_nc}->($self);
8740        }
8741      
8742            return  ($self->{ct}); # ELEMENT
8743            redo A;
8744          } elsif ($self->{nc} == -1) {
8745            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8746            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8747            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8748            
8749        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8750          $self->{line_prev} = $self->{line};
8751          $self->{column_prev} = $self->{column};
8752          $self->{column}++;
8753          $self->{nc}
8754              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8755        } else {
8756          $self->{set_nc}->($self);
8757        }
8758      
8759            return  ($self->{ct}); # ELEMENT
8760            redo A;
8761          } else {
8762            if ($self->{group_depth}) {
8763              $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8764            } else {
8765              $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8766              $self->{state} = BOGUS_MD_STATE;
8767            }
8768            ## Reconsume.
8769            redo A;
8770          }
8771        } elsif ($self->{state} == AFTER_MD_DEF_STATE) {
8772          if ($is_space->{$self->{nc}}) {
8773            ## Stay in the state.
8774            
8775        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8776          $self->{line_prev} = $self->{line};
8777          $self->{column_prev} = $self->{column};
8778          $self->{column}++;
8779          $self->{nc}
8780              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8781        } else {
8782          $self->{set_nc}->($self);
8783        }
8784      
8785            redo A;
8786          } elsif ($self->{nc} == 0x003E) { # >
8787            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8788            
8789        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8790          $self->{line_prev} = $self->{line};
8791          $self->{column_prev} = $self->{column};
8792          $self->{column}++;
8793          $self->{nc}
8794              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8795        } else {
8796          $self->{set_nc}->($self);
8797        }
8798      
8799            return  ($self->{ct}); # ENTITY/ELEMENT
8800            redo A;
8801          } elsif ($self->{nc} == -1) {
8802            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8803            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8804            
8805        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8806          $self->{line_prev} = $self->{line};
8807          $self->{column_prev} = $self->{column};
8808          $self->{column}++;
8809          $self->{nc}
8810              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8811        } else {
8812          $self->{set_nc}->($self);
8813        }
8814      
8815            return  ($self->{ct}); # ENTITY/ELEMENT
8816            redo A;
8817          } else {
8818            $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8819            $self->{state} = BOGUS_MD_STATE;
8820            ## Reconsume.
8821            redo A;
8822          }
8823        } elsif ($self->{state} == BOGUS_MD_STATE) {
8824          if ($self->{nc} == 0x003E) { # >
8825            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8826            
8827        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8828          $self->{line_prev} = $self->{line};
8829          $self->{column_prev} = $self->{column};
8830          $self->{column}++;
8831          $self->{nc}
8832              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8833        } else {
8834          $self->{set_nc}->($self);
8835        }
8836      
8837            return  ($self->{ct}); # ATTLIST/ENTITY/NOTATION
8838            redo A;
8839          } elsif ($self->{nc} == -1) {
8840            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8841            ## Reconsume.
8842            return  ($self->{ct}); # ATTLIST/ENTITY/NOTATION
8843            redo A;
8844          } else {
8845            ## Stay in the state.
8846            
8847        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8848          $self->{line_prev} = $self->{line};
8849          $self->{column_prev} = $self->{column};
8850          $self->{column}++;
8851          $self->{nc}
8852              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8853        } else {
8854          $self->{set_nc}->($self);
8855        }
8856      
8857            redo A;
8858          }
8859      } else {      } else {
8860        die "$0: $self->{state}: Unknown state";        die "$0: $self->{state}: Unknown state";
8861      }      }
# Line 4270  sub _get_next_token ($) { Line 8866  sub _get_next_token ($) {
8866    
8867  1;  1;
8868  ## $Date$  ## $Date$
8869                                    

Legend:
Removed from v.1.7  
changed lines
  Added in v.1.33

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24