/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm
Suika

Diff of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.9 by wakaba, Wed Oct 15 08:05:47 2008 UTC revision 1.34 by wakaba, Sat Sep 5 11:31:58 2009 UTC
# Line 15  BEGIN { Line 15  BEGIN {
15      CHARACTER_TOKEN      CHARACTER_TOKEN
16      PI_TOKEN      PI_TOKEN
17      ABORT_TOKEN      ABORT_TOKEN
18        END_OF_DOCTYPE_TOKEN
19        ATTLIST_TOKEN
20        ELEMENT_TOKEN
21        GENERAL_ENTITY_TOKEN
22        PARAMETER_ENTITY_TOKEN
23        NOTATION_TOKEN
24    );    );
25        
26    our %EXPORT_TAGS = (    our %EXPORT_TAGS = (
# Line 27  BEGIN { Line 33  BEGIN {
33        CHARACTER_TOKEN        CHARACTER_TOKEN
34        PI_TOKEN        PI_TOKEN
35        ABORT_TOKEN        ABORT_TOKEN
36          END_OF_DOCTYPE_TOKEN
37          ATTLIST_TOKEN
38          ELEMENT_TOKEN
39          GENERAL_ENTITY_TOKEN
40          PARAMETER_ENTITY_TOKEN
41          NOTATION_TOKEN
42      )],      )],
43    );    );
44  }  }
45    
46    ## NOTE: Differences from the XML5 draft are marked as "XML5:".
47    
48  ## Token types  ## Token types
49    
50  sub DOCTYPE_TOKEN () { 1 }  sub DOCTYPE_TOKEN () { 1 } ## XML5: No DOCTYPE token.
51  sub COMMENT_TOKEN () { 2 }  sub COMMENT_TOKEN () { 2 }
52  sub START_TAG_TOKEN () { 3 }  sub START_TAG_TOKEN () { 3 }
53  sub END_TAG_TOKEN () { 4 }  sub END_TAG_TOKEN () { 4 }
54  sub END_OF_FILE_TOKEN () { 5 }  sub END_OF_FILE_TOKEN () { 5 }
55  sub CHARACTER_TOKEN () { 6 }  sub CHARACTER_TOKEN () { 6 }
56  sub PI_TOKEN () { 7 } # XML5  sub PI_TOKEN () { 7 } ## NOTE: XML only.
57  sub ABORT_TOKEN () { 8 } # Not a token actually  sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing.
58    sub END_OF_DOCTYPE_TOKEN () { 9 } ## NOTE: XML only.
59    sub ATTLIST_TOKEN () { 10 } ## NOTE: XML only.
60    sub ELEMENT_TOKEN () { 11 } ## NOTE: XML only.
61    sub GENERAL_ENTITY_TOKEN () { 12 } ## NOTE: XML only.
62    sub PARAMETER_ENTITY_TOKEN () { 13 } ## NOTE: XML only.
63    sub NOTATION_TOKEN () { 14 } ## NOTE: XML only.
64    
65    ## XML5: XML5 has "empty tag token".  In this implementation, it is
66    ## represented as a start tag token with $self->{self_closing} flag
67    ## set to true.
68    
69    ## XML5: XML5 has "short end tag token".  In this implementation, it
70    ## is represented as an end tag token with $token->{tag_name} flag set
71    ## to an empty string.
72    
73  package Whatpm::HTML;  package Whatpm::HTML;
74    
# Line 77  sub COMMENT_START_STATE () { 14 } Line 105  sub COMMENT_START_STATE () { 14 }
105  sub COMMENT_START_DASH_STATE () { 15 }  sub COMMENT_START_DASH_STATE () { 15 }
106  sub COMMENT_STATE () { 16 }  sub COMMENT_STATE () { 16 }
107  sub COMMENT_END_STATE () { 17 }  sub COMMENT_END_STATE () { 17 }
108    sub COMMENT_END_BANG_STATE () { 102 }
109    sub COMMENT_END_SPACE_STATE () { 103 } ## LAST
110  sub COMMENT_END_DASH_STATE () { 18 }  sub COMMENT_END_DASH_STATE () { 18 }
111  sub BOGUS_COMMENT_STATE () { 19 }  sub BOGUS_COMMENT_STATE () { 19 }
112  sub DOCTYPE_STATE () { 20 }  sub DOCTYPE_STATE () { 20 }
# Line 114  sub HEXREF_HEX_STATE () { 48 } Line 144  sub HEXREF_HEX_STATE () { 48 }
144  sub ENTITY_NAME_STATE () { 49 }  sub ENTITY_NAME_STATE () { 49 }
145  sub PCDATA_STATE () { 50 } # "data state" in the spec  sub PCDATA_STATE () { 50 } # "data state" in the spec
146    
147  ## XML states  ## XML-only states
148  sub PI_STATE () { 51 }  sub PI_STATE () { 51 }
149  sub PI_TARGET_STATE () { 52 }  sub PI_TARGET_STATE () { 52 }
150  sub PI_TARGET_AFTER_STATE () { 53 }  sub PI_TARGET_AFTER_STATE () { 53 }
151  sub PI_DATA_STATE () { 54 }  sub PI_DATA_STATE () { 54 }
152  sub PI_AFTER_STATE () { 55 }  sub PI_AFTER_STATE () { 55 }
153  sub PI_DATA_AFTER_STATE () { 56 }  sub PI_DATA_AFTER_STATE () { 56 }
154    sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 }
155    sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 }
156    sub BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 59 }
157    sub DOCTYPE_TAG_STATE () { 60 }
158    sub DOCTYPE_MARKUP_DECLARATION_OPEN_STATE () { 61 }
159    sub MD_ATTLIST_STATE () { 62 }
160    sub MD_E_STATE () { 63 }
161    sub MD_ELEMENT_STATE () { 64 }
162    sub MD_ENTITY_STATE () { 65 }
163    sub MD_NOTATION_STATE () { 66 }
164    sub DOCTYPE_MD_STATE () { 67 }
165    sub BEFORE_MD_NAME_STATE () { 68 }
166    sub MD_NAME_STATE () { 69 }
167    sub DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE () { 70 }
168    sub DOCTYPE_ATTLIST_NAME_AFTER_STATE () { 71 }
169    sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE () { 72 }
170    sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE () { 73 }
171    sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE () { 74 }
172    sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE () { 75 }
173    sub BEFORE_ALLOWED_TOKEN_STATE () { 76 }
174    sub ALLOWED_TOKEN_STATE () { 77 }
175    sub AFTER_ALLOWED_TOKEN_STATE () { 78 }
176    sub AFTER_ALLOWED_TOKENS_STATE () { 79 }
177    sub BEFORE_ATTR_DEFAULT_STATE () { 80 }
178    sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE () { 81 }
179    sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE () { 82 }
180    sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE () { 83 }
181    sub AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE () { 84 }
182    sub BEFORE_NDATA_STATE () { 85 }
183    sub NDATA_STATE () { 86 }
184    sub AFTER_NDATA_STATE () { 87 }
185    sub BEFORE_NOTATION_NAME_STATE () { 88 }
186    sub NOTATION_NAME_STATE () { 89 }
187    sub DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE () { 90 }
188    sub DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE () { 91 }
189    sub ENTITY_VALUE_ENTITY_STATE () { 92 }
190    sub AFTER_ELEMENT_NAME_STATE () { 93 }
191    sub BEFORE_ELEMENT_CONTENT_STATE () { 94 }
192    sub CONTENT_KEYWORD_STATE () { 95 }
193    sub AFTER_CM_GROUP_OPEN_STATE () { 96 }
194    sub CM_ELEMENT_NAME_STATE () { 97 }
195    sub AFTER_CM_ELEMENT_NAME_STATE () { 98 }
196    sub AFTER_CM_GROUP_CLOSE_STATE () { 99 }
197    sub AFTER_MD_DEF_STATE () { 100 }
198    sub BOGUS_MD_STATE () { 101 }
199    
200  ## Tree constructor state constants (see Whatpm::HTML for the full  ## Tree constructor state constants (see Whatpm::HTML for the full
201  ## list and descriptions)  ## list and descriptions)
# Line 131  sub FOREIGN_EL () { 0b1_00000000000 } Line 206  sub FOREIGN_EL () { 0b1_00000000000 }
206  ## Character reference mappings  ## Character reference mappings
207    
208  my $charref_map = {  my $charref_map = {
209      0x00 => 0xFFFD, # REPLACEMENT CHARACTER
210    0x0D => 0x000A,    0x0D => 0x000A,
211    0x80 => 0x20AC,    0x80 => 0x20AC,
212    0x81 => 0xFFFD,    0x81 => 0x0081,
213    0x82 => 0x201A,    0x82 => 0x201A,
214    0x83 => 0x0192,    0x83 => 0x0192,
215    0x84 => 0x201E,    0x84 => 0x201E,
# Line 145  my $charref_map = { Line 221  my $charref_map = {
221    0x8A => 0x0160,    0x8A => 0x0160,
222    0x8B => 0x2039,    0x8B => 0x2039,
223    0x8C => 0x0152,    0x8C => 0x0152,
224    0x8D => 0xFFFD,    0x8D => 0x008D,
225    0x8E => 0x017D,    0x8E => 0x017D,
226    0x8F => 0xFFFD,    0x8F => 0x008F,
227    0x90 => 0xFFFD,    0x90 => 0x0090,
228    0x91 => 0x2018,    0x91 => 0x2018,
229    0x92 => 0x2019,    0x92 => 0x2019,
230    0x93 => 0x201C,    0x93 => 0x201C,
# Line 161  my $charref_map = { Line 237  my $charref_map = {
237    0x9A => 0x0161,    0x9A => 0x0161,
238    0x9B => 0x203A,    0x9B => 0x203A,
239    0x9C => 0x0153,    0x9C => 0x0153,
240    0x9D => 0xFFFD,    0x9D => 0x009D,
241    0x9E => 0x017E,    0x9E => 0x017E,
242    0x9F => 0x0178,    0x9F => 0x0178,
243  }; # $charref_map  }; # $charref_map
244  $charref_map->{$_} = 0xFFFD  $charref_map->{$_} = $_
245      for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,      for 0x0001..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
246          0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF          0xD800..0xDFFF, 0xFDD0..0xFDEF,
247          0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,          0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
248          0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,          0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
249          0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,          0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
# Line 186  sub _initialize_tokenizer ($) { Line 262  sub _initialize_tokenizer ($) {
262    #$self->{is_xml} (if XML)    #$self->{is_xml} (if XML)
263    
264    $self->{state} = DATA_STATE; # MUST    $self->{state} = DATA_STATE; # MUST
265    $self->{s_kwd} = ''; # state keyword    $self->{s_kwd} = ''; # Data state keyword
266      #$self->{kwd} = ''; # State-dependent keyword; initialized when used
267    #$self->{entity__value}; # initialized when used    #$self->{entity__value}; # initialized when used
268    #$self->{entity__match}; # initialized when used    #$self->{entity__match}; # initialized when used
269    $self->{content_model} = PCDATA_CONTENT_MODEL; # be    $self->{content_model} = PCDATA_CONTENT_MODEL; # be
# Line 216  sub _initialize_tokenizer ($) { Line 293  sub _initialize_tokenizer ($) {
293    
294  ## A token has:  ## A token has:
295  ##   ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,  ##   ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
296  ##       CHARACTER_TOKEN, or END_OF_FILE_TOKEN  ##       CHARACTER_TOKEN, END_OF_FILE_TOKEN, PI_TOKEN, or ABORT_TOKEN
297  ##   ->{name} (DOCTYPE_TOKEN)  ##   ->{name} (DOCTYPE_TOKEN)
298  ##   ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)  ##   ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
299    ##   ->{target} (PI_TOKEN)
300  ##   ->{pubid} (DOCTYPE_TOKEN)  ##   ->{pubid} (DOCTYPE_TOKEN)
301  ##   ->{sysid} (DOCTYPE_TOKEN)  ##   ->{sysid} (DOCTYPE_TOKEN)
302  ##   ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag  ##   ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
# Line 226  sub _initialize_tokenizer ($) { Line 304  sub _initialize_tokenizer ($) {
304  ##        ->{name}  ##        ->{name}
305  ##        ->{value}  ##        ->{value}
306  ##        ->{has_reference} == 1 or 0  ##        ->{has_reference} == 1 or 0
307  ##   ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)  ##        ->{index}: Index of the attribute in a tag.
308    ##   ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)
309  ##   ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)  ##   ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
310    ##   ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.
311    ##   ->{has_internal_subset} = 1 or 0 (DOCTYPE_TOKEN)
312    
313  ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.  ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
314  ##     |->{self_closing}| is used to save the value of |$self->{self_closing}|  ##     |->{self_closing}| is used to save the value of |$self->{self_closing}|
315  ##     while the token is pushed back to the stack.  ##     while the token is pushed back to the stack.
# Line 247  my $is_space = { Line 329  my $is_space = {
329    0x0009 => 1, # CHARACTER TABULATION (HT)    0x0009 => 1, # CHARACTER TABULATION (HT)
330    0x000A => 1, # LINE FEED (LF)    0x000A => 1, # LINE FEED (LF)
331    #0x000B => 0, # LINE TABULATION (VT)    #0x000B => 0, # LINE TABULATION (VT)
332    0x000C => 1, # FORM FEED (FF)    0x000C => 1, # FORM FEED (FF) ## XML5: Not a space character.
333    #0x000D => 1, # CARRIAGE RETURN (CR)    #0x000D => 1, # CARRIAGE RETURN (CR)
334    0x0020 => 1, # SPACE (SP)    0x0020 => 1, # SPACE (SP)
335  };  };
# Line 507  sub _get_next_token ($) { Line 589  sub _get_next_token ($) {
589        return  ($token);        return  ($token);
590        redo A;        redo A;
591      } elsif ($self->{state} == TAG_OPEN_STATE) {      } elsif ($self->{state} == TAG_OPEN_STATE) {
592          ## XML5: "tag state".
593    
594        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
595          if ($self->{nc} == 0x002F) { # /          if ($self->{nc} == 0x002F) { # /
596                        
# Line 525  sub _get_next_token ($) { Line 609  sub _get_next_token ($) {
609            redo A;            redo A;
610          } elsif ($self->{nc} == 0x0021) { # !          } elsif ($self->{nc} == 0x0021) { # !
611                        
612            $self->{s_kwd} = '<' unless $self->{escape};            $self->{s_kwd} = $self->{escaped} ? '' : '<';
613            #            #
614          } else {          } else {
615                        
616              $self->{s_kwd} = '';
617            #            #
618          }          }
619    
620          ## reconsume          ## reconsume
621          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
         $self->{s_kwd} = '';  
622          return  ({type => CHARACTER_TOKEN, data => '<',          return  ({type => CHARACTER_TOKEN, data => '<',
623                    line => $self->{line_prev},                    line => $self->{line_prev},
624                    column => $self->{column_prev},                    column => $self->{column_prev},
# Line 709  sub _get_next_token ($) { Line 793  sub _get_next_token ($) {
793        ## NOTE: The "close tag open state" in the spec is implemented as        ## NOTE: The "close tag open state" in the spec is implemented as
794        ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.        ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
795    
796          ## XML5: "end tag state".
797    
798        my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"        my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
799        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
800          if (defined $self->{last_stag_name}) {          if (defined $self->{last_stag_name}) {
801            $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;            $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
802            $self->{s_kwd} = '';            $self->{kwd} = '';
803            ## Reconsume.            ## Reconsume.
804            redo A;            redo A;
805          } else {          } else {
# Line 770  sub _get_next_token ($) { Line 856  sub _get_next_token ($) {
856        
857          redo A;          redo A;
858        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
           
859          $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty end tag',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty end tag',
860                          line => $self->{line_prev}, ## "<" in "</>"                          line => $self->{line_prev}, ## "<" in "</>"
861                          column => $self->{column_prev} - 1);                          column => $self->{column_prev} - 1);
862          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
863          $self->{s_kwd} = '';          $self->{s_kwd} = '';
864                    if ($self->{is_xml}) {
865              
866              ## XML5: No parse error.
867              
868              ## NOTE: This parser raises a parse error, since it supports
869              ## XML1, not XML5.
870    
871              ## NOTE: A short end tag token.
872              my $ct = {type => END_TAG_TOKEN,
873                        tag_name => '',
874                        line => $self->{line_prev},
875                        column => $self->{column_prev} - 1,
876                       };
877              
878        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
879          $self->{line_prev} = $self->{line};
880          $self->{column_prev} = $self->{column};
881          $self->{column}++;
882          $self->{nc}
883              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
884        } else {
885          $self->{set_nc}->($self);
886        }
887      
888              return  ($ct);
889            } else {
890              
891              
892      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
893        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
894        $self->{column_prev} = $self->{column};        $self->{column_prev} = $self->{column};
# Line 787  sub _get_next_token ($) { Line 899  sub _get_next_token ($) {
899        $self->{set_nc}->($self);        $self->{set_nc}->($self);
900      }      }
901        
902            }
903          redo A;          redo A;
904        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
905                    
# Line 800  sub _get_next_token ($) { Line 913  sub _get_next_token ($) {
913                   });                   });
914    
915          redo A;          redo A;
916        } else {        } elsif (not $self->{is_xml} or
917                   $is_space->{$self->{nc}}) {
918                    
919          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus end tag');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus end tag',
920                            line => $self->{line_prev}, # "<" of "</"
921                            column => $self->{column_prev} - 1);
922          $self->{state} = BOGUS_COMMENT_STATE;          $self->{state} = BOGUS_COMMENT_STATE;
923          $self->{ct} = {type => COMMENT_TOKEN, data => '',          $self->{ct} = {type => COMMENT_TOKEN, data => '',
924                                    line => $self->{line_prev}, # "<" of "</"                                    line => $self->{line_prev}, # "<" of "</"
# Line 815  sub _get_next_token ($) { Line 931  sub _get_next_token ($) {
931          ## generated from the bogus end tag, as defined in the          ## generated from the bogus end tag, as defined in the
932          ## "bogus comment state" entry.          ## "bogus comment state" entry.
933          redo A;          redo A;
934          } else {
935            ## XML5: "</:" is a parse error.
936            
937            $self->{ct} = {type => END_TAG_TOKEN,
938                           tag_name => chr ($self->{nc}),
939                           line => $l, column => $c};
940            $self->{state} = TAG_NAME_STATE; ## XML5: "end tag name state".
941            
942        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
943          $self->{line_prev} = $self->{line};
944          $self->{column_prev} = $self->{column};
945          $self->{column}++;
946          $self->{nc}
947              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
948        } else {
949          $self->{set_nc}->($self);
950        }
951      
952            redo A;
953        }        }
954      } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {      } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
955        my $ch = substr $self->{last_stag_name}, length $self->{s_kwd}, 1;        my $ch = substr $self->{last_stag_name}, length $self->{kwd}, 1;
956        if (length $ch) {        if (length $ch) {
957          my $CH = $ch;          my $CH = $ch;
958          $ch =~ tr/a-z/A-Z/;          $ch =~ tr/a-z/A-Z/;
# Line 825  sub _get_next_token ($) { Line 960  sub _get_next_token ($) {
960          if ($nch eq $ch or $nch eq $CH) {          if ($nch eq $ch or $nch eq $CH) {
961                        
962            ## Stay in the state.            ## Stay in the state.
963            $self->{s_kwd} .= $nch;            $self->{kwd} .= $nch;
964                        
965      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
966        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 844  sub _get_next_token ($) { Line 979  sub _get_next_token ($) {
979            $self->{s_kwd} = '';            $self->{s_kwd} = '';
980            ## Reconsume.            ## Reconsume.
981            return  ({type => CHARACTER_TOKEN,            return  ({type => CHARACTER_TOKEN,
982                      data => '</' . $self->{s_kwd},                      data => '</' . $self->{kwd},
983                      line => $self->{line_prev},                      line => $self->{line_prev},
984                      column => $self->{column_prev} - 1 - length $self->{s_kwd},                      column => $self->{column_prev} - 1 - length $self->{kwd},
985                     });                     });
986            redo A;            redo A;
987          }          }
# Line 862  sub _get_next_token ($) { Line 997  sub _get_next_token ($) {
997            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
998            $self->{s_kwd} = '';            $self->{s_kwd} = '';
999            return  ({type => CHARACTER_TOKEN,            return  ({type => CHARACTER_TOKEN,
1000                      data => '</' . $self->{s_kwd},                      data => '</' . $self->{kwd},
1001                      line => $self->{line_prev},                      line => $self->{line_prev},
1002                      column => $self->{column_prev} - 1 - length $self->{s_kwd},                      column => $self->{column_prev} - 1 - length $self->{kwd},
1003                     });                     });
1004            redo A;            redo A;
1005          } else {          } else {
# Line 873  sub _get_next_token ($) { Line 1008  sub _get_next_token ($) {
1008                = {type => END_TAG_TOKEN,                = {type => END_TAG_TOKEN,
1009                   tag_name => $self->{last_stag_name},                   tag_name => $self->{last_stag_name},
1010                   line => $self->{line_prev},                   line => $self->{line_prev},
1011                   column => $self->{column_prev} - 1 - length $self->{s_kwd}};                   column => $self->{column_prev} - 1 - length $self->{kwd}};
1012            $self->{state} = TAG_NAME_STATE;            $self->{state} = TAG_NAME_STATE;
1013            ## Reconsume.            ## Reconsume.
1014            redo A;            redo A;
# Line 968  sub _get_next_token ($) { Line 1103  sub _get_next_token ($) {
1103          $self->{s_kwd} = '';          $self->{s_kwd} = '';
1104          # reconsume          # reconsume
1105    
1106          return  ($self->{ct}); # start tag or end tag          ## Discard the token.
1107            #return  ($self->{ct}); # start tag or end tag
1108    
1109          redo A;          redo A;
1110        } elsif ($self->{nc} == 0x002F) { # /        } elsif ($self->{nc} == 0x002F) { # /
# Line 1005  sub _get_next_token ($) { Line 1141  sub _get_next_token ($) {
1141          redo A;          redo A;
1142        }        }
1143      } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {      } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
1144          ## XML5: "Tag attribute name before state".
1145    
1146        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
1147                    
1148          ## Stay in the state          ## Stay in the state
# Line 1107  sub _get_next_token ($) { Line 1245  sub _get_next_token ($) {
1245          $self->{s_kwd} = '';          $self->{s_kwd} = '';
1246          # reconsume          # reconsume
1247    
1248          return  ($self->{ct}); # start tag or end tag          ## Discard the token.
1249            #return  ($self->{ct}); # start tag or end tag
1250    
1251          redo A;          redo A;
1252        } else {        } else {
1253          if ({          if ({
1254               0x0022 => 1, # "               0x0022 => 1, # "
1255               0x0027 => 1, # '               0x0027 => 1, # '
1256                 0x003C => 1, # <
1257               0x003D => 1, # =               0x003D => 1, # =
1258              }->{$self->{nc}}) {              }->{$self->{nc}}) {
1259                        
1260              ## XML5: Not a parse error.
1261            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1262          } else {          } else {
1263                        
1264              ## XML5: ":" raises a parse error and is ignored.
1265          }          }
1266          $self->{ca}          $self->{ca}
1267              = {name => chr ($self->{nc}),              = {name => chr ($self->{nc}),
# Line 1140  sub _get_next_token ($) { Line 1282  sub _get_next_token ($) {
1282          redo A;          redo A;
1283        }        }
1284      } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {      } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
1285          ## XML5: "Tag attribute name state".
1286    
1287        my $before_leave = sub {        my $before_leave = sub {
1288          if (exists $self->{ct}->{attributes} # start tag or end tag          if (exists $self->{ct}->{attributes} # start tag or end tag
1289              ->{$self->{ca}->{name}}) { # MUST              ->{$self->{ca}->{name}}) { # MUST
# Line 1150  sub _get_next_token ($) { Line 1294  sub _get_next_token ($) {
1294                        
1295            $self->{ct}->{attributes}->{$self->{ca}->{name}}            $self->{ct}->{attributes}->{$self->{ca}->{name}}
1296              = $self->{ca};              = $self->{ca};
1297              $self->{ca}->{index} = ++$self->{ct}->{last_index};
1298          }          }
1299        }; # $before_leave        }; # $before_leave
1300    
# Line 1186  sub _get_next_token ($) { Line 1331  sub _get_next_token ($) {
1331        
1332          redo A;          redo A;
1333        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
1334            if ($self->{is_xml}) {
1335              
1336              ## XML5: Not a parse error.
1337              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1338            } else {
1339              
1340            }
1341    
1342          $before_leave->();          $before_leave->();
1343          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
1344                        
# Line 1235  sub _get_next_token ($) { Line 1388  sub _get_next_token ($) {
1388        
1389          redo A;          redo A;
1390        } elsif ($self->{nc} == 0x002F) { # /        } elsif ($self->{nc} == 0x002F) { # /
1391            if ($self->{is_xml}) {
1392              
1393              ## XML5: Not a parse error.
1394              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1395            } else {
1396              
1397            }
1398                    
1399          $before_leave->();          $before_leave->();
1400          $self->{state} = SELF_CLOSING_START_TAG_STATE;          $self->{state} = SELF_CLOSING_START_TAG_STATE;
# Line 1272  sub _get_next_token ($) { Line 1432  sub _get_next_token ($) {
1432          $self->{s_kwd} = '';          $self->{s_kwd} = '';
1433          # reconsume          # reconsume
1434    
1435          return  ($self->{ct}); # start tag or end tag          ## Discard the token.
1436            #return  ($self->{ct}); # start tag or end tag
1437    
1438          redo A;          redo A;
1439        } else {        } else {
1440          if ($self->{nc} == 0x0022 or # "          if ({
1441              $self->{nc} == 0x0027) { # '               0x0022 => 1, # "
1442                 0x0027 => 1, # '
1443                 0x003C => 1, # <
1444                }->{$self->{nc}}) {
1445                        
1446              ## XML5: Not a parse error.
1447            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1448          } else {          } else {
1449                        
# Line 1299  sub _get_next_token ($) { Line 1464  sub _get_next_token ($) {
1464          redo A;          redo A;
1465        }        }
1466      } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {      } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1467          ## XML5: "Tag attribute name after state".
1468          
1469        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
1470                    
1471          ## Stay in the state          ## Stay in the state
# Line 1330  sub _get_next_token ($) { Line 1497  sub _get_next_token ($) {
1497        
1498          redo A;          redo A;
1499        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
1500            if ($self->{is_xml}) {
1501              
1502              ## XML5: Not a parse error.
1503              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1504            } else {
1505              
1506            }
1507    
1508          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
1509                        
1510            $self->{last_stag_name} = $self->{ct}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
# Line 1383  sub _get_next_token ($) { Line 1558  sub _get_next_token ($) {
1558        
1559          redo A;          redo A;
1560        } elsif ($self->{nc} == 0x002F) { # /        } elsif ($self->{nc} == 0x002F) { # /
1561            if ($self->{is_xml}) {
1562              
1563              ## XML5: Not a parse error.
1564              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1565            } else {
1566              
1567            }
1568                    
1569          $self->{state} = SELF_CLOSING_START_TAG_STATE;          $self->{state} = SELF_CLOSING_START_TAG_STATE;
1570                    
# Line 1418  sub _get_next_token ($) { Line 1600  sub _get_next_token ($) {
1600          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1601          # reconsume          # reconsume
1602    
1603          return  ($self->{ct}); # start tag or end tag          ## Discard the token.
1604            #return  ($self->{ct}); # start tag or end tag
1605    
1606          redo A;          redo A;
1607        } else {        } else {
1608          if ($self->{nc} == 0x0022 or # "          if ($self->{is_xml}) {
1609              $self->{nc} == 0x0027) { # '            
1610              ## XML5: Not a parse error.
1611              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1612            } else {
1613                        
1614            }
1615    
1616            if ({
1617                 0x0022 => 1, # "
1618                 0x0027 => 1, # '
1619                 0x003C => 1, # <
1620                }->{$self->{nc}}) {
1621              
1622              ## XML5: Not a parse error.
1623            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1624          } else {          } else {
1625                        
# Line 1448  sub _get_next_token ($) { Line 1643  sub _get_next_token ($) {
1643          redo A;                  redo A;        
1644        }        }
1645      } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {      } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1646          ## XML5: "Tag attribute value before state".
1647    
1648        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
1649                    
1650          ## Stay in the state          ## Stay in the state
# Line 1553  sub _get_next_token ($) { Line 1750  sub _get_next_token ($) {
1750          $self->{s_kwd} = '';          $self->{s_kwd} = '';
1751          ## reconsume          ## reconsume
1752    
1753          return  ($self->{ct}); # start tag or end tag          ## Discard the token.
1754            #return  ($self->{ct}); # start tag or end tag
1755    
1756          redo A;          redo A;
1757        } else {        } else {
1758          if ($self->{nc} == 0x003D) { # =          if ($self->{nc} == 0x003D or $self->{nc} == 0x003C) { # =, <
1759                        
1760              ## XML5: Not a parse error.
1761            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
1762            } elsif ($self->{is_xml}) {
1763              
1764              ## XML5: No parse error.
1765              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO
1766          } else {          } else {
1767                        
1768          }          }
# Line 1579  sub _get_next_token ($) { Line 1782  sub _get_next_token ($) {
1782          redo A;          redo A;
1783        }        }
1784      } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {      } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1785          ## XML5: "Tag attribute value double quoted state" and "DOCTYPE
1786          ## ATTLIST attribute value double quoted state".
1787          
1788        if ($self->{nc} == 0x0022) { # "        if ($self->{nc} == 0x0022) { # "
1789                    if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1790          $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;            
1791              ## XML5: "DOCTYPE ATTLIST name after state".
1792              push @{$self->{ct}->{attrdefs}}, $self->{ca};
1793              $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1794            } else {
1795              
1796              ## XML5: "Tag attribute name before state".
1797              $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1798            }
1799                    
1800      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1801        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 1596  sub _get_next_token ($) { Line 1810  sub _get_next_token ($) {
1810          redo A;          redo A;
1811        } elsif ($self->{nc} == 0x0026) { # &        } elsif ($self->{nc} == 0x0026) { # &
1812                    
1813            ## XML5: Not defined yet.
1814    
1815          ## NOTE: In the spec, the tokenizer is switched to the          ## NOTE: In the spec, the tokenizer is switched to the
1816          ## "entity in attribute value state".  In this implementation, the          ## "entity in attribute value state".  In this implementation, the
1817          ## tokenizer is switched to the |ENTITY_STATE|, which is an          ## tokenizer is switched to the |ENTITY_STATE|, which is an
# Line 1615  sub _get_next_token ($) { Line 1831  sub _get_next_token ($) {
1831      }      }
1832        
1833          redo A;          redo A;
1834          } elsif ($self->{is_xml} and
1835                   $is_space->{$self->{nc}}) {
1836            
1837            $self->{ca}->{value} .= ' ';
1838            ## Stay in the state.
1839            
1840        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1841          $self->{line_prev} = $self->{line};
1842          $self->{column_prev} = $self->{column};
1843          $self->{column}++;
1844          $self->{nc}
1845              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1846        } else {
1847          $self->{set_nc}->($self);
1848        }
1849      
1850            redo A;
1851        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
1852          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1853          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
1854                        
1855            $self->{last_stag_name} = $self->{ct}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
1856    
1857              $self->{state} = DATA_STATE;
1858              $self->{s_kwd} = '';
1859              ## reconsume
1860              return  ($self->{ct}); # start tag
1861              redo A;
1862          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1863            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1864            if ($self->{ct}->{attributes}) {            if ($self->{ct}->{attributes}) {
# Line 1629  sub _get_next_token ($) { Line 1868  sub _get_next_token ($) {
1868              ## NOTE: This state should never be reached.              ## NOTE: This state should never be reached.
1869                            
1870            }            }
1871    
1872              $self->{state} = DATA_STATE;
1873              $self->{s_kwd} = '';
1874              ## reconsume
1875    
1876              ## Discard the token.
1877              #return  ($self->{ct}); # end tag
1878    
1879              redo A;
1880            } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1881              ## XML5: No parse error above; not defined yet.
1882              push @{$self->{ct}->{attrdefs}}, $self->{ca};
1883              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1884              ## Reconsume.
1885    
1886              ## Discard the token.
1887              #return  ($self->{ct}); # ATTLIST
1888    
1889              redo A;
1890          } else {          } else {
1891            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1892          }          }
         $self->{state} = DATA_STATE;  
         $self->{s_kwd} = '';  
         ## reconsume  
   
         return  ($self->{ct}); # start tag or end tag  
   
         redo A;  
1893        } else {        } else {
1894                    ## XML5 [ATTLIST]: Not defined yet.
1895            if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1896              
1897              ## XML5: Not a parse error.
1898              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
1899            } else {
1900              
1901            }
1902          $self->{ca}->{value} .= chr ($self->{nc});          $self->{ca}->{value} .= chr ($self->{nc});
1903          $self->{read_until}->($self->{ca}->{value},          $self->{read_until}->($self->{ca}->{value},
1904                                q["&],                                qq["&<\x09\x0C\x20],
1905                                length $self->{ca}->{value});                                length $self->{ca}->{value});
1906    
1907          ## Stay in the state          ## Stay in the state
# Line 1661  sub _get_next_token ($) { Line 1919  sub _get_next_token ($) {
1919          redo A;          redo A;
1920        }        }
1921      } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {      } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1922          ## XML5: "Tag attribute value single quoted state" and "DOCTYPE
1923          ## ATTLIST attribute value single quoted state".
1924    
1925        if ($self->{nc} == 0x0027) { # '        if ($self->{nc} == 0x0027) { # '
1926                    if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1927          $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;            
1928              ## XML5: "DOCTYPE ATTLIST name after state".
1929              push @{$self->{ct}->{attrdefs}}, $self->{ca};
1930              $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1931            } else {
1932              
1933              ## XML5: "Before attribute name state" (sic).
1934              $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1935            }
1936                    
1937      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1938        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 1678  sub _get_next_token ($) { Line 1947  sub _get_next_token ($) {
1947          redo A;          redo A;
1948        } elsif ($self->{nc} == 0x0026) { # &        } elsif ($self->{nc} == 0x0026) { # &
1949                    
1950            ## XML5: Not defined yet.
1951    
1952          ## NOTE: In the spec, the tokenizer is switched to the          ## NOTE: In the spec, the tokenizer is switched to the
1953          ## "entity in attribute value state".  In this implementation, the          ## "entity in attribute value state".  In this implementation, the
1954          ## tokenizer is switched to the |ENTITY_STATE|, which is an          ## tokenizer is switched to the |ENTITY_STATE|, which is an
# Line 1697  sub _get_next_token ($) { Line 1968  sub _get_next_token ($) {
1968      }      }
1969        
1970          redo A;          redo A;
1971          } elsif ($self->{is_xml} and
1972                   $is_space->{$self->{nc}}) {
1973            
1974            $self->{ca}->{value} .= ' ';
1975            ## Stay in the state.
1976            
1977        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1978          $self->{line_prev} = $self->{line};
1979          $self->{column_prev} = $self->{column};
1980          $self->{column}++;
1981          $self->{nc}
1982              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1983        } else {
1984          $self->{set_nc}->($self);
1985        }
1986      
1987            redo A;
1988        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
1989          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1990          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
1991                        
1992            $self->{last_stag_name} = $self->{ct}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
1993    
1994              $self->{state} = DATA_STATE;
1995              $self->{s_kwd} = '';
1996              ## reconsume
1997    
1998              ## Discard the token.
1999              #return  ($self->{ct}); # start tag
2000    
2001              redo A;
2002          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2003            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2004            if ($self->{ct}->{attributes}) {            if ($self->{ct}->{attributes}) {
# Line 1711  sub _get_next_token ($) { Line 2008  sub _get_next_token ($) {
2008              ## NOTE: This state should never be reached.              ## NOTE: This state should never be reached.
2009                            
2010            }            }
2011    
2012              $self->{state} = DATA_STATE;
2013              $self->{s_kwd} = '';
2014              ## reconsume
2015    
2016              ## Discard the token.
2017              #return  ($self->{ct}); # end tag
2018    
2019              redo A;
2020            } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2021              ## XML5: No parse error above; not defined yet.
2022              push @{$self->{ct}->{attrdefs}}, $self->{ca};
2023              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2024              ## Reconsume.
2025    
2026              ## Discard the token.
2027              #return  ($self->{ct}); # ATTLIST
2028    
2029              redo A;
2030          } else {          } else {
2031            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
2032          }          }
         $self->{state} = DATA_STATE;  
         $self->{s_kwd} = '';  
         ## reconsume  
   
         return  ($self->{ct}); # start tag or end tag  
   
         redo A;  
2033        } else {        } else {
2034                    ## XML5 [ATTLIST]: Not defined yet.
2035            if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
2036              
2037              ## XML5: Not a parse error.
2038              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
2039            } else {
2040              
2041            }
2042          $self->{ca}->{value} .= chr ($self->{nc});          $self->{ca}->{value} .= chr ($self->{nc});
2043          $self->{read_until}->($self->{ca}->{value},          $self->{read_until}->($self->{ca}->{value},
2044                                q['&],                                qq['&<\x09\x0C\x20],
2045                                length $self->{ca}->{value});                                length $self->{ca}->{value});
2046    
2047          ## Stay in the state          ## Stay in the state
# Line 1743  sub _get_next_token ($) { Line 2059  sub _get_next_token ($) {
2059          redo A;          redo A;
2060        }        }
2061      } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {      } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
2062          ## XML5: "Tag attribute value unquoted state".
2063    
2064        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
2065                    if ($self->{ct}->{type} == ATTLIST_TOKEN) {
2066          $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;            
2067              push @{$self->{ct}->{attrdefs}}, $self->{ca};
2068              $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
2069            } else {
2070              
2071              ## XML5: "Tag attribute name before state".
2072              $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2073            }
2074                    
2075      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2076        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 1760  sub _get_next_token ($) { Line 2085  sub _get_next_token ($) {
2085          redo A;          redo A;
2086        } elsif ($self->{nc} == 0x0026) { # &        } elsif ($self->{nc} == 0x0026) { # &
2087                    
2088    
2089            ## XML5: Not defined yet.
2090    
2091          ## NOTE: In the spec, the tokenizer is switched to the          ## NOTE: In the spec, the tokenizer is switched to the
2092          ## "entity in attribute value state".  In this implementation, the          ## "entity in attribute value state".  In this implementation, the
2093          ## tokenizer is switched to the |ENTITY_STATE|, which is an          ## tokenizer is switched to the |ENTITY_STATE|, which is an
# Line 1783  sub _get_next_token ($) { Line 2111  sub _get_next_token ($) {
2111          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
2112                        
2113            $self->{last_stag_name} = $self->{ct}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
2114    
2115              $self->{state} = DATA_STATE;
2116              $self->{s_kwd} = '';
2117              
2118        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2119          $self->{line_prev} = $self->{line};
2120          $self->{column_prev} = $self->{column};
2121          $self->{column}++;
2122          $self->{nc}
2123              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2124        } else {
2125          $self->{set_nc}->($self);
2126        }
2127      
2128              return  ($self->{ct}); # start tag
2129              redo A;
2130          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2131            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2132            if ($self->{ct}->{attributes}) {            if ($self->{ct}->{attributes}) {
# Line 1792  sub _get_next_token ($) { Line 2136  sub _get_next_token ($) {
2136              ## NOTE: This state should never be reached.              ## NOTE: This state should never be reached.
2137                            
2138            }            }
2139          } else {  
2140            die "$0: $self->{ct}->{type}: Unknown token type";            $self->{state} = DATA_STATE;
2141          }            $self->{s_kwd} = '';
2142          $self->{state} = DATA_STATE;            
         $self->{s_kwd} = '';  
           
2143      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2144        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
2145        $self->{column_prev} = $self->{column};        $self->{column_prev} = $self->{column};
# Line 1808  sub _get_next_token ($) { Line 2150  sub _get_next_token ($) {
2150        $self->{set_nc}->($self);        $self->{set_nc}->($self);
2151      }      }
2152        
2153              return  ($self->{ct}); # end tag
2154          return  ($self->{ct}); # start tag or end tag            redo A;
2155            } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2156          redo A;            push @{$self->{ct}->{attrdefs}}, $self->{ca};
2157              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2158              
2159        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2160          $self->{line_prev} = $self->{line};
2161          $self->{column_prev} = $self->{column};
2162          $self->{column}++;
2163          $self->{nc}
2164              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2165        } else {
2166          $self->{set_nc}->($self);
2167        }
2168      
2169              return  ($self->{ct}); # ATTLIST
2170              redo A;
2171            } else {
2172              die "$0: $self->{ct}->{type}: Unknown token type";
2173            }
2174        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
         $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');  
2175          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
2176                        
2177              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2178            $self->{last_stag_name} = $self->{ct}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
2179    
2180              $self->{state} = DATA_STATE;
2181              $self->{s_kwd} = '';
2182              ## reconsume
2183    
2184              ## Discard the token.
2185              #return  ($self->{ct}); # start tag
2186              
2187              redo A;
2188          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2189              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2190            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2191            if ($self->{ct}->{attributes}) {            if ($self->{ct}->{attributes}) {
2192                            
# Line 1826  sub _get_next_token ($) { Line 2195  sub _get_next_token ($) {
2195              ## NOTE: This state should never be reached.              ## NOTE: This state should never be reached.
2196                            
2197            }            }
2198    
2199              $self->{state} = DATA_STATE;
2200              $self->{s_kwd} = '';
2201              ## reconsume
2202    
2203              ## Discard the token.
2204              #return  ($self->{ct}); # end tag
2205    
2206              redo A;
2207            } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2208              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
2209              push @{$self->{ct}->{attrdefs}}, $self->{ca};
2210              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2211              ## Reconsume.
2212    
2213              ## Discard the token.
2214              #return  ($self->{ct}); # ATTLIST
2215    
2216              redo A;
2217          } else {          } else {
2218            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
2219          }          }
         $self->{state} = DATA_STATE;  
         $self->{s_kwd} = '';  
         ## reconsume  
   
         return  ($self->{ct}); # start tag or end tag  
   
         redo A;  
2220        } else {        } else {
2221          if ({          if ({
2222               0x0022 => 1, # "               0x0022 => 1, # "
2223               0x0027 => 1, # '               0x0027 => 1, # '
2224               0x003D => 1, # =               0x003D => 1, # =
2225                 0x003C => 1, # <
2226              }->{$self->{nc}}) {              }->{$self->{nc}}) {
2227                        
2228              ## XML5: Not a parse error.
2229            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
2230          } else {          } else {
2231                        
2232          }          }
2233          $self->{ca}->{value} .= chr ($self->{nc});          $self->{ca}->{value} .= chr ($self->{nc});
2234          $self->{read_until}->($self->{ca}->{value},          $self->{read_until}->($self->{ca}->{value},
2235                                q["'=& >],                                qq["'=& \x09\x0C>],
2236                                length $self->{ca}->{value});                                length $self->{ca}->{value});
2237    
2238          ## Stay in the state          ## Stay in the state
# Line 1949  sub _get_next_token ($) { Line 2332  sub _get_next_token ($) {
2332          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2333          $self->{s_kwd} = '';          $self->{s_kwd} = '';
2334          ## Reconsume.          ## Reconsume.
2335          return  ($self->{ct}); # start tag or end tag  
2336            ## Discard the token.
2337            #return  ($self->{ct}); # start tag or end tag
2338    
2339          redo A;          redo A;
2340        } else {        } else {
2341                    
# Line 1959  sub _get_next_token ($) { Line 2345  sub _get_next_token ($) {
2345          redo A;          redo A;
2346        }        }
2347      } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {      } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
2348          ## XML5: "Empty tag state".
2349    
2350        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
2351          if ($self->{ct}->{type} == END_TAG_TOKEN) {          if ($self->{ct}->{type} == END_TAG_TOKEN) {
2352                        
# Line 2010  sub _get_next_token ($) { Line 2398  sub _get_next_token ($) {
2398          } else {          } else {
2399            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
2400          }          }
2401            ## XML5: "Tag attribute name before state".
2402          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2403          $self->{s_kwd} = '';          $self->{s_kwd} = '';
2404          ## Reconsume.          ## Reconsume.
2405          return  ($self->{ct}); # start tag or end tag  
2406            ## Discard the token.
2407            #return  ($self->{ct}); # start tag or end tag
2408    
2409          redo A;          redo A;
2410        } else {        } else {
2411                    
# Line 2024  sub _get_next_token ($) { Line 2416  sub _get_next_token ($) {
2416          redo A;          redo A;
2417        }        }
2418      } elsif ($self->{state} == BOGUS_COMMENT_STATE) {      } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
2419        ## (only happen if PCDATA state)        ## XML5: "Bogus comment state" and "DOCTYPE bogus comment state".
2420    
2421        ## NOTE: Unlike spec's "bogus comment state", this implementation        ## NOTE: Unlike spec's "bogus comment state", this implementation
2422        ## consumes characters one-by-one basis.        ## consumes characters one-by-one basis.
2423                
2424        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
2425                    if ($self->{in_subset}) {
2426          $self->{state} = DATA_STATE;            
2427          $self->{s_kwd} = '';            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2428            } else {
2429              
2430              $self->{state} = DATA_STATE;
2431              $self->{s_kwd} = '';
2432            }
2433                    
2434      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2435        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2048  sub _get_next_token ($) { Line 2445  sub _get_next_token ($) {
2445          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
2446          redo A;          redo A;
2447        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
2448                    if ($self->{in_subset}) {
2449          $self->{state} = DATA_STATE;            
2450          $self->{s_kwd} = '';            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2451            } else {
2452              
2453              $self->{state} = DATA_STATE;
2454              $self->{s_kwd} = '';
2455            }
2456          ## reconsume          ## reconsume
2457    
2458          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
# Line 2077  sub _get_next_token ($) { Line 2479  sub _get_next_token ($) {
2479          redo A;          redo A;
2480        }        }
2481      } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {      } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
2482        ## (only happen if PCDATA state)        ## XML5: "Markup declaration state".
2483                
2484        if ($self->{nc} == 0x002D) { # -        if ($self->{nc} == 0x002D) { # -
2485                    
# Line 2099  sub _get_next_token ($) { Line 2501  sub _get_next_token ($) {
2501          ## ASCII case-insensitive.          ## ASCII case-insensitive.
2502                    
2503          $self->{state} = MD_DOCTYPE_STATE;          $self->{state} = MD_DOCTYPE_STATE;
2504          $self->{s_kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
2505                    
2506      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2507        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2118  sub _get_next_token ($) { Line 2520  sub _get_next_token ($) {
2520                 $self->{nc} == 0x005B) { # [                 $self->{nc} == 0x005B) { # [
2521                                                    
2522          $self->{state} = MD_CDATA_STATE;          $self->{state} = MD_CDATA_STATE;
2523          $self->{s_kwd} = '[';          $self->{kwd} = '[';
2524                    
2525      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2526        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2152  sub _get_next_token ($) { Line 2554  sub _get_next_token ($) {
2554                                    line => $self->{line_prev},                                    line => $self->{line_prev},
2555                                    column => $self->{column_prev} - 2,                                    column => $self->{column_prev} - 2,
2556                                   };                                   };
2557          $self->{state} = COMMENT_START_STATE;          $self->{state} = COMMENT_START_STATE; ## XML5: "comment state".
2558                    
2559      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2560        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2188  sub _get_next_token ($) { Line 2590  sub _get_next_token ($) {
2590              0x0054, # T              0x0054, # T
2591              0x0059, # Y              0x0059, # Y
2592              0x0050, # P              0x0050, # P
2593            ]->[length $self->{s_kwd}] or            ]->[length $self->{kwd}] or
2594            $self->{nc} == [            $self->{nc} == [
2595              undef,              undef,
2596              0x006F, # o              0x006F, # o
# Line 2196  sub _get_next_token ($) { Line 2598  sub _get_next_token ($) {
2598              0x0074, # t              0x0074, # t
2599              0x0079, # y              0x0079, # y
2600              0x0070, # p              0x0070, # p
2601            ]->[length $self->{s_kwd}]) {            ]->[length $self->{kwd}]) {
2602                    
2603          ## Stay in the state.          ## Stay in the state.
2604          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
2605                    
2606      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2607        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2212  sub _get_next_token ($) { Line 2614  sub _get_next_token ($) {
2614      }      }
2615        
2616          redo A;          redo A;
2617        } elsif ((length $self->{s_kwd}) == 6 and        } elsif ((length $self->{kwd}) == 6 and
2618                 ($self->{nc} == 0x0045 or # E                 ($self->{nc} == 0x0045 or # E
2619                  $self->{nc} == 0x0065)) { # e                  $self->{nc} == 0x0065)) { # e
2620                    if ($self->{is_xml} and
2621                ($self->{kwd} ne 'DOCTYP' or $self->{nc} == 0x0065)) {
2622              
2623              ## XML5: case-sensitive.
2624              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO
2625                              text => 'DOCTYPE',
2626                              line => $self->{line_prev},
2627                              column => $self->{column_prev} - 5);
2628            } else {
2629              
2630            }
2631          $self->{state} = DOCTYPE_STATE;          $self->{state} = DOCTYPE_STATE;
2632          $self->{ct} = {type => DOCTYPE_TOKEN,          $self->{ct} = {type => DOCTYPE_TOKEN,
2633                                    quirks => 1,                                    quirks => 1,
# Line 2238  sub _get_next_token ($) { Line 2650  sub _get_next_token ($) {
2650                                    
2651          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2652                          line => $self->{line_prev},                          line => $self->{line_prev},
2653                          column => $self->{column_prev} - 1 - length $self->{s_kwd});                          column => $self->{column_prev} - 1 - length $self->{kwd});
2654          $self->{state} = BOGUS_COMMENT_STATE;          $self->{state} = BOGUS_COMMENT_STATE;
2655          ## Reconsume.          ## Reconsume.
2656          $self->{ct} = {type => COMMENT_TOKEN,          $self->{ct} = {type => COMMENT_TOKEN,
2657                                    data => $self->{s_kwd},                                    data => $self->{kwd},
2658                                    line => $self->{line_prev},                                    line => $self->{line_prev},
2659                                    column => $self->{column_prev} - 1 - length $self->{s_kwd},                                    column => $self->{column_prev} - 1 - length $self->{kwd},
2660                                   };                                   };
2661          redo A;          redo A;
2662        }        }
# Line 2255  sub _get_next_token ($) { Line 2667  sub _get_next_token ($) {
2667              '[CD' => 0x0041, # A              '[CD' => 0x0041, # A
2668              '[CDA' => 0x0054, # T              '[CDA' => 0x0054, # T
2669              '[CDAT' => 0x0041, # A              '[CDAT' => 0x0041, # A
2670            }->{$self->{s_kwd}}) {            }->{$self->{kwd}}) {
2671                    
2672          ## Stay in the state.          ## Stay in the state.
2673          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
2674                    
2675      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2676        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2271  sub _get_next_token ($) { Line 2683  sub _get_next_token ($) {
2683      }      }
2684        
2685          redo A;          redo A;
2686        } elsif ($self->{s_kwd} eq '[CDATA' and        } elsif ($self->{kwd} eq '[CDATA' and
2687                 $self->{nc} == 0x005B) { # [                 $self->{nc} == 0x005B) { # [
2688          if ($self->{is_xml} and          if ($self->{is_xml} and
2689              not $self->{tainted} and              not $self->{tainted} and
# Line 2306  sub _get_next_token ($) { Line 2718  sub _get_next_token ($) {
2718                    
2719          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2720                          line => $self->{line_prev},                          line => $self->{line_prev},
2721                          column => $self->{column_prev} - 1 - length $self->{s_kwd});                          column => $self->{column_prev} - 1 - length $self->{kwd});
2722          $self->{state} = BOGUS_COMMENT_STATE;          $self->{state} = BOGUS_COMMENT_STATE;
2723          ## Reconsume.          ## Reconsume.
2724          $self->{ct} = {type => COMMENT_TOKEN,          $self->{ct} = {type => COMMENT_TOKEN,
2725                                    data => $self->{s_kwd},                                    data => $self->{kwd},
2726                                    line => $self->{line_prev},                                    line => $self->{line_prev},
2727                                    column => $self->{column_prev} - 1 - length $self->{s_kwd},                                    column => $self->{column_prev} - 1 - length $self->{kwd},
2728                                   };                                   };
2729          redo A;          redo A;
2730        }        }
# Line 2333  sub _get_next_token ($) { Line 2745  sub _get_next_token ($) {
2745        
2746          redo A;          redo A;
2747        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
           
2748          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2749          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
2750          $self->{s_kwd} = '';            
2751              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2752            } else {
2753              
2754              $self->{state} = DATA_STATE;
2755              $self->{s_kwd} = '';
2756            }
2757                    
2758      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2759        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2353  sub _get_next_token ($) { Line 2770  sub _get_next_token ($) {
2770    
2771          redo A;          redo A;
2772        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
2773          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2774          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
2775          $self->{s_kwd} = '';            
2776              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2777            } else {
2778              
2779              $self->{state} = DATA_STATE;
2780              $self->{s_kwd} = '';
2781            }
2782          ## reconsume          ## reconsume
2783    
2784          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
# Line 2397  sub _get_next_token ($) { Line 2819  sub _get_next_token ($) {
2819        
2820          redo A;          redo A;
2821        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
           
2822          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2823          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
2824          $self->{s_kwd} = '';            
2825              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2826            } else {
2827              
2828              $self->{state} = DATA_STATE;
2829              $self->{s_kwd} = '';
2830            }
2831                    
2832      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2833        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2417  sub _get_next_token ($) { Line 2844  sub _get_next_token ($) {
2844    
2845          redo A;          redo A;
2846        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
2847          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2848          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
2849          $self->{s_kwd} = '';            
2850              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2851            } else {
2852              
2853              $self->{state} = DATA_STATE;
2854              $self->{s_kwd} = '';
2855            }
2856          ## reconsume          ## reconsume
2857    
2858          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
# Line 2445  sub _get_next_token ($) { Line 2877  sub _get_next_token ($) {
2877          redo A;          redo A;
2878        }        }
2879      } elsif ($self->{state} == COMMENT_STATE) {      } elsif ($self->{state} == COMMENT_STATE) {
2880          ## XML5: "Comment state" and "DOCTYPE comment state".
2881    
2882        if ($self->{nc} == 0x002D) { # -        if ($self->{nc} == 0x002D) { # -
2883                    
2884          $self->{state} = COMMENT_END_DASH_STATE;          $self->{state} = COMMENT_END_DASH_STATE;
# Line 2461  sub _get_next_token ($) { Line 2895  sub _get_next_token ($) {
2895        
2896          redo A;          redo A;
2897        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
2898          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2899          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
2900          $self->{s_kwd} = '';            
2901              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2902            } else {
2903              
2904              $self->{state} = DATA_STATE;
2905              $self->{s_kwd} = '';
2906            }
2907          ## reconsume          ## reconsume
2908    
2909          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
# Line 2492  sub _get_next_token ($) { Line 2931  sub _get_next_token ($) {
2931          redo A;          redo A;
2932        }        }
2933      } elsif ($self->{state} == COMMENT_END_DASH_STATE) {      } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2934          ## XML5: "Comment dash state" and "DOCTYPE comment dash state".
2935    
2936        if ($self->{nc} == 0x002D) { # -        if ($self->{nc} == 0x002D) { # -
2937                    
2938          $self->{state} = COMMENT_END_STATE;          $self->{state} = COMMENT_END_STATE;
# Line 2508  sub _get_next_token ($) { Line 2949  sub _get_next_token ($) {
2949        
2950          redo A;          redo A;
2951        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
2952          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2953          $self->{s_kwd} = '';          if ($self->{in_subset}) {
2954          $self->{state} = DATA_STATE;            
2955          $self->{s_kwd} = '';            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2956            } else {
2957              
2958              $self->{state} = DATA_STATE;
2959              $self->{s_kwd} = '';
2960            }
2961          ## reconsume          ## reconsume
2962    
2963          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
# Line 2535  sub _get_next_token ($) { Line 2980  sub _get_next_token ($) {
2980        
2981          redo A;          redo A;
2982        }        }
2983      } elsif ($self->{state} == COMMENT_END_STATE) {      } elsif ($self->{state} == COMMENT_END_STATE or
2984                 $self->{state} == COMMENT_END_BANG_STATE) {
2985          ## XML5: "Comment end state" and "DOCTYPE comment end state".
2986          ## (No comment end bang state.)
2987    
2988        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
2989                    if ($self->{in_subset}) {
2990          $self->{state} = DATA_STATE;            
2991          $self->{s_kwd} = '';            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2992            } else {
2993              
2994              $self->{state} = DATA_STATE;
2995              $self->{s_kwd} = '';
2996            }
2997                    
2998      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2999        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2556  sub _get_next_token ($) { Line 3010  sub _get_next_token ($) {
3010    
3011          redo A;          redo A;
3012        } elsif ($self->{nc} == 0x002D) { # -        } elsif ($self->{nc} == 0x002D) { # -
3013            if ($self->{state} == COMMENT_END_BANG_STATE) {
3014              
3015              $self->{ct}->{data} .= '--!'; # comment
3016              $self->{state} = COMMENT_END_DASH_STATE;
3017            } else {
3018              
3019              ## XML5: Not a parse error.
3020              $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
3021                              line => $self->{line_prev},
3022                              column => $self->{column_prev});
3023              $self->{ct}->{data} .= '-'; # comment
3024              ## Stay in the state
3025            }
3026                    
3027          $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3028                          line => $self->{line_prev},        $self->{line_prev} = $self->{line};
3029                          column => $self->{column_prev});        $self->{column_prev} = $self->{column};
3030          $self->{ct}->{data} .= '-'; # comment        $self->{column}++;
3031          ## Stay in the state        $self->{nc}
3032              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3033        } else {
3034          $self->{set_nc}->($self);
3035        }
3036      
3037            redo A;
3038          } elsif ($self->{state} != COMMENT_END_BANG_STATE and
3039                   $is_space->{$self->{nc}}) {
3040            
3041            $self->{parse_error}->(level => $self->{level}->{must}, type => 'comment end space'); # XXX error type
3042            $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
3043            $self->{state} = COMMENT_END_SPACE_STATE;
3044                    
3045      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3046        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2574  sub _get_next_token ($) { Line 3053  sub _get_next_token ($) {
3053      }      }
3054        
3055          redo A;          redo A;
3056        } elsif ($self->{nc} == -1) {        } elsif ($self->{state} != COMMENT_END_BANG_STATE and
3057                   $self->{nc} == 0x0021) { # !
3058            
3059            $self->{parse_error}->(level => $self->{level}->{must}, type => 'comment end bang'); # XXX error type
3060            $self->{state} = COMMENT_END_BANG_STATE;
3061                    
3062        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3063          $self->{line_prev} = $self->{line};
3064          $self->{column_prev} = $self->{column};
3065          $self->{column}++;
3066          $self->{nc}
3067              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3068        } else {
3069          $self->{set_nc}->($self);
3070        }
3071      
3072            redo A;
3073          } elsif ($self->{nc} == -1) {
3074          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
3075          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
3076          $self->{s_kwd} = '';            
3077          ## reconsume            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3078            } else {
3079              
3080              $self->{state} = DATA_STATE;
3081              $self->{s_kwd} = '';
3082            }
3083            ## Reconsume.
3084    
3085          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
3086    
3087          redo A;          redo A;
3088        } else {        } else {
3089                    
3090          $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',          if ($self->{state} == COMMENT_END_BANG_STATE) {
3091                          line => $self->{line_prev},            $self->{ct}->{data} .= '--!' . chr ($self->{nc}); # comment
3092                          column => $self->{column_prev});          } else {
3093          $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment            $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
3094            }
3095          $self->{state} = COMMENT_STATE;          $self->{state} = COMMENT_STATE;
3096                    
3097      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 2604  sub _get_next_token ($) { Line 3106  sub _get_next_token ($) {
3106        
3107          redo A;          redo A;
3108        }        }
3109        } elsif ($self->{state} == COMMENT_END_SPACE_STATE) {
3110          ## XML5: Not exist.
3111    
3112          if ($self->{nc} == 0x003E) { # >
3113            if ($self->{in_subset}) {
3114              
3115              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3116            } else {
3117              
3118              $self->{state} = DATA_STATE;
3119              $self->{s_kwd} = '';
3120            }
3121            
3122        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3123          $self->{line_prev} = $self->{line};
3124          $self->{column_prev} = $self->{column};
3125          $self->{column}++;
3126          $self->{nc}
3127              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3128        } else {
3129          $self->{set_nc}->($self);
3130        }
3131      
3132    
3133            return  ($self->{ct}); # comment
3134    
3135            redo A;
3136          } elsif ($is_space->{$self->{nc}}) {
3137            
3138            $self->{ct}->{data} .= chr ($self->{nc}); # comment
3139            ## Stay in the state.
3140            
3141        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3142          $self->{line_prev} = $self->{line};
3143          $self->{column_prev} = $self->{column};
3144          $self->{column}++;
3145          $self->{nc}
3146              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3147        } else {
3148          $self->{set_nc}->($self);
3149        }
3150      
3151            redo A;
3152          } elsif ($self->{nc} == -1) {
3153            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
3154            if ($self->{in_subset}) {
3155              
3156              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3157            } else {
3158              
3159              $self->{state} = DATA_STATE;
3160              $self->{s_kwd} = '';
3161            }
3162            ## Reconsume.
3163    
3164            return  ($self->{ct}); # comment
3165    
3166            redo A;
3167          } else {
3168            
3169            $self->{ct}->{data} .= chr ($self->{nc}); # comment
3170            $self->{state} = COMMENT_STATE;
3171            
3172        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3173          $self->{line_prev} = $self->{line};
3174          $self->{column_prev} = $self->{column};
3175          $self->{column}++;
3176          $self->{nc}
3177              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3178        } else {
3179          $self->{set_nc}->($self);
3180        }
3181      
3182            redo A;
3183          }
3184      } elsif ($self->{state} == DOCTYPE_STATE) {      } elsif ($self->{state} == DOCTYPE_STATE) {
3185        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
3186                    
# Line 2620  sub _get_next_token ($) { Line 3197  sub _get_next_token ($) {
3197      }      }
3198        
3199          redo A;          redo A;
3200          } elsif ($self->{nc} == -1) {
3201            
3202            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3203            $self->{ct}->{quirks} = 1;
3204    
3205            $self->{state} = DATA_STATE;
3206            ## Reconsume.
3207            return  ($self->{ct}); # DOCTYPE (quirks)
3208    
3209            redo A;
3210        } else {        } else {
3211                    
3212            ## XML5: Swith to the bogus comment state.
3213          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before DOCTYPE name');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before DOCTYPE name');
3214          $self->{state} = BEFORE_DOCTYPE_NAME_STATE;          $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
3215          ## reconsume          ## reconsume
3216          redo A;          redo A;
3217        }        }
3218      } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {      } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
3219          ## XML5: "DOCTYPE root name before state".
3220    
3221        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
3222                    
3223          ## Stay in the state          ## Stay in the state
# Line 2645  sub _get_next_token ($) { Line 3235  sub _get_next_token ($) {
3235          redo A;          redo A;
3236        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
3237                    
3238            ## XML5: No parse error.
3239          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3240          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3241          $self->{s_kwd} = '';          $self->{s_kwd} = '';
# Line 2663  sub _get_next_token ($) { Line 3254  sub _get_next_token ($) {
3254          return  ($self->{ct}); # DOCTYPE (quirks)          return  ($self->{ct}); # DOCTYPE (quirks)
3255    
3256          redo A;          redo A;
3257          } elsif (0x0041 <= $self->{nc} and $self->{nc} <= 0x005A) { # A..Z
3258            
3259            $self->{ct}->{name} # DOCTYPE
3260                = chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
3261            delete $self->{ct}->{quirks};
3262            $self->{state} = DOCTYPE_NAME_STATE;
3263            
3264        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3265          $self->{line_prev} = $self->{line};
3266          $self->{column_prev} = $self->{column};
3267          $self->{column}++;
3268          $self->{nc}
3269              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3270        } else {
3271          $self->{set_nc}->($self);
3272        }
3273      
3274            redo A;
3275        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
3276                    
3277          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
# Line 2673  sub _get_next_token ($) { Line 3282  sub _get_next_token ($) {
3282          return  ($self->{ct}); # DOCTYPE (quirks)          return  ($self->{ct}); # DOCTYPE (quirks)
3283    
3284          redo A;          redo A;
3285          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3286            
3287            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3288            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3289            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3290            $self->{in_subset} = 1;
3291            
3292        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3293          $self->{line_prev} = $self->{line};
3294          $self->{column_prev} = $self->{column};
3295          $self->{column}++;
3296          $self->{nc}
3297              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3298        } else {
3299          $self->{set_nc}->($self);
3300        }
3301      
3302            return  ($self->{ct}); # DOCTYPE
3303            redo A;
3304        } else {        } else {
3305                    
3306          $self->{ct}->{name} = chr $self->{nc};          $self->{ct}->{name} = chr $self->{nc};
# Line 2692  sub _get_next_token ($) { Line 3320  sub _get_next_token ($) {
3320          redo A;          redo A;
3321        }        }
3322      } elsif ($self->{state} == DOCTYPE_NAME_STATE) {      } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
3323  ## ISSUE: Redundant "First," in the spec.        ## XML5: "DOCTYPE root name state".
3324    
3325          ## ISSUE: Redundant "First," in the spec.
3326    
3327        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
3328                    
3329          $self->{state} = AFTER_DOCTYPE_NAME_STATE;          $self->{state} = AFTER_DOCTYPE_NAME_STATE;
# Line 2727  sub _get_next_token ($) { Line 3358  sub _get_next_token ($) {
3358          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
3359    
3360          redo A;          redo A;
3361          } elsif (0x0041 <= $self->{nc} and $self->{nc} <= 0x005A) { # A..Z
3362            
3363            $self->{ct}->{name} # DOCTYPE
3364                .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
3365            delete $self->{ct}->{quirks};
3366            ## Stay in the state.
3367            
3368        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3369          $self->{line_prev} = $self->{line};
3370          $self->{column_prev} = $self->{column};
3371          $self->{column}++;
3372          $self->{nc}
3373              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3374        } else {
3375          $self->{set_nc}->($self);
3376        }
3377      
3378            redo A;
3379        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
3380                    
3381          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
# Line 2738  sub _get_next_token ($) { Line 3387  sub _get_next_token ($) {
3387          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
3388    
3389          redo A;          redo A;
3390          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3391            
3392            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3393            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3394            $self->{in_subset} = 1;
3395            
3396        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3397          $self->{line_prev} = $self->{line};
3398          $self->{column_prev} = $self->{column};
3399          $self->{column}++;
3400          $self->{nc}
3401              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3402        } else {
3403          $self->{set_nc}->($self);
3404        }
3405      
3406            return  ($self->{ct}); # DOCTYPE
3407            redo A;
3408        } else {        } else {
3409                    
3410          $self->{ct}->{name}          $self->{ct}->{name} .= chr ($self->{nc}); # DOCTYPE
3411            .= chr ($self->{nc}); # DOCTYPE          ## Stay in the state.
         ## Stay in the state  
3412                    
3413      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3414        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2757  sub _get_next_token ($) { Line 3423  sub _get_next_token ($) {
3423          redo A;          redo A;
3424        }        }
3425      } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {      } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
3426          ## XML5: Corresponding to XML5's "DOCTYPE root name after
3427          ## state", but implemented differently.
3428    
3429        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
3430                    
3431          ## Stay in the state          ## Stay in the state
# Line 2773  sub _get_next_token ($) { Line 3442  sub _get_next_token ($) {
3442        
3443          redo A;          redo A;
3444        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
3445            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3446              
3447              $self->{state} = DATA_STATE;
3448              $self->{s_kwd} = '';
3449            } else {
3450              
3451              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
3452              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3453            }
3454                    
         $self->{state} = DATA_STATE;  
         $self->{s_kwd} = '';  
3455                    
3456      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3457        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2787  sub _get_next_token ($) { Line 3463  sub _get_next_token ($) {
3463        $self->{set_nc}->($self);        $self->{set_nc}->($self);
3464      }      }
3465        
3466            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         return  ($self->{ct}); # DOCTYPE  
   
3467          redo A;          redo A;
3468        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
3469            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3470              
3471              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3472              $self->{state} = DATA_STATE;
3473              $self->{s_kwd} = '';
3474              $self->{ct}->{quirks} = 1;
3475            } else {
3476              
3477              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3478              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3479            }
3480                    
3481          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');          ## Reconsume.
3482          $self->{state} = DATA_STATE;          return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{s_kwd} = '';  
         ## reconsume  
   
         $self->{ct}->{quirks} = 1;  
         return  ($self->{ct}); # DOCTYPE  
   
3483          redo A;          redo A;
3484        } elsif ($self->{nc} == 0x0050 or # P        } elsif ($self->{nc} == 0x0050 or # P
3485                 $self->{nc} == 0x0070) { # p                 $self->{nc} == 0x0070) { # p
3486            
3487          $self->{state} = PUBLIC_STATE;          $self->{state} = PUBLIC_STATE;
3488          $self->{s_kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
3489                    
3490      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3491        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2820  sub _get_next_token ($) { Line 3500  sub _get_next_token ($) {
3500          redo A;          redo A;
3501        } elsif ($self->{nc} == 0x0053 or # S        } elsif ($self->{nc} == 0x0053 or # S
3502                 $self->{nc} == 0x0073) { # s                 $self->{nc} == 0x0073) { # s
3503            
3504          $self->{state} = SYSTEM_STATE;          $self->{state} = SYSTEM_STATE;
3505          $self->{s_kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
3506                    
3507      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3508        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2834  sub _get_next_token ($) { Line 3515  sub _get_next_token ($) {
3515      }      }
3516        
3517          redo A;          redo A;
3518        } else {        } elsif ($self->{nc} == 0x0022 and # "
3519                   ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
3520                    $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
3521                    
3522          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name');          $self->{state} = DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE;
3523          $self->{ct}->{quirks} = 1;          $self->{ct}->{value} = ''; # ENTITY
3524            
3525        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3526          $self->{line_prev} = $self->{line};
3527          $self->{column_prev} = $self->{column};
3528          $self->{column}++;
3529          $self->{nc}
3530              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3531        } else {
3532          $self->{set_nc}->($self);
3533        }
3534      
3535            redo A;
3536          } elsif ($self->{nc} == 0x0027 and # '
3537                   ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
3538                    $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
3539            
3540            $self->{state} = DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE;
3541            $self->{ct}->{value} = ''; # ENTITY
3542            
3543        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3544          $self->{line_prev} = $self->{line};
3545          $self->{column_prev} = $self->{column};
3546          $self->{column}++;
3547          $self->{nc}
3548              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3549        } else {
3550          $self->{set_nc}->($self);
3551        }
3552      
3553            redo A;
3554          } elsif ($self->{is_xml} and
3555                   $self->{ct}->{type} == DOCTYPE_TOKEN and
3556                   $self->{nc} == 0x005B) { # [
3557            
3558            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3559            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3560            $self->{in_subset} = 1;
3561            
3562        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3563          $self->{line_prev} = $self->{line};
3564          $self->{column_prev} = $self->{column};
3565          $self->{column}++;
3566          $self->{nc}
3567              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3568        } else {
3569          $self->{set_nc}->($self);
3570        }
3571      
3572            return  ($self->{ct}); # DOCTYPE
3573            redo A;
3574          } else {
3575            $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name'); ## TODO: type
3576    
3577            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3578              
3579              $self->{ct}->{quirks} = 1;
3580              $self->{state} = BOGUS_DOCTYPE_STATE;
3581            } else {
3582              
3583              $self->{state} = BOGUS_MD_STATE;
3584            }
3585    
         $self->{state} = BOGUS_DOCTYPE_STATE;  
3586                    
3587      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3588        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2861  sub _get_next_token ($) { Line 3604  sub _get_next_token ($) {
3604              0x0042, # B              0x0042, # B
3605              0x004C, # L              0x004C, # L
3606              0x0049, # I              0x0049, # I
3607            ]->[length $self->{s_kwd}] or            ]->[length $self->{kwd}] or
3608            $self->{nc} == [            $self->{nc} == [
3609              undef,              undef,
3610              0x0075, # u              0x0075, # u
3611              0x0062, # b              0x0062, # b
3612              0x006C, # l              0x006C, # l
3613              0x0069, # i              0x0069, # i
3614            ]->[length $self->{s_kwd}]) {            ]->[length $self->{kwd}]) {
3615                    
3616          ## Stay in the state.          ## Stay in the state.
3617          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
3618                    
3619      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3620        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2884  sub _get_next_token ($) { Line 3627  sub _get_next_token ($) {
3627      }      }
3628        
3629          redo A;          redo A;
3630        } elsif ((length $self->{s_kwd}) == 5 and        } elsif ((length $self->{kwd}) == 5 and
3631                 ($self->{nc} == 0x0043 or # C                 ($self->{nc} == 0x0043 or # C
3632                  $self->{nc} == 0x0063)) { # c                  $self->{nc} == 0x0063)) { # c
3633                    if ($self->{is_xml} and
3634                ($self->{kwd} ne 'PUBLI' or $self->{nc} == 0x0063)) { # c
3635              
3636              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
3637                              text => 'PUBLIC',
3638                              line => $self->{line_prev},
3639                              column => $self->{column_prev} - 4);
3640            } else {
3641              
3642            }
3643          $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;          $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3644                    
3645      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 2902  sub _get_next_token ($) { Line 3654  sub _get_next_token ($) {
3654        
3655          redo A;          redo A;
3656        } else {        } else {
3657                    $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name', ## TODO: type
         $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name',  
3658                          line => $self->{line_prev},                          line => $self->{line_prev},
3659                          column => $self->{column_prev} + 1 - length $self->{s_kwd});                          column => $self->{column_prev} + 1 - length $self->{kwd});
3660          $self->{ct}->{quirks} = 1;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3661              
3662          $self->{state} = BOGUS_DOCTYPE_STATE;            $self->{ct}->{quirks} = 1;
3663              $self->{state} = BOGUS_DOCTYPE_STATE;
3664            } else {
3665              
3666              $self->{state} = BOGUS_MD_STATE;
3667            }
3668          ## Reconsume.          ## Reconsume.
3669          redo A;          redo A;
3670        }        }
# Line 2920  sub _get_next_token ($) { Line 3676  sub _get_next_token ($) {
3676              0x0053, # S              0x0053, # S
3677              0x0054, # T              0x0054, # T
3678              0x0045, # E              0x0045, # E
3679            ]->[length $self->{s_kwd}] or            ]->[length $self->{kwd}] or
3680            $self->{nc} == [            $self->{nc} == [
3681              undef,              undef,
3682              0x0079, # y              0x0079, # y
3683              0x0073, # s              0x0073, # s
3684              0x0074, # t              0x0074, # t
3685              0x0065, # e              0x0065, # e
3686            ]->[length $self->{s_kwd}]) {            ]->[length $self->{kwd}]) {
3687                    
3688          ## Stay in the state.          ## Stay in the state.
3689          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
3690                    
3691      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3692        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2943  sub _get_next_token ($) { Line 3699  sub _get_next_token ($) {
3699      }      }
3700        
3701          redo A;          redo A;
3702        } elsif ((length $self->{s_kwd}) == 5 and        } elsif ((length $self->{kwd}) == 5 and
3703                 ($self->{nc} == 0x004D or # M                 ($self->{nc} == 0x004D or # M
3704                  $self->{nc} == 0x006D)) { # m                  $self->{nc} == 0x006D)) { # m
3705                    if ($self->{is_xml} and
3706                ($self->{kwd} ne 'SYSTE' or $self->{nc} == 0x006D)) { # m
3707              
3708              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
3709                              text => 'SYSTEM',
3710                              line => $self->{line_prev},
3711                              column => $self->{column_prev} - 4);
3712            } else {
3713              
3714            }
3715          $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;          $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
3716                    
3717      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 2961  sub _get_next_token ($) { Line 3726  sub _get_next_token ($) {
3726        
3727          redo A;          redo A;
3728        } else {        } else {
3729                    $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name', ## TODO: type
         $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name',  
3730                          line => $self->{line_prev},                          line => $self->{line_prev},
3731                          column => $self->{column_prev} + 1 - length $self->{s_kwd});                          column => $self->{column_prev} + 1 - length $self->{kwd});
3732          $self->{ct}->{quirks} = 1;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3733              
3734          $self->{state} = BOGUS_DOCTYPE_STATE;            $self->{ct}->{quirks} = 1;
3735              $self->{state} = BOGUS_DOCTYPE_STATE;
3736            } else {
3737              
3738              $self->{state} = BOGUS_MD_STATE;
3739            }
3740          ## Reconsume.          ## Reconsume.
3741          redo A;          redo A;
3742        }        }
# Line 3020  sub _get_next_token ($) { Line 3789  sub _get_next_token ($) {
3789        
3790          redo A;          redo A;
3791        } elsif ($self->{nc} eq 0x003E) { # >        } elsif ($self->{nc} eq 0x003E) { # >
           
3792          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3793            
3794          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3795          $self->{s_kwd} = '';            
3796              $self->{state} = DATA_STATE;
3797              $self->{s_kwd} = '';
3798              $self->{ct}->{quirks} = 1;
3799            } else {
3800              
3801              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3802            }
3803            
3804                    
3805      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3806        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3036  sub _get_next_token ($) { Line 3812  sub _get_next_token ($) {
3812        $self->{set_nc}->($self);        $self->{set_nc}->($self);
3813      }      }
3814        
3815            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         return  ($self->{ct}); # DOCTYPE  
   
3816          redo A;          redo A;
3817        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
3818            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3819              
3820              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3821              $self->{state} = DATA_STATE;
3822              $self->{s_kwd} = '';
3823              $self->{ct}->{quirks} = 1;
3824            } else {
3825              
3826              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3827              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3828            }
3829                    
         $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');  
   
         $self->{state} = DATA_STATE;  
         $self->{s_kwd} = '';  
3830          ## reconsume          ## reconsume
   
         $self->{ct}->{quirks} = 1;  
3831          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
   
3832          redo A;          redo A;
3833        } else {        } elsif ($self->{is_xml} and
3834                   $self->{ct}->{type} == DOCTYPE_TOKEN and
3835                   $self->{nc} == 0x005B) { # [
3836                    
3837            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3838            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3839            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3840            $self->{in_subset} = 1;
3841            
3842        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3843          $self->{line_prev} = $self->{line};
3844          $self->{column_prev} = $self->{column};
3845          $self->{column}++;
3846          $self->{nc}
3847              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3848        } else {
3849          $self->{set_nc}->($self);
3850        }
3851      
3852            return  ($self->{ct}); # DOCTYPE
3853            redo A;
3854          } else {
3855          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC');
         $self->{ct}->{quirks} = 1;  
3856    
3857          $self->{state} = BOGUS_DOCTYPE_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3858              
3859              $self->{ct}->{quirks} = 1;
3860              $self->{state} = BOGUS_DOCTYPE_STATE;
3861            } else {
3862              
3863              $self->{state} = BOGUS_MD_STATE;
3864            }
3865    
3866                    
3867      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3868        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3089  sub _get_next_token ($) { Line 3893  sub _get_next_token ($) {
3893        
3894          redo A;          redo A;
3895        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
           
3896          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3897    
3898          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3899          $self->{s_kwd} = '';            
3900              $self->{state} = DATA_STATE;
3901              $self->{s_kwd} = '';
3902              $self->{ct}->{quirks} = 1;
3903            } else {
3904              
3905              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3906            }
3907    
3908                    
3909      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3910        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3105  sub _get_next_token ($) { Line 3916  sub _get_next_token ($) {
3916        $self->{set_nc}->($self);        $self->{set_nc}->($self);
3917      }      }
3918        
3919            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         return  ($self->{ct}); # DOCTYPE  
   
3920          redo A;          redo A;
3921        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
3922          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3923    
3924          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3925          $self->{s_kwd} = '';            
3926          ## reconsume            $self->{state} = DATA_STATE;
3927              $self->{s_kwd} = '';
3928          $self->{ct}->{quirks} = 1;            $self->{ct}->{quirks} = 1;
3929            } else {
3930              
3931              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3932            }
3933            
3934            ## Reconsume.
3935          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
   
3936          redo A;          redo A;
3937        } else {        } else {
3938                    
3939          $self->{ct}->{pubid} # DOCTYPE          $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
             .= chr $self->{nc};  
3940          $self->{read_until}->($self->{ct}->{pubid}, q[">],          $self->{read_until}->($self->{ct}->{pubid}, q[">],
3941                                length $self->{ct}->{pubid});                                length $self->{ct}->{pubid});
3942    
# Line 3160  sub _get_next_token ($) { Line 3971  sub _get_next_token ($) {
3971        
3972          redo A;          redo A;
3973        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
           
3974          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3975    
3976          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3977          $self->{s_kwd} = '';            
3978              $self->{state} = DATA_STATE;
3979              $self->{s_kwd} = '';
3980              $self->{ct}->{quirks} = 1;
3981            } else {
3982              
3983              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3984            }
3985    
3986                    
3987      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3988        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3176  sub _get_next_token ($) { Line 3994  sub _get_next_token ($) {
3994        $self->{set_nc}->($self);        $self->{set_nc}->($self);
3995      }      }
3996        
3997            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         return  ($self->{ct}); # DOCTYPE  
   
3998          redo A;          redo A;
3999        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
4000          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
4001    
4002          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4003          $self->{s_kwd} = '';            
4004              $self->{state} = DATA_STATE;
4005              $self->{s_kwd} = '';
4006              $self->{ct}->{quirks} = 1;
4007            } else {
4008              
4009              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4010            }
4011          
4012          ## reconsume          ## reconsume
4013            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         return  ($self->{ct}); # DOCTYPE  
   
4014          redo A;          redo A;
4015        } else {        } else {
4016                    
4017          $self->{ct}->{pubid} # DOCTYPE          $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
             .= chr $self->{nc};  
4018          $self->{read_until}->($self->{ct}->{pubid}, q['>],          $self->{read_until}->($self->{ct}->{pubid}, q['>],
4019                                length $self->{ct}->{pubid});                                length $self->{ct}->{pubid});
4020    
# Line 3232  sub _get_next_token ($) { Line 4050  sub _get_next_token ($) {
4050          redo A;          redo A;
4051        } elsif ($self->{nc} == 0x0022) { # "        } elsif ($self->{nc} == 0x0022) { # "
4052                    
4053          $self->{ct}->{sysid} = ''; # DOCTYPE          $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
4054          $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;          $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
4055                    
4056      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 3248  sub _get_next_token ($) { Line 4066  sub _get_next_token ($) {
4066          redo A;          redo A;
4067        } elsif ($self->{nc} == 0x0027) { # '        } elsif ($self->{nc} == 0x0027) { # '
4068                    
4069          $self->{ct}->{sysid} = ''; # DOCTYPE          $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
4070          $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;          $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
4071                    
4072      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 3263  sub _get_next_token ($) { Line 4081  sub _get_next_token ($) {
4081        
4082          redo A;          redo A;
4083        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
4084            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4085              if ($self->{is_xml}) {
4086                
4087                $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
4088              } else {
4089                
4090              }
4091              $self->{state} = DATA_STATE;
4092              $self->{s_kwd} = '';
4093            } else {
4094              if ($self->{ct}->{type} == NOTATION_TOKEN) {
4095                
4096              } else {
4097                
4098                $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');            
4099              }
4100              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4101            }
4102                    
         $self->{state} = DATA_STATE;  
         $self->{s_kwd} = '';  
4103                    
4104      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4105        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3277  sub _get_next_token ($) { Line 4111  sub _get_next_token ($) {
4111        $self->{set_nc}->($self);        $self->{set_nc}->($self);
4112      }      }
4113        
4114            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         return  ($self->{ct}); # DOCTYPE  
   
4115          redo A;          redo A;
4116        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
4117            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4118              
4119              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
4120              
4121              $self->{state} = DATA_STATE;
4122              $self->{s_kwd} = '';
4123              $self->{ct}->{quirks} = 1;
4124            } else {
4125              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4126              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4127            }
4128                    
         $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');  
   
         $self->{state} = DATA_STATE;  
         $self->{s_kwd} = '';  
4129          ## reconsume          ## reconsume
4130            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4131          $self->{ct}->{quirks} = 1;          redo A;
4132          } elsif ($self->{is_xml} and
4133                   $self->{ct}->{type} == DOCTYPE_TOKEN and
4134                   $self->{nc} == 0x005B) { # [
4135            
4136            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
4137            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4138            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4139            $self->{in_subset} = 1;
4140            
4141        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4142          $self->{line_prev} = $self->{line};
4143          $self->{column_prev} = $self->{column};
4144          $self->{column}++;
4145          $self->{nc}
4146              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4147        } else {
4148          $self->{set_nc}->($self);
4149        }
4150      
4151          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
   
4152          redo A;          redo A;
4153        } else {        } else {
           
4154          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC literal');
         $self->{ct}->{quirks} = 1;  
4155    
4156          $self->{state} = BOGUS_DOCTYPE_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4157              
4158              $self->{ct}->{quirks} = 1;
4159              $self->{state} = BOGUS_DOCTYPE_STATE;
4160            } else {
4161              
4162              $self->{state} = BOGUS_MD_STATE;
4163            }
4164    
4165                    
4166      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4167        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3361  sub _get_next_token ($) { Line 4224  sub _get_next_token ($) {
4224        
4225          redo A;          redo A;
4226        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
           
4227          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
         $self->{state} = DATA_STATE;  
         $self->{s_kwd} = '';  
4228                    
4229      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4230        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3377  sub _get_next_token ($) { Line 4237  sub _get_next_token ($) {
4237      }      }
4238        
4239    
4240          $self->{ct}->{quirks} = 1;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4241          return  ($self->{ct}); # DOCTYPE            
4242              $self->{state} = DATA_STATE;
4243              $self->{s_kwd} = '';
4244              $self->{ct}->{quirks} = 1;
4245            } else {
4246              
4247              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4248            }
4249    
4250            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4251          redo A;          redo A;
4252        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
4253            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4254              
4255              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
4256              $self->{state} = DATA_STATE;
4257              $self->{s_kwd} = '';
4258              $self->{ct}->{quirks} = 1;
4259            } else {
4260              
4261              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4262              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4263            }
4264                    
         $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');  
   
         $self->{state} = DATA_STATE;  
         $self->{s_kwd} = '';  
4265          ## reconsume          ## reconsume
4266            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4267            redo A;
4268          } elsif ($self->{is_xml} and
4269                   $self->{ct}->{type} == DOCTYPE_TOKEN and
4270                   $self->{nc} == 0x005B) { # [
4271            
4272            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
4273    
4274          $self->{ct}->{quirks} = 1;          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4275            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4276            $self->{in_subset} = 1;
4277            
4278        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4279          $self->{line_prev} = $self->{line};
4280          $self->{column_prev} = $self->{column};
4281          $self->{column}++;
4282          $self->{nc}
4283              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4284        } else {
4285          $self->{set_nc}->($self);
4286        }
4287      
4288          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
   
4289          redo A;          redo A;
4290        } else {        } else {
           
4291          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM');
         $self->{ct}->{quirks} = 1;  
4292    
4293          $self->{state} = BOGUS_DOCTYPE_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4294                        
4295              $self->{ct}->{quirks} = 1;
4296              $self->{state} = BOGUS_DOCTYPE_STATE;
4297            } else {
4298              
4299              $self->{state} = BOGUS_MD_STATE;
4300            }
4301    
4302                    
4303      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4304        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3428  sub _get_next_token ($) { Line 4328  sub _get_next_token ($) {
4328      }      }
4329        
4330          redo A;          redo A;
4331        } elsif ($self->{nc} == 0x003E) { # >        } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
           
4332          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4333    
4334          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4335          $self->{s_kwd} = '';            
4336              $self->{state} = DATA_STATE;
4337              $self->{s_kwd} = '';
4338              $self->{ct}->{quirks} = 1;
4339            } else {
4340              
4341              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4342            }
4343            
4344                    
4345      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4346        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3445  sub _get_next_token ($) { Line 4352  sub _get_next_token ($) {
4352        $self->{set_nc}->($self);        $self->{set_nc}->($self);
4353      }      }
4354        
4355            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         return  ($self->{ct}); # DOCTYPE  
   
4356          redo A;          redo A;
4357        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
4358          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4359    
4360          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4361          $self->{s_kwd} = '';            
4362              $self->{state} = DATA_STATE;
4363              $self->{s_kwd} = '';
4364              $self->{ct}->{quirks} = 1;
4365            } else {
4366              
4367              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4368            }
4369            
4370          ## reconsume          ## reconsume
4371            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         return  ($self->{ct}); # DOCTYPE  
   
4372          redo A;          redo A;
4373        } else {        } else {
4374                    
4375          $self->{ct}->{sysid} # DOCTYPE          $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
             .= chr $self->{nc};  
4376          $self->{read_until}->($self->{ct}->{sysid}, q[">],          $self->{read_until}->($self->{ct}->{sysid}, q[">],
4377                                length $self->{ct}->{sysid});                                length $self->{ct}->{sysid});
4378    
# Line 3499  sub _get_next_token ($) { Line 4406  sub _get_next_token ($) {
4406      }      }
4407        
4408          redo A;          redo A;
4409        } elsif ($self->{nc} == 0x003E) { # >        } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
4410                    
4411          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4412    
# Line 3522  sub _get_next_token ($) { Line 4429  sub _get_next_token ($) {
4429    
4430          redo A;          redo A;
4431        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
4432          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4433    
4434          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4435          $self->{s_kwd} = '';            
4436          ## reconsume            $self->{state} = DATA_STATE;
4437              $self->{s_kwd} = '';
4438          $self->{ct}->{quirks} = 1;            $self->{ct}->{quirks} = 1;
4439          return  ($self->{ct}); # DOCTYPE          } else {
4440              
4441              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4442            }
4443    
4444            ## reconsume
4445            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4446          redo A;          redo A;
4447        } else {        } else {
4448                    
4449          $self->{ct}->{sysid} # DOCTYPE          $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
             .= chr $self->{nc};  
4450          $self->{read_until}->($self->{ct}->{sysid}, q['>],          $self->{read_until}->($self->{ct}->{sysid}, q['>],
4451                                length $self->{ct}->{sysid});                                length $self->{ct}->{sysid});
4452    
# Line 3556  sub _get_next_token ($) { Line 4466  sub _get_next_token ($) {
4466        }        }
4467      } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {      } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
4468        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
4469                    if ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN) {
4470          ## Stay in the state            
4471              $self->{state} = BEFORE_NDATA_STATE;
4472            } else {
4473              
4474              ## Stay in the state
4475            }
4476                    
4477      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4478        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3571  sub _get_next_token ($) { Line 4486  sub _get_next_token ($) {
4486        
4487          redo A;          redo A;
4488        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
4489            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4490              
4491              $self->{state} = DATA_STATE;
4492              $self->{s_kwd} = '';
4493            } else {
4494              
4495              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4496            }
4497    
4498                    
4499          $self->{state} = DATA_STATE;      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4500          $self->{s_kwd} = '';        $self->{line_prev} = $self->{line};
4501          $self->{column_prev} = $self->{column};
4502          $self->{column}++;
4503          $self->{nc}
4504              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4505        } else {
4506          $self->{set_nc}->($self);
4507        }
4508      
4509            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4510            redo A;
4511          } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
4512                   ($self->{nc} == 0x004E or # N
4513                    $self->{nc} == 0x006E)) { # n
4514            
4515            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before NDATA'); ## TODO: type
4516            $self->{state} = NDATA_STATE;
4517            $self->{kwd} = chr $self->{nc};
4518                    
4519      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4520        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3585  sub _get_next_token ($) { Line 4526  sub _get_next_token ($) {
4526        $self->{set_nc}->($self);        $self->{set_nc}->($self);
4527      }      }
4528        
4529            redo A;
4530          } elsif ($self->{nc} == -1) {
4531            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4532              
4533              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
4534              $self->{state} = DATA_STATE;
4535              $self->{s_kwd} = '';
4536              $self->{ct}->{quirks} = 1;
4537            } else {
4538              
4539              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4540              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4541            }
4542    
4543            ## reconsume
4544            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4545            redo A;
4546          } elsif ($self->{is_xml} and
4547                   $self->{ct}->{type} == DOCTYPE_TOKEN and
4548                   $self->{nc} == 0x005B) { # [
4549            
4550            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4551            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4552            $self->{in_subset} = 1;
4553            
4554        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4555          $self->{line_prev} = $self->{line};
4556          $self->{column_prev} = $self->{column};
4557          $self->{column}++;
4558          $self->{nc}
4559              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4560        } else {
4561          $self->{set_nc}->($self);
4562        }
4563      
4564          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
4565            redo A;
4566          } else {
4567            $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
4568    
4569            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4570              
4571              #$self->{ct}->{quirks} = 1;
4572              $self->{state} = BOGUS_DOCTYPE_STATE;
4573            } else {
4574              
4575              $self->{state} = BOGUS_MD_STATE;
4576            }
4577    
4578            
4579        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4580          $self->{line_prev} = $self->{line};
4581          $self->{column_prev} = $self->{column};
4582          $self->{column}++;
4583          $self->{nc}
4584              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4585        } else {
4586          $self->{set_nc}->($self);
4587        }
4588      
4589            redo A;
4590          }
4591        } elsif ($self->{state} == BEFORE_NDATA_STATE) {
4592          if ($is_space->{$self->{nc}}) {
4593            
4594            ## Stay in the state.
4595            
4596        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4597          $self->{line_prev} = $self->{line};
4598          $self->{column_prev} = $self->{column};
4599          $self->{column}++;
4600          $self->{nc}
4601              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4602        } else {
4603          $self->{set_nc}->($self);
4604        }
4605      
4606            redo A;
4607          } elsif ($self->{nc} == 0x003E) { # >
4608            
4609            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4610            
4611        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4612          $self->{line_prev} = $self->{line};
4613          $self->{column_prev} = $self->{column};
4614          $self->{column}++;
4615          $self->{nc}
4616              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4617        } else {
4618          $self->{set_nc}->($self);
4619        }
4620      
4621            return  ($self->{ct}); # ENTITY
4622            redo A;
4623          } elsif ($self->{nc} == 0x004E or # N
4624                   $self->{nc} == 0x006E) { # n
4625            
4626            $self->{state} = NDATA_STATE;
4627            $self->{kwd} = chr $self->{nc};
4628            
4629        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4630          $self->{line_prev} = $self->{line};
4631          $self->{column_prev} = $self->{column};
4632          $self->{column}++;
4633          $self->{nc}
4634              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4635        } else {
4636          $self->{set_nc}->($self);
4637        }
4638      
4639          redo A;          redo A;
4640        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
4641                    
4642          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4643          $self->{state} = DATA_STATE;          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
         $self->{s_kwd} = '';  
4644          ## reconsume          ## reconsume
4645            return  ($self->{ct}); # ENTITY
         $self->{ct}->{quirks} = 1;  
         return  ($self->{ct}); # DOCTYPE  
   
4646          redo A;          redo A;
4647        } else {        } else {
4648                    
4649          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
4650          #$self->{ct}->{quirks} = 1;          $self->{state} = BOGUS_MD_STATE;
   
         $self->{state} = BOGUS_DOCTYPE_STATE;  
4651                    
4652      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4653        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3639  sub _get_next_token ($) { Line 4681  sub _get_next_token ($) {
4681          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
4682    
4683          redo A;          redo A;
4684          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
4685            
4686            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4687            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4688            $self->{in_subset} = 1;
4689            
4690        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4691          $self->{line_prev} = $self->{line};
4692          $self->{column_prev} = $self->{column};
4693          $self->{column}++;
4694          $self->{nc}
4695              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4696        } else {
4697          $self->{set_nc}->($self);
4698        }
4699      
4700            return  ($self->{ct}); # DOCTYPE
4701            redo A;
4702        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
4703                    
4704          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
# Line 3651  sub _get_next_token ($) { Line 4711  sub _get_next_token ($) {
4711        } else {        } else {
4712                    
4713          my $s = '';          my $s = '';
4714          $self->{read_until}->($s, q[>], 0);          $self->{read_until}->($s, q{>[}, 0);
4715    
4716          ## Stay in the state          ## Stay in the state
4717                    
# Line 3671  sub _get_next_token ($) { Line 4731  sub _get_next_token ($) {
4731        ## NOTE: "CDATA section state" in the state is jointly implemented        ## NOTE: "CDATA section state" in the state is jointly implemented
4732        ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,        ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
4733        ## and |CDATA_SECTION_MSE2_STATE|.        ## and |CDATA_SECTION_MSE2_STATE|.
4734    
4735          ## XML5: "CDATA state".
4736                
4737        if ($self->{nc} == 0x005D) { # ]        if ($self->{nc} == 0x005D) { # ]
4738                    
# Line 3697  sub _get_next_token ($) { Line 4759  sub _get_next_token ($) {
4759    
4760          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
4761          $self->{s_kwd} = '';          $self->{s_kwd} = '';
4762                    ## Reconsume.
     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {  
       $self->{line_prev} = $self->{line};  
       $self->{column_prev} = $self->{column};  
       $self->{column}++;  
       $self->{nc}  
           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);  
     } else {  
       $self->{set_nc}->($self);  
     }  
     
4763          if (length $self->{ct}->{data}) { # character          if (length $self->{ct}->{data}) { # character
4764                        
4765            return  ($self->{ct}); # character            return  ($self->{ct}); # character
# Line 3740  sub _get_next_token ($) { Line 4792  sub _get_next_token ($) {
4792    
4793        ## ISSUE: "text tokens" in spec.        ## ISSUE: "text tokens" in spec.
4794      } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {      } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
4795          ## XML5: "CDATA bracket state".
4796    
4797        if ($self->{nc} == 0x005D) { # ]        if ($self->{nc} == 0x005D) { # ]
4798                    
4799          $self->{state} = CDATA_SECTION_MSE2_STATE;          $self->{state} = CDATA_SECTION_MSE2_STATE;
# Line 3757  sub _get_next_token ($) { Line 4811  sub _get_next_token ($) {
4811          redo A;          redo A;
4812        } else {        } else {
4813                    
4814            ## XML5: If EOF, "]" is not appended and changed to the data state.
4815          $self->{ct}->{data} .= ']';          $self->{ct}->{data} .= ']';
4816          $self->{state} = CDATA_SECTION_STATE;          $self->{state} = CDATA_SECTION_STATE; ## XML5: Stay in the state.
4817          ## Reconsume.          ## Reconsume.
4818          redo A;          redo A;
4819        }        }
4820      } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {      } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
4821          ## XML5: "CDATA end state".
4822    
4823        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
4824          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
4825          $self->{s_kwd} = '';          $self->{s_kwd} = '';
# Line 3805  sub _get_next_token ($) { Line 4862  sub _get_next_token ($) {
4862                    
4863          $self->{ct}->{data} .= ']]'; # character          $self->{ct}->{data} .= ']]'; # character
4864          $self->{state} = CDATA_SECTION_STATE;          $self->{state} = CDATA_SECTION_STATE;
4865          ## Reconsume.          ## Reconsume. ## XML5: Emit.
4866          redo A;          redo A;
4867        }        }
4868      } elsif ($self->{state} == ENTITY_STATE) {      } elsif ($self->{state} == ENTITY_STATE) {
# Line 3814  sub _get_next_token ($) { Line 4871  sub _get_next_token ($) {
4871              0x003C => 1, 0x0026 => 1, -1 => 1, # <, &              0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
4872              $self->{entity_add} => 1,              $self->{entity_add} => 1,
4873            }->{$self->{nc}}) {            }->{$self->{nc}}) {
4874                    if ($self->{is_xml}) {
4875              
4876              $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
4877                              line => $self->{line_prev},
4878                              column => $self->{column_prev}
4879                                  + ($self->{nc} == -1 ? 1 : 0));
4880            } else {
4881              
4882              ## No error
4883            }
4884          ## Don't consume          ## Don't consume
         ## No error  
4885          ## Return nothing.          ## Return nothing.
4886          #          #
4887        } elsif ($self->{nc} == 0x0023) { # #        } elsif ($self->{nc} == 0x0023) { # #
4888                    
4889          $self->{state} = ENTITY_HASH_STATE;          $self->{state} = ENTITY_HASH_STATE;
4890          $self->{s_kwd} = '#';          $self->{kwd} = '#';
4891                    
4892      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4893        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3835  sub _get_next_token ($) { Line 4900  sub _get_next_token ($) {
4900      }      }
4901        
4902          redo A;          redo A;
4903        } elsif ((0x0041 <= $self->{nc} and        } elsif ($self->{is_xml} or
4904                   (0x0041 <= $self->{nc} and
4905                  $self->{nc} <= 0x005A) or # A..Z                  $self->{nc} <= 0x005A) or # A..Z
4906                 (0x0061 <= $self->{nc} and                 (0x0061 <= $self->{nc} and
4907                  $self->{nc} <= 0x007A)) { # a..z                  $self->{nc} <= 0x007A)) { # a..z
4908                    
4909          require Whatpm::_NamedEntityList;          require Whatpm::_NamedEntityList;
4910          $self->{state} = ENTITY_NAME_STATE;          $self->{state} = ENTITY_NAME_STATE;
4911          $self->{s_kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
4912          $self->{entity__value} = $self->{s_kwd};          $self->{entity__value} = $self->{kwd};
4913          $self->{entity__match} = 0;          $self->{entity__match} = 0;
4914                    
4915      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 3889  sub _get_next_token ($) { Line 4955  sub _get_next_token ($) {
4955          redo A;          redo A;
4956        }        }
4957      } elsif ($self->{state} == ENTITY_HASH_STATE) {      } elsif ($self->{state} == ENTITY_HASH_STATE) {
4958        if ($self->{nc} == 0x0078 or # x        if ($self->{nc} == 0x0078) { # x
           $self->{nc} == 0x0058) { # X  
4959                    
4960          $self->{state} = HEXREF_X_STATE;          $self->{state} = HEXREF_X_STATE;
4961          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
4962            
4963        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4964          $self->{line_prev} = $self->{line};
4965          $self->{column_prev} = $self->{column};
4966          $self->{column}++;
4967          $self->{nc}
4968              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4969        } else {
4970          $self->{set_nc}->($self);
4971        }
4972      
4973            redo A;
4974          } elsif ($self->{nc} == 0x0058) { # X
4975            
4976            if ($self->{is_xml}) {
4977              $self->{parse_error}->(level => $self->{level}->{must}, type => 'uppercase hcro'); ## TODO: type
4978            }
4979            $self->{state} = HEXREF_X_STATE;
4980            $self->{kwd} .= chr $self->{nc};
4981                    
4982      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4983        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3910  sub _get_next_token ($) { Line 4994  sub _get_next_token ($) {
4994                 $self->{nc} <= 0x0039) { # 0..9                 $self->{nc} <= 0x0039) { # 0..9
4995                    
4996          $self->{state} = NCR_NUM_STATE;          $self->{state} = NCR_NUM_STATE;
4997          $self->{s_kwd} = $self->{nc} - 0x0030;          $self->{kwd} = $self->{nc} - 0x0030;
4998                    
4999      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5000        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3956  sub _get_next_token ($) { Line 5040  sub _get_next_token ($) {
5040        if (0x0030 <= $self->{nc} and        if (0x0030 <= $self->{nc} and
5041            $self->{nc} <= 0x0039) { # 0..9            $self->{nc} <= 0x0039) { # 0..9
5042                    
5043          $self->{s_kwd} *= 10;          $self->{kwd} *= 10;
5044          $self->{s_kwd} += $self->{nc} - 0x0030;          $self->{kwd} += $self->{nc} - 0x0030;
5045                    
5046          ## Stay in the state.          ## Stay in the state.
5047                    
# Line 3993  sub _get_next_token ($) { Line 5077  sub _get_next_token ($) {
5077          #          #
5078        }        }
5079    
5080        my $code = $self->{s_kwd};        my $code = $self->{kwd};
5081        my $l = $self->{line_prev};        my $l = $self->{line_prev};
5082        my $c = $self->{column_prev};        my $c = $self->{column_prev};
5083        if ($charref_map->{$code}) {        if ((not $self->{is_xml} and $charref_map->{$code}) or
5084              ($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or
5085              ($self->{is_xml} and $code == 0x0000)) {
5086                    
5087          $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
5088                          text => (sprintf 'U+%04X', $code),                          text => (sprintf 'U+%04X', $code),
# Line 4036  sub _get_next_token ($) { Line 5122  sub _get_next_token ($) {
5122          # 0..9, A..F, a..f          # 0..9, A..F, a..f
5123                    
5124          $self->{state} = HEXREF_HEX_STATE;          $self->{state} = HEXREF_HEX_STATE;
5125          $self->{s_kwd} = 0;          $self->{kwd} = 0;
5126          ## Reconsume.          ## Reconsume.
5127          redo A;          redo A;
5128        } else {        } else {
# Line 4054  sub _get_next_token ($) { Line 5140  sub _get_next_token ($) {
5140            $self->{s_kwd} = '';            $self->{s_kwd} = '';
5141            ## Reconsume.            ## Reconsume.
5142            return  ({type => CHARACTER_TOKEN,            return  ({type => CHARACTER_TOKEN,
5143                      data => '&' . $self->{s_kwd},                      data => '&' . $self->{kwd},
5144                      line => $self->{line_prev},                      line => $self->{line_prev},
5145                      column => $self->{column_prev} - length $self->{s_kwd},                      column => $self->{column_prev} - length $self->{kwd},
5146                     });                     });
5147            redo A;            redo A;
5148          } else {          } else {
5149                        
5150            $self->{ca}->{value} .= '&' . $self->{s_kwd};            $self->{ca}->{value} .= '&' . $self->{kwd};
5151            $self->{state} = $self->{prev_state};            $self->{state} = $self->{prev_state};
5152            $self->{s_kwd} = '';            $self->{s_kwd} = '';
5153            ## Reconsume.            ## Reconsume.
# Line 4072  sub _get_next_token ($) { Line 5158  sub _get_next_token ($) {
5158        if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {        if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
5159          # 0..9          # 0..9
5160                    
5161          $self->{s_kwd} *= 0x10;          $self->{kwd} *= 0x10;
5162          $self->{s_kwd} += $self->{nc} - 0x0030;          $self->{kwd} += $self->{nc} - 0x0030;
5163          ## Stay in the state.          ## Stay in the state.
5164                    
5165      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 4090  sub _get_next_token ($) { Line 5176  sub _get_next_token ($) {
5176        } elsif (0x0061 <= $self->{nc} and        } elsif (0x0061 <= $self->{nc} and
5177                 $self->{nc} <= 0x0066) { # a..f                 $self->{nc} <= 0x0066) { # a..f
5178                    
5179          $self->{s_kwd} *= 0x10;          $self->{kwd} *= 0x10;
5180          $self->{s_kwd} += $self->{nc} - 0x0060 + 9;          $self->{kwd} += $self->{nc} - 0x0060 + 9;
5181          ## Stay in the state.          ## Stay in the state.
5182                    
5183      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 4108  sub _get_next_token ($) { Line 5194  sub _get_next_token ($) {
5194        } elsif (0x0041 <= $self->{nc} and        } elsif (0x0041 <= $self->{nc} and
5195                 $self->{nc} <= 0x0046) { # A..F                 $self->{nc} <= 0x0046) { # A..F
5196                    
5197          $self->{s_kwd} *= 0x10;          $self->{kwd} *= 0x10;
5198          $self->{s_kwd} += $self->{nc} - 0x0040 + 9;          $self->{kwd} += $self->{nc} - 0x0040 + 9;
5199          ## Stay in the state.          ## Stay in the state.
5200                    
5201      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 4146  sub _get_next_token ($) { Line 5232  sub _get_next_token ($) {
5232          #          #
5233        }        }
5234    
5235        my $code = $self->{s_kwd};        my $code = $self->{kwd};
5236        my $l = $self->{line_prev};        my $l = $self->{line_prev};
5237        my $c = $self->{column_prev};        my $c = $self->{column_prev};
5238        if ($charref_map->{$code}) {        if ((not $self->{is_xml} and $charref_map->{$code}) or
5239              ($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or
5240              ($self->{is_xml} and $code == 0x0000)) {
5241                    
5242          $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
5243                          text => (sprintf 'U+%04X', $code),                          text => (sprintf 'U+%04X', $code),
# Line 4183  sub _get_next_token ($) { Line 5271  sub _get_next_token ($) {
5271          redo A;          redo A;
5272        }        }
5273      } elsif ($self->{state} == ENTITY_NAME_STATE) {      } elsif ($self->{state} == ENTITY_NAME_STATE) {
5274        if (length $self->{s_kwd} < 30 and        if ((0x0041 <= $self->{nc} and # a
5275            ## NOTE: Some number greater than the maximum length of entity name             $self->{nc} <= 0x005A) or # x
5276            ((0x0041 <= $self->{nc} and # a            (0x0061 <= $self->{nc} and # a
5277              $self->{nc} <= 0x005A) or # x             $self->{nc} <= 0x007A) or # z
5278             (0x0061 <= $self->{nc} and # a            (0x0030 <= $self->{nc} and # 0
5279              $self->{nc} <= 0x007A) or # z             $self->{nc} <= 0x0039) or # 9
5280             (0x0030 <= $self->{nc} and # 0            $self->{nc} == 0x003B or # ;
5281              $self->{nc} <= 0x0039) or # 9            ($self->{is_xml} and
5282             $self->{nc} == 0x003B)) { # ;             not ($is_space->{$self->{nc}} or
5283                    {
5284                      0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
5285                      $self->{entity_add} => 1,
5286                    }->{$self->{nc}}))) {
5287          our $EntityChar;          our $EntityChar;
5288          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
5289          if (defined $EntityChar->{$self->{s_kwd}}) {          if (defined $EntityChar->{$self->{kwd}} or
5290                $self->{ge}->{$self->{kwd}}) {
5291            if ($self->{nc} == 0x003B) { # ;            if ($self->{nc} == 0x003B) { # ;
5292                            if (defined $self->{ge}->{$self->{kwd}}) {
5293              $self->{entity__value} = $EntityChar->{$self->{s_kwd}};                if ($self->{ge}->{$self->{kwd}}->{only_text}) {
5294                    
5295                    $self->{entity__value} = $self->{ge}->{$self->{kwd}}->{value};
5296                  } else {
5297                    if (defined $self->{ge}->{$self->{kwd}}->{notation}) {
5298                      
5299                      $self->{parse_error}->(level => $self->{level}->{must}, type => 'unparsed entity', ## TODO: type
5300                                      value => $self->{kwd});
5301                    } else {
5302                      
5303                    }
5304                    $self->{entity__value} = '&' . $self->{kwd}; ## TODO: expand
5305                  }
5306                } else {
5307                  if ($self->{is_xml}) {
5308                    
5309                    $self->{parse_error}->(level => $self->{level}->{must}, type => 'entity not declared', ## TODO: type
5310                                    value => $self->{kwd},
5311                                    level => {
5312                                              'amp;' => $self->{level}->{warn},
5313                                              'quot;' => $self->{level}->{warn},
5314                                              'lt;' => $self->{level}->{warn},
5315                                              'gt;' => $self->{level}->{warn},
5316                                              'apos;' => $self->{level}->{warn},
5317                                             }->{$self->{kwd}} ||
5318                                             $self->{level}->{must});
5319                  } else {
5320                    
5321                  }
5322                  $self->{entity__value} = $EntityChar->{$self->{kwd}};
5323                }
5324              $self->{entity__match} = 1;              $self->{entity__match} = 1;
5325                            
5326      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 4213  sub _get_next_token ($) { Line 5336  sub _get_next_token ($) {
5336              #              #
5337            } else {            } else {
5338                            
5339              $self->{entity__value} = $EntityChar->{$self->{s_kwd}};              $self->{entity__value} = $EntityChar->{$self->{kwd}};
5340              $self->{entity__match} = -1;              $self->{entity__match} = -1;
5341              ## Stay in the state.              ## Stay in the state.
5342                            
# Line 4261  sub _get_next_token ($) { Line 5384  sub _get_next_token ($) {
5384          if ($self->{prev_state} != DATA_STATE and # in attribute          if ($self->{prev_state} != DATA_STATE and # in attribute
5385              $self->{entity__match} < -1) {              $self->{entity__match} < -1) {
5386                        
5387            $data = '&' . $self->{s_kwd};            $data = '&' . $self->{kwd};
5388            #            #
5389          } else {          } else {
5390                        
# Line 4273  sub _get_next_token ($) { Line 5396  sub _get_next_token ($) {
5396                    
5397          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
5398                          line => $self->{line_prev},                          line => $self->{line_prev},
5399                          column => $self->{column_prev} - length $self->{s_kwd});                          column => $self->{column_prev} - length $self->{kwd});
5400          $data = '&' . $self->{s_kwd};          $data = '&' . $self->{kwd};
5401          #          #
5402        }        }
5403        
# Line 4297  sub _get_next_token ($) { Line 5420  sub _get_next_token ($) {
5420                    data => $data,                    data => $data,
5421                    has_reference => $has_ref,                    has_reference => $has_ref,
5422                    line => $self->{line_prev},                    line => $self->{line_prev},
5423                    column => $self->{column_prev} + 1 - length $self->{s_kwd},                    column => $self->{column_prev} + 1 - length $self->{kwd},
5424                   });                   });
5425          redo A;          redo A;
5426        } else {        } else {
# Line 4313  sub _get_next_token ($) { Line 5436  sub _get_next_token ($) {
5436      ## XML-only states      ## XML-only states
5437    
5438      } elsif ($self->{state} == PI_STATE) {      } elsif ($self->{state} == PI_STATE) {
5439          ## XML5: "Pi state" and "DOCTYPE pi state".
5440    
5441        if ($is_space->{$self->{nc}} or        if ($is_space->{$self->{nc}} or
5442            $self->{nc} == 0x003F or # ? ## XML5: Same as "Anything else"            $self->{nc} == 0x003F or # ?
5443            $self->{nc} == -1) {            $self->{nc} == -1) {
5444            ## XML5: U+003F: "pi state": Same as "Anything else"; "DOCTYPE
5445            ## pi state": Switch to the "DOCTYPE pi after state".  EOF:
5446            ## "DOCTYPE pi state": Parse error, switch to the "data
5447            ## state".
5448          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare pio', ## TODO: type          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare pio', ## TODO: type
5449                          line => $self->{line_prev},                          line => $self->{line_prev},
5450                          column => $self->{column_prev}                          column => $self->{column_prev}
# Line 4330  sub _get_next_token ($) { Line 5459  sub _get_next_token ($) {
5459                        };                        };
5460          redo A;          redo A;
5461        } else {        } else {
5462            ## XML5: "DOCTYPE pi state": Stay in the state.
5463          $self->{ct} = {type => PI_TOKEN,          $self->{ct} = {type => PI_TOKEN,
5464                         target => chr $self->{nc},                         target => chr $self->{nc},
5465                         data => '',                         data => '',
# Line 4367  sub _get_next_token ($) { Line 5497  sub _get_next_token ($) {
5497          redo A;          redo A;
5498        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
5499          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
5500          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
5501          $self->{s_kwd} = '';            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5502            } else {
5503              $self->{state} = DATA_STATE;
5504              $self->{s_kwd} = '';
5505            }
5506          ## Reconsume.          ## Reconsume.
5507          return  ($self->{ct}); # pi          return  ($self->{ct}); # pi
5508          redo A;          redo A;
# Line 4439  sub _get_next_token ($) { Line 5573  sub _get_next_token ($) {
5573          redo A;          redo A;
5574        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
5575          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
5576          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
5577          $self->{s_kwd} = '';            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state"
5578            } else {
5579              $self->{state} = DATA_STATE;
5580              $self->{s_kwd} = '';
5581            }
5582          ## Reprocess.          ## Reprocess.
5583          return  ($self->{ct}); # pi          return  ($self->{ct}); # pi
5584          redo A;          redo A;
# Line 4464  sub _get_next_token ($) { Line 5602  sub _get_next_token ($) {
5602          redo A;          redo A;
5603        }        }
5604      } elsif ($self->{state} == PI_AFTER_STATE) {      } elsif ($self->{state} == PI_AFTER_STATE) {
5605          ## XML5: Part of "Pi after state".
5606    
5607        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
5608          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
5609          $self->{s_kwd} = '';            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5610            } else {
5611              $self->{state} = DATA_STATE;
5612              $self->{s_kwd} = '';
5613            }
5614                    
5615      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5616        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 4509  sub _get_next_token ($) { Line 5653  sub _get_next_token ($) {
5653          redo A;          redo A;
5654        }        }
5655      } elsif ($self->{state} == PI_DATA_AFTER_STATE) {      } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
5656        ## XML5: Same as "pi after state" in XML5        ## XML5: Same as "pi after state" and "DOCTYPE pi after state".
5657    
5658        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
5659          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
5660          $self->{s_kwd} = '';            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5661            } else {
5662              $self->{state} = DATA_STATE;
5663              $self->{s_kwd} = '';
5664            }
5665                    
5666      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5667        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 4547  sub _get_next_token ($) { Line 5696  sub _get_next_token ($) {
5696          ## Reprocess.          ## Reprocess.
5697          redo A;          redo A;
5698        }        }
5699    
5700        } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_STATE) {
5701          if ($self->{nc} == 0x003C) { # <
5702            $self->{state} = DOCTYPE_TAG_STATE;
5703            
5704        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5705          $self->{line_prev} = $self->{line};
5706          $self->{column_prev} = $self->{column};
5707          $self->{column}++;
5708          $self->{nc}
5709              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5710        } else {
5711          $self->{set_nc}->($self);
5712        }
5713      
5714            redo A;
5715          } elsif ($self->{nc} == 0x0025) { # %
5716            ## XML5: Not defined yet.
5717    
5718            ## TODO:
5719    
5720            if (not $self->{stop_processing} and
5721                not $self->{document}->xml_standalone) {
5722              $self->{parse_error}->(level => $self->{level}->{must}, type => 'stop processing', ## TODO: type
5723                              level => $self->{level}->{info});
5724              $self->{stop_processing} = 1;
5725            }
5726    
5727            
5728        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5729          $self->{line_prev} = $self->{line};
5730          $self->{column_prev} = $self->{column};
5731          $self->{column}++;
5732          $self->{nc}
5733              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5734        } else {
5735          $self->{set_nc}->($self);
5736        }
5737      
5738            redo A;
5739          } elsif ($self->{nc} == 0x005D) { # ]
5740            delete $self->{in_subset};
5741            $self->{state} = DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
5742            
5743        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5744          $self->{line_prev} = $self->{line};
5745          $self->{column_prev} = $self->{column};
5746          $self->{column}++;
5747          $self->{nc}
5748              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5749        } else {
5750          $self->{set_nc}->($self);
5751        }
5752      
5753            redo A;
5754          } elsif ($is_space->{$self->{nc}}) {
5755            ## Stay in the state.
5756            
5757        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5758          $self->{line_prev} = $self->{line};
5759          $self->{column_prev} = $self->{column};
5760          $self->{column}++;
5761          $self->{nc}
5762              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5763        } else {
5764          $self->{set_nc}->($self);
5765        }
5766      
5767            redo A;
5768          } elsif ($self->{nc} == -1) {
5769            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed internal subset'); ## TODO: type
5770            delete $self->{in_subset};
5771            $self->{state} = DATA_STATE;
5772            $self->{s_kwd} = '';
5773            ## Reconsume.
5774            return  ({type => END_OF_DOCTYPE_TOKEN});
5775            redo A;
5776          } else {
5777            unless ($self->{internal_subset_tainted}) {
5778              ## XML5: No parse error.
5779              $self->{parse_error}->(level => $self->{level}->{must}, type => 'string in internal subset');
5780              $self->{internal_subset_tainted} = 1;
5781            }
5782            ## Stay in the state.
5783            
5784        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5785          $self->{line_prev} = $self->{line};
5786          $self->{column_prev} = $self->{column};
5787          $self->{column}++;
5788          $self->{nc}
5789              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5790        } else {
5791          $self->{set_nc}->($self);
5792        }
5793      
5794            redo A;
5795          }
5796        } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
5797          if ($self->{nc} == 0x003E) { # >
5798            $self->{state} = DATA_STATE;
5799            $self->{s_kwd} = '';
5800                    
5801        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5802          $self->{line_prev} = $self->{line};
5803          $self->{column_prev} = $self->{column};
5804          $self->{column}++;
5805          $self->{nc}
5806              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5807        } else {
5808          $self->{set_nc}->($self);
5809        }
5810      
5811            return  ({type => END_OF_DOCTYPE_TOKEN});
5812            redo A;
5813          } elsif ($self->{nc} == -1) {
5814            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
5815            $self->{state} = DATA_STATE;
5816            $self->{s_kwd} = '';
5817            ## Reconsume.
5818            return  ({type => END_OF_DOCTYPE_TOKEN});
5819            redo A;
5820          } else {
5821            ## XML5: No parse error and stay in the state.
5822            $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after internal subset'); ## TODO: type
5823    
5824            $self->{state} = BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
5825            
5826        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5827          $self->{line_prev} = $self->{line};
5828          $self->{column_prev} = $self->{column};
5829          $self->{column}++;
5830          $self->{nc}
5831              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5832        } else {
5833          $self->{set_nc}->($self);
5834        }
5835      
5836            redo A;
5837          }
5838        } elsif ($self->{state} == BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
5839          if ($self->{nc} == 0x003E) { # >
5840            $self->{state} = DATA_STATE;
5841            $self->{s_kwd} = '';
5842            
5843        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5844          $self->{line_prev} = $self->{line};
5845          $self->{column_prev} = $self->{column};
5846          $self->{column}++;
5847          $self->{nc}
5848              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5849        } else {
5850          $self->{set_nc}->($self);
5851        }
5852      
5853            return  ({type => END_OF_DOCTYPE_TOKEN});
5854            redo A;
5855          } elsif ($self->{nc} == -1) {
5856            $self->{state} = DATA_STATE;
5857            $self->{s_kwd} = '';
5858            ## Reconsume.
5859            return  ({type => END_OF_DOCTYPE_TOKEN});
5860            redo A;
5861          } else {
5862            ## Stay in the state.
5863            
5864        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5865          $self->{line_prev} = $self->{line};
5866          $self->{column_prev} = $self->{column};
5867          $self->{column}++;
5868          $self->{nc}
5869              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5870        } else {
5871          $self->{set_nc}->($self);
5872        }
5873      
5874            redo A;
5875          }
5876        } elsif ($self->{state} == DOCTYPE_TAG_STATE) {
5877          if ($self->{nc} == 0x0021) { # !
5878            $self->{state} = DOCTYPE_MARKUP_DECLARATION_OPEN_STATE;
5879            
5880        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5881          $self->{line_prev} = $self->{line};
5882          $self->{column_prev} = $self->{column};
5883          $self->{column}++;
5884          $self->{nc}
5885              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5886        } else {
5887          $self->{set_nc}->($self);
5888        }
5889      
5890            redo A;
5891          } elsif ($self->{nc} == 0x003F) { # ?
5892            $self->{state} = PI_STATE;
5893            
5894        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5895          $self->{line_prev} = $self->{line};
5896          $self->{column_prev} = $self->{column};
5897          $self->{column}++;
5898          $self->{nc}
5899              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5900        } else {
5901          $self->{set_nc}->($self);
5902        }
5903      
5904            redo A;
5905          } elsif ($self->{nc} == -1) {
5906            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago');
5907            $self->{state} = DATA_STATE;
5908            $self->{s_kwd} = '';
5909            ## Reconsume.
5910            redo A;
5911          } else {
5912            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago', ## XML5: Not a parse error.
5913                            line => $self->{line_prev},
5914                            column => $self->{column_prev});
5915            $self->{state} = BOGUS_COMMENT_STATE;
5916            $self->{ct} = {type => COMMENT_TOKEN,
5917                           data => '',
5918                          }; ## NOTE: Will be discarded.
5919            
5920        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5921          $self->{line_prev} = $self->{line};
5922          $self->{column_prev} = $self->{column};
5923          $self->{column}++;
5924          $self->{nc}
5925              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5926        } else {
5927          $self->{set_nc}->($self);
5928        }
5929      
5930            redo A;
5931          }
5932        } elsif ($self->{state} == DOCTYPE_MARKUP_DECLARATION_OPEN_STATE) {
5933          ## XML5: "DOCTYPE markup declaration state".
5934          
5935          if ($self->{nc} == 0x002D) { # -
5936            $self->{state} = MD_HYPHEN_STATE;
5937            
5938        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5939          $self->{line_prev} = $self->{line};
5940          $self->{column_prev} = $self->{column};
5941          $self->{column}++;
5942          $self->{nc}
5943              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5944        } else {
5945          $self->{set_nc}->($self);
5946        }
5947      
5948            redo A;
5949          } elsif ($self->{nc} == 0x0045 or # E
5950                   $self->{nc} == 0x0065) { # e
5951            $self->{state} = MD_E_STATE;
5952            $self->{kwd} = chr $self->{nc};
5953            
5954        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5955          $self->{line_prev} = $self->{line};
5956          $self->{column_prev} = $self->{column};
5957          $self->{column}++;
5958          $self->{nc}
5959              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5960        } else {
5961          $self->{set_nc}->($self);
5962        }
5963      
5964            redo A;
5965          } elsif ($self->{nc} == 0x0041 or # A
5966                   $self->{nc} == 0x0061) { # a
5967            $self->{state} = MD_ATTLIST_STATE;
5968            $self->{kwd} = chr $self->{nc};
5969            
5970        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5971          $self->{line_prev} = $self->{line};
5972          $self->{column_prev} = $self->{column};
5973          $self->{column}++;
5974          $self->{nc}
5975              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5976        } else {
5977          $self->{set_nc}->($self);
5978        }
5979      
5980            redo A;
5981          } elsif ($self->{nc} == 0x004E or # N
5982                   $self->{nc} == 0x006E) { # n
5983            $self->{state} = MD_NOTATION_STATE;
5984            $self->{kwd} = chr $self->{nc};
5985            
5986        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5987          $self->{line_prev} = $self->{line};
5988          $self->{column_prev} = $self->{column};
5989          $self->{column}++;
5990          $self->{nc}
5991              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5992        } else {
5993          $self->{set_nc}->($self);
5994        }
5995      
5996            redo A;
5997          } else {
5998            #
5999          }
6000          
6001          ## XML5: No parse error.
6002          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
6003                          line => $self->{line_prev},
6004                          column => $self->{column_prev} - 1);
6005          ## Reconsume.
6006          $self->{state} = BOGUS_COMMENT_STATE;
6007          $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded.
6008          redo A;
6009        } elsif ($self->{state} == MD_E_STATE) {
6010          if ($self->{nc} == 0x004E or # N
6011              $self->{nc} == 0x006E) { # n
6012            $self->{state} = MD_ENTITY_STATE;
6013            $self->{kwd} .= chr $self->{nc};
6014            
6015        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6016          $self->{line_prev} = $self->{line};
6017          $self->{column_prev} = $self->{column};
6018          $self->{column}++;
6019          $self->{nc}
6020              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6021        } else {
6022          $self->{set_nc}->($self);
6023        }
6024      
6025            redo A;
6026          } elsif ($self->{nc} == 0x004C or # L
6027                   $self->{nc} == 0x006C) { # l
6028            ## XML5: <!ELEMENT> not supported.
6029            $self->{state} = MD_ELEMENT_STATE;
6030            $self->{kwd} .= chr $self->{nc};
6031            
6032        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6033          $self->{line_prev} = $self->{line};
6034          $self->{column_prev} = $self->{column};
6035          $self->{column}++;
6036          $self->{nc}
6037              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6038        } else {
6039          $self->{set_nc}->($self);
6040        }
6041      
6042            redo A;
6043          } else {
6044            ## XML5: No parse error.
6045            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
6046                            line => $self->{line_prev},
6047                            column => $self->{column_prev} - 2
6048                                + 1 * ($self->{nc} == -1));
6049            ## Reconsume.
6050            $self->{state} = BOGUS_COMMENT_STATE;
6051            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6052            redo A;
6053          }
6054        } elsif ($self->{state} == MD_ENTITY_STATE) {
6055          if ($self->{nc} == [
6056                undef,
6057                undef,
6058                0x0054, # T
6059                0x0049, # I
6060                0x0054, # T
6061              ]->[length $self->{kwd}] or
6062              $self->{nc} == [
6063                undef,
6064                undef,
6065                0x0074, # t
6066                0x0069, # i
6067                0x0074, # t
6068              ]->[length $self->{kwd}]) {
6069            ## Stay in the state.
6070            $self->{kwd} .= chr $self->{nc};
6071            
6072        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6073          $self->{line_prev} = $self->{line};
6074          $self->{column_prev} = $self->{column};
6075          $self->{column}++;
6076          $self->{nc}
6077              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6078        } else {
6079          $self->{set_nc}->($self);
6080        }
6081      
6082            redo A;
6083          } elsif ((length $self->{kwd}) == 5 and
6084                   ($self->{nc} == 0x0059 or # Y
6085                    $self->{nc} == 0x0079)) { # y
6086            if ($self->{kwd} ne 'ENTIT' or $self->{nc} == 0x0079) {
6087              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
6088                              text => 'ENTITY',
6089                              line => $self->{line_prev},
6090                              column => $self->{column_prev} - 4);
6091            }
6092            $self->{ct} = {type => GENERAL_ENTITY_TOKEN, name => '',
6093                           line => $self->{line_prev},
6094                           column => $self->{column_prev} - 6};
6095            $self->{state} = DOCTYPE_MD_STATE;
6096            
6097        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6098          $self->{line_prev} = $self->{line};
6099          $self->{column_prev} = $self->{column};
6100          $self->{column}++;
6101          $self->{nc}
6102              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6103        } else {
6104          $self->{set_nc}->($self);
6105        }
6106      
6107            redo A;
6108          } else {
6109            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
6110                            line => $self->{line_prev},
6111                            column => $self->{column_prev} - 1
6112                                - (length $self->{kwd})
6113                                + 1 * ($self->{nc} == -1));
6114            $self->{state} = BOGUS_COMMENT_STATE;
6115            ## Reconsume.
6116            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6117            redo A;
6118          }
6119        } elsif ($self->{state} == MD_ELEMENT_STATE) {
6120          if ($self->{nc} == [
6121               undef,
6122               undef,
6123               0x0045, # E
6124               0x004D, # M
6125               0x0045, # E
6126               0x004E, # N
6127              ]->[length $self->{kwd}] or
6128              $self->{nc} == [
6129               undef,
6130               undef,
6131               0x0065, # e
6132               0x006D, # m
6133               0x0065, # e
6134               0x006E, # n
6135              ]->[length $self->{kwd}]) {
6136            ## Stay in the state.
6137            $self->{kwd} .= chr $self->{nc};
6138            
6139        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6140          $self->{line_prev} = $self->{line};
6141          $self->{column_prev} = $self->{column};
6142          $self->{column}++;
6143          $self->{nc}
6144              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6145        } else {
6146          $self->{set_nc}->($self);
6147        }
6148      
6149            redo A;
6150          } elsif ((length $self->{kwd}) == 6 and
6151                   ($self->{nc} == 0x0054 or # T
6152                    $self->{nc} == 0x0074)) { # t
6153            if ($self->{kwd} ne 'ELEMEN' or $self->{nc} == 0x0074) {
6154              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
6155                              text => 'ELEMENT',
6156                              line => $self->{line_prev},
6157                              column => $self->{column_prev} - 5);
6158            }
6159            $self->{ct} = {type => ELEMENT_TOKEN, name => '',
6160                           line => $self->{line_prev},
6161                           column => $self->{column_prev} - 7};
6162            $self->{state} = DOCTYPE_MD_STATE;
6163            
6164        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6165          $self->{line_prev} = $self->{line};
6166          $self->{column_prev} = $self->{column};
6167          $self->{column}++;
6168          $self->{nc}
6169              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6170        } else {
6171          $self->{set_nc}->($self);
6172        }
6173      
6174            redo A;
6175          } else {
6176            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
6177                            line => $self->{line_prev},
6178                            column => $self->{column_prev} - 1
6179                                - (length $self->{kwd})
6180                                + 1 * ($self->{nc} == -1));
6181            $self->{state} = BOGUS_COMMENT_STATE;
6182            ## Reconsume.
6183            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6184            redo A;
6185          }
6186        } elsif ($self->{state} == MD_ATTLIST_STATE) {
6187          if ($self->{nc} == [
6188               undef,
6189               0x0054, # T
6190               0x0054, # T
6191               0x004C, # L
6192               0x0049, # I
6193               0x0053, # S
6194              ]->[length $self->{kwd}] or
6195              $self->{nc} == [
6196               undef,
6197               0x0074, # t
6198               0x0074, # t
6199               0x006C, # l
6200               0x0069, # i
6201               0x0073, # s
6202              ]->[length $self->{kwd}]) {
6203            ## Stay in the state.
6204            $self->{kwd} .= chr $self->{nc};
6205            
6206        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6207          $self->{line_prev} = $self->{line};
6208          $self->{column_prev} = $self->{column};
6209          $self->{column}++;
6210          $self->{nc}
6211              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6212        } else {
6213          $self->{set_nc}->($self);
6214        }
6215      
6216            redo A;
6217          } elsif ((length $self->{kwd}) == 6 and
6218                   ($self->{nc} == 0x0054 or # T
6219                    $self->{nc} == 0x0074)) { # t
6220            if ($self->{kwd} ne 'ATTLIS' or $self->{nc} == 0x0074) {
6221              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
6222                              text => 'ATTLIST',
6223                              line => $self->{line_prev},
6224                              column => $self->{column_prev} - 5);
6225            }
6226            $self->{ct} = {type => ATTLIST_TOKEN, name => '',
6227                           attrdefs => [],
6228                           line => $self->{line_prev},
6229                           column => $self->{column_prev} - 7};
6230            $self->{state} = DOCTYPE_MD_STATE;
6231            
6232        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6233          $self->{line_prev} = $self->{line};
6234          $self->{column_prev} = $self->{column};
6235          $self->{column}++;
6236          $self->{nc}
6237              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6238        } else {
6239          $self->{set_nc}->($self);
6240        }
6241      
6242            redo A;
6243          } else {
6244            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
6245                            line => $self->{line_prev},
6246                            column => $self->{column_prev} - 1
6247                                 - (length $self->{kwd})
6248                                 + 1 * ($self->{nc} == -1));
6249            $self->{state} = BOGUS_COMMENT_STATE;
6250            ## Reconsume.
6251            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6252            redo A;
6253          }
6254        } elsif ($self->{state} == MD_NOTATION_STATE) {
6255          if ($self->{nc} == [
6256               undef,
6257               0x004F, # O
6258               0x0054, # T
6259               0x0041, # A
6260               0x0054, # T
6261               0x0049, # I
6262               0x004F, # O
6263              ]->[length $self->{kwd}] or
6264              $self->{nc} == [
6265               undef,
6266               0x006F, # o
6267               0x0074, # t
6268               0x0061, # a
6269               0x0074, # t
6270               0x0069, # i
6271               0x006F, # o
6272              ]->[length $self->{kwd}]) {
6273            ## Stay in the state.
6274            $self->{kwd} .= chr $self->{nc};
6275            
6276        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6277          $self->{line_prev} = $self->{line};
6278          $self->{column_prev} = $self->{column};
6279          $self->{column}++;
6280          $self->{nc}
6281              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6282        } else {
6283          $self->{set_nc}->($self);
6284        }
6285      
6286            redo A;
6287          } elsif ((length $self->{kwd}) == 7 and
6288                   ($self->{nc} == 0x004E or # N
6289                    $self->{nc} == 0x006E)) { # n
6290            if ($self->{kwd} ne 'NOTATIO' or $self->{nc} == 0x006E) {
6291              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
6292                              text => 'NOTATION',
6293                              line => $self->{line_prev},
6294                              column => $self->{column_prev} - 6);
6295            }
6296            $self->{ct} = {type => NOTATION_TOKEN, name => '',
6297                           line => $self->{line_prev},
6298                           column => $self->{column_prev} - 8};
6299            $self->{state} = DOCTYPE_MD_STATE;
6300            
6301        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6302          $self->{line_prev} = $self->{line};
6303          $self->{column_prev} = $self->{column};
6304          $self->{column}++;
6305          $self->{nc}
6306              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6307        } else {
6308          $self->{set_nc}->($self);
6309        }
6310      
6311            redo A;
6312          } else {
6313            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
6314                            line => $self->{line_prev},
6315                            column => $self->{column_prev} - 1
6316                                - (length $self->{kwd})
6317                                + 1 * ($self->{nc} == -1));
6318            $self->{state} = BOGUS_COMMENT_STATE;
6319            ## Reconsume.
6320            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6321            redo A;
6322          }
6323        } elsif ($self->{state} == DOCTYPE_MD_STATE) {
6324          ## XML5: "DOCTYPE ENTITY state", "DOCTYPE ATTLIST state", and
6325          ## "DOCTYPE NOTATION state".
6326    
6327          if ($is_space->{$self->{nc}}) {
6328            ## XML5: [NOTATION] Switch to the "DOCTYPE NOTATION identifier state".
6329            $self->{state} = BEFORE_MD_NAME_STATE;
6330            
6331        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6332          $self->{line_prev} = $self->{line};
6333          $self->{column_prev} = $self->{column};
6334          $self->{column}++;
6335          $self->{nc}
6336              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6337        } else {
6338          $self->{set_nc}->($self);
6339        }
6340      
6341            redo A;
6342          } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
6343                   $self->{nc} == 0x0025) { # %
6344            ## XML5: Switch to the "DOCTYPE bogus comment state".
6345            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before md name'); ## TODO: type
6346            $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
6347            
6348        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6349          $self->{line_prev} = $self->{line};
6350          $self->{column_prev} = $self->{column};
6351          $self->{column}++;
6352          $self->{nc}
6353              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6354        } else {
6355          $self->{set_nc}->($self);
6356        }
6357      
6358            redo A;
6359          } elsif ($self->{nc} == -1) {
6360            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6361            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6362            ## Reconsume.
6363            redo A;
6364          } elsif ($self->{nc} == 0x003E) { # >
6365            ## XML5: Switch to the "DOCTYPE bogus comment state".
6366            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6367            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6368            
6369        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6370          $self->{line_prev} = $self->{line};
6371          $self->{column_prev} = $self->{column};
6372          $self->{column}++;
6373          $self->{nc}
6374              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6375        } else {
6376          $self->{set_nc}->($self);
6377        }
6378      
6379            redo A;
6380          } else {
6381            ## XML5: Switch to the "DOCTYPE bogus comment state".
6382            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before md name'); ## TODO: type
6383            $self->{state} = BEFORE_MD_NAME_STATE;
6384            redo A;
6385          }
6386        } elsif ($self->{state} == BEFORE_MD_NAME_STATE) {
6387          ## XML5: "DOCTYPE ENTITY parameter state", "DOCTYPE ENTITY type
6388          ## before state", "DOCTYPE ATTLIST name before state".
6389    
6390          if ($is_space->{$self->{nc}}) {
6391            ## Stay in the state.
6392            
6393        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6394          $self->{line_prev} = $self->{line};
6395          $self->{column_prev} = $self->{column};
6396          $self->{column}++;
6397          $self->{nc}
6398              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6399        } else {
6400          $self->{set_nc}->($self);
6401        }
6402      
6403            redo A;
6404          } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
6405                   $self->{nc} == 0x0025) { # %
6406            $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
6407            
6408        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6409          $self->{line_prev} = $self->{line};
6410          $self->{column_prev} = $self->{column};
6411          $self->{column}++;
6412          $self->{nc}
6413              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6414        } else {
6415          $self->{set_nc}->($self);
6416        }
6417      
6418            redo A;
6419          } elsif ($self->{nc} == 0x003E) { # >
6420            ## XML5: Same as "Anything else".
6421            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6422            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6423            
6424        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6425          $self->{line_prev} = $self->{line};
6426          $self->{column_prev} = $self->{column};
6427          $self->{column}++;
6428          $self->{nc}
6429              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6430        } else {
6431          $self->{set_nc}->($self);
6432        }
6433      
6434            redo A;
6435          } elsif ($self->{nc} == -1) {
6436            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6437            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6438            ## Reconsume.
6439            redo A;
6440          } else {
6441            ## XML5: [ATTLIST] Not defined yet.
6442            $self->{ct}->{name} .= chr $self->{nc};
6443            $self->{state} = MD_NAME_STATE;
6444            
6445        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6446          $self->{line_prev} = $self->{line};
6447          $self->{column_prev} = $self->{column};
6448          $self->{column}++;
6449          $self->{nc}
6450              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6451        } else {
6452          $self->{set_nc}->($self);
6453        }
6454      
6455            redo A;
6456          }
6457        } elsif ($self->{state} == DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE) {
6458          if ($is_space->{$self->{nc}}) {
6459            ## XML5: Switch to the "DOCTYPE ENTITY parameter state".
6460            $self->{ct}->{type} = PARAMETER_ENTITY_TOKEN;
6461            $self->{state} = BEFORE_MD_NAME_STATE;
6462            
6463        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6464          $self->{line_prev} = $self->{line};
6465          $self->{column_prev} = $self->{column};
6466          $self->{column}++;
6467          $self->{nc}
6468              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6469        } else {
6470          $self->{set_nc}->($self);
6471        }
6472      
6473            redo A;
6474          } elsif ($self->{nc} == 0x003E) { # >
6475            ## XML5: Same as "Anything else".
6476            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6477            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6478            
6479        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6480          $self->{line_prev} = $self->{line};
6481          $self->{column_prev} = $self->{column};
6482          $self->{column}++;
6483          $self->{nc}
6484              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6485        } else {
6486          $self->{set_nc}->($self);
6487        }
6488      
6489            redo A;
6490          } elsif ($self->{nc} == -1) {
6491            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md');
6492            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6493            ## Reconsume.
6494            redo A;
6495          } else {
6496            ## XML5: No parse error.
6497            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space after ENTITY percent'); ## TODO: type
6498            $self->{state} = BOGUS_COMMENT_STATE;
6499            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6500            ## Reconsume.
6501            redo A;
6502          }
6503        } elsif ($self->{state} == MD_NAME_STATE) {
6504          ## XML5: "DOCTYPE ENTITY name state" and "DOCTYPE ATTLIST name state".
6505          
6506          if ($is_space->{$self->{nc}}) {
6507            if ($self->{ct}->{type} == ATTLIST_TOKEN) {
6508              $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
6509            } elsif ($self->{ct}->{type} == ELEMENT_TOKEN) {
6510              $self->{state} = AFTER_ELEMENT_NAME_STATE;
6511            } else { # ENTITY/NOTATION
6512              $self->{state} = AFTER_DOCTYPE_NAME_STATE;
6513            }
6514            
6515        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6516          $self->{line_prev} = $self->{line};
6517          $self->{column_prev} = $self->{column};
6518          $self->{column}++;
6519          $self->{nc}
6520              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6521        } else {
6522          $self->{set_nc}->($self);
6523        }
6524      
6525            redo A;
6526          } elsif ($self->{nc} == 0x003E) { # >
6527            if ($self->{ct}->{type} == ATTLIST_TOKEN) {
6528              #
6529            } else {
6530              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
6531            }
6532            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6533            
6534        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6535          $self->{line_prev} = $self->{line};
6536          $self->{column_prev} = $self->{column};
6537          $self->{column}++;
6538          $self->{nc}
6539              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6540        } else {
6541          $self->{set_nc}->($self);
6542        }
6543      
6544            return  ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
6545            redo A;
6546          } elsif ($self->{nc} == -1) {
6547            ## XML5: [ATTLIST] No parse error.
6548            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md');
6549            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6550            ## Reconsume.
6551            return  ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
6552            redo A;
6553          } else {
6554            ## XML5: [ATTLIST] Not defined yet.
6555            $self->{ct}->{name} .= chr $self->{nc};
6556            ## Stay in the state.
6557            
6558        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6559          $self->{line_prev} = $self->{line};
6560          $self->{column_prev} = $self->{column};
6561          $self->{column}++;
6562          $self->{nc}
6563              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6564        } else {
6565          $self->{set_nc}->($self);
6566        }
6567      
6568            redo A;
6569          }
6570        } elsif ($self->{state} == DOCTYPE_ATTLIST_NAME_AFTER_STATE) {
6571          if ($is_space->{$self->{nc}}) {
6572            ## Stay in the state.
6573            
6574        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6575          $self->{line_prev} = $self->{line};
6576          $self->{column_prev} = $self->{column};
6577          $self->{column}++;
6578          $self->{nc}
6579              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6580        } else {
6581          $self->{set_nc}->($self);
6582        }
6583      
6584            redo A;
6585          } elsif ($self->{nc} == 0x003E) { # >
6586            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6587            
6588        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6589          $self->{line_prev} = $self->{line};
6590          $self->{column_prev} = $self->{column};
6591          $self->{column}++;
6592          $self->{nc}
6593              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6594        } else {
6595          $self->{set_nc}->($self);
6596        }
6597      
6598            return  ($self->{ct}); # ATTLIST
6599            redo A;
6600          } elsif ($self->{nc} == -1) {
6601            ## XML5: No parse error.
6602            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6603            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6604            return  ($self->{ct});
6605            redo A;
6606          } else {
6607            ## XML5: Not defined yet.
6608            $self->{ca} = {name => chr ($self->{nc}), # attrdef
6609                           tokens => [],
6610                           line => $self->{line}, column => $self->{column}};
6611            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE;
6612            
6613        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6614          $self->{line_prev} = $self->{line};
6615          $self->{column_prev} = $self->{column};
6616          $self->{column}++;
6617          $self->{nc}
6618              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6619        } else {
6620          $self->{set_nc}->($self);
6621        }
6622      
6623            redo A;
6624          }
6625        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE) {
6626          if ($is_space->{$self->{nc}}) {
6627            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE;
6628            
6629        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6630          $self->{line_prev} = $self->{line};
6631          $self->{column_prev} = $self->{column};
6632          $self->{column}++;
6633          $self->{nc}
6634              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6635        } else {
6636          $self->{set_nc}->($self);
6637        }
6638      
6639            redo A;
6640          } elsif ($self->{nc} == 0x003E) { # >
6641            ## XML5: Same as "anything else".
6642            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr type'); ## TODO: type
6643            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6644            
6645        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6646          $self->{line_prev} = $self->{line};
6647          $self->{column_prev} = $self->{column};
6648          $self->{column}++;
6649          $self->{nc}
6650              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6651        } else {
6652          $self->{set_nc}->($self);
6653        }
6654      
6655            return  ($self->{ct}); # ATTLIST
6656            redo A;
6657          } elsif ($self->{nc} == 0x0028) { # (
6658            ## XML5: Same as "anything else".
6659            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before paren'); ## TODO: type
6660            $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6661            
6662        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6663          $self->{line_prev} = $self->{line};
6664          $self->{column_prev} = $self->{column};
6665          $self->{column}++;
6666          $self->{nc}
6667              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6668        } else {
6669          $self->{set_nc}->($self);
6670        }
6671      
6672            redo A;
6673          } elsif ($self->{nc} == -1) {
6674            ## XML5: No parse error.
6675            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6676            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6677            
6678        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6679          $self->{line_prev} = $self->{line};
6680          $self->{column_prev} = $self->{column};
6681          $self->{column}++;
6682          $self->{nc}
6683              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6684        } else {
6685          $self->{set_nc}->($self);
6686        }
6687      
6688            return  ($self->{ct}); # ATTLIST
6689            redo A;
6690          } else {
6691            ## XML5: Not defined yet.
6692            $self->{ca}->{name} .= chr $self->{nc};
6693            ## Stay in the state.
6694            
6695        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6696          $self->{line_prev} = $self->{line};
6697          $self->{column_prev} = $self->{column};
6698          $self->{column}++;
6699          $self->{nc}
6700              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6701        } else {
6702          $self->{set_nc}->($self);
6703        }
6704      
6705            redo A;
6706          }
6707        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE) {
6708          if ($is_space->{$self->{nc}}) {
6709            ## Stay in the state.
6710            
6711        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6712          $self->{line_prev} = $self->{line};
6713          $self->{column_prev} = $self->{column};
6714          $self->{column}++;
6715          $self->{nc}
6716              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6717        } else {
6718          $self->{set_nc}->($self);
6719        }
6720      
6721            redo A;
6722          } elsif ($self->{nc} == 0x003E) { # >
6723            ## XML5: Same as "anything else".
6724            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr type'); ## TODO: type
6725            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6726            
6727        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6728          $self->{line_prev} = $self->{line};
6729          $self->{column_prev} = $self->{column};
6730          $self->{column}++;
6731          $self->{nc}
6732              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6733        } else {
6734          $self->{set_nc}->($self);
6735        }
6736      
6737            return  ($self->{ct}); # ATTLIST
6738            redo A;
6739          } elsif ($self->{nc} == 0x0028) { # (
6740            ## XML5: Same as "anything else".
6741            $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6742            
6743        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6744          $self->{line_prev} = $self->{line};
6745          $self->{column_prev} = $self->{column};
6746          $self->{column}++;
6747          $self->{nc}
6748              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6749        } else {
6750          $self->{set_nc}->($self);
6751        }
6752      
6753            redo A;
6754          } elsif ($self->{nc} == -1) {
6755            ## XML5: No parse error.
6756            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6757            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6758            
6759        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6760          $self->{line_prev} = $self->{line};
6761          $self->{column_prev} = $self->{column};
6762          $self->{column}++;
6763          $self->{nc}
6764              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6765        } else {
6766          $self->{set_nc}->($self);
6767        }
6768      
6769            return  ($self->{ct});
6770            redo A;
6771          } else {
6772            ## XML5: Not defined yet.
6773            $self->{ca}->{type} = chr $self->{nc};
6774            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE;
6775            
6776        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6777          $self->{line_prev} = $self->{line};
6778          $self->{column_prev} = $self->{column};
6779          $self->{column}++;
6780          $self->{nc}
6781              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6782        } else {
6783          $self->{set_nc}->($self);
6784        }
6785      
6786            redo A;
6787          }
6788        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE) {
6789          if ($is_space->{$self->{nc}}) {
6790            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE;
6791            
6792        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6793          $self->{line_prev} = $self->{line};
6794          $self->{column_prev} = $self->{column};
6795          $self->{column}++;
6796          $self->{nc}
6797              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6798        } else {
6799          $self->{set_nc}->($self);
6800        }
6801      
6802            redo A;
6803          } elsif ($self->{nc} == 0x0023) { # #
6804            ## XML5: Same as "anything else".
6805            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6806            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
6807            
6808        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6809          $self->{line_prev} = $self->{line};
6810          $self->{column_prev} = $self->{column};
6811          $self->{column}++;
6812          $self->{nc}
6813              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6814        } else {
6815          $self->{set_nc}->($self);
6816        }
6817      
6818            redo A;
6819          } elsif ($self->{nc} == 0x0022) { # "
6820            ## XML5: Same as "anything else".
6821            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6822            $self->{ca}->{value} = '';
6823            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
6824            
6825        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6826          $self->{line_prev} = $self->{line};
6827          $self->{column_prev} = $self->{column};
6828          $self->{column}++;
6829          $self->{nc}
6830              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6831        } else {
6832          $self->{set_nc}->($self);
6833        }
6834      
6835            redo A;
6836          } elsif ($self->{nc} == 0x0027) { # '
6837            ## XML5: Same as "anything else".
6838            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6839            $self->{ca}->{value} = '';
6840            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
6841            
6842        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6843          $self->{line_prev} = $self->{line};
6844          $self->{column_prev} = $self->{column};
6845          $self->{column}++;
6846          $self->{nc}
6847              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6848        } else {
6849          $self->{set_nc}->($self);
6850        }
6851      
6852            redo A;
6853          } elsif ($self->{nc} == 0x003E) { # >
6854            ## XML5: Same as "anything else".
6855            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
6856            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6857            
6858        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6859          $self->{line_prev} = $self->{line};
6860          $self->{column_prev} = $self->{column};
6861          $self->{column}++;
6862          $self->{nc}
6863              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6864        } else {
6865          $self->{set_nc}->($self);
6866        }
6867      
6868            return  ($self->{ct}); # ATTLIST
6869            redo A;
6870          } elsif ($self->{nc} == 0x0028) { # (
6871            ## XML5: Same as "anything else".
6872            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before paren'); ## TODO: type
6873            $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6874            
6875        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6876          $self->{line_prev} = $self->{line};
6877          $self->{column_prev} = $self->{column};
6878          $self->{column}++;
6879          $self->{nc}
6880              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6881        } else {
6882          $self->{set_nc}->($self);
6883        }
6884      
6885            redo A;
6886          } elsif ($self->{nc} == -1) {
6887            ## XML5: No parse error.
6888            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6889            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6890            
6891        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6892          $self->{line_prev} = $self->{line};
6893          $self->{column_prev} = $self->{column};
6894          $self->{column}++;
6895          $self->{nc}
6896              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6897        } else {
6898          $self->{set_nc}->($self);
6899        }
6900      
6901            return  ($self->{ct});
6902            redo A;
6903          } else {
6904            ## XML5: Not defined yet.
6905            $self->{ca}->{type} .= chr $self->{nc};
6906            ## Stay in the state.
6907            
6908        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6909          $self->{line_prev} = $self->{line};
6910          $self->{column_prev} = $self->{column};
6911          $self->{column}++;
6912          $self->{nc}
6913              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6914        } else {
6915          $self->{set_nc}->($self);
6916        }
6917      
6918            redo A;
6919          }
6920        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE) {
6921          if ($is_space->{$self->{nc}}) {
6922            ## Stay in the state.
6923            
6924        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6925          $self->{line_prev} = $self->{line};
6926          $self->{column_prev} = $self->{column};
6927          $self->{column}++;
6928          $self->{nc}
6929              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6930        } else {
6931          $self->{set_nc}->($self);
6932        }
6933      
6934            redo A;
6935          } elsif ($self->{nc} == 0x0028) { # (
6936            ## XML5: Same as "anything else".
6937            $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6938            
6939        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6940          $self->{line_prev} = $self->{line};
6941          $self->{column_prev} = $self->{column};
6942          $self->{column}++;
6943          $self->{nc}
6944              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6945        } else {
6946          $self->{set_nc}->($self);
6947        }
6948      
6949            redo A;
6950          } elsif ($self->{nc} == 0x0023) { # #
6951            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
6952            
6953        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6954          $self->{line_prev} = $self->{line};
6955          $self->{column_prev} = $self->{column};
6956          $self->{column}++;
6957          $self->{nc}
6958              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6959        } else {
6960          $self->{set_nc}->($self);
6961        }
6962      
6963            redo A;
6964          } elsif ($self->{nc} == 0x0022) { # "
6965            ## XML5: Same as "anything else".
6966            $self->{ca}->{value} = '';
6967            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
6968            
6969        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6970          $self->{line_prev} = $self->{line};
6971          $self->{column_prev} = $self->{column};
6972          $self->{column}++;
6973          $self->{nc}
6974              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6975        } else {
6976          $self->{set_nc}->($self);
6977        }
6978      
6979            redo A;
6980          } elsif ($self->{nc} == 0x0027) { # '
6981            ## XML5: Same as "anything else".
6982            $self->{ca}->{value} = '';
6983            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
6984            
6985        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6986          $self->{line_prev} = $self->{line};
6987          $self->{column_prev} = $self->{column};
6988          $self->{column}++;
6989          $self->{nc}
6990              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6991        } else {
6992          $self->{set_nc}->($self);
6993        }
6994      
6995            redo A;
6996          } elsif ($self->{nc} == 0x003E) { # >
6997            ## XML5: Same as "anything else".
6998            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
6999            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7000            
7001        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7002          $self->{line_prev} = $self->{line};
7003          $self->{column_prev} = $self->{column};
7004          $self->{column}++;
7005          $self->{nc}
7006              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7007        } else {
7008          $self->{set_nc}->($self);
7009        }
7010      
7011            return  ($self->{ct}); # ATTLIST
7012            redo A;
7013          } elsif ($self->{nc} == -1) {
7014            ## XML5: No parse error.
7015            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7016            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7017            
7018        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7019          $self->{line_prev} = $self->{line};
7020          $self->{column_prev} = $self->{column};
7021          $self->{column}++;
7022          $self->{nc}
7023              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7024        } else {
7025          $self->{set_nc}->($self);
7026        }
7027      
7028            return  ($self->{ct});
7029            redo A;
7030          } else {
7031            ## XML5: Switch to the "DOCTYPE bogus comment state".
7032            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
7033            $self->{ca}->{value} = '';
7034            $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7035            ## Reconsume.
7036            redo A;
7037          }
7038        } elsif ($self->{state} == BEFORE_ALLOWED_TOKEN_STATE) {
7039          if ($is_space->{$self->{nc}}) {
7040            ## Stay in the state.
7041            
7042        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7043          $self->{line_prev} = $self->{line};
7044          $self->{column_prev} = $self->{column};
7045          $self->{column}++;
7046          $self->{nc}
7047              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7048        } else {
7049          $self->{set_nc}->($self);
7050        }
7051      
7052            redo A;
7053          } elsif ($self->{nc} == 0x007C) { # |
7054            $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty allowed token'); ## TODO: type
7055            ## Stay in the state.
7056            
7057        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7058          $self->{line_prev} = $self->{line};
7059          $self->{column_prev} = $self->{column};
7060          $self->{column}++;
7061          $self->{nc}
7062              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7063        } else {
7064          $self->{set_nc}->($self);
7065        }
7066      
7067            redo A;
7068          } elsif ($self->{nc} == 0x0029) { # )
7069            $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty allowed token'); ## TODO: type
7070            $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
7071            
7072        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7073          $self->{line_prev} = $self->{line};
7074          $self->{column_prev} = $self->{column};
7075          $self->{column}++;
7076          $self->{nc}
7077              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7078        } else {
7079          $self->{set_nc}->($self);
7080        }
7081      
7082            redo A;
7083          } elsif ($self->{nc} == 0x003E) { # >
7084            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
7085            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7086            
7087        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7088          $self->{line_prev} = $self->{line};
7089          $self->{column_prev} = $self->{column};
7090          $self->{column}++;
7091          $self->{nc}
7092              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7093        } else {
7094          $self->{set_nc}->($self);
7095        }
7096      
7097            return  ($self->{ct}); # ATTLIST
7098            redo A;
7099          } elsif ($self->{nc} == -1) {
7100            ## XML5: No parse error.
7101            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7102            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7103            
7104        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7105          $self->{line_prev} = $self->{line};
7106          $self->{column_prev} = $self->{column};
7107          $self->{column}++;
7108          $self->{nc}
7109              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7110        } else {
7111          $self->{set_nc}->($self);
7112        }
7113      
7114            return  ($self->{ct});
7115            redo A;
7116          } else {
7117            push @{$self->{ca}->{tokens}}, chr $self->{nc};
7118            $self->{state} = ALLOWED_TOKEN_STATE;
7119            
7120        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7121          $self->{line_prev} = $self->{line};
7122          $self->{column_prev} = $self->{column};
7123          $self->{column}++;
7124          $self->{nc}
7125              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7126        } else {
7127          $self->{set_nc}->($self);
7128        }
7129      
7130            redo A;
7131          }
7132        } elsif ($self->{state} == ALLOWED_TOKEN_STATE) {
7133          if ($is_space->{$self->{nc}}) {
7134            $self->{state} = AFTER_ALLOWED_TOKEN_STATE;
7135            
7136        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7137          $self->{line_prev} = $self->{line};
7138          $self->{column_prev} = $self->{column};
7139          $self->{column}++;
7140          $self->{nc}
7141              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7142        } else {
7143          $self->{set_nc}->($self);
7144        }
7145      
7146            redo A;
7147          } elsif ($self->{nc} == 0x007C) { # |
7148            $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
7149            
7150        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7151          $self->{line_prev} = $self->{line};
7152          $self->{column_prev} = $self->{column};
7153          $self->{column}++;
7154          $self->{nc}
7155              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7156        } else {
7157          $self->{set_nc}->($self);
7158        }
7159      
7160            redo A;
7161          } elsif ($self->{nc} == 0x0029) { # )
7162            $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
7163            
7164        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7165          $self->{line_prev} = $self->{line};
7166          $self->{column_prev} = $self->{column};
7167          $self->{column}++;
7168          $self->{nc}
7169              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7170        } else {
7171          $self->{set_nc}->($self);
7172        }
7173      
7174            redo A;
7175          } elsif ($self->{nc} == 0x003E) { # >
7176            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
7177            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7178            
7179        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7180          $self->{line_prev} = $self->{line};
7181          $self->{column_prev} = $self->{column};
7182          $self->{column}++;
7183          $self->{nc}
7184              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7185        } else {
7186          $self->{set_nc}->($self);
7187        }
7188      
7189            return  ($self->{ct}); # ATTLIST
7190            redo A;
7191          } elsif ($self->{nc} == -1) {
7192            ## XML5: No parse error.
7193            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7194            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7195            
7196        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7197          $self->{line_prev} = $self->{line};
7198          $self->{column_prev} = $self->{column};
7199          $self->{column}++;
7200          $self->{nc}
7201              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7202        } else {
7203          $self->{set_nc}->($self);
7204        }
7205      
7206            return  ($self->{ct});
7207            redo A;
7208          } else {
7209            $self->{ca}->{tokens}->[-1] .= chr $self->{nc};
7210            ## Stay in the state.
7211            
7212        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7213          $self->{line_prev} = $self->{line};
7214          $self->{column_prev} = $self->{column};
7215          $self->{column}++;
7216          $self->{nc}
7217              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7218        } else {
7219          $self->{set_nc}->($self);
7220        }
7221      
7222            redo A;
7223          }
7224        } elsif ($self->{state} == AFTER_ALLOWED_TOKEN_STATE) {
7225          if ($is_space->{$self->{nc}}) {
7226            ## Stay in the state.
7227            
7228        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7229          $self->{line_prev} = $self->{line};
7230          $self->{column_prev} = $self->{column};
7231          $self->{column}++;
7232          $self->{nc}
7233              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7234        } else {
7235          $self->{set_nc}->($self);
7236        }
7237      
7238            redo A;
7239          } elsif ($self->{nc} == 0x007C) { # |
7240            $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
7241            
7242        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7243          $self->{line_prev} = $self->{line};
7244          $self->{column_prev} = $self->{column};
7245          $self->{column}++;
7246          $self->{nc}
7247              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7248        } else {
7249          $self->{set_nc}->($self);
7250        }
7251      
7252            redo A;
7253          } elsif ($self->{nc} == 0x0029) { # )
7254            $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
7255            
7256        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7257          $self->{line_prev} = $self->{line};
7258          $self->{column_prev} = $self->{column};
7259          $self->{column}++;
7260          $self->{nc}
7261              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7262        } else {
7263          $self->{set_nc}->($self);
7264        }
7265      
7266            redo A;
7267          } elsif ($self->{nc} == 0x003E) { # >
7268            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
7269            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7270            
7271        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7272          $self->{line_prev} = $self->{line};
7273          $self->{column_prev} = $self->{column};
7274          $self->{column}++;
7275          $self->{nc}
7276              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7277        } else {
7278          $self->{set_nc}->($self);
7279        }
7280      
7281            return  ($self->{ct}); # ATTLIST
7282            redo A;
7283          } elsif ($self->{nc} == -1) {
7284            ## XML5: No parse error.
7285            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7286            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7287            
7288        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7289          $self->{line_prev} = $self->{line};
7290          $self->{column_prev} = $self->{column};
7291          $self->{column}++;
7292          $self->{nc}
7293              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7294        } else {
7295          $self->{set_nc}->($self);
7296        }
7297      
7298            return  ($self->{ct});
7299            redo A;
7300          } else {
7301            $self->{parse_error}->(level => $self->{level}->{must}, type => 'space in allowed token', ## TODO: type
7302                            line => $self->{line_prev},
7303                            column => $self->{column_prev});
7304            $self->{ca}->{tokens}->[-1] .= ' ' . chr $self->{nc};
7305            $self->{state} = ALLOWED_TOKEN_STATE;
7306            
7307        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7308          $self->{line_prev} = $self->{line};
7309          $self->{column_prev} = $self->{column};
7310          $self->{column}++;
7311          $self->{nc}
7312              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7313        } else {
7314          $self->{set_nc}->($self);
7315        }
7316      
7317            redo A;
7318          }
7319        } elsif ($self->{state} == AFTER_ALLOWED_TOKENS_STATE) {
7320          if ($is_space->{$self->{nc}}) {
7321            $self->{state} = BEFORE_ATTR_DEFAULT_STATE;
7322            
7323        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7324          $self->{line_prev} = $self->{line};
7325          $self->{column_prev} = $self->{column};
7326          $self->{column}++;
7327          $self->{nc}
7328              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7329        } else {
7330          $self->{set_nc}->($self);
7331        }
7332      
7333            redo A;
7334          } elsif ($self->{nc} == 0x0023) { # #
7335            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7336            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
7337            
7338        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7339          $self->{line_prev} = $self->{line};
7340          $self->{column_prev} = $self->{column};
7341          $self->{column}++;
7342          $self->{nc}
7343              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7344        } else {
7345          $self->{set_nc}->($self);
7346        }
7347      
7348            redo A;
7349          } elsif ($self->{nc} == 0x0022) { # "
7350            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7351            $self->{ca}->{value} = '';
7352            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7353            
7354        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7355          $self->{line_prev} = $self->{line};
7356          $self->{column_prev} = $self->{column};
7357          $self->{column}++;
7358          $self->{nc}
7359              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7360        } else {
7361          $self->{set_nc}->($self);
7362        }
7363      
7364            redo A;
7365          } elsif ($self->{nc} == 0x0027) { # '
7366            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7367            $self->{ca}->{value} = '';
7368            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7369            
7370        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7371          $self->{line_prev} = $self->{line};
7372          $self->{column_prev} = $self->{column};
7373          $self->{column}++;
7374          $self->{nc}
7375              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7376        } else {
7377          $self->{set_nc}->($self);
7378        }
7379      
7380            redo A;
7381          } elsif ($self->{nc} == 0x003E) { # >
7382            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7383            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7384            
7385        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7386          $self->{line_prev} = $self->{line};
7387          $self->{column_prev} = $self->{column};
7388          $self->{column}++;
7389          $self->{nc}
7390              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7391        } else {
7392          $self->{set_nc}->($self);
7393        }
7394      
7395            return  ($self->{ct}); # ATTLIST
7396            redo A;
7397          } elsif ($self->{nc} == -1) {
7398            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7399            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7400            
7401        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7402          $self->{line_prev} = $self->{line};
7403          $self->{column_prev} = $self->{column};
7404          $self->{column}++;
7405          $self->{nc}
7406              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7407        } else {
7408          $self->{set_nc}->($self);
7409        }
7410      
7411            return  ($self->{ct});
7412            redo A;
7413          } else {
7414            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
7415            $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7416            ## Reconsume.
7417            redo A;
7418          }
7419        } elsif ($self->{state} == BEFORE_ATTR_DEFAULT_STATE) {
7420          if ($is_space->{$self->{nc}}) {
7421            ## Stay in the state.
7422            
7423        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7424          $self->{line_prev} = $self->{line};
7425          $self->{column_prev} = $self->{column};
7426          $self->{column}++;
7427          $self->{nc}
7428              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7429        } else {
7430          $self->{set_nc}->($self);
7431        }
7432      
7433            redo A;
7434          } elsif ($self->{nc} == 0x0023) { # #
7435            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
7436            
7437        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7438          $self->{line_prev} = $self->{line};
7439          $self->{column_prev} = $self->{column};
7440          $self->{column}++;
7441          $self->{nc}
7442              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7443        } else {
7444          $self->{set_nc}->($self);
7445        }
7446      
7447            redo A;
7448          } elsif ($self->{nc} == 0x0022) { # "
7449            $self->{ca}->{value} = '';
7450            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7451            
7452        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7453          $self->{line_prev} = $self->{line};
7454          $self->{column_prev} = $self->{column};
7455          $self->{column}++;
7456          $self->{nc}
7457              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7458        } else {
7459          $self->{set_nc}->($self);
7460        }
7461      
7462            redo A;
7463          } elsif ($self->{nc} == 0x0027) { # '
7464            $self->{ca}->{value} = '';
7465            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7466            
7467        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7468          $self->{line_prev} = $self->{line};
7469          $self->{column_prev} = $self->{column};
7470          $self->{column}++;
7471          $self->{nc}
7472              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7473        } else {
7474          $self->{set_nc}->($self);
7475        }
7476      
7477            redo A;
7478          } elsif ($self->{nc} == 0x003E) { # >
7479            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7480            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7481            
7482        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7483          $self->{line_prev} = $self->{line};
7484          $self->{column_prev} = $self->{column};
7485          $self->{column}++;
7486          $self->{nc}
7487              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7488        } else {
7489          $self->{set_nc}->($self);
7490        }
7491      
7492            return  ($self->{ct}); # ATTLIST
7493            redo A;
7494          } elsif ($self->{nc} == -1) {
7495            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7496            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7497            
7498        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7499          $self->{line_prev} = $self->{line};
7500          $self->{column_prev} = $self->{column};
7501          $self->{column}++;
7502          $self->{nc}
7503              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7504        } else {
7505          $self->{set_nc}->($self);
7506        }
7507      
7508            return  ($self->{ct});
7509            redo A;
7510          } else {
7511            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
7512            $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7513            ## Reconsume.
7514            redo A;
7515          }
7516        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE) {
7517          if ($is_space->{$self->{nc}}) {
7518            ## XML5: No parse error.
7519            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no default type'); ## TODO: type
7520            $self->{state} = BOGUS_MD_STATE;
7521            ## Reconsume.
7522            redo A;
7523          } elsif ($self->{nc} == 0x0022) { # "
7524            ## XML5: Same as "anything else".
7525            $self->{ca}->{value} = '';
7526            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7527            
7528        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7529          $self->{line_prev} = $self->{line};
7530          $self->{column_prev} = $self->{column};
7531          $self->{column}++;
7532          $self->{nc}
7533              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7534        } else {
7535          $self->{set_nc}->($self);
7536        }
7537      
7538            redo A;
7539          } elsif ($self->{nc} == 0x0027) { # '
7540            ## XML5: Same as "anything else".
7541            $self->{ca}->{value} = '';
7542            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7543            
7544        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7545          $self->{line_prev} = $self->{line};
7546          $self->{column_prev} = $self->{column};
7547          $self->{column}++;
7548          $self->{nc}
7549              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7550        } else {
7551          $self->{set_nc}->($self);
7552        }
7553      
7554            redo A;
7555          } elsif ($self->{nc} == 0x003E) { # >
7556            ## XML5: Same as "anything else".
7557            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7558            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7559            
7560        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7561          $self->{line_prev} = $self->{line};
7562          $self->{column_prev} = $self->{column};
7563          $self->{column}++;
7564          $self->{nc}
7565              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7566        } else {
7567          $self->{set_nc}->($self);
7568        }
7569      
7570            return  ($self->{ct}); # ATTLIST
7571            redo A;
7572          } elsif ($self->{nc} == -1) {
7573            ## XML5: No parse error.
7574            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7575            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7576            
7577        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7578          $self->{line_prev} = $self->{line};
7579          $self->{column_prev} = $self->{column};
7580          $self->{column}++;
7581          $self->{nc}
7582              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7583        } else {
7584          $self->{set_nc}->($self);
7585        }
7586      
7587            return  ($self->{ct});
7588            redo A;
7589          } else {
7590            $self->{ca}->{default} = chr $self->{nc};
7591            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE;
7592            
7593        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7594          $self->{line_prev} = $self->{line};
7595          $self->{column_prev} = $self->{column};
7596          $self->{column}++;
7597          $self->{nc}
7598              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7599        } else {
7600          $self->{set_nc}->($self);
7601        }
7602      
7603            redo A;
7604          }
7605        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE) {
7606          if ($is_space->{$self->{nc}}) {
7607            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE;
7608            
7609        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7610          $self->{line_prev} = $self->{line};
7611          $self->{column_prev} = $self->{column};
7612          $self->{column}++;
7613          $self->{nc}
7614              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7615        } else {
7616          $self->{set_nc}->($self);
7617        }
7618      
7619            redo A;
7620          } elsif ($self->{nc} == 0x0022) { # "
7621            ## XML5: Same as "anything else".
7622            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7623            $self->{ca}->{value} = '';
7624            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7625            
7626        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7627          $self->{line_prev} = $self->{line};
7628          $self->{column_prev} = $self->{column};
7629          $self->{column}++;
7630          $self->{nc}
7631              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7632        } else {
7633          $self->{set_nc}->($self);
7634        }
7635      
7636            redo A;
7637          } elsif ($self->{nc} == 0x0027) { # '
7638            ## XML5: Same as "anything else".
7639            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7640            $self->{ca}->{value} = '';
7641            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7642            
7643        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7644          $self->{line_prev} = $self->{line};
7645          $self->{column_prev} = $self->{column};
7646          $self->{column}++;
7647          $self->{nc}
7648              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7649        } else {
7650          $self->{set_nc}->($self);
7651        }
7652      
7653            redo A;
7654          } elsif ($self->{nc} == 0x003E) { # >
7655            ## XML5: Same as "anything else".
7656            push @{$self->{ct}->{attrdefs}}, $self->{ca};
7657            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7658            
7659        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7660          $self->{line_prev} = $self->{line};
7661          $self->{column_prev} = $self->{column};
7662          $self->{column}++;
7663          $self->{nc}
7664              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7665        } else {
7666          $self->{set_nc}->($self);
7667        }
7668      
7669            return  ($self->{ct}); # ATTLIST
7670            redo A;
7671          } elsif ($self->{nc} == -1) {
7672            ## XML5: No parse error.
7673            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7674            push @{$self->{ct}->{attrdefs}}, $self->{ca};
7675            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7676            
7677        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7678          $self->{line_prev} = $self->{line};
7679          $self->{column_prev} = $self->{column};
7680          $self->{column}++;
7681          $self->{nc}
7682              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7683        } else {
7684          $self->{set_nc}->($self);
7685        }
7686      
7687            return  ($self->{ct});
7688            redo A;
7689          } else {
7690            $self->{ca}->{default} .= chr $self->{nc};
7691            ## Stay in the state.
7692            
7693        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7694          $self->{line_prev} = $self->{line};
7695          $self->{column_prev} = $self->{column};
7696          $self->{column}++;
7697          $self->{nc}
7698              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7699        } else {
7700          $self->{set_nc}->($self);
7701        }
7702      
7703            redo A;
7704          }
7705        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE) {
7706          if ($is_space->{$self->{nc}}) {
7707            ## Stay in the state.
7708            
7709        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7710          $self->{line_prev} = $self->{line};
7711          $self->{column_prev} = $self->{column};
7712          $self->{column}++;
7713          $self->{nc}
7714              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7715        } else {
7716          $self->{set_nc}->($self);
7717        }
7718      
7719            redo A;
7720          } elsif ($self->{nc} == 0x0022) { # "
7721            $self->{ca}->{value} = '';
7722            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7723            
7724        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7725          $self->{line_prev} = $self->{line};
7726          $self->{column_prev} = $self->{column};
7727          $self->{column}++;
7728          $self->{nc}
7729              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7730        } else {
7731          $self->{set_nc}->($self);
7732        }
7733      
7734            redo A;
7735          } elsif ($self->{nc} == 0x0027) { # '
7736            $self->{ca}->{value} = '';
7737            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7738            
7739        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7740          $self->{line_prev} = $self->{line};
7741          $self->{column_prev} = $self->{column};
7742          $self->{column}++;
7743          $self->{nc}
7744              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7745        } else {
7746          $self->{set_nc}->($self);
7747        }
7748      
7749            redo A;
7750          } elsif ($self->{nc} == 0x003E) { # >
7751            push @{$self->{ct}->{attrdefs}}, $self->{ca};
7752            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7753            
7754        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7755          $self->{line_prev} = $self->{line};
7756          $self->{column_prev} = $self->{column};
7757          $self->{column}++;
7758          $self->{nc}
7759              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7760        } else {
7761          $self->{set_nc}->($self);
7762        }
7763      
7764            return  ($self->{ct}); # ATTLIST
7765            redo A;
7766          } elsif ($self->{nc} == -1) {
7767            ## XML5: No parse error.
7768            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7769            push @{$self->{ct}->{attrdefs}}, $self->{ca};
7770            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7771            
7772        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7773          $self->{line_prev} = $self->{line};
7774          $self->{column_prev} = $self->{column};
7775          $self->{column}++;
7776          $self->{nc}
7777              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7778        } else {
7779          $self->{set_nc}->($self);
7780        }
7781      
7782            return  ($self->{ct});
7783            redo A;
7784          } else {
7785            ## XML5: Not defined yet.
7786            if ($self->{ca}->{default} eq 'FIXED') {
7787              $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7788            } else {
7789              push @{$self->{ct}->{attrdefs}}, $self->{ca};
7790              $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7791            }
7792            ## Reconsume.
7793            redo A;
7794          }
7795        } elsif ($self->{state} == AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE) {
7796          if ($is_space->{$self->{nc}} or
7797              $self->{nc} == -1 or
7798              $self->{nc} == 0x003E) { # >
7799            $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7800            ## Reconsume.
7801            redo A;
7802          } else {
7803            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before attr name'); ## TODO: type
7804            $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7805            ## Reconsume.
7806            redo A;
7807          }
7808        } elsif ($self->{state} == NDATA_STATE) {
7809          ## ASCII case-insensitive
7810          if ($self->{nc} == [
7811                undef,
7812                0x0044, # D
7813                0x0041, # A
7814                0x0054, # T
7815              ]->[length $self->{kwd}] or
7816              $self->{nc} == [
7817                undef,
7818                0x0064, # d
7819                0x0061, # a
7820                0x0074, # t
7821              ]->[length $self->{kwd}]) {
7822            
7823            ## Stay in the state.
7824            $self->{kwd} .= chr $self->{nc};
7825            
7826        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7827          $self->{line_prev} = $self->{line};
7828          $self->{column_prev} = $self->{column};
7829          $self->{column}++;
7830          $self->{nc}
7831              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7832        } else {
7833          $self->{set_nc}->($self);
7834        }
7835      
7836            redo A;
7837          } elsif ((length $self->{kwd}) == 4 and
7838                   ($self->{nc} == 0x0041 or # A
7839                    $self->{nc} == 0x0061)) { # a
7840            if ($self->{kwd} ne 'NDAT' or $self->{nc} == 0x0061) { # a
7841              
7842              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
7843                              text => 'NDATA',
7844                              line => $self->{line_prev},
7845                              column => $self->{column_prev} - 4);
7846            } else {
7847              
7848            }
7849            $self->{state} = AFTER_NDATA_STATE;
7850            
7851        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7852          $self->{line_prev} = $self->{line};
7853          $self->{column_prev} = $self->{column};
7854          $self->{column}++;
7855          $self->{nc}
7856              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7857        } else {
7858          $self->{set_nc}->($self);
7859        }
7860      
7861            redo A;
7862          } else {
7863            $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after literal', ## TODO: type
7864                            line => $self->{line_prev},
7865                            column => $self->{column_prev} + 1
7866                                - length $self->{kwd});
7867            
7868            $self->{state} = BOGUS_MD_STATE;
7869            ## Reconsume.
7870            redo A;
7871          }
7872        } elsif ($self->{state} == AFTER_NDATA_STATE) {
7873          if ($is_space->{$self->{nc}}) {
7874            $self->{state} = BEFORE_NOTATION_NAME_STATE;
7875            
7876        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7877          $self->{line_prev} = $self->{line};
7878          $self->{column_prev} = $self->{column};
7879          $self->{column}++;
7880          $self->{nc}
7881              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7882        } else {
7883          $self->{set_nc}->($self);
7884        }
7885      
7886            redo A;
7887          } elsif ($self->{nc} == 0x003E) { # >
7888            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no notation name'); ## TODO: type
7889            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7890            
7891        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7892          $self->{line_prev} = $self->{line};
7893          $self->{column_prev} = $self->{column};
7894          $self->{column}++;
7895          $self->{nc}
7896              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7897        } else {
7898          $self->{set_nc}->($self);
7899        }
7900      
7901            return  ($self->{ct}); # ENTITY
7902            redo A;
7903          } elsif ($self->{nc} == -1) {
7904            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7905            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7906            
7907        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7908          $self->{line_prev} = $self->{line};
7909          $self->{column_prev} = $self->{column};
7910          $self->{column}++;
7911          $self->{nc}
7912              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7913        } else {
7914          $self->{set_nc}->($self);
7915        }
7916      
7917            return  ($self->{ct}); # ENTITY
7918            redo A;
7919          } else {
7920            $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after literal', ## TODO: type
7921                            line => $self->{line_prev},
7922                            column => $self->{column_prev} + 1
7923                                - length $self->{kwd});
7924            $self->{state} = BOGUS_MD_STATE;
7925            ## Reconsume.
7926            redo A;
7927          }
7928        } elsif ($self->{state} == BEFORE_NOTATION_NAME_STATE) {
7929          if ($is_space->{$self->{nc}}) {
7930            ## Stay in the state.
7931            
7932        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7933          $self->{line_prev} = $self->{line};
7934          $self->{column_prev} = $self->{column};
7935          $self->{column}++;
7936          $self->{nc}
7937              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7938        } else {
7939          $self->{set_nc}->($self);
7940        }
7941      
7942            redo A;
7943          } elsif ($self->{nc} == 0x003E) { # >
7944            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no notation name'); ## TODO: type
7945            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7946            
7947        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7948          $self->{line_prev} = $self->{line};
7949          $self->{column_prev} = $self->{column};
7950          $self->{column}++;
7951          $self->{nc}
7952              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7953        } else {
7954          $self->{set_nc}->($self);
7955        }
7956      
7957            return  ($self->{ct}); # ENTITY
7958            redo A;
7959          } elsif ($self->{nc} == -1) {
7960            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7961            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7962            
7963        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7964          $self->{line_prev} = $self->{line};
7965          $self->{column_prev} = $self->{column};
7966          $self->{column}++;
7967          $self->{nc}
7968              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7969        } else {
7970          $self->{set_nc}->($self);
7971        }
7972      
7973            return  ($self->{ct}); # ENTITY
7974            redo A;
7975          } else {
7976            $self->{ct}->{notation} = chr $self->{nc}; # ENTITY
7977            $self->{state} = NOTATION_NAME_STATE;
7978            
7979        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7980          $self->{line_prev} = $self->{line};
7981          $self->{column_prev} = $self->{column};
7982          $self->{column}++;
7983          $self->{nc}
7984              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7985        } else {
7986          $self->{set_nc}->($self);
7987        }
7988      
7989            redo A;
7990          }
7991        } elsif ($self->{state} == NOTATION_NAME_STATE) {
7992          if ($is_space->{$self->{nc}}) {
7993            $self->{state} = AFTER_MD_DEF_STATE;
7994            
7995        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7996          $self->{line_prev} = $self->{line};
7997          $self->{column_prev} = $self->{column};
7998          $self->{column}++;
7999          $self->{nc}
8000              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8001        } else {
8002          $self->{set_nc}->($self);
8003        }
8004      
8005            redo A;
8006          } elsif ($self->{nc} == 0x003E) { # >
8007            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8008            
8009        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8010          $self->{line_prev} = $self->{line};
8011          $self->{column_prev} = $self->{column};
8012          $self->{column}++;
8013          $self->{nc}
8014              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8015        } else {
8016          $self->{set_nc}->($self);
8017        }
8018      
8019            return  ($self->{ct}); # ENTITY
8020            redo A;
8021          } elsif ($self->{nc} == -1) {
8022            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8023            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8024            
8025        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8026          $self->{line_prev} = $self->{line};
8027          $self->{column_prev} = $self->{column};
8028          $self->{column}++;
8029          $self->{nc}
8030              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8031        } else {
8032          $self->{set_nc}->($self);
8033        }
8034      
8035            return  ($self->{ct}); # ENTITY
8036            redo A;
8037          } else {
8038            $self->{ct}->{notation} .= chr $self->{nc}; # ENTITY
8039            ## Stay in the state.
8040            
8041        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8042          $self->{line_prev} = $self->{line};
8043          $self->{column_prev} = $self->{column};
8044          $self->{column}++;
8045          $self->{nc}
8046              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8047        } else {
8048          $self->{set_nc}->($self);
8049        }
8050      
8051            redo A;
8052          }
8053        } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE) {
8054          if ($self->{nc} == 0x0022) { # "
8055            $self->{state} = AFTER_MD_DEF_STATE;
8056            
8057        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8058          $self->{line_prev} = $self->{line};
8059          $self->{column_prev} = $self->{column};
8060          $self->{column}++;
8061          $self->{nc}
8062              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8063        } else {
8064          $self->{set_nc}->($self);
8065        }
8066      
8067            redo A;
8068          } elsif ($self->{nc} == 0x0026) { # &
8069            $self->{prev_state} = $self->{state};
8070            $self->{state} = ENTITY_VALUE_ENTITY_STATE;
8071            $self->{entity_add} = 0x0022; # "
8072            
8073        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8074          $self->{line_prev} = $self->{line};
8075          $self->{column_prev} = $self->{column};
8076          $self->{column}++;
8077          $self->{nc}
8078              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8079        } else {
8080          $self->{set_nc}->($self);
8081        }
8082      
8083            redo A;
8084    ## TODO: %
8085          } elsif ($self->{nc} == -1) {
8086            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed entity value'); ## TODO: type
8087            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8088            ## Reconsume.
8089            return  ($self->{ct}); # ENTITY
8090            redo A;
8091          } else {
8092            $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
8093            
8094        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8095          $self->{line_prev} = $self->{line};
8096          $self->{column_prev} = $self->{column};
8097          $self->{column}++;
8098          $self->{nc}
8099              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8100        } else {
8101          $self->{set_nc}->($self);
8102        }
8103      
8104            redo A;
8105          }
8106        } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE) {
8107          if ($self->{nc} == 0x0027) { # '
8108            $self->{state} = AFTER_MD_DEF_STATE;
8109            
8110        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8111          $self->{line_prev} = $self->{line};
8112          $self->{column_prev} = $self->{column};
8113          $self->{column}++;
8114          $self->{nc}
8115              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8116        } else {
8117          $self->{set_nc}->($self);
8118        }
8119      
8120            redo A;
8121          } elsif ($self->{nc} == 0x0026) { # &
8122            $self->{prev_state} = $self->{state};
8123            $self->{state} = ENTITY_VALUE_ENTITY_STATE;
8124            $self->{entity_add} = 0x0027; # '
8125            
8126        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8127          $self->{line_prev} = $self->{line};
8128          $self->{column_prev} = $self->{column};
8129          $self->{column}++;
8130          $self->{nc}
8131              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8132        } else {
8133          $self->{set_nc}->($self);
8134        }
8135      
8136            redo A;
8137    ## TODO: %
8138          } elsif ($self->{nc} == -1) {
8139            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed entity value'); ## TODO: type
8140            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8141            ## Reconsume.
8142            return  ($self->{ct}); # ENTITY
8143            redo A;
8144          } else {
8145            $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
8146            
8147        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8148          $self->{line_prev} = $self->{line};
8149          $self->{column_prev} = $self->{column};
8150          $self->{column}++;
8151          $self->{nc}
8152              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8153        } else {
8154          $self->{set_nc}->($self);
8155        }
8156      
8157            redo A;
8158          }
8159        } elsif ($self->{state} == ENTITY_VALUE_ENTITY_STATE) {
8160          if ($is_space->{$self->{nc}} or
8161              {
8162                0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
8163                $self->{entity_add} => 1,
8164              }->{$self->{nc}}) {
8165            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
8166                            line => $self->{line_prev},
8167                            column => $self->{column_prev}
8168                                + ($self->{nc} == -1 ? 1 : 0));
8169            ## Don't consume
8170            ## Return nothing.
8171            #
8172          } elsif ($self->{nc} == 0x0023) { # #
8173            $self->{ca} = $self->{ct};
8174            $self->{state} = ENTITY_HASH_STATE;
8175            $self->{kwd} = '#';
8176            
8177        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8178          $self->{line_prev} = $self->{line};
8179          $self->{column_prev} = $self->{column};
8180          $self->{column}++;
8181          $self->{nc}
8182              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8183        } else {
8184          $self->{set_nc}->($self);
8185        }
8186      
8187            redo A;
8188          } else {
8189            #
8190          }
8191    
8192          $self->{ct}->{value} .= '&';
8193          $self->{state} = $self->{prev_state};
8194          ## Reconsume.
8195          redo A;
8196        } elsif ($self->{state} == AFTER_ELEMENT_NAME_STATE) {
8197          if ($is_space->{$self->{nc}}) {
8198            $self->{state} = BEFORE_ELEMENT_CONTENT_STATE;
8199            
8200        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8201          $self->{line_prev} = $self->{line};
8202          $self->{column_prev} = $self->{column};
8203          $self->{column}++;
8204          $self->{nc}
8205              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8206        } else {
8207          $self->{set_nc}->($self);
8208        }
8209      
8210            redo A;
8211          } elsif ($self->{nc} == 0x0028) { # (
8212            $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
8213            $self->{ct}->{content} = ['('];
8214            $self->{group_depth} = 1;
8215            
8216        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8217          $self->{line_prev} = $self->{line};
8218          $self->{column_prev} = $self->{column};
8219          $self->{column}++;
8220          $self->{nc}
8221              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8222        } else {
8223          $self->{set_nc}->($self);
8224        }
8225      
8226            redo A;
8227          } elsif ($self->{nc} == 0x003E) { # >
8228            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
8229            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8230            
8231        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8232          $self->{line_prev} = $self->{line};
8233          $self->{column_prev} = $self->{column};
8234          $self->{column}++;
8235          $self->{nc}
8236              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8237        } else {
8238          $self->{set_nc}->($self);
8239        }
8240      
8241            return  ($self->{ct}); # ELEMENT
8242            redo A;
8243          } elsif ($self->{nc} == -1) {
8244            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8245            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8246            
8247        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8248          $self->{line_prev} = $self->{line};
8249          $self->{column_prev} = $self->{column};
8250          $self->{column}++;
8251          $self->{nc}
8252              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8253        } else {
8254          $self->{set_nc}->($self);
8255        }
8256      
8257            return  ($self->{ct}); # ELEMENT
8258            redo A;
8259          } else {
8260            $self->{ct}->{content} = [chr $self->{nc}];
8261            $self->{state} = CONTENT_KEYWORD_STATE;
8262            
8263        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8264          $self->{line_prev} = $self->{line};
8265          $self->{column_prev} = $self->{column};
8266          $self->{column}++;
8267          $self->{nc}
8268              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8269        } else {
8270          $self->{set_nc}->($self);
8271        }
8272      
8273            redo A;
8274          }
8275        } elsif ($self->{state} == CONTENT_KEYWORD_STATE) {
8276          if ($is_space->{$self->{nc}}) {
8277            $self->{state} = AFTER_MD_DEF_STATE;
8278            
8279        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8280          $self->{line_prev} = $self->{line};
8281          $self->{column_prev} = $self->{column};
8282          $self->{column}++;
8283          $self->{nc}
8284              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8285        } else {
8286          $self->{set_nc}->($self);
8287        }
8288      
8289            redo A;
8290          } elsif ($self->{nc} == 0x003E) { # >
8291            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8292            
8293        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8294          $self->{line_prev} = $self->{line};
8295          $self->{column_prev} = $self->{column};
8296          $self->{column}++;
8297          $self->{nc}
8298              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8299        } else {
8300          $self->{set_nc}->($self);
8301        }
8302      
8303            return  ($self->{ct}); # ELEMENT
8304            redo A;
8305          } elsif ($self->{nc} == -1) {
8306            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8307            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8308            
8309        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8310          $self->{line_prev} = $self->{line};
8311          $self->{column_prev} = $self->{column};
8312          $self->{column}++;
8313          $self->{nc}
8314              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8315        } else {
8316          $self->{set_nc}->($self);
8317        }
8318      
8319            return  ($self->{ct}); # ELEMENT
8320            redo A;
8321          } else {
8322            $self->{ct}->{content}->[-1] .= chr $self->{nc}; # ELEMENT
8323            ## Stay in the state.
8324            
8325        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8326          $self->{line_prev} = $self->{line};
8327          $self->{column_prev} = $self->{column};
8328          $self->{column}++;
8329          $self->{nc}
8330              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8331        } else {
8332          $self->{set_nc}->($self);
8333        }
8334      
8335            redo A;
8336          }
8337        } elsif ($self->{state} == AFTER_CM_GROUP_OPEN_STATE) {
8338          if ($is_space->{$self->{nc}}) {
8339            ## Stay in the state.
8340            
8341        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8342          $self->{line_prev} = $self->{line};
8343          $self->{column_prev} = $self->{column};
8344          $self->{column}++;
8345          $self->{nc}
8346              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8347        } else {
8348          $self->{set_nc}->($self);
8349        }
8350      
8351            redo A;
8352          } elsif ($self->{nc} == 0x0028) { # (
8353            $self->{group_depth}++;
8354            push @{$self->{ct}->{content}}, chr $self->{nc};
8355            ## Stay in the state.
8356            
8357        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8358          $self->{line_prev} = $self->{line};
8359          $self->{column_prev} = $self->{column};
8360          $self->{column}++;
8361          $self->{nc}
8362              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8363        } else {
8364          $self->{set_nc}->($self);
8365        }
8366      
8367            redo A;
8368          } elsif ($self->{nc} == 0x007C or # |
8369                   $self->{nc} == 0x002C) { # ,
8370            $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty element name'); ## TODO: type
8371            ## Stay in the state.
8372            
8373        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8374          $self->{line_prev} = $self->{line};
8375          $self->{column_prev} = $self->{column};
8376          $self->{column}++;
8377          $self->{nc}
8378              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8379        } else {
8380          $self->{set_nc}->($self);
8381        }
8382      
8383            redo A;
8384          } elsif ($self->{nc} == 0x0029) { # )
8385            $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty element name'); ## TODO: type
8386            push @{$self->{ct}->{content}}, chr $self->{nc};
8387            $self->{group_depth}--;
8388            $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8389            
8390        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8391          $self->{line_prev} = $self->{line};
8392          $self->{column_prev} = $self->{column};
8393          $self->{column}++;
8394          $self->{nc}
8395              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8396        } else {
8397          $self->{set_nc}->($self);
8398        }
8399      
8400            redo A;
8401          } elsif ($self->{nc} == 0x003E) { # >
8402            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8403            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8404            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8405            
8406        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8407          $self->{line_prev} = $self->{line};
8408          $self->{column_prev} = $self->{column};
8409          $self->{column}++;
8410          $self->{nc}
8411              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8412        } else {
8413          $self->{set_nc}->($self);
8414        }
8415      
8416            return  ($self->{ct}); # ELEMENT
8417            redo A;
8418          } elsif ($self->{nc} == -1) {
8419            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8420            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8421            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8422            
8423        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8424          $self->{line_prev} = $self->{line};
8425          $self->{column_prev} = $self->{column};
8426          $self->{column}++;
8427          $self->{nc}
8428              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8429        } else {
8430          $self->{set_nc}->($self);
8431        }
8432      
8433            return  ($self->{ct}); # ELEMENT
8434            redo A;
8435          } else {
8436            push @{$self->{ct}->{content}}, chr $self->{nc};
8437            $self->{state} = CM_ELEMENT_NAME_STATE;
8438            
8439        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8440          $self->{line_prev} = $self->{line};
8441          $self->{column_prev} = $self->{column};
8442          $self->{column}++;
8443          $self->{nc}
8444              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8445        } else {
8446          $self->{set_nc}->($self);
8447        }
8448      
8449            redo A;
8450          }
8451        } elsif ($self->{state} == CM_ELEMENT_NAME_STATE) {
8452          if ($is_space->{$self->{nc}}) {
8453            $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8454            
8455        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8456          $self->{line_prev} = $self->{line};
8457          $self->{column_prev} = $self->{column};
8458          $self->{column}++;
8459          $self->{nc}
8460              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8461        } else {
8462          $self->{set_nc}->($self);
8463        }
8464      
8465            redo A;
8466          } elsif ($self->{nc} == 0x002A or # *
8467                   $self->{nc} == 0x002B or # +
8468                   $self->{nc} == 0x003F) { # ?
8469            push @{$self->{ct}->{content}}, chr $self->{nc};
8470            $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8471            
8472        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8473          $self->{line_prev} = $self->{line};
8474          $self->{column_prev} = $self->{column};
8475          $self->{column}++;
8476          $self->{nc}
8477              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8478        } else {
8479          $self->{set_nc}->($self);
8480        }
8481      
8482            redo A;
8483          } elsif ($self->{nc} == 0x007C or # |
8484                   $self->{nc} == 0x002C) { # ,
8485            push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
8486            $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
8487            
8488        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8489          $self->{line_prev} = $self->{line};
8490          $self->{column_prev} = $self->{column};
8491          $self->{column}++;
8492          $self->{nc}
8493              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8494        } else {
8495          $self->{set_nc}->($self);
8496        }
8497      
8498            redo A;
8499          } elsif ($self->{nc} == 0x0029) { # )
8500            $self->{group_depth}--;
8501            push @{$self->{ct}->{content}}, chr $self->{nc};
8502            $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8503            
8504        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8505          $self->{line_prev} = $self->{line};
8506          $self->{column_prev} = $self->{column};
8507          $self->{column}++;
8508          $self->{nc}
8509              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8510        } else {
8511          $self->{set_nc}->($self);
8512        }
8513      
8514            redo A;
8515          } elsif ($self->{nc} == 0x003E) { # >
8516            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8517            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8518            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8519            
8520        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8521          $self->{line_prev} = $self->{line};
8522          $self->{column_prev} = $self->{column};
8523          $self->{column}++;
8524          $self->{nc}
8525              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8526        } else {
8527          $self->{set_nc}->($self);
8528        }
8529      
8530            return  ($self->{ct}); # ELEMENT
8531            redo A;
8532          } elsif ($self->{nc} == -1) {
8533            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8534            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8535            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8536            
8537        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8538          $self->{line_prev} = $self->{line};
8539          $self->{column_prev} = $self->{column};
8540          $self->{column}++;
8541          $self->{nc}
8542              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8543        } else {
8544          $self->{set_nc}->($self);
8545        }
8546      
8547            return  ($self->{ct}); # ELEMENT
8548            redo A;
8549          } else {
8550            $self->{ct}->{content}->[-1] .= chr $self->{nc};
8551            ## Stay in the state.
8552            
8553        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8554          $self->{line_prev} = $self->{line};
8555          $self->{column_prev} = $self->{column};
8556          $self->{column}++;
8557          $self->{nc}
8558              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8559        } else {
8560          $self->{set_nc}->($self);
8561        }
8562      
8563            redo A;
8564          }
8565        } elsif ($self->{state} == AFTER_CM_ELEMENT_NAME_STATE) {
8566          if ($is_space->{$self->{nc}}) {
8567            ## Stay in the state.
8568            
8569        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8570          $self->{line_prev} = $self->{line};
8571          $self->{column_prev} = $self->{column};
8572          $self->{column}++;
8573          $self->{nc}
8574              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8575        } else {
8576          $self->{set_nc}->($self);
8577        }
8578      
8579            redo A;
8580          } elsif ($self->{nc} == 0x007C or # |
8581                   $self->{nc} == 0x002C) { # ,
8582            push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
8583            $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
8584            
8585        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8586          $self->{line_prev} = $self->{line};
8587          $self->{column_prev} = $self->{column};
8588          $self->{column}++;
8589          $self->{nc}
8590              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8591        } else {
8592          $self->{set_nc}->($self);
8593        }
8594      
8595            redo A;
8596          } elsif ($self->{nc} == 0x0029) { # )
8597            $self->{group_depth}--;
8598            push @{$self->{ct}->{content}}, chr $self->{nc};
8599            $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8600            
8601        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8602          $self->{line_prev} = $self->{line};
8603          $self->{column_prev} = $self->{column};
8604          $self->{column}++;
8605          $self->{nc}
8606              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8607        } else {
8608          $self->{set_nc}->($self);
8609        }
8610      
8611            redo A;
8612          } elsif ($self->{nc} == 0x003E) { # >
8613            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8614            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8615            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8616            
8617        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8618          $self->{line_prev} = $self->{line};
8619          $self->{column_prev} = $self->{column};
8620          $self->{column}++;
8621          $self->{nc}
8622              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8623        } else {
8624          $self->{set_nc}->($self);
8625        }
8626      
8627            return  ($self->{ct}); # ELEMENT
8628            redo A;
8629          } elsif ($self->{nc} == -1) {
8630            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8631            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8632            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8633            
8634        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8635          $self->{line_prev} = $self->{line};
8636          $self->{column_prev} = $self->{column};
8637          $self->{column}++;
8638          $self->{nc}
8639              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8640        } else {
8641          $self->{set_nc}->($self);
8642        }
8643      
8644            return  ($self->{ct}); # ELEMENT
8645            redo A;
8646          } else {
8647            $self->{parse_error}->(level => $self->{level}->{must}, type => 'after element name'); ## TODO: type
8648            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8649            $self->{state} = BOGUS_MD_STATE;
8650            
8651        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8652          $self->{line_prev} = $self->{line};
8653          $self->{column_prev} = $self->{column};
8654          $self->{column}++;
8655          $self->{nc}
8656              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8657        } else {
8658          $self->{set_nc}->($self);
8659        }
8660      
8661            redo A;
8662          }
8663        } elsif ($self->{state} == AFTER_CM_GROUP_CLOSE_STATE) {
8664          if ($is_space->{$self->{nc}}) {
8665            if ($self->{group_depth}) {
8666              $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8667            } else {
8668              $self->{state} = AFTER_MD_DEF_STATE;
8669            }
8670            
8671        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8672          $self->{line_prev} = $self->{line};
8673          $self->{column_prev} = $self->{column};
8674          $self->{column}++;
8675          $self->{nc}
8676              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8677        } else {
8678          $self->{set_nc}->($self);
8679        }
8680      
8681            redo A;
8682          } elsif ($self->{nc} == 0x002A or # *
8683                   $self->{nc} == 0x002B or # +
8684                   $self->{nc} == 0x003F) { # ?
8685            push @{$self->{ct}->{content}}, chr $self->{nc};
8686            if ($self->{group_depth}) {
8687              $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8688            } else {
8689              $self->{state} = AFTER_MD_DEF_STATE;
8690            }
8691            
8692        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8693          $self->{line_prev} = $self->{line};
8694          $self->{column_prev} = $self->{column};
8695          $self->{column}++;
8696          $self->{nc}
8697              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8698        } else {
8699          $self->{set_nc}->($self);
8700        }
8701      
8702            redo A;
8703          } elsif ($self->{nc} == 0x0029) { # )
8704            if ($self->{group_depth}) {
8705              $self->{group_depth}--;
8706              push @{$self->{ct}->{content}}, chr $self->{nc};
8707              ## Stay in the state.
8708              
8709        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8710          $self->{line_prev} = $self->{line};
8711          $self->{column_prev} = $self->{column};
8712          $self->{column}++;
8713          $self->{nc}
8714              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8715        } else {
8716          $self->{set_nc}->($self);
8717        }
8718      
8719              redo A;
8720            } else {
8721              $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8722              $self->{state} = BOGUS_MD_STATE;
8723              ## Reconsume.
8724              redo A;
8725            }
8726          } elsif ($self->{nc} == 0x003E) { # >
8727            if ($self->{group_depth}) {
8728              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8729              push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8730            }
8731            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8732            
8733        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8734          $self->{line_prev} = $self->{line};
8735          $self->{column_prev} = $self->{column};
8736          $self->{column}++;
8737          $self->{nc}
8738              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8739        } else {
8740          $self->{set_nc}->($self);
8741        }
8742      
8743            return  ($self->{ct}); # ELEMENT
8744            redo A;
8745          } elsif ($self->{nc} == -1) {
8746            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8747            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8748            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8749            
8750        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8751          $self->{line_prev} = $self->{line};
8752          $self->{column_prev} = $self->{column};
8753          $self->{column}++;
8754          $self->{nc}
8755              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8756        } else {
8757          $self->{set_nc}->($self);
8758        }
8759      
8760            return  ($self->{ct}); # ELEMENT
8761            redo A;
8762          } else {
8763            if ($self->{group_depth}) {
8764              $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8765            } else {
8766              $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8767              $self->{state} = BOGUS_MD_STATE;
8768            }
8769            ## Reconsume.
8770            redo A;
8771          }
8772        } elsif ($self->{state} == AFTER_MD_DEF_STATE) {
8773          if ($is_space->{$self->{nc}}) {
8774            ## Stay in the state.
8775            
8776        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8777          $self->{line_prev} = $self->{line};
8778          $self->{column_prev} = $self->{column};
8779          $self->{column}++;
8780          $self->{nc}
8781              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8782        } else {
8783          $self->{set_nc}->($self);
8784        }
8785      
8786            redo A;
8787          } elsif ($self->{nc} == 0x003E) { # >
8788            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8789            
8790        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8791          $self->{line_prev} = $self->{line};
8792          $self->{column_prev} = $self->{column};
8793          $self->{column}++;
8794          $self->{nc}
8795              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8796        } else {
8797          $self->{set_nc}->($self);
8798        }
8799      
8800            return  ($self->{ct}); # ENTITY/ELEMENT
8801            redo A;
8802          } elsif ($self->{nc} == -1) {
8803            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8804            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8805            
8806        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8807          $self->{line_prev} = $self->{line};
8808          $self->{column_prev} = $self->{column};
8809          $self->{column}++;
8810          $self->{nc}
8811              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8812        } else {
8813          $self->{set_nc}->($self);
8814        }
8815      
8816            return  ($self->{ct}); # ENTITY/ELEMENT
8817            redo A;
8818          } else {
8819            $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8820            $self->{state} = BOGUS_MD_STATE;
8821            ## Reconsume.
8822            redo A;
8823          }
8824        } elsif ($self->{state} == BOGUS_MD_STATE) {
8825          if ($self->{nc} == 0x003E) { # >
8826            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8827            
8828        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8829          $self->{line_prev} = $self->{line};
8830          $self->{column_prev} = $self->{column};
8831          $self->{column}++;
8832          $self->{nc}
8833              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8834        } else {
8835          $self->{set_nc}->($self);
8836        }
8837      
8838            return  ($self->{ct}); # ATTLIST/ENTITY/NOTATION
8839            redo A;
8840          } elsif ($self->{nc} == -1) {
8841            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8842            ## Reconsume.
8843            return  ($self->{ct}); # ATTLIST/ENTITY/NOTATION
8844            redo A;
8845          } else {
8846            ## Stay in the state.
8847            
8848        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8849          $self->{line_prev} = $self->{line};
8850          $self->{column_prev} = $self->{column};
8851          $self->{column}++;
8852          $self->{nc}
8853              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8854        } else {
8855          $self->{set_nc}->($self);
8856        }
8857      
8858            redo A;
8859          }
8860      } else {      } else {
8861        die "$0: $self->{state}: Unknown state";        die "$0: $self->{state}: Unknown state";
8862      }      }
# Line 4558  sub _get_next_token ($) { Line 8867  sub _get_next_token ($) {
8867    
8868  1;  1;
8869  ## $Date$  ## $Date$
8870                                    

Legend:
Removed from v.1.9  
changed lines
  Added in v.1.34

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24