/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm
Suika

Diff of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.1 by wakaba, Tue Oct 14 02:27:58 2008 UTC revision 1.18 by wakaba, Sun Oct 19 06:14:57 2008 UTC
# Line 2  package Whatpm::HTML::Tokenizer; Line 2  package Whatpm::HTML::Tokenizer;
2  use strict;  use strict;
3  our $VERSION=do{my @r=(q$Revision$=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};  our $VERSION=do{my @r=(q$Revision$=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4    
5    BEGIN {
6      require Exporter;
7      push our @ISA, 'Exporter';
8    
9      our @EXPORT_OK = qw(
10        DOCTYPE_TOKEN
11        COMMENT_TOKEN
12        START_TAG_TOKEN
13        END_TAG_TOKEN
14        END_OF_FILE_TOKEN
15        CHARACTER_TOKEN
16        PI_TOKEN
17        ABORT_TOKEN
18        END_OF_DOCTYPE_TOKEN
19        ATTLIST_TOKEN
20        ELEMENT_TOKEN
21        GENERAL_ENTITY_TOKEN
22        PARAMETER_ENTITY_TOKEN
23        NOTATION_TOKEN
24      );
25      
26      our %EXPORT_TAGS = (
27        token => [qw(
28          DOCTYPE_TOKEN
29          COMMENT_TOKEN
30          START_TAG_TOKEN
31          END_TAG_TOKEN
32          END_OF_FILE_TOKEN
33          CHARACTER_TOKEN
34          PI_TOKEN
35          ABORT_TOKEN
36          END_OF_DOCTYPE_TOKEN
37          ATTLIST_TOKEN
38          ELEMENT_TOKEN
39          GENERAL_ENTITY_TOKEN
40          PARAMETER_ENTITY_TOKEN
41          NOTATION_TOKEN
42        )],
43      );
44    }
45    
46    ## NOTE: Differences from the XML5 draft are marked as "XML5:".
47    
48    ## Token types
49    
50    sub DOCTYPE_TOKEN () { 1 } ## XML5: No DOCTYPE token.
51    sub COMMENT_TOKEN () { 2 }
52    sub START_TAG_TOKEN () { 3 }
53    sub END_TAG_TOKEN () { 4 }
54    sub END_OF_FILE_TOKEN () { 5 }
55    sub CHARACTER_TOKEN () { 6 }
56    sub PI_TOKEN () { 7 } ## NOTE: XML only.
57    sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing.
58    sub END_OF_DOCTYPE_TOKEN () { 9 } ## NOTE: XML only.
59    sub ATTLIST_TOKEN () { 10 } ## NOTE: XML only.
60    sub ELEMENT_TOKEN () { 11 } ## NOTE: XML only.
61    sub GENERAL_ENTITY_TOKEN () { 12 } ## NOTE: XML only.
62    sub PARAMETER_ENTITY_TOKEN () { 13 } ## NOTE: XML only.
63    sub NOTATION_TOKEN () { 14 } ## NOTE: XML only.
64    
65    ## XML5: XML5 has "empty tag token".  In this implementation, it is
66    ## represented as a start tag token with $self->{self_closing} flag
67    ## set to true.
68    
69    ## XML5: XML5 has "short end tag token".  In this implementation, it
70    ## is represented as an end tag token with $token->{tag_name} flag set
71    ## to an empty string.
72    
73  package Whatpm::HTML;  package Whatpm::HTML;
74    
75    BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
76    
77  ## Content model flags  ## Content model flags
78    
79  sub CM_ENTITY () { 0b001 } # & markup in data  sub CM_ENTITY () { 0b001 } # & markup in data
# Line 72  sub HEXREF_HEX_STATE () { 48 } Line 142  sub HEXREF_HEX_STATE () { 48 }
142  sub ENTITY_NAME_STATE () { 49 }  sub ENTITY_NAME_STATE () { 49 }
143  sub PCDATA_STATE () { 50 } # "data state" in the spec  sub PCDATA_STATE () { 50 } # "data state" in the spec
144    
145  ## Token types  ## XML-only states
146    sub PI_STATE () { 51 }
147  sub DOCTYPE_TOKEN () { 1 }  sub PI_TARGET_STATE () { 52 }
148  sub COMMENT_TOKEN () { 2 }  sub PI_TARGET_AFTER_STATE () { 53 }
149  sub START_TAG_TOKEN () { 3 }  sub PI_DATA_STATE () { 54 }
150  sub END_TAG_TOKEN () { 4 }  sub PI_AFTER_STATE () { 55 }
151  sub END_OF_FILE_TOKEN () { 5 }  sub PI_DATA_AFTER_STATE () { 56 }
152  sub CHARACTER_TOKEN () { 6 }  sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 }
153    sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 }
154    sub BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 59 }
155    sub DOCTYPE_TAG_STATE () { 60 }
156    sub DOCTYPE_MARKUP_DECLARATION_OPEN_STATE () { 61 }
157    sub MD_ATTLIST_STATE () { 62 }
158    sub MD_E_STATE () { 63 }
159    sub MD_ELEMENT_STATE () { 64 }
160    sub MD_ENTITY_STATE () { 65 }
161    sub MD_NOTATION_STATE () { 66 }
162    sub DOCTYPE_MD_STATE () { 67 }
163    sub BEFORE_MD_NAME_STATE () { 68 }
164    sub MD_NAME_STATE () { 69 }
165    sub DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE () { 70 }
166    sub DOCTYPE_ATTLIST_NAME_AFTER_STATE () { 71 }
167    sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE () { 72 }
168    sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE () { 73 }
169    sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE () { 74 }
170    sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE () { 75 }
171    sub BEFORE_ALLOWED_TOKEN_STATE () { 76 }
172    sub ALLOWED_TOKEN_STATE () { 77 }
173    sub AFTER_ALLOWED_TOKEN_STATE () { 78 }
174    sub AFTER_ALLOWED_TOKENS_STATE () { 79 }
175    sub BEFORE_ATTR_DEFAULT_STATE () { 80 }
176    sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE () { 81 }
177    sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE () { 82 }
178    sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE () { 83 }
179    sub AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE () { 84 }
180    sub BEFORE_NDATA_STATE () { 85 }
181    sub NDATA_STATE () { 86 }
182    sub AFTER_NDATA_STATE () { 87 }
183    sub BEFORE_NOTATION_NAME_STATE () { 88 }
184    sub NOTATION_NAME_STATE () { 89 }
185    sub AFTER_NOTATION_NAME_STATE () { 90 }
186    sub BOGUS_MD_STATE () { 91 }
187    
188  ## Tree constructor state constants (see Whatpm::HTML for the full  ## Tree constructor state constants (see Whatpm::HTML for the full
189  ## list and descriptions)  ## list and descriptions)
# Line 142  sub _initialize_tokenizer ($) { Line 246  sub _initialize_tokenizer ($) {
246    #$self->{level}    #$self->{level}
247    #$self->{set_nc}    #$self->{set_nc}
248    #$self->{parse_error}    #$self->{parse_error}
249      #$self->{is_xml} (if XML)
250    
251    $self->{state} = DATA_STATE; # MUST    $self->{state} = DATA_STATE; # MUST
252    #$self->{s_kwd}; # state keyword - initialized when used    $self->{s_kwd} = ''; # Data state keyword
253      #$self->{kwd} = ''; # State-dependent keyword; initialized when used
254    #$self->{entity__value}; # initialized when used    #$self->{entity__value}; # initialized when used
255    #$self->{entity__match}; # initialized when used    #$self->{entity__match}; # initialized when used
256    $self->{content_model} = PCDATA_CONTENT_MODEL; # be    $self->{content_model} = PCDATA_CONTENT_MODEL; # be
# Line 174  sub _initialize_tokenizer ($) { Line 280  sub _initialize_tokenizer ($) {
280    
281  ## A token has:  ## A token has:
282  ##   ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,  ##   ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
283  ##       CHARACTER_TOKEN, or END_OF_FILE_TOKEN  ##       CHARACTER_TOKEN, END_OF_FILE_TOKEN, PI_TOKEN, or ABORT_TOKEN
284  ##   ->{name} (DOCTYPE_TOKEN)  ##   ->{name} (DOCTYPE_TOKEN)
285  ##   ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)  ##   ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
286    ##   ->{target} (PI_TOKEN)
287  ##   ->{pubid} (DOCTYPE_TOKEN)  ##   ->{pubid} (DOCTYPE_TOKEN)
288  ##   ->{sysid} (DOCTYPE_TOKEN)  ##   ->{sysid} (DOCTYPE_TOKEN)
289  ##   ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag  ##   ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
# Line 184  sub _initialize_tokenizer ($) { Line 291  sub _initialize_tokenizer ($) {
291  ##        ->{name}  ##        ->{name}
292  ##        ->{value}  ##        ->{value}
293  ##        ->{has_reference} == 1 or 0  ##        ->{has_reference} == 1 or 0
294  ##   ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)  ##        ->{index}: Index of the attribute in a tag.
295    ##   ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)
296    ##   ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
297    ##   ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.
298    ##   ->{has_internal_subset} = 1 or 0 (DOCTYPE_TOKEN)
299    
300  ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.  ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
301  ##     |->{self_closing}| is used to save the value of |$self->{self_closing}|  ##     |->{self_closing}| is used to save the value of |$self->{self_closing}|
302  ##     while the token is pushed back to the stack.  ##     while the token is pushed back to the stack.
# Line 204  my $is_space = { Line 316  my $is_space = {
316    0x0009 => 1, # CHARACTER TABULATION (HT)    0x0009 => 1, # CHARACTER TABULATION (HT)
317    0x000A => 1, # LINE FEED (LF)    0x000A => 1, # LINE FEED (LF)
318    #0x000B => 0, # LINE TABULATION (VT)    #0x000B => 0, # LINE TABULATION (VT)
319    0x000C => 1, # FORM FEED (FF)    0x000C => 1, # FORM FEED (FF) ## XML5: Not a space character.
320    #0x000D => 1, # CARRIAGE RETURN (CR)    #0x000D => 1, # CARRIAGE RETURN (CR)
321    0x0020 => 1, # SPACE (SP)    0x0020 => 1, # SPACE (SP)
322  };  };
# Line 328  sub _get_next_token ($) { Line 440  sub _get_next_token ($) {
440          }          }
441        } elsif ($self->{nc} == 0x002D) { # -        } elsif ($self->{nc} == 0x002D) { # -
442          if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA          if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
443            $self->{s_kwd} .= '-';            if ($self->{s_kwd} eq '<!-') {
             
           if ($self->{s_kwd} eq '<!--') {  
444                            
445              $self->{escape} = 1; # unless $self->{escape};              $self->{escape} = 1; # unless $self->{escape};
446              $self->{s_kwd} = '--';              $self->{s_kwd} = '--';
447              #              #
448            } elsif ($self->{s_kwd} eq '---') {            } elsif ($self->{s_kwd} eq '-') {
449                            
450              $self->{s_kwd} = '--';              $self->{s_kwd} = '--';
451              #              #
452              } elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') {
453                
454                $self->{s_kwd} .= '-';
455                #
456            } else {            } else {
457                            
458                $self->{s_kwd} = '-';
459              #              #
460            }            }
461          }          }
# Line 386  sub _get_next_token ($) { Line 501  sub _get_next_token ($) {
501            if ($self->{s_kwd} eq '--') {            if ($self->{s_kwd} eq '--') {
502                            
503              delete $self->{escape};              delete $self->{escape};
504                #
505            } else {            } else {
506                            
507                #
508            }            }
509            } elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') {
510              
511              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched mse', ## TODO: type
512                              line => $self->{line_prev},
513                              column => $self->{column_prev} - 1);
514              #
515          } else {          } else {
516                        
517              #
518          }          }
519                    
520          $self->{s_kwd} = '';          $self->{s_kwd} = '';
521          #          #
522          } elsif ($self->{nc} == 0x005D) { # ]
523            if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') {
524              
525              $self->{s_kwd} .= ']';
526            } elsif ($self->{s_kwd} eq ']]') {
527              
528              #
529            } else {
530              
531              $self->{s_kwd} = '';
532            }
533            #
534        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
535                    
536          $self->{s_kwd} = '';          $self->{s_kwd} = '';
# Line 412  sub _get_next_token ($) { Line 548  sub _get_next_token ($) {
548                     data => chr $self->{nc},                     data => chr $self->{nc},
549                     line => $self->{line}, column => $self->{column},                     line => $self->{line}, column => $self->{column},
550                    };                    };
551        if ($self->{read_until}->($token->{data}, q[-!<>&],        if ($self->{read_until}->($token->{data}, q{-!<>&\]},
552                                  length $token->{data})) {                                  length $token->{data})) {
553          $self->{s_kwd} = '';          $self->{s_kwd} = '';
554        }        }
555    
556        ## Stay in the data state.        ## Stay in the data state.
557        if ($self->{content_model} == PCDATA_CONTENT_MODEL) {        if (not $self->{is_xml} and
558              $self->{content_model} == PCDATA_CONTENT_MODEL) {
559                    
560          $self->{state} = PCDATA_STATE;          $self->{state} = PCDATA_STATE;
561        } else {        } else {
# Line 439  sub _get_next_token ($) { Line 576  sub _get_next_token ($) {
576        return  ($token);        return  ($token);
577        redo A;        redo A;
578      } elsif ($self->{state} == TAG_OPEN_STATE) {      } elsif ($self->{state} == TAG_OPEN_STATE) {
579          ## XML5: "tag state".
580    
581        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
582          if ($self->{nc} == 0x002F) { # /          if ($self->{nc} == 0x002F) { # /
583                        
# Line 457  sub _get_next_token ($) { Line 596  sub _get_next_token ($) {
596            redo A;            redo A;
597          } elsif ($self->{nc} == 0x0021) { # !          } elsif ($self->{nc} == 0x0021) { # !
598                        
599            $self->{s_kwd} = '<' unless $self->{escape};            $self->{s_kwd} = $self->{escaped} ? '' : '<';
600            #            #
601          } else {          } else {
602                        
603              $self->{s_kwd} = '';
604            #            #
605          }          }
606    
# Line 507  sub _get_next_token ($) { Line 647  sub _get_next_token ($) {
647                        
648            $self->{ct}            $self->{ct}
649              = {type => START_TAG_TOKEN,              = {type => START_TAG_TOKEN,
650                 tag_name => chr ($self->{nc} + 0x0020),                 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
651                 line => $self->{line_prev},                 line => $self->{line_prev},
652                 column => $self->{column_prev}};                 column => $self->{column_prev}};
653            $self->{state} = TAG_NAME_STATE;            $self->{state} = TAG_NAME_STATE;
# Line 549  sub _get_next_token ($) { Line 689  sub _get_next_token ($) {
689                            line => $self->{line_prev},                            line => $self->{line_prev},
690                            column => $self->{column_prev});                            column => $self->{column_prev});
691            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
692              $self->{s_kwd} = '';
693                        
694      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
695        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 568  sub _get_next_token ($) { Line 709  sub _get_next_token ($) {
709    
710            redo A;            redo A;
711          } elsif ($self->{nc} == 0x003F) { # ?          } elsif ($self->{nc} == 0x003F) { # ?
712                        if ($self->{is_xml}) {
713            $self->{parse_error}->(level => $self->{level}->{must}, type => 'pio',              
714                            line => $self->{line_prev},              $self->{state} = PI_STATE;
715                            column => $self->{column_prev});              
716            $self->{state} = BOGUS_COMMENT_STATE;      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
717            $self->{ct} = {type => COMMENT_TOKEN, data => '',        $self->{line_prev} = $self->{line};
718                                      line => $self->{line_prev},        $self->{column_prev} = $self->{column};
719                                      column => $self->{column_prev},        $self->{column}++;
720                                     };        $self->{nc}
721            ## $self->{nc} is intentionally left as is            = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
722            redo A;      } else {
723          } else {        $self->{set_nc}->($self);
724        }
725      
726                redo A;
727              } else {
728                
729                $self->{parse_error}->(level => $self->{level}->{must}, type => 'pio',
730                                line => $self->{line_prev},
731                                column => $self->{column_prev});
732                $self->{state} = BOGUS_COMMENT_STATE;
733                $self->{ct} = {type => COMMENT_TOKEN, data => '',
734                               line => $self->{line_prev},
735                               column => $self->{column_prev},
736                              };
737                ## $self->{nc} is intentionally left as is
738                redo A;
739              }
740            } elsif (not $self->{is_xml} or $is_space->{$self->{nc}}) {
741                        
742            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago',            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago',
743                            line => $self->{line_prev},                            line => $self->{line_prev},
744                            column => $self->{column_prev});                            column => $self->{column_prev});
745            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
746              $self->{s_kwd} = '';
747            ## reconsume            ## reconsume
748    
749            return  ({type => CHARACTER_TOKEN, data => '<',            return  ({type => CHARACTER_TOKEN, data => '<',
# Line 593  sub _get_next_token ($) { Line 752  sub _get_next_token ($) {
752                     });                     });
753    
754            redo A;            redo A;
755            } else {
756              ## XML5: "<:" is a parse error.
757              
758              $self->{ct} = {type => START_TAG_TOKEN,
759                                        tag_name => chr ($self->{nc}),
760                                        line => $self->{line_prev},
761                                        column => $self->{column_prev}};
762              $self->{state} = TAG_NAME_STATE;
763              
764        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
765          $self->{line_prev} = $self->{line};
766          $self->{column_prev} = $self->{column};
767          $self->{column}++;
768          $self->{nc}
769              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
770        } else {
771          $self->{set_nc}->($self);
772        }
773      
774              redo A;
775          }          }
776        } else {        } else {
777          die "$0: $self->{content_model} in tag open";          die "$0: $self->{content_model} in tag open";
# Line 601  sub _get_next_token ($) { Line 780  sub _get_next_token ($) {
780        ## NOTE: The "close tag open state" in the spec is implemented as        ## NOTE: The "close tag open state" in the spec is implemented as
781        ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.        ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
782    
783          ## XML5: "end tag state".
784    
785        my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"        my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
786        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
787          if (defined $self->{last_stag_name}) {          if (defined $self->{last_stag_name}) {
788            $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;            $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
789            $self->{s_kwd} = '';            $self->{kwd} = '';
790            ## Reconsume.            ## Reconsume.
791            redo A;            redo A;
792          } else {          } else {
# Line 613  sub _get_next_token ($) { Line 794  sub _get_next_token ($) {
794            ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.            ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
795                        
796            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
797              $self->{s_kwd} = '';
798            ## Reconsume.            ## Reconsume.
799            return  ({type => CHARACTER_TOKEN, data => '</',            return  ({type => CHARACTER_TOKEN, data => '</',
800                      line => $l, column => $c,                      line => $l, column => $c,
# Line 626  sub _get_next_token ($) { Line 808  sub _get_next_token ($) {
808                    
809          $self->{ct}          $self->{ct}
810              = {type => END_TAG_TOKEN,              = {type => END_TAG_TOKEN,
811                 tag_name => chr ($self->{nc} + 0x0020),                 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
812                 line => $l, column => $c};                 line => $l, column => $c};
813          $self->{state} = TAG_NAME_STATE;          $self->{state} = TAG_NAME_STATE;
814                    
# Line 661  sub _get_next_token ($) { Line 843  sub _get_next_token ($) {
843        
844          redo A;          redo A;
845        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
           
846          $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty end tag',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty end tag',
847                          line => $self->{line_prev}, ## "<" in "</>"                          line => $self->{line_prev}, ## "<" in "</>"
848                          column => $self->{column_prev} - 1);                          column => $self->{column_prev} - 1);
849          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
850                    $self->{s_kwd} = '';
851            if ($self->{is_xml}) {
852              
853              ## XML5: No parse error.
854              
855              ## NOTE: This parser raises a parse error, since it supports
856              ## XML1, not XML5.
857    
858              ## NOTE: A short end tag token.
859              my $ct = {type => END_TAG_TOKEN,
860                        tag_name => '',
861                        line => $self->{line_prev},
862                        column => $self->{column_prev} - 1,
863                       };
864              
865      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
866        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
867        $self->{column_prev} = $self->{column};        $self->{column_prev} = $self->{column};
# Line 677  sub _get_next_token ($) { Line 872  sub _get_next_token ($) {
872        $self->{set_nc}->($self);        $self->{set_nc}->($self);
873      }      }
874        
875              return  ($ct);
876            } else {
877              
878              
879        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
880          $self->{line_prev} = $self->{line};
881          $self->{column_prev} = $self->{column};
882          $self->{column}++;
883          $self->{nc}
884              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
885        } else {
886          $self->{set_nc}->($self);
887        }
888      
889            }
890          redo A;          redo A;
891        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
892                    
893          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare etago');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare etago');
894            $self->{s_kwd} = '';
895          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
896          # reconsume          # reconsume
897    
# Line 689  sub _get_next_token ($) { Line 900  sub _get_next_token ($) {
900                   });                   });
901    
902          redo A;          redo A;
903        } else {        } elsif (not $self->{is_xml} or
904                   $is_space->{$self->{nc}}) {
905                    
906          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus end tag');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus end tag',
907                            line => $self->{line_prev}, # "<" of "</"
908                            column => $self->{column_prev} - 1);
909          $self->{state} = BOGUS_COMMENT_STATE;          $self->{state} = BOGUS_COMMENT_STATE;
910          $self->{ct} = {type => COMMENT_TOKEN, data => '',          $self->{ct} = {type => COMMENT_TOKEN, data => '',
911                                    line => $self->{line_prev}, # "<" of "</"                                    line => $self->{line_prev}, # "<" of "</"
# Line 704  sub _get_next_token ($) { Line 918  sub _get_next_token ($) {
918          ## generated from the bogus end tag, as defined in the          ## generated from the bogus end tag, as defined in the
919          ## "bogus comment state" entry.          ## "bogus comment state" entry.
920          redo A;          redo A;
921          } else {
922            ## XML5: "</:" is a parse error.
923            
924            $self->{ct} = {type => END_TAG_TOKEN,
925                           tag_name => chr ($self->{nc}),
926                           line => $l, column => $c};
927            $self->{state} = TAG_NAME_STATE; ## XML5: "end tag name state".
928            
929        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
930          $self->{line_prev} = $self->{line};
931          $self->{column_prev} = $self->{column};
932          $self->{column}++;
933          $self->{nc}
934              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
935        } else {
936          $self->{set_nc}->($self);
937        }
938      
939            redo A;
940        }        }
941      } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {      } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
942        my $ch = substr $self->{last_stag_name}, length $self->{s_kwd}, 1;        my $ch = substr $self->{last_stag_name}, length $self->{kwd}, 1;
943        if (length $ch) {        if (length $ch) {
944          my $CH = $ch;          my $CH = $ch;
945          $ch =~ tr/a-z/A-Z/;          $ch =~ tr/a-z/A-Z/;
# Line 714  sub _get_next_token ($) { Line 947  sub _get_next_token ($) {
947          if ($nch eq $ch or $nch eq $CH) {          if ($nch eq $ch or $nch eq $CH) {
948                        
949            ## Stay in the state.            ## Stay in the state.
950            $self->{s_kwd} .= $nch;            $self->{kwd} .= $nch;
951                        
952      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
953        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 730  sub _get_next_token ($) { Line 963  sub _get_next_token ($) {
963          } else {          } else {
964                        
965            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
966              $self->{s_kwd} = '';
967            ## Reconsume.            ## Reconsume.
968            return  ({type => CHARACTER_TOKEN,            return  ({type => CHARACTER_TOKEN,
969                      data => '</' . $self->{s_kwd},                      data => '</' . $self->{kwd},
970                      line => $self->{line_prev},                      line => $self->{line_prev},
971                      column => $self->{column_prev} - 1 - length $self->{s_kwd},                      column => $self->{column_prev} - 1 - length $self->{kwd},
972                     });                     });
973            redo A;            redo A;
974          }          }
# Line 748  sub _get_next_token ($) { Line 982  sub _get_next_token ($) {
982                        
983            ## Reconsume.            ## Reconsume.
984            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
985              $self->{s_kwd} = '';
986            return  ({type => CHARACTER_TOKEN,            return  ({type => CHARACTER_TOKEN,
987                      data => '</' . $self->{s_kwd},                      data => '</' . $self->{kwd},
988                      line => $self->{line_prev},                      line => $self->{line_prev},
989                      column => $self->{column_prev} - 1 - length $self->{s_kwd},                      column => $self->{column_prev} - 1 - length $self->{kwd},
990                     });                     });
991            redo A;            redo A;
992          } else {          } else {
# Line 760  sub _get_next_token ($) { Line 995  sub _get_next_token ($) {
995                = {type => END_TAG_TOKEN,                = {type => END_TAG_TOKEN,
996                   tag_name => $self->{last_stag_name},                   tag_name => $self->{last_stag_name},
997                   line => $self->{line_prev},                   line => $self->{line_prev},
998                   column => $self->{column_prev} - 1 - length $self->{s_kwd}};                   column => $self->{column_prev} - 1 - length $self->{kwd}};
999            $self->{state} = TAG_NAME_STATE;            $self->{state} = TAG_NAME_STATE;
1000            ## Reconsume.            ## Reconsume.
1001            redo A;            redo A;
# Line 799  sub _get_next_token ($) { Line 1034  sub _get_next_token ($) {
1034            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1035          }          }
1036          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1037            $self->{s_kwd} = '';
1038                    
1039      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1040        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 817  sub _get_next_token ($) { Line 1053  sub _get_next_token ($) {
1053        } elsif (0x0041 <= $self->{nc} and        } elsif (0x0041 <= $self->{nc} and
1054                 $self->{nc} <= 0x005A) { # A..Z                 $self->{nc} <= 0x005A) { # A..Z
1055                    
1056          $self->{ct}->{tag_name} .= chr ($self->{nc} + 0x0020);          $self->{ct}->{tag_name}
1057                .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1058            # start tag or end tag            # start tag or end tag
1059          ## Stay in this state          ## Stay in this state
1060                    
# Line 850  sub _get_next_token ($) { Line 1087  sub _get_next_token ($) {
1087            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1088          }          }
1089          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1090            $self->{s_kwd} = '';
1091          # reconsume          # reconsume
1092    
1093          return  ($self->{ct}); # start tag or end tag          return  ($self->{ct}); # start tag or end tag
# Line 889  sub _get_next_token ($) { Line 1127  sub _get_next_token ($) {
1127          redo A;          redo A;
1128        }        }
1129      } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {      } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
1130          ## XML5: "Tag attribute name before state".
1131    
1132        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
1133                    
1134          ## Stay in the state          ## Stay in the state
# Line 920  sub _get_next_token ($) { Line 1160  sub _get_next_token ($) {
1160            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1161          }          }
1162          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1163            $self->{s_kwd} = '';
1164                    
1165      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1166        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 939  sub _get_next_token ($) { Line 1180  sub _get_next_token ($) {
1180                 $self->{nc} <= 0x005A) { # A..Z                 $self->{nc} <= 0x005A) { # A..Z
1181                    
1182          $self->{ca}          $self->{ca}
1183              = {name => chr ($self->{nc} + 0x0020),              = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1184                 value => '',                 value => '',
1185                 line => $self->{line}, column => $self->{column}};                 line => $self->{line}, column => $self->{column}};
1186          $self->{state} = ATTRIBUTE_NAME_STATE;          $self->{state} = ATTRIBUTE_NAME_STATE;
# Line 987  sub _get_next_token ($) { Line 1228  sub _get_next_token ($) {
1228            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1229          }          }
1230          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1231            $self->{s_kwd} = '';
1232          # reconsume          # reconsume
1233    
1234          return  ($self->{ct}); # start tag or end tag          return  ($self->{ct}); # start tag or end tag
# Line 999  sub _get_next_token ($) { Line 1241  sub _get_next_token ($) {
1241               0x003D => 1, # =               0x003D => 1, # =
1242              }->{$self->{nc}}) {              }->{$self->{nc}}) {
1243                        
1244              ## XML5: Not a parse error.
1245            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1246          } else {          } else {
1247                        
1248              ## XML5: ":" raises a parse error and is ignored.
1249          }          }
1250          $self->{ca}          $self->{ca}
1251              = {name => chr ($self->{nc}),              = {name => chr ($self->{nc}),
# Line 1022  sub _get_next_token ($) { Line 1266  sub _get_next_token ($) {
1266          redo A;          redo A;
1267        }        }
1268      } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {      } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
1269          ## XML5: "Tag attribute name state".
1270    
1271        my $before_leave = sub {        my $before_leave = sub {
1272          if (exists $self->{ct}->{attributes} # start tag or end tag          if (exists $self->{ct}->{attributes} # start tag or end tag
1273              ->{$self->{ca}->{name}}) { # MUST              ->{$self->{ca}->{name}}) { # MUST
# Line 1032  sub _get_next_token ($) { Line 1278  sub _get_next_token ($) {
1278                        
1279            $self->{ct}->{attributes}->{$self->{ca}->{name}}            $self->{ct}->{attributes}->{$self->{ca}->{name}}
1280              = $self->{ca};              = $self->{ca};
1281              $self->{ca}->{index} = ++$self->{ct}->{last_index};
1282          }          }
1283        }; # $before_leave        }; # $before_leave
1284    
# Line 1068  sub _get_next_token ($) { Line 1315  sub _get_next_token ($) {
1315        
1316          redo A;          redo A;
1317        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
1318            if ($self->{is_xml}) {
1319              
1320              ## XML5: Not a parse error.
1321              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1322            } else {
1323              
1324            }
1325    
1326          $before_leave->();          $before_leave->();
1327          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
1328                        
# Line 1082  sub _get_next_token ($) { Line 1337  sub _get_next_token ($) {
1337            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1338          }          }
1339          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1340            $self->{s_kwd} = '';
1341                    
1342      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1343        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 1100  sub _get_next_token ($) { Line 1356  sub _get_next_token ($) {
1356        } elsif (0x0041 <= $self->{nc} and        } elsif (0x0041 <= $self->{nc} and
1357                 $self->{nc} <= 0x005A) { # A..Z                 $self->{nc} <= 0x005A) { # A..Z
1358                    
1359          $self->{ca}->{name} .= chr ($self->{nc} + 0x0020);          $self->{ca}->{name}
1360                .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1361          ## Stay in the state          ## Stay in the state
1362                    
1363      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 1115  sub _get_next_token ($) { Line 1372  sub _get_next_token ($) {
1372        
1373          redo A;          redo A;
1374        } elsif ($self->{nc} == 0x002F) { # /        } elsif ($self->{nc} == 0x002F) { # /
1375            if ($self->{is_xml}) {
1376              
1377              ## XML5: Not a parse error.
1378              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1379            } else {
1380              
1381            }
1382                    
1383          $before_leave->();          $before_leave->();
1384          $self->{state} = SELF_CLOSING_START_TAG_STATE;          $self->{state} = SELF_CLOSING_START_TAG_STATE;
# Line 1149  sub _get_next_token ($) { Line 1413  sub _get_next_token ($) {
1413            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1414          }          }
1415          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1416            $self->{s_kwd} = '';
1417          # reconsume          # reconsume
1418    
1419          return  ($self->{ct}); # start tag or end tag          return  ($self->{ct}); # start tag or end tag
# Line 1158  sub _get_next_token ($) { Line 1423  sub _get_next_token ($) {
1423          if ($self->{nc} == 0x0022 or # "          if ($self->{nc} == 0x0022 or # "
1424              $self->{nc} == 0x0027) { # '              $self->{nc} == 0x0027) { # '
1425                        
1426              ## XML5: Not a parse error.
1427            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1428          } else {          } else {
1429                        
# Line 1178  sub _get_next_token ($) { Line 1444  sub _get_next_token ($) {
1444          redo A;          redo A;
1445        }        }
1446      } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {      } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1447          ## XML5: "Tag attribute name after state".
1448          
1449        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
1450                    
1451          ## Stay in the state          ## Stay in the state
# Line 1209  sub _get_next_token ($) { Line 1477  sub _get_next_token ($) {
1477        
1478          redo A;          redo A;
1479        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
1480            if ($self->{is_xml}) {
1481              
1482              ## XML5: Not a parse error.
1483              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1484            } else {
1485              
1486            }
1487    
1488          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
1489                        
1490            $self->{last_stag_name} = $self->{ct}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
# Line 1225  sub _get_next_token ($) { Line 1501  sub _get_next_token ($) {
1501            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1502          }          }
1503          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1504            $self->{s_kwd} = '';
1505                    
1506      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1507        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 1244  sub _get_next_token ($) { Line 1521  sub _get_next_token ($) {
1521                 $self->{nc} <= 0x005A) { # A..Z                 $self->{nc} <= 0x005A) { # A..Z
1522                    
1523          $self->{ca}          $self->{ca}
1524              = {name => chr ($self->{nc} + 0x0020),              = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1525                 value => '',                 value => '',
1526                 line => $self->{line}, column => $self->{column}};                 line => $self->{line}, column => $self->{column}};
1527          $self->{state} = ATTRIBUTE_NAME_STATE;          $self->{state} = ATTRIBUTE_NAME_STATE;
# Line 1261  sub _get_next_token ($) { Line 1538  sub _get_next_token ($) {
1538        
1539          redo A;          redo A;
1540        } elsif ($self->{nc} == 0x002F) { # /        } elsif ($self->{nc} == 0x002F) { # /
1541            if ($self->{is_xml}) {
1542              
1543              ## XML5: Not a parse error.
1544              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1545            } else {
1546              
1547            }
1548                    
1549          $self->{state} = SELF_CLOSING_START_TAG_STATE;          $self->{state} = SELF_CLOSING_START_TAG_STATE;
1550                    
# Line 1292  sub _get_next_token ($) { Line 1576  sub _get_next_token ($) {
1576          } else {          } else {
1577            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1578          }          }
1579            $self->{s_kwd} = '';
1580          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1581          # reconsume          # reconsume
1582    
# Line 1299  sub _get_next_token ($) { Line 1584  sub _get_next_token ($) {
1584    
1585          redo A;          redo A;
1586        } else {        } else {
1587            if ($self->{is_xml}) {
1588              
1589              ## XML5: Not a parse error.
1590              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1591            } else {
1592              
1593            }
1594    
1595          if ($self->{nc} == 0x0022 or # "          if ($self->{nc} == 0x0022 or # "
1596              $self->{nc} == 0x0027) { # '              $self->{nc} == 0x0027) { # '
1597                        
1598              ## XML5: Not a parse error.
1599            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1600          } else {          } else {
1601                        
# Line 1325  sub _get_next_token ($) { Line 1619  sub _get_next_token ($) {
1619          redo A;                  redo A;        
1620        }        }
1621      } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {      } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1622          ## XML5: "Tag attribute value before state".
1623    
1624        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
1625                    
1626          ## Stay in the state          ## Stay in the state
# Line 1393  sub _get_next_token ($) { Line 1689  sub _get_next_token ($) {
1689            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1690          }          }
1691          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1692            $self->{s_kwd} = '';
1693                    
1694      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1695        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 1426  sub _get_next_token ($) { Line 1723  sub _get_next_token ($) {
1723            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1724          }          }
1725          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1726            $self->{s_kwd} = '';
1727          ## reconsume          ## reconsume
1728    
1729          return  ($self->{ct}); # start tag or end tag          return  ($self->{ct}); # start tag or end tag
# Line 1434  sub _get_next_token ($) { Line 1732  sub _get_next_token ($) {
1732        } else {        } else {
1733          if ($self->{nc} == 0x003D) { # =          if ($self->{nc} == 0x003D) { # =
1734                        
1735              ## XML5: Not a parse error.
1736            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
1737            } elsif ($self->{is_xml}) {
1738              
1739              ## XML5: No parse error.
1740              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO
1741          } else {          } else {
1742                        
1743          }          }
# Line 1454  sub _get_next_token ($) { Line 1757  sub _get_next_token ($) {
1757          redo A;          redo A;
1758        }        }
1759      } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {      } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1760          ## XML5: "Tag attribute value double quoted state" and "DOCTYPE
1761          ## ATTLIST attribute value double quoted state".
1762          
1763        if ($self->{nc} == 0x0022) { # "        if ($self->{nc} == 0x0022) { # "
1764                    if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1765          $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;            
1766              ## XML5: "DOCTYPE ATTLIST name after state".
1767              push @{$self->{ct}->{attrdefs}}, $self->{ca};
1768              $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1769            } else {
1770              
1771              ## XML5: "Tag attribute name before state".
1772              $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1773            }
1774                    
1775      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1776        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 1471  sub _get_next_token ($) { Line 1785  sub _get_next_token ($) {
1785          redo A;          redo A;
1786        } elsif ($self->{nc} == 0x0026) { # &        } elsif ($self->{nc} == 0x0026) { # &
1787                    
1788            ## XML5: Not defined yet.
1789    
1790          ## NOTE: In the spec, the tokenizer is switched to the          ## NOTE: In the spec, the tokenizer is switched to the
1791          ## "entity in attribute value state".  In this implementation, the          ## "entity in attribute value state".  In this implementation, the
1792          ## tokenizer is switched to the |ENTITY_STATE|, which is an          ## tokenizer is switched to the |ENTITY_STATE|, which is an
# Line 1495  sub _get_next_token ($) { Line 1811  sub _get_next_token ($) {
1811          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
1812                        
1813            $self->{last_stag_name} = $self->{ct}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
1814    
1815              $self->{state} = DATA_STATE;
1816              $self->{s_kwd} = '';
1817              ## reconsume
1818              return  ($self->{ct}); # start tag
1819              redo A;
1820          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1821            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1822            if ($self->{ct}->{attributes}) {            if ($self->{ct}->{attributes}) {
# Line 1504  sub _get_next_token ($) { Line 1826  sub _get_next_token ($) {
1826              ## NOTE: This state should never be reached.              ## NOTE: This state should never be reached.
1827                            
1828            }            }
1829    
1830              $self->{state} = DATA_STATE;
1831              $self->{s_kwd} = '';
1832              ## reconsume
1833              return  ($self->{ct}); # end tag
1834              redo A;
1835            } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1836              ## XML5: No parse error above; not defined yet.
1837              push @{$self->{ct}->{attrdefs}}, $self->{ca};
1838              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1839              ## Reconsume.
1840              return  ($self->{ct}); # ATTLIST
1841              redo A;
1842          } else {          } else {
1843            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1844          }          }
         $self->{state} = DATA_STATE;  
         ## reconsume  
   
         return  ($self->{ct}); # start tag or end tag  
   
         redo A;  
1845        } else {        } else {
1846                    ## XML5 [ATTLIST]: Not defined yet.
1847            if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1848              
1849              ## XML5: Not a parse error.
1850              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
1851            } else {
1852              
1853            }
1854          $self->{ca}->{value} .= chr ($self->{nc});          $self->{ca}->{value} .= chr ($self->{nc});
1855          $self->{read_until}->($self->{ca}->{value},          $self->{read_until}->($self->{ca}->{value},
1856                                q["&],                                q["&<],
1857                                length $self->{ca}->{value});                                length $self->{ca}->{value});
1858    
1859          ## Stay in the state          ## Stay in the state
# Line 1535  sub _get_next_token ($) { Line 1871  sub _get_next_token ($) {
1871          redo A;          redo A;
1872        }        }
1873      } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {      } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1874          ## XML5: "Tag attribute value single quoted state" and "DOCTYPE
1875          ## ATTLIST attribute value single quoted state".
1876    
1877        if ($self->{nc} == 0x0027) { # '        if ($self->{nc} == 0x0027) { # '
1878                    if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1879          $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;            
1880              ## XML5: "DOCTYPE ATTLIST name after state".
1881              push @{$self->{ct}->{attrdefs}}, $self->{ca};
1882              $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1883            } else {
1884              
1885              ## XML5: "Before attribute name state" (sic).
1886              $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1887            }
1888                    
1889      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1890        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 1552  sub _get_next_token ($) { Line 1899  sub _get_next_token ($) {
1899          redo A;          redo A;
1900        } elsif ($self->{nc} == 0x0026) { # &        } elsif ($self->{nc} == 0x0026) { # &
1901                    
1902            ## XML5: Not defined yet.
1903    
1904          ## NOTE: In the spec, the tokenizer is switched to the          ## NOTE: In the spec, the tokenizer is switched to the
1905          ## "entity in attribute value state".  In this implementation, the          ## "entity in attribute value state".  In this implementation, the
1906          ## tokenizer is switched to the |ENTITY_STATE|, which is an          ## tokenizer is switched to the |ENTITY_STATE|, which is an
# Line 1576  sub _get_next_token ($) { Line 1925  sub _get_next_token ($) {
1925          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
1926                        
1927            $self->{last_stag_name} = $self->{ct}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
1928    
1929              $self->{state} = DATA_STATE;
1930              $self->{s_kwd} = '';
1931              ## reconsume
1932              return  ($self->{ct}); # start tag
1933              redo A;
1934          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1935            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1936            if ($self->{ct}->{attributes}) {            if ($self->{ct}->{attributes}) {
# Line 1585  sub _get_next_token ($) { Line 1940  sub _get_next_token ($) {
1940              ## NOTE: This state should never be reached.              ## NOTE: This state should never be reached.
1941                            
1942            }            }
1943    
1944              $self->{state} = DATA_STATE;
1945              $self->{s_kwd} = '';
1946              ## reconsume
1947              return  ($self->{ct}); # end tag
1948              redo A;
1949            } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1950              ## XML5: No parse error above; not defined yet.
1951              push @{$self->{ct}->{attrdefs}}, $self->{ca};
1952              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1953              ## Reconsume.
1954              return  ($self->{ct}); # ATTLIST
1955              redo A;
1956          } else {          } else {
1957            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1958          }          }
         $self->{state} = DATA_STATE;  
         ## reconsume  
   
         return  ($self->{ct}); # start tag or end tag  
   
         redo A;  
1959        } else {        } else {
1960                    ## XML5 [ATTLIST]: Not defined yet.
1961            if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1962              
1963              ## XML5: Not a parse error.
1964              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
1965            } else {
1966              
1967            }
1968          $self->{ca}->{value} .= chr ($self->{nc});          $self->{ca}->{value} .= chr ($self->{nc});
1969          $self->{read_until}->($self->{ca}->{value},          $self->{read_until}->($self->{ca}->{value},
1970                                q['&],                                q['&<],
1971                                length $self->{ca}->{value});                                length $self->{ca}->{value});
1972    
1973          ## Stay in the state          ## Stay in the state
# Line 1616  sub _get_next_token ($) { Line 1985  sub _get_next_token ($) {
1985          redo A;          redo A;
1986        }        }
1987      } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {      } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1988          ## XML5: "Tag attribute value unquoted state".
1989    
1990        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
1991                    if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1992          $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;            
1993              push @{$self->{ct}->{attrdefs}}, $self->{ca};
1994              $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
1995            } else {
1996              
1997              ## XML5: "Tag attribute name before state".
1998              $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1999            }
2000                    
2001      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2002        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 1633  sub _get_next_token ($) { Line 2011  sub _get_next_token ($) {
2011          redo A;          redo A;
2012        } elsif ($self->{nc} == 0x0026) { # &        } elsif ($self->{nc} == 0x0026) { # &
2013                    
2014    
2015            ## XML5: Not defined yet.
2016    
2017          ## NOTE: In the spec, the tokenizer is switched to the          ## NOTE: In the spec, the tokenizer is switched to the
2018          ## "entity in attribute value state".  In this implementation, the          ## "entity in attribute value state".  In this implementation, the
2019          ## tokenizer is switched to the |ENTITY_STATE|, which is an          ## tokenizer is switched to the |ENTITY_STATE|, which is an
# Line 1656  sub _get_next_token ($) { Line 2037  sub _get_next_token ($) {
2037          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
2038                        
2039            $self->{last_stag_name} = $self->{ct}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
2040    
2041              $self->{state} = DATA_STATE;
2042              $self->{s_kwd} = '';
2043              
2044        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2045          $self->{line_prev} = $self->{line};
2046          $self->{column_prev} = $self->{column};
2047          $self->{column}++;
2048          $self->{nc}
2049              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2050        } else {
2051          $self->{set_nc}->($self);
2052        }
2053      
2054              return  ($self->{ct}); # start tag
2055              redo A;
2056          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2057            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2058            if ($self->{ct}->{attributes}) {            if ($self->{ct}->{attributes}) {
# Line 1665  sub _get_next_token ($) { Line 2062  sub _get_next_token ($) {
2062              ## NOTE: This state should never be reached.              ## NOTE: This state should never be reached.
2063                            
2064            }            }
2065          } else {  
2066            die "$0: $self->{ct}->{type}: Unknown token type";            $self->{state} = DATA_STATE;
2067          }            $self->{s_kwd} = '';
2068          $self->{state} = DATA_STATE;            
           
2069      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2070        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
2071        $self->{column_prev} = $self->{column};        $self->{column_prev} = $self->{column};
# Line 1680  sub _get_next_token ($) { Line 2076  sub _get_next_token ($) {
2076        $self->{set_nc}->($self);        $self->{set_nc}->($self);
2077      }      }
2078        
2079              return  ($self->{ct}); # end tag
2080          return  ($self->{ct}); # start tag or end tag            redo A;
2081            } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2082          redo A;            push @{$self->{ct}->{attrdefs}}, $self->{ca};
2083              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2084              
2085        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2086          $self->{line_prev} = $self->{line};
2087          $self->{column_prev} = $self->{column};
2088          $self->{column}++;
2089          $self->{nc}
2090              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2091        } else {
2092          $self->{set_nc}->($self);
2093        }
2094      
2095              return  ($self->{ct}); # ATTLIST
2096              redo A;
2097            } else {
2098              die "$0: $self->{ct}->{type}: Unknown token type";
2099            }
2100        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
         $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');  
2101          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
2102                        
2103              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2104            $self->{last_stag_name} = $self->{ct}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
2105    
2106              $self->{state} = DATA_STATE;
2107              $self->{s_kwd} = '';
2108              ## reconsume
2109              return  ($self->{ct}); # start tag
2110              redo A;
2111          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2112              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2113            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2114            if ($self->{ct}->{attributes}) {            if ($self->{ct}->{attributes}) {
2115                            
# Line 1698  sub _get_next_token ($) { Line 2118  sub _get_next_token ($) {
2118              ## NOTE: This state should never be reached.              ## NOTE: This state should never be reached.
2119                            
2120            }            }
2121    
2122              $self->{state} = DATA_STATE;
2123              $self->{s_kwd} = '';
2124              ## reconsume
2125              return  ($self->{ct}); # end tag
2126              redo A;
2127            } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2128              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
2129              push @{$self->{ct}->{attrdefs}}, $self->{ca};
2130              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2131              ## Reconsume.
2132              return  ($self->{ct}); # ATTLIST
2133              redo A;
2134          } else {          } else {
2135            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
2136          }          }
         $self->{state} = DATA_STATE;  
         ## reconsume  
   
         return  ($self->{ct}); # start tag or end tag  
   
         redo A;  
2137        } else {        } else {
2138          if ({          if ({
2139               0x0022 => 1, # "               0x0022 => 1, # "
# Line 1714  sub _get_next_token ($) { Line 2141  sub _get_next_token ($) {
2141               0x003D => 1, # =               0x003D => 1, # =
2142              }->{$self->{nc}}) {              }->{$self->{nc}}) {
2143                        
2144              ## XML5: Not a parse error.
2145            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
2146          } else {          } else {
2147                        
# Line 1770  sub _get_next_token ($) { Line 2198  sub _get_next_token ($) {
2198            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
2199          }          }
2200          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2201            $self->{s_kwd} = '';
2202                    
2203      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2204        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 1817  sub _get_next_token ($) { Line 2246  sub _get_next_token ($) {
2246            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
2247          }          }
2248          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2249            $self->{s_kwd} = '';
2250          ## Reconsume.          ## Reconsume.
2251          return  ($self->{ct}); # start tag or end tag          return  ($self->{ct}); # start tag or end tag
2252          redo A;          redo A;
# Line 1828  sub _get_next_token ($) { Line 2258  sub _get_next_token ($) {
2258          redo A;          redo A;
2259        }        }
2260      } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {      } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
2261          ## XML5: "Empty tag state".
2262    
2263        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
2264          if ($self->{ct}->{type} == END_TAG_TOKEN) {          if ($self->{ct}->{type} == END_TAG_TOKEN) {
2265                        
# Line 1847  sub _get_next_token ($) { Line 2279  sub _get_next_token ($) {
2279          }          }
2280    
2281          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2282            $self->{s_kwd} = '';
2283                    
2284      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2285        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 1878  sub _get_next_token ($) { Line 2311  sub _get_next_token ($) {
2311          } else {          } else {
2312            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
2313          }          }
2314            ## XML5: "Tag attribute name before state".
2315          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2316            $self->{s_kwd} = '';
2317          ## Reconsume.          ## Reconsume.
2318          return  ($self->{ct}); # start tag or end tag          return  ($self->{ct}); # start tag or end tag
2319          redo A;          redo A;
# Line 1891  sub _get_next_token ($) { Line 2326  sub _get_next_token ($) {
2326          redo A;          redo A;
2327        }        }
2328      } elsif ($self->{state} == BOGUS_COMMENT_STATE) {      } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
2329        ## (only happen if PCDATA state)        ## XML5: "Bogus comment state" and "DOCTYPE bogus comment state".
2330    
2331        ## NOTE: Unlike spec's "bogus comment state", this implementation        ## NOTE: Unlike spec's "bogus comment state", this implementation
2332        ## consumes characters one-by-one basis.        ## consumes characters one-by-one basis.
2333                
2334        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
2335                    if ($self->{in_subset}) {
2336          $self->{state} = DATA_STATE;            
2337              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2338            } else {
2339              
2340              $self->{state} = DATA_STATE;
2341              $self->{s_kwd} = '';
2342            }
2343                    
2344      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2345        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 1914  sub _get_next_token ($) { Line 2355  sub _get_next_token ($) {
2355          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
2356          redo A;          redo A;
2357        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
2358                    if ($self->{in_subset}) {
2359          $self->{state} = DATA_STATE;            
2360              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2361            } else {
2362              
2363              $self->{state} = DATA_STATE;
2364              $self->{s_kwd} = '';
2365            }
2366          ## reconsume          ## reconsume
2367    
2368          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
# Line 1942  sub _get_next_token ($) { Line 2389  sub _get_next_token ($) {
2389          redo A;          redo A;
2390        }        }
2391      } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {      } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
2392        ## (only happen if PCDATA state)        ## XML5: "Markup declaration state".
2393                
2394        if ($self->{nc} == 0x002D) { # -        if ($self->{nc} == 0x002D) { # -
2395                    
# Line 1964  sub _get_next_token ($) { Line 2411  sub _get_next_token ($) {
2411          ## ASCII case-insensitive.          ## ASCII case-insensitive.
2412                    
2413          $self->{state} = MD_DOCTYPE_STATE;          $self->{state} = MD_DOCTYPE_STATE;
2414          $self->{s_kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
2415                    
2416      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2417        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 1977  sub _get_next_token ($) { Line 2424  sub _get_next_token ($) {
2424      }      }
2425        
2426          redo A;          redo A;
2427        } elsif ($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and        } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
2428                 $self->{open_elements}->[-1]->[1] & FOREIGN_EL and                   $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
2429                    $self->{is_xml}) and
2430                 $self->{nc} == 0x005B) { # [                 $self->{nc} == 0x005B) { # [
2431                                                    
2432          $self->{state} = MD_CDATA_STATE;          $self->{state} = MD_CDATA_STATE;
2433          $self->{s_kwd} = '[';          $self->{kwd} = '[';
2434                    
2435      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2436        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2016  sub _get_next_token ($) { Line 2464  sub _get_next_token ($) {
2464                                    line => $self->{line_prev},                                    line => $self->{line_prev},
2465                                    column => $self->{column_prev} - 2,                                    column => $self->{column_prev} - 2,
2466                                   };                                   };
2467          $self->{state} = COMMENT_START_STATE;          $self->{state} = COMMENT_START_STATE; ## XML5: "comment state".
2468                    
2469      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2470        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2052  sub _get_next_token ($) { Line 2500  sub _get_next_token ($) {
2500              0x0054, # T              0x0054, # T
2501              0x0059, # Y              0x0059, # Y
2502              0x0050, # P              0x0050, # P
2503            ]->[length $self->{s_kwd}] or            ]->[length $self->{kwd}] or
2504            $self->{nc} == [            $self->{nc} == [
2505              undef,              undef,
2506              0x006F, # o              0x006F, # o
# Line 2060  sub _get_next_token ($) { Line 2508  sub _get_next_token ($) {
2508              0x0074, # t              0x0074, # t
2509              0x0079, # y              0x0079, # y
2510              0x0070, # p              0x0070, # p
2511            ]->[length $self->{s_kwd}]) {            ]->[length $self->{kwd}]) {
2512                    
2513          ## Stay in the state.          ## Stay in the state.
2514          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
2515                    
2516      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2517        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2076  sub _get_next_token ($) { Line 2524  sub _get_next_token ($) {
2524      }      }
2525        
2526          redo A;          redo A;
2527        } elsif ((length $self->{s_kwd}) == 6 and        } elsif ((length $self->{kwd}) == 6 and
2528                 ($self->{nc} == 0x0045 or # E                 ($self->{nc} == 0x0045 or # E
2529                  $self->{nc} == 0x0065)) { # e                  $self->{nc} == 0x0065)) { # e
2530                    if ($self->{is_xml} and
2531                ($self->{kwd} ne 'DOCTYP' or $self->{nc} == 0x0065)) {
2532              
2533              ## XML5: case-sensitive.
2534              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO
2535                              text => 'DOCTYPE',
2536                              line => $self->{line_prev},
2537                              column => $self->{column_prev} - 5);
2538            } else {
2539              
2540            }
2541          $self->{state} = DOCTYPE_STATE;          $self->{state} = DOCTYPE_STATE;
2542          $self->{ct} = {type => DOCTYPE_TOKEN,          $self->{ct} = {type => DOCTYPE_TOKEN,
2543                                    quirks => 1,                                    quirks => 1,
# Line 2102  sub _get_next_token ($) { Line 2560  sub _get_next_token ($) {
2560                                    
2561          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2562                          line => $self->{line_prev},                          line => $self->{line_prev},
2563                          column => $self->{column_prev} - 1 - length $self->{s_kwd});                          column => $self->{column_prev} - 1 - length $self->{kwd});
2564          $self->{state} = BOGUS_COMMENT_STATE;          $self->{state} = BOGUS_COMMENT_STATE;
2565          ## Reconsume.          ## Reconsume.
2566          $self->{ct} = {type => COMMENT_TOKEN,          $self->{ct} = {type => COMMENT_TOKEN,
2567                                    data => $self->{s_kwd},                                    data => $self->{kwd},
2568                                    line => $self->{line_prev},                                    line => $self->{line_prev},
2569                                    column => $self->{column_prev} - 1 - length $self->{s_kwd},                                    column => $self->{column_prev} - 1 - length $self->{kwd},
2570                                   };                                   };
2571          redo A;          redo A;
2572        }        }
# Line 2119  sub _get_next_token ($) { Line 2577  sub _get_next_token ($) {
2577              '[CD' => 0x0041, # A              '[CD' => 0x0041, # A
2578              '[CDA' => 0x0054, # T              '[CDA' => 0x0054, # T
2579              '[CDAT' => 0x0041, # A              '[CDAT' => 0x0041, # A
2580            }->{$self->{s_kwd}}) {            }->{$self->{kwd}}) {
2581                    
2582          ## Stay in the state.          ## Stay in the state.
2583          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
2584                    
2585      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2586        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2135  sub _get_next_token ($) { Line 2593  sub _get_next_token ($) {
2593      }      }
2594        
2595          redo A;          redo A;
2596        } elsif ($self->{s_kwd} eq '[CDATA' and        } elsif ($self->{kwd} eq '[CDATA' and
2597                 $self->{nc} == 0x005B) { # [                 $self->{nc} == 0x005B) { # [
2598                    if ($self->{is_xml} and
2599                not $self->{tainted} and
2600                @{$self->{open_elements} or []} == 0) {
2601              
2602              $self->{parse_error}->(level => $self->{level}->{must}, type => 'cdata outside of root element',
2603                              line => $self->{line_prev},
2604                              column => $self->{column_prev} - 7);
2605              $self->{tainted} = 1;
2606            } else {
2607              
2608            }
2609    
2610          $self->{ct} = {type => CHARACTER_TOKEN,          $self->{ct} = {type => CHARACTER_TOKEN,
2611                                    data => '',                                    data => '',
2612                                    line => $self->{line_prev},                                    line => $self->{line_prev},
# Line 2159  sub _get_next_token ($) { Line 2628  sub _get_next_token ($) {
2628                    
2629          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2630                          line => $self->{line_prev},                          line => $self->{line_prev},
2631                          column => $self->{column_prev} - 1 - length $self->{s_kwd});                          column => $self->{column_prev} - 1 - length $self->{kwd});
2632          $self->{state} = BOGUS_COMMENT_STATE;          $self->{state} = BOGUS_COMMENT_STATE;
2633          ## Reconsume.          ## Reconsume.
2634          $self->{ct} = {type => COMMENT_TOKEN,          $self->{ct} = {type => COMMENT_TOKEN,
2635                                    data => $self->{s_kwd},                                    data => $self->{kwd},
2636                                    line => $self->{line_prev},                                    line => $self->{line_prev},
2637                                    column => $self->{column_prev} - 1 - length $self->{s_kwd},                                    column => $self->{column_prev} - 1 - length $self->{kwd},
2638                                   };                                   };
2639          redo A;          redo A;
2640        }        }
# Line 2186  sub _get_next_token ($) { Line 2655  sub _get_next_token ($) {
2655        
2656          redo A;          redo A;
2657        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
           
2658          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2659          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
2660              
2661              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2662            } else {
2663              
2664              $self->{state} = DATA_STATE;
2665              $self->{s_kwd} = '';
2666            }
2667                    
2668      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2669        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2205  sub _get_next_token ($) { Line 2680  sub _get_next_token ($) {
2680    
2681          redo A;          redo A;
2682        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
2683          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2684          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
2685              
2686              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2687            } else {
2688              
2689              $self->{state} = DATA_STATE;
2690              $self->{s_kwd} = '';
2691            }
2692          ## reconsume          ## reconsume
2693    
2694          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
# Line 2248  sub _get_next_token ($) { Line 2729  sub _get_next_token ($) {
2729        
2730          redo A;          redo A;
2731        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
           
2732          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2733          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
2734              
2735              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2736            } else {
2737              
2738              $self->{state} = DATA_STATE;
2739              $self->{s_kwd} = '';
2740            }
2741                    
2742      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2743        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2267  sub _get_next_token ($) { Line 2754  sub _get_next_token ($) {
2754    
2755          redo A;          redo A;
2756        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
2757          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2758          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
2759              
2760              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2761            } else {
2762              
2763              $self->{state} = DATA_STATE;
2764              $self->{s_kwd} = '';
2765            }
2766          ## reconsume          ## reconsume
2767    
2768          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
# Line 2294  sub _get_next_token ($) { Line 2787  sub _get_next_token ($) {
2787          redo A;          redo A;
2788        }        }
2789      } elsif ($self->{state} == COMMENT_STATE) {      } elsif ($self->{state} == COMMENT_STATE) {
2790          ## XML5: "Comment state" and "DOCTYPE comment state".
2791    
2792        if ($self->{nc} == 0x002D) { # -        if ($self->{nc} == 0x002D) { # -
2793                    
2794          $self->{state} = COMMENT_END_DASH_STATE;          $self->{state} = COMMENT_END_DASH_STATE;
# Line 2310  sub _get_next_token ($) { Line 2805  sub _get_next_token ($) {
2805        
2806          redo A;          redo A;
2807        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
2808          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2809          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
2810              
2811              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2812            } else {
2813              
2814              $self->{state} = DATA_STATE;
2815              $self->{s_kwd} = '';
2816            }
2817          ## reconsume          ## reconsume
2818    
2819          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
# Line 2340  sub _get_next_token ($) { Line 2841  sub _get_next_token ($) {
2841          redo A;          redo A;
2842        }        }
2843      } elsif ($self->{state} == COMMENT_END_DASH_STATE) {      } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2844          ## XML5: "Comment dash state" and "DOCTYPE comment dash state".
2845    
2846        if ($self->{nc} == 0x002D) { # -        if ($self->{nc} == 0x002D) { # -
2847                    
2848          $self->{state} = COMMENT_END_STATE;          $self->{state} = COMMENT_END_STATE;
# Line 2356  sub _get_next_token ($) { Line 2859  sub _get_next_token ($) {
2859        
2860          redo A;          redo A;
2861        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
2862          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2863          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
2864              
2865              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2866            } else {
2867              
2868              $self->{state} = DATA_STATE;
2869              $self->{s_kwd} = '';
2870            }
2871          ## reconsume          ## reconsume
2872    
2873          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
# Line 2382  sub _get_next_token ($) { Line 2891  sub _get_next_token ($) {
2891          redo A;          redo A;
2892        }        }
2893      } elsif ($self->{state} == COMMENT_END_STATE) {      } elsif ($self->{state} == COMMENT_END_STATE) {
2894          ## XML5: "Comment end state" and "DOCTYPE comment end state".
2895    
2896        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
2897                    if ($self->{in_subset}) {
2898          $self->{state} = DATA_STATE;            
2899              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2900            } else {
2901              
2902              $self->{state} = DATA_STATE;
2903              $self->{s_kwd} = '';
2904            }
2905                    
2906      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2907        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2402  sub _get_next_token ($) { Line 2919  sub _get_next_token ($) {
2919          redo A;          redo A;
2920        } elsif ($self->{nc} == 0x002D) { # -        } elsif ($self->{nc} == 0x002D) { # -
2921                    
2922            ## XML5: Not a parse error.
2923          $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2924                          line => $self->{line_prev},                          line => $self->{line_prev},
2925                          column => $self->{column_prev});                          column => $self->{column_prev});
# Line 2420  sub _get_next_token ($) { Line 2938  sub _get_next_token ($) {
2938        
2939          redo A;          redo A;
2940        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
2941          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2942          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
2943              
2944              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2945            } else {
2946              
2947              $self->{state} = DATA_STATE;
2948              $self->{s_kwd} = '';
2949            }
2950          ## reconsume          ## reconsume
2951    
2952          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
# Line 2430  sub _get_next_token ($) { Line 2954  sub _get_next_token ($) {
2954          redo A;          redo A;
2955        } else {        } else {
2956                    
2957            ## XML5: Not a parse error.
2958          $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2959                          line => $self->{line_prev},                          line => $self->{line_prev},
2960                          column => $self->{column_prev});                          column => $self->{column_prev});
# Line 2466  sub _get_next_token ($) { Line 2991  sub _get_next_token ($) {
2991          redo A;          redo A;
2992        } else {        } else {
2993                    
2994            ## XML5: Unless EOF, swith to the bogus comment state.
2995          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before DOCTYPE name');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before DOCTYPE name');
2996          $self->{state} = BEFORE_DOCTYPE_NAME_STATE;          $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2997          ## reconsume          ## reconsume
2998          redo A;          redo A;
2999        }        }
3000      } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {      } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
3001          ## XML5: "DOCTYPE root name before state".
3002    
3003        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
3004                    
3005          ## Stay in the state          ## Stay in the state
# Line 2489  sub _get_next_token ($) { Line 3017  sub _get_next_token ($) {
3017          redo A;          redo A;
3018        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
3019                    
3020            ## XML5: No parse error.
3021          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3022          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3023            $self->{s_kwd} = '';
3024                    
3025      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3026        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2510  sub _get_next_token ($) { Line 3040  sub _get_next_token ($) {
3040                    
3041          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3042          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3043            $self->{s_kwd} = '';
3044          ## reconsume          ## reconsume
3045    
3046          return  ($self->{ct}); # DOCTYPE (quirks)          return  ($self->{ct}); # DOCTYPE (quirks)
3047    
3048          redo A;          redo A;
3049          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3050            
3051            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3052            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3053            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3054            $self->{in_subset} = 1;
3055            
3056        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3057          $self->{line_prev} = $self->{line};
3058          $self->{column_prev} = $self->{column};
3059          $self->{column}++;
3060          $self->{nc}
3061              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3062        } else {
3063          $self->{set_nc}->($self);
3064        }
3065      
3066            return  ($self->{ct}); # DOCTYPE
3067            redo A;
3068        } else {        } else {
3069                    
3070          $self->{ct}->{name} = chr $self->{nc};          $self->{ct}->{name} = chr $self->{nc};
# Line 2534  sub _get_next_token ($) { Line 3084  sub _get_next_token ($) {
3084          redo A;          redo A;
3085        }        }
3086      } elsif ($self->{state} == DOCTYPE_NAME_STATE) {      } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
3087  ## ISSUE: Redundant "First," in the spec.        ## XML5: "DOCTYPE root name state".
3088    
3089          ## ISSUE: Redundant "First," in the spec.
3090    
3091        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
3092                    
3093          $self->{state} = AFTER_DOCTYPE_NAME_STATE;          $self->{state} = AFTER_DOCTYPE_NAME_STATE;
# Line 2553  sub _get_next_token ($) { Line 3106  sub _get_next_token ($) {
3106        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
3107                    
3108          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3109            $self->{s_kwd} = '';
3110                    
3111      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3112        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2572  sub _get_next_token ($) { Line 3126  sub _get_next_token ($) {
3126                    
3127          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3128          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3129            $self->{s_kwd} = '';
3130          ## reconsume          ## reconsume
3131    
3132          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
3133          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
3134    
3135          redo A;          redo A;
3136          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3137            
3138            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3139            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3140            $self->{in_subset} = 1;
3141            
3142        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3143          $self->{line_prev} = $self->{line};
3144          $self->{column_prev} = $self->{column};
3145          $self->{column}++;
3146          $self->{nc}
3147              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3148        } else {
3149          $self->{set_nc}->($self);
3150        }
3151      
3152            return  ($self->{ct}); # DOCTYPE
3153            redo A;
3154        } else {        } else {
3155                    
3156          $self->{ct}->{name}          $self->{ct}->{name}
# Line 2597  sub _get_next_token ($) { Line 3170  sub _get_next_token ($) {
3170          redo A;          redo A;
3171        }        }
3172      } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {      } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
3173          ## XML5: Corresponding to XML5's "DOCTYPE root name after
3174          ## state", but implemented differently.
3175    
3176        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
3177                    
3178          ## Stay in the state          ## Stay in the state
# Line 2613  sub _get_next_token ($) { Line 3189  sub _get_next_token ($) {
3189        
3190          redo A;          redo A;
3191        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
3192            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3193              
3194              $self->{state} = DATA_STATE;
3195              $self->{s_kwd} = '';
3196            } else {
3197              
3198              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
3199              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3200            }
3201                    
         $self->{state} = DATA_STATE;  
3202                    
3203      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3204        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2626  sub _get_next_token ($) { Line 3210  sub _get_next_token ($) {
3210        $self->{set_nc}->($self);        $self->{set_nc}->($self);
3211      }      }
3212        
3213            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         return  ($self->{ct}); # DOCTYPE  
   
3214          redo A;          redo A;
3215        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
3216            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3217              
3218              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3219              $self->{state} = DATA_STATE;
3220              $self->{s_kwd} = '';
3221              $self->{ct}->{quirks} = 1;
3222            } else {
3223              
3224              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3225              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3226            }
3227                    
3228          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');          ## Reconsume.
3229          $self->{state} = DATA_STATE;          return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         ## reconsume  
   
         $self->{ct}->{quirks} = 1;  
         return  ($self->{ct}); # DOCTYPE  
   
3230          redo A;          redo A;
3231        } elsif ($self->{nc} == 0x0050 or # P        } elsif ($self->{nc} == 0x0050 or # P
3232                 $self->{nc} == 0x0070) { # p                 $self->{nc} == 0x0070) { # p
3233            
3234          $self->{state} = PUBLIC_STATE;          $self->{state} = PUBLIC_STATE;
3235          $self->{s_kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
3236                    
3237      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3238        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2658  sub _get_next_token ($) { Line 3247  sub _get_next_token ($) {
3247          redo A;          redo A;
3248        } elsif ($self->{nc} == 0x0053 or # S        } elsif ($self->{nc} == 0x0053 or # S
3249                 $self->{nc} == 0x0073) { # s                 $self->{nc} == 0x0073) { # s
3250            
3251          $self->{state} = SYSTEM_STATE;          $self->{state} = SYSTEM_STATE;
3252          $self->{s_kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
3253                    
3254      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3255        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2672  sub _get_next_token ($) { Line 3262  sub _get_next_token ($) {
3262      }      }
3263        
3264          redo A;          redo A;
3265        } else {  ## TODO: " and ' for ENTITY
3266          } elsif ($self->{is_xml} and
3267                   $self->{ct}->{type} == DOCTYPE_TOKEN and
3268                   $self->{nc} == 0x005B) { # [
3269                    
3270          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name');          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3271          $self->{ct}->{quirks} = 1;          $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3272            $self->{in_subset} = 1;
3273            
3274        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3275          $self->{line_prev} = $self->{line};
3276          $self->{column_prev} = $self->{column};
3277          $self->{column}++;
3278          $self->{nc}
3279              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3280        } else {
3281          $self->{set_nc}->($self);
3282        }
3283      
3284            return  ($self->{ct}); # DOCTYPE
3285            redo A;
3286          } else {
3287            $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name'); ## TODO: type
3288    
3289            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3290              
3291              $self->{ct}->{quirks} = 1;
3292              $self->{state} = BOGUS_DOCTYPE_STATE;
3293            } else {
3294              
3295              $self->{state} = BOGUS_MD_STATE;
3296            }
3297    
         $self->{state} = BOGUS_DOCTYPE_STATE;  
3298                    
3299      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3300        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2699  sub _get_next_token ($) { Line 3316  sub _get_next_token ($) {
3316              0x0042, # B              0x0042, # B
3317              0x004C, # L              0x004C, # L
3318              0x0049, # I              0x0049, # I
3319            ]->[length $self->{s_kwd}] or            ]->[length $self->{kwd}] or
3320            $self->{nc} == [            $self->{nc} == [
3321              undef,              undef,
3322              0x0075, # u              0x0075, # u
3323              0x0062, # b              0x0062, # b
3324              0x006C, # l              0x006C, # l
3325              0x0069, # i              0x0069, # i
3326            ]->[length $self->{s_kwd}]) {            ]->[length $self->{kwd}]) {
3327                    
3328          ## Stay in the state.          ## Stay in the state.
3329          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
3330                    
3331      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3332        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2722  sub _get_next_token ($) { Line 3339  sub _get_next_token ($) {
3339      }      }
3340        
3341          redo A;          redo A;
3342        } elsif ((length $self->{s_kwd}) == 5 and        } elsif ((length $self->{kwd}) == 5 and
3343                 ($self->{nc} == 0x0043 or # C                 ($self->{nc} == 0x0043 or # C
3344                  $self->{nc} == 0x0063)) { # c                  $self->{nc} == 0x0063)) { # c
3345                    if ($self->{is_xml} and
3346                ($self->{kwd} ne 'PUBLI' or $self->{nc} == 0x0063)) { # c
3347              
3348              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
3349                              text => 'PUBLIC',
3350                              line => $self->{line_prev},
3351                              column => $self->{column_prev} - 4);
3352            } else {
3353              
3354            }
3355          $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;          $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3356                    
3357      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 2740  sub _get_next_token ($) { Line 3366  sub _get_next_token ($) {
3366        
3367          redo A;          redo A;
3368        } else {        } else {
3369                    $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name', ## TODO: type
         $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name',  
3370                          line => $self->{line_prev},                          line => $self->{line_prev},
3371                          column => $self->{column_prev} + 1 - length $self->{s_kwd});                          column => $self->{column_prev} + 1 - length $self->{kwd});
3372          $self->{ct}->{quirks} = 1;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3373              
3374          $self->{state} = BOGUS_DOCTYPE_STATE;            $self->{ct}->{quirks} = 1;
3375              $self->{state} = BOGUS_DOCTYPE_STATE;
3376            } else {
3377              
3378              $self->{state} = BOGUS_MD_STATE;
3379            }
3380          ## Reconsume.          ## Reconsume.
3381          redo A;          redo A;
3382        }        }
# Line 2758  sub _get_next_token ($) { Line 3388  sub _get_next_token ($) {
3388              0x0053, # S              0x0053, # S
3389              0x0054, # T              0x0054, # T
3390              0x0045, # E              0x0045, # E
3391            ]->[length $self->{s_kwd}] or            ]->[length $self->{kwd}] or
3392            $self->{nc} == [            $self->{nc} == [
3393              undef,              undef,
3394              0x0079, # y              0x0079, # y
3395              0x0073, # s              0x0073, # s
3396              0x0074, # t              0x0074, # t
3397              0x0065, # e              0x0065, # e
3398            ]->[length $self->{s_kwd}]) {            ]->[length $self->{kwd}]) {
3399                    
3400          ## Stay in the state.          ## Stay in the state.
3401          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
3402                    
3403      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3404        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2781  sub _get_next_token ($) { Line 3411  sub _get_next_token ($) {
3411      }      }
3412        
3413          redo A;          redo A;
3414        } elsif ((length $self->{s_kwd}) == 5 and        } elsif ((length $self->{kwd}) == 5 and
3415                 ($self->{nc} == 0x004D or # M                 ($self->{nc} == 0x004D or # M
3416                  $self->{nc} == 0x006D)) { # m                  $self->{nc} == 0x006D)) { # m
3417                    if ($self->{is_xml} and
3418                ($self->{kwd} ne 'SYSTE' or $self->{nc} == 0x006D)) { # m
3419              
3420              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
3421                              text => 'SYSTEM',
3422                              line => $self->{line_prev},
3423                              column => $self->{column_prev} - 4);
3424            } else {
3425              
3426            }
3427          $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;          $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
3428                    
3429      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 2799  sub _get_next_token ($) { Line 3438  sub _get_next_token ($) {
3438        
3439          redo A;          redo A;
3440        } else {        } else {
3441                    $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name', ## TODO: type
         $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name',  
3442                          line => $self->{line_prev},                          line => $self->{line_prev},
3443                          column => $self->{column_prev} + 1 - length $self->{s_kwd});                          column => $self->{column_prev} + 1 - length $self->{kwd});
3444          $self->{ct}->{quirks} = 1;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3445              
3446          $self->{state} = BOGUS_DOCTYPE_STATE;            $self->{ct}->{quirks} = 1;
3447              $self->{state} = BOGUS_DOCTYPE_STATE;
3448            } else {
3449              
3450              $self->{state} = BOGUS_MD_STATE;
3451            }
3452          ## Reconsume.          ## Reconsume.
3453          redo A;          redo A;
3454        }        }
# Line 2858  sub _get_next_token ($) { Line 3501  sub _get_next_token ($) {
3501        
3502          redo A;          redo A;
3503        } elsif ($self->{nc} eq 0x003E) { # >        } elsif ($self->{nc} eq 0x003E) { # >
           
3504          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3505            
3506          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3507              
3508              $self->{state} = DATA_STATE;
3509              $self->{s_kwd} = '';
3510              $self->{ct}->{quirks} = 1;
3511            } else {
3512              
3513              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3514            }
3515            
3516                    
3517      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3518        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2873  sub _get_next_token ($) { Line 3524  sub _get_next_token ($) {
3524        $self->{set_nc}->($self);        $self->{set_nc}->($self);
3525      }      }
3526        
3527            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         return  ($self->{ct}); # DOCTYPE  
   
3528          redo A;          redo A;
3529        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
3530            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3531              
3532              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3533              $self->{state} = DATA_STATE;
3534              $self->{s_kwd} = '';
3535              $self->{ct}->{quirks} = 1;
3536            } else {
3537              
3538              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3539              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3540            }
3541                    
         $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');  
   
         $self->{state} = DATA_STATE;  
3542          ## reconsume          ## reconsume
   
         $self->{ct}->{quirks} = 1;  
3543          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
   
3544          redo A;          redo A;
3545        } else {        } elsif ($self->{is_xml} and
3546                   $self->{ct}->{type} == DOCTYPE_TOKEN and
3547                   $self->{nc} == 0x005B) { # [
3548            
3549            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3550            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3551            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3552            $self->{in_subset} = 1;
3553                    
3554        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3555          $self->{line_prev} = $self->{line};
3556          $self->{column_prev} = $self->{column};
3557          $self->{column}++;
3558          $self->{nc}
3559              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3560        } else {
3561          $self->{set_nc}->($self);
3562        }
3563      
3564            return  ($self->{ct}); # DOCTYPE
3565            redo A;
3566          } else {
3567          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC');
         $self->{ct}->{quirks} = 1;  
3568    
3569          $self->{state} = BOGUS_DOCTYPE_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3570              
3571              $self->{ct}->{quirks} = 1;
3572              $self->{state} = BOGUS_DOCTYPE_STATE;
3573            } else {
3574              
3575              $self->{state} = BOGUS_MD_STATE;
3576            }
3577    
3578                    
3579      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3580        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2925  sub _get_next_token ($) { Line 3605  sub _get_next_token ($) {
3605        
3606          redo A;          redo A;
3607        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
           
3608          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3609    
3610          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3611              
3612              $self->{state} = DATA_STATE;
3613              $self->{s_kwd} = '';
3614              $self->{ct}->{quirks} = 1;
3615            } else {
3616              
3617              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3618            }
3619    
3620                    
3621      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3622        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2940  sub _get_next_token ($) { Line 3628  sub _get_next_token ($) {
3628        $self->{set_nc}->($self);        $self->{set_nc}->($self);
3629      }      }
3630        
3631            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         return  ($self->{ct}); # DOCTYPE  
   
3632          redo A;          redo A;
3633        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
3634          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3635    
3636          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3637          ## reconsume            
3638              $self->{state} = DATA_STATE;
3639          $self->{ct}->{quirks} = 1;            $self->{s_kwd} = '';
3640              $self->{ct}->{quirks} = 1;
3641            } else {
3642              
3643              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3644            }
3645            
3646            ## Reconsume.
3647          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
   
3648          redo A;          redo A;
3649        } else {        } else {
3650                    
3651          $self->{ct}->{pubid} # DOCTYPE          $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
             .= chr $self->{nc};  
3652          $self->{read_until}->($self->{ct}->{pubid}, q[">],          $self->{read_until}->($self->{ct}->{pubid}, q[">],
3653                                length $self->{ct}->{pubid});                                length $self->{ct}->{pubid});
3654    
# Line 2994  sub _get_next_token ($) { Line 3683  sub _get_next_token ($) {
3683        
3684          redo A;          redo A;
3685        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
           
3686          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3687    
3688          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3689              
3690              $self->{state} = DATA_STATE;
3691              $self->{s_kwd} = '';
3692              $self->{ct}->{quirks} = 1;
3693            } else {
3694              
3695              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3696            }
3697    
3698                    
3699      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3700        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3009  sub _get_next_token ($) { Line 3706  sub _get_next_token ($) {
3706        $self->{set_nc}->($self);        $self->{set_nc}->($self);
3707      }      }
3708        
3709            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         return  ($self->{ct}); # DOCTYPE  
   
3710          redo A;          redo A;
3711        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
3712          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3713    
3714          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3715              
3716              $self->{state} = DATA_STATE;
3717              $self->{s_kwd} = '';
3718              $self->{ct}->{quirks} = 1;
3719            } else {
3720              
3721              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3722            }
3723          
3724          ## reconsume          ## reconsume
3725            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         return  ($self->{ct}); # DOCTYPE  
   
3726          redo A;          redo A;
3727        } else {        } else {
3728                    
3729          $self->{ct}->{pubid} # DOCTYPE          $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
             .= chr $self->{nc};  
3730          $self->{read_until}->($self->{ct}->{pubid}, q['>],          $self->{read_until}->($self->{ct}->{pubid}, q['>],
3731                                length $self->{ct}->{pubid});                                length $self->{ct}->{pubid});
3732    
# Line 3064  sub _get_next_token ($) { Line 3762  sub _get_next_token ($) {
3762          redo A;          redo A;
3763        } elsif ($self->{nc} == 0x0022) { # "        } elsif ($self->{nc} == 0x0022) { # "
3764                    
3765          $self->{ct}->{sysid} = ''; # DOCTYPE          $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
3766          $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;          $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3767                    
3768      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 3080  sub _get_next_token ($) { Line 3778  sub _get_next_token ($) {
3778          redo A;          redo A;
3779        } elsif ($self->{nc} == 0x0027) { # '        } elsif ($self->{nc} == 0x0027) { # '
3780                    
3781          $self->{ct}->{sysid} = ''; # DOCTYPE          $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
3782          $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;          $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
3783                    
3784      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 3095  sub _get_next_token ($) { Line 3793  sub _get_next_token ($) {
3793        
3794          redo A;          redo A;
3795        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
3796            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3797              if ($self->{is_xml}) {
3798                
3799                $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3800              } else {
3801                
3802              }
3803              $self->{state} = DATA_STATE;
3804              $self->{s_kwd} = '';
3805            } else {
3806              if ($self->{ct}->{type} == NOTATION_TOKEN) {
3807                
3808              } else {
3809                
3810                $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');            
3811              }
3812              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3813            }
3814                    
         $self->{state} = DATA_STATE;  
3815                    
3816      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3817        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3108  sub _get_next_token ($) { Line 3823  sub _get_next_token ($) {
3823        $self->{set_nc}->($self);        $self->{set_nc}->($self);
3824      }      }
3825        
3826            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         return  ($self->{ct}); # DOCTYPE  
   
3827          redo A;          redo A;
3828        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
3829            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3830              
3831              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3832              
3833              $self->{state} = DATA_STATE;
3834              $self->{s_kwd} = '';
3835              $self->{ct}->{quirks} = 1;
3836            } else {
3837              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3838              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3839            }
3840                    
         $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');  
   
         $self->{state} = DATA_STATE;  
3841          ## reconsume          ## reconsume
3842            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3843          $self->{ct}->{quirks} = 1;          redo A;
3844          } elsif ($self->{is_xml} and
3845                   $self->{ct}->{type} == DOCTYPE_TOKEN and
3846                   $self->{nc} == 0x005B) { # [
3847            
3848            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3849            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3850            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3851            $self->{in_subset} = 1;
3852            
3853        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3854          $self->{line_prev} = $self->{line};
3855          $self->{column_prev} = $self->{column};
3856          $self->{column}++;
3857          $self->{nc}
3858              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3859        } else {
3860          $self->{set_nc}->($self);
3861        }
3862      
3863          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
   
3864          redo A;          redo A;
3865        } else {        } else {
           
3866          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC literal');
         $self->{ct}->{quirks} = 1;  
3867    
3868          $self->{state} = BOGUS_DOCTYPE_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3869              
3870              $self->{ct}->{quirks} = 1;
3871              $self->{state} = BOGUS_DOCTYPE_STATE;
3872            } else {
3873              
3874              $self->{state} = BOGUS_MD_STATE;
3875            }
3876    
3877                    
3878      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3879        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3191  sub _get_next_token ($) { Line 3936  sub _get_next_token ($) {
3936        
3937          redo A;          redo A;
3938        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
           
3939          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
         $self->{state} = DATA_STATE;  
3940                    
3941      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3942        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3206  sub _get_next_token ($) { Line 3949  sub _get_next_token ($) {
3949      }      }
3950        
3951    
3952          $self->{ct}->{quirks} = 1;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3953          return  ($self->{ct}); # DOCTYPE            
3954              $self->{state} = DATA_STATE;
3955              $self->{s_kwd} = '';
3956              $self->{ct}->{quirks} = 1;
3957            } else {
3958              
3959              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3960            }
3961    
3962            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3963          redo A;          redo A;
3964        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
3965            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3966              
3967              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3968              $self->{state} = DATA_STATE;
3969              $self->{s_kwd} = '';
3970              $self->{ct}->{quirks} = 1;
3971            } else {
3972              
3973              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3974              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3975            }
3976                    
         $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');  
   
         $self->{state} = DATA_STATE;  
3977          ## reconsume          ## reconsume
3978            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3979            redo A;
3980          } elsif ($self->{is_xml} and
3981                   $self->{ct}->{type} == DOCTYPE_TOKEN and
3982                   $self->{nc} == 0x005B) { # [
3983            
3984            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3985    
3986          $self->{ct}->{quirks} = 1;          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3987            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3988            $self->{in_subset} = 1;
3989            
3990        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3991          $self->{line_prev} = $self->{line};
3992          $self->{column_prev} = $self->{column};
3993          $self->{column}++;
3994          $self->{nc}
3995              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3996        } else {
3997          $self->{set_nc}->($self);
3998        }
3999      
4000          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
   
4001          redo A;          redo A;
4002        } else {        } else {
           
4003          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM');
         $self->{ct}->{quirks} = 1;  
4004    
4005          $self->{state} = BOGUS_DOCTYPE_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4006                        
4007              $self->{ct}->{quirks} = 1;
4008              $self->{state} = BOGUS_DOCTYPE_STATE;
4009            } else {
4010              
4011              $self->{state} = BOGUS_MD_STATE;
4012            }
4013    
4014                    
4015      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4016        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3256  sub _get_next_token ($) { Line 4040  sub _get_next_token ($) {
4040      }      }
4041        
4042          redo A;          redo A;
4043        } elsif ($self->{nc} == 0x003E) { # >        } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
           
4044          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4045    
4046          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4047              
4048              $self->{state} = DATA_STATE;
4049              $self->{s_kwd} = '';
4050              $self->{ct}->{quirks} = 1;
4051            } else {
4052              
4053              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4054            }
4055            
4056                    
4057      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4058        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3272  sub _get_next_token ($) { Line 4064  sub _get_next_token ($) {
4064        $self->{set_nc}->($self);        $self->{set_nc}->($self);
4065      }      }
4066        
4067            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         return  ($self->{ct}); # DOCTYPE  
   
4068          redo A;          redo A;
4069        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
4070          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4071    
4072          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4073              
4074              $self->{state} = DATA_STATE;
4075              $self->{s_kwd} = '';
4076              $self->{ct}->{quirks} = 1;
4077            } else {
4078              
4079              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4080            }
4081            
4082          ## reconsume          ## reconsume
4083            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         return  ($self->{ct}); # DOCTYPE  
   
4084          redo A;          redo A;
4085        } else {        } else {
4086                    
4087          $self->{ct}->{sysid} # DOCTYPE          $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
             .= chr $self->{nc};  
4088          $self->{read_until}->($self->{ct}->{sysid}, q[">],          $self->{read_until}->($self->{ct}->{sysid}, q[">],
4089                                length $self->{ct}->{sysid});                                length $self->{ct}->{sysid});
4090    
# Line 3325  sub _get_next_token ($) { Line 4118  sub _get_next_token ($) {
4118      }      }
4119        
4120          redo A;          redo A;
4121        } elsif ($self->{nc} == 0x003E) { # >        } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
4122                    
4123          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4124    
4125          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
4126            $self->{s_kwd} = '';
4127                    
4128      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4129        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3347  sub _get_next_token ($) { Line 4141  sub _get_next_token ($) {
4141    
4142          redo A;          redo A;
4143        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
4144          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4145    
4146          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4147          ## reconsume            
4148              $self->{state} = DATA_STATE;
4149          $self->{ct}->{quirks} = 1;            $self->{s_kwd} = '';
4150          return  ($self->{ct}); # DOCTYPE            $self->{ct}->{quirks} = 1;
4151            } else {
4152              
4153              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4154            }
4155    
4156            ## reconsume
4157            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4158          redo A;          redo A;
4159        } else {        } else {
4160                    
4161          $self->{ct}->{sysid} # DOCTYPE          $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
             .= chr $self->{nc};  
4162          $self->{read_until}->($self->{ct}->{sysid}, q['>],          $self->{read_until}->($self->{ct}->{sysid}, q['>],
4163                                length $self->{ct}->{sysid});                                length $self->{ct}->{sysid});
4164    
# Line 3380  sub _get_next_token ($) { Line 4178  sub _get_next_token ($) {
4178        }        }
4179      } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {      } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
4180        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
4181                    if ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN) {
4182          ## Stay in the state            
4183              $self->{state} = BEFORE_NDATA_STATE;
4184            } else {
4185              
4186              ## Stay in the state
4187            }
4188                    
4189      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4190        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3395  sub _get_next_token ($) { Line 4198  sub _get_next_token ($) {
4198        
4199          redo A;          redo A;
4200        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
4201            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4202              
4203              $self->{state} = DATA_STATE;
4204              $self->{s_kwd} = '';
4205            } else {
4206              
4207              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4208            }
4209    
4210                    
4211          $self->{state} = DATA_STATE;      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4212          $self->{line_prev} = $self->{line};
4213          $self->{column_prev} = $self->{column};
4214          $self->{column}++;
4215          $self->{nc}
4216              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4217        } else {
4218          $self->{set_nc}->($self);
4219        }
4220      
4221            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4222            redo A;
4223          } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
4224                   ($self->{nc} == 0x004E or # N
4225                    $self->{nc} == 0x006E)) { # n
4226            
4227            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before NDATA'); ## TODO: type
4228            $self->{state} = NDATA_STATE;
4229            $self->{kwd} = chr $self->{nc};
4230                    
4231      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4232        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3408  sub _get_next_token ($) { Line 4238  sub _get_next_token ($) {
4238        $self->{set_nc}->($self);        $self->{set_nc}->($self);
4239      }      }
4240        
4241            redo A;
4242          } elsif ($self->{nc} == -1) {
4243            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4244              
4245              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
4246              $self->{state} = DATA_STATE;
4247              $self->{s_kwd} = '';
4248              $self->{ct}->{quirks} = 1;
4249            } else {
4250              
4251              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4252              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4253            }
4254    
4255            ## reconsume
4256            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4257            redo A;
4258          } elsif ($self->{is_xml} and
4259                   $self->{ct}->{type} == DOCTYPE_TOKEN and
4260                   $self->{nc} == 0x005B) { # [
4261            
4262            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4263            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4264            $self->{in_subset} = 1;
4265            
4266        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4267          $self->{line_prev} = $self->{line};
4268          $self->{column_prev} = $self->{column};
4269          $self->{column}++;
4270          $self->{nc}
4271              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4272        } else {
4273          $self->{set_nc}->($self);
4274        }
4275      
4276          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
4277            redo A;
4278          } else {
4279            $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
4280    
4281            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4282              
4283              #$self->{ct}->{quirks} = 1;
4284              $self->{state} = BOGUS_DOCTYPE_STATE;
4285            } else {
4286              
4287              $self->{state} = BOGUS_MD_STATE;
4288            }
4289    
4290            
4291        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4292          $self->{line_prev} = $self->{line};
4293          $self->{column_prev} = $self->{column};
4294          $self->{column}++;
4295          $self->{nc}
4296              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4297        } else {
4298          $self->{set_nc}->($self);
4299        }
4300      
4301            redo A;
4302          }
4303        } elsif ($self->{state} == BEFORE_NDATA_STATE) {
4304          if ($is_space->{$self->{nc}}) {
4305            
4306            ## Stay in the state.
4307            
4308        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4309          $self->{line_prev} = $self->{line};
4310          $self->{column_prev} = $self->{column};
4311          $self->{column}++;
4312          $self->{nc}
4313              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4314        } else {
4315          $self->{set_nc}->($self);
4316        }
4317      
4318            redo A;
4319          } elsif ($self->{nc} == 0x003E) { # >
4320            
4321            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4322            
4323        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4324          $self->{line_prev} = $self->{line};
4325          $self->{column_prev} = $self->{column};
4326          $self->{column}++;
4327          $self->{nc}
4328              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4329        } else {
4330          $self->{set_nc}->($self);
4331        }
4332      
4333            return  ($self->{ct}); # ENTITY
4334            redo A;
4335          } elsif ($self->{nc} == 0x004E or # N
4336                   $self->{nc} == 0x006E) { # n
4337            
4338            $self->{state} = NDATA_STATE;
4339            $self->{kwd} = chr $self->{nc};
4340            
4341        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4342          $self->{line_prev} = $self->{line};
4343          $self->{column_prev} = $self->{column};
4344          $self->{column}++;
4345          $self->{nc}
4346              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4347        } else {
4348          $self->{set_nc}->($self);
4349        }
4350      
4351          redo A;          redo A;
4352        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
4353                    
4354          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4355          $self->{state} = DATA_STATE;          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4356          ## reconsume          ## reconsume
4357            return  ($self->{ct}); # ENTITY
         $self->{ct}->{quirks} = 1;  
         return  ($self->{ct}); # DOCTYPE  
   
4358          redo A;          redo A;
4359        } else {        } else {
4360                    
4361          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
4362          #$self->{ct}->{quirks} = 1;          $self->{state} = BOGUS_MD_STATE;
   
         $self->{state} = BOGUS_DOCTYPE_STATE;  
4363                    
4364      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4365        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3445  sub _get_next_token ($) { Line 4377  sub _get_next_token ($) {
4377        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
4378                    
4379          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
4380            $self->{s_kwd} = '';
4381                    
4382      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4383        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3460  sub _get_next_token ($) { Line 4393  sub _get_next_token ($) {
4393          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
4394    
4395          redo A;          redo A;
4396          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
4397            
4398            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4399            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4400            $self->{in_subset} = 1;
4401            
4402        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4403          $self->{line_prev} = $self->{line};
4404          $self->{column_prev} = $self->{column};
4405          $self->{column}++;
4406          $self->{nc}
4407              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4408        } else {
4409          $self->{set_nc}->($self);
4410        }
4411      
4412            return  ($self->{ct}); # DOCTYPE
4413            redo A;
4414        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
4415                    
4416          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
4417            $self->{s_kwd} = '';
4418          ## reconsume          ## reconsume
4419    
4420          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
# Line 3471  sub _get_next_token ($) { Line 4423  sub _get_next_token ($) {
4423        } else {        } else {
4424                    
4425          my $s = '';          my $s = '';
4426          $self->{read_until}->($s, q[>], 0);          $self->{read_until}->($s, q{>[}, 0);
4427    
4428          ## Stay in the state          ## Stay in the state
4429                    
# Line 3491  sub _get_next_token ($) { Line 4443  sub _get_next_token ($) {
4443        ## NOTE: "CDATA section state" in the state is jointly implemented        ## NOTE: "CDATA section state" in the state is jointly implemented
4444        ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,        ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
4445        ## and |CDATA_SECTION_MSE2_STATE|.        ## and |CDATA_SECTION_MSE2_STATE|.
4446    
4447          ## XML5: "CDATA state".
4448                
4449        if ($self->{nc} == 0x005D) { # ]        if ($self->{nc} == 0x005D) { # ]
4450                    
# Line 3508  sub _get_next_token ($) { Line 4462  sub _get_next_token ($) {
4462        
4463          redo A;          redo A;
4464        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
4465            if ($self->{is_xml}) {
4466              
4467              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no mse'); ## TODO: type
4468            } else {
4469              
4470            }
4471    
4472          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
4473                    $self->{s_kwd} = '';
4474      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {          ## Reconsume.
       $self->{line_prev} = $self->{line};  
       $self->{column_prev} = $self->{column};  
       $self->{column}++;  
       $self->{nc}  
           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);  
     } else {  
       $self->{set_nc}->($self);  
     }  
     
4475          if (length $self->{ct}->{data}) { # character          if (length $self->{ct}->{data}) { # character
4476                        
4477            return  ($self->{ct}); # character            return  ($self->{ct}); # character
# Line 3552  sub _get_next_token ($) { Line 4504  sub _get_next_token ($) {
4504    
4505        ## ISSUE: "text tokens" in spec.        ## ISSUE: "text tokens" in spec.
4506      } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {      } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
4507          ## XML5: "CDATA bracket state".
4508    
4509        if ($self->{nc} == 0x005D) { # ]        if ($self->{nc} == 0x005D) { # ]
4510                    
4511          $self->{state} = CDATA_SECTION_MSE2_STATE;          $self->{state} = CDATA_SECTION_MSE2_STATE;
# Line 3569  sub _get_next_token ($) { Line 4523  sub _get_next_token ($) {
4523          redo A;          redo A;
4524        } else {        } else {
4525                    
4526            ## XML5: If EOF, "]" is not appended and changed to the data state.
4527          $self->{ct}->{data} .= ']';          $self->{ct}->{data} .= ']';
4528          $self->{state} = CDATA_SECTION_STATE;          $self->{state} = CDATA_SECTION_STATE; ## XML5: Stay in the state.
4529          ## Reconsume.          ## Reconsume.
4530          redo A;          redo A;
4531        }        }
4532      } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {      } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
4533          ## XML5: "CDATA end state".
4534    
4535        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
4536          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
4537            $self->{s_kwd} = '';
4538                    
4539      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4540        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3616  sub _get_next_token ($) { Line 4574  sub _get_next_token ($) {
4574                    
4575          $self->{ct}->{data} .= ']]'; # character          $self->{ct}->{data} .= ']]'; # character
4576          $self->{state} = CDATA_SECTION_STATE;          $self->{state} = CDATA_SECTION_STATE;
4577          ## Reconsume.          ## Reconsume. ## XML5: Emit.
4578          redo A;          redo A;
4579        }        }
4580      } elsif ($self->{state} == ENTITY_STATE) {      } elsif ($self->{state} == ENTITY_STATE) {
# Line 3633  sub _get_next_token ($) { Line 4591  sub _get_next_token ($) {
4591        } elsif ($self->{nc} == 0x0023) { # #        } elsif ($self->{nc} == 0x0023) { # #
4592                    
4593          $self->{state} = ENTITY_HASH_STATE;          $self->{state} = ENTITY_HASH_STATE;
4594          $self->{s_kwd} = '#';          $self->{kwd} = '#';
4595                    
4596      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4597        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3653  sub _get_next_token ($) { Line 4611  sub _get_next_token ($) {
4611                    
4612          require Whatpm::_NamedEntityList;          require Whatpm::_NamedEntityList;
4613          $self->{state} = ENTITY_NAME_STATE;          $self->{state} = ENTITY_NAME_STATE;
4614          $self->{s_kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
4615          $self->{entity__value} = $self->{s_kwd};          $self->{entity__value} = $self->{kwd};
4616          $self->{entity__match} = 0;          $self->{entity__match} = 0;
4617                    
4618      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 3684  sub _get_next_token ($) { Line 4642  sub _get_next_token ($) {
4642        if ($self->{prev_state} == DATA_STATE) {        if ($self->{prev_state} == DATA_STATE) {
4643                    
4644          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
4645            $self->{s_kwd} = '';
4646          ## Reconsume.          ## Reconsume.
4647          return  ({type => CHARACTER_TOKEN, data => '&',          return  ({type => CHARACTER_TOKEN, data => '&',
4648                    line => $self->{line_prev},                    line => $self->{line_prev},
# Line 3694  sub _get_next_token ($) { Line 4653  sub _get_next_token ($) {
4653                    
4654          $self->{ca}->{value} .= '&';          $self->{ca}->{value} .= '&';
4655          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
4656            $self->{s_kwd} = '';
4657          ## Reconsume.          ## Reconsume.
4658          redo A;          redo A;
4659        }        }
# Line 3702  sub _get_next_token ($) { Line 4662  sub _get_next_token ($) {
4662            $self->{nc} == 0x0058) { # X            $self->{nc} == 0x0058) { # X
4663                    
4664          $self->{state} = HEXREF_X_STATE;          $self->{state} = HEXREF_X_STATE;
4665          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
4666                    
4667      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4668        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3719  sub _get_next_token ($) { Line 4679  sub _get_next_token ($) {
4679                 $self->{nc} <= 0x0039) { # 0..9                 $self->{nc} <= 0x0039) { # 0..9
4680                    
4681          $self->{state} = NCR_NUM_STATE;          $self->{state} = NCR_NUM_STATE;
4682          $self->{s_kwd} = $self->{nc} - 0x0030;          $self->{kwd} = $self->{nc} - 0x0030;
4683                    
4684      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4685        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3744  sub _get_next_token ($) { Line 4704  sub _get_next_token ($) {
4704          if ($self->{prev_state} == DATA_STATE) {          if ($self->{prev_state} == DATA_STATE) {
4705                        
4706            $self->{state} = $self->{prev_state};            $self->{state} = $self->{prev_state};
4707              $self->{s_kwd} = '';
4708            ## Reconsume.            ## Reconsume.
4709            return  ({type => CHARACTER_TOKEN,            return  ({type => CHARACTER_TOKEN,
4710                      data => '&#',                      data => '&#',
# Line 3755  sub _get_next_token ($) { Line 4716  sub _get_next_token ($) {
4716                        
4717            $self->{ca}->{value} .= '&#';            $self->{ca}->{value} .= '&#';
4718            $self->{state} = $self->{prev_state};            $self->{state} = $self->{prev_state};
4719              $self->{s_kwd} = '';
4720            ## Reconsume.            ## Reconsume.
4721            redo A;            redo A;
4722          }          }
# Line 3763  sub _get_next_token ($) { Line 4725  sub _get_next_token ($) {
4725        if (0x0030 <= $self->{nc} and        if (0x0030 <= $self->{nc} and
4726            $self->{nc} <= 0x0039) { # 0..9            $self->{nc} <= 0x0039) { # 0..9
4727                    
4728          $self->{s_kwd} *= 10;          $self->{kwd} *= 10;
4729          $self->{s_kwd} += $self->{nc} - 0x0030;          $self->{kwd} += $self->{nc} - 0x0030;
4730                    
4731          ## Stay in the state.          ## Stay in the state.
4732                    
# Line 3800  sub _get_next_token ($) { Line 4762  sub _get_next_token ($) {
4762          #          #
4763        }        }
4764    
4765        my $code = $self->{s_kwd};        my $code = $self->{kwd};
4766        my $l = $self->{line_prev};        my $l = $self->{line_prev};
4767        my $c = $self->{column_prev};        my $c = $self->{column_prev};
4768        if ($charref_map->{$code}) {        if ($charref_map->{$code}) {
# Line 3820  sub _get_next_token ($) { Line 4782  sub _get_next_token ($) {
4782        if ($self->{prev_state} == DATA_STATE) {        if ($self->{prev_state} == DATA_STATE) {
4783                    
4784          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
4785            $self->{s_kwd} = '';
4786          ## Reconsume.          ## Reconsume.
4787          return  ({type => CHARACTER_TOKEN, data => chr $code,          return  ({type => CHARACTER_TOKEN, data => chr $code,
4788                      has_reference => 1,
4789                    line => $l, column => $c,                    line => $l, column => $c,
4790                   });                   });
4791          redo A;          redo A;
# Line 3830  sub _get_next_token ($) { Line 4794  sub _get_next_token ($) {
4794          $self->{ca}->{value} .= chr $code;          $self->{ca}->{value} .= chr $code;
4795          $self->{ca}->{has_reference} = 1;          $self->{ca}->{has_reference} = 1;
4796          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
4797            $self->{s_kwd} = '';
4798          ## Reconsume.          ## Reconsume.
4799          redo A;          redo A;
4800        }        }
# Line 3840  sub _get_next_token ($) { Line 4805  sub _get_next_token ($) {
4805          # 0..9, A..F, a..f          # 0..9, A..F, a..f
4806                    
4807          $self->{state} = HEXREF_HEX_STATE;          $self->{state} = HEXREF_HEX_STATE;
4808          $self->{s_kwd} = 0;          $self->{kwd} = 0;
4809          ## Reconsume.          ## Reconsume.
4810          redo A;          redo A;
4811        } else {        } else {
# Line 3855  sub _get_next_token ($) { Line 4820  sub _get_next_token ($) {
4820          if ($self->{prev_state} == DATA_STATE) {          if ($self->{prev_state} == DATA_STATE) {
4821                        
4822            $self->{state} = $self->{prev_state};            $self->{state} = $self->{prev_state};
4823              $self->{s_kwd} = '';
4824            ## Reconsume.            ## Reconsume.
4825            return  ({type => CHARACTER_TOKEN,            return  ({type => CHARACTER_TOKEN,
4826                      data => '&' . $self->{s_kwd},                      data => '&' . $self->{kwd},
4827                      line => $self->{line_prev},                      line => $self->{line_prev},
4828                      column => $self->{column_prev} - length $self->{s_kwd},                      column => $self->{column_prev} - length $self->{kwd},
4829                     });                     });
4830            redo A;            redo A;
4831          } else {          } else {
4832                        
4833            $self->{ca}->{value} .= '&' . $self->{s_kwd};            $self->{ca}->{value} .= '&' . $self->{kwd};
4834            $self->{state} = $self->{prev_state};            $self->{state} = $self->{prev_state};
4835              $self->{s_kwd} = '';
4836            ## Reconsume.            ## Reconsume.
4837            redo A;            redo A;
4838          }          }
# Line 3874  sub _get_next_token ($) { Line 4841  sub _get_next_token ($) {
4841        if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {        if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
4842          # 0..9          # 0..9
4843                    
4844          $self->{s_kwd} *= 0x10;          $self->{kwd} *= 0x10;
4845          $self->{s_kwd} += $self->{nc} - 0x0030;          $self->{kwd} += $self->{nc} - 0x0030;
4846          ## Stay in the state.          ## Stay in the state.
4847                    
4848      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 3892  sub _get_next_token ($) { Line 4859  sub _get_next_token ($) {
4859        } elsif (0x0061 <= $self->{nc} and        } elsif (0x0061 <= $self->{nc} and
4860                 $self->{nc} <= 0x0066) { # a..f                 $self->{nc} <= 0x0066) { # a..f
4861                    
4862          $self->{s_kwd} *= 0x10;          $self->{kwd} *= 0x10;
4863          $self->{s_kwd} += $self->{nc} - 0x0060 + 9;          $self->{kwd} += $self->{nc} - 0x0060 + 9;
4864          ## Stay in the state.          ## Stay in the state.
4865                    
4866      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 3910  sub _get_next_token ($) { Line 4877  sub _get_next_token ($) {
4877        } elsif (0x0041 <= $self->{nc} and        } elsif (0x0041 <= $self->{nc} and
4878                 $self->{nc} <= 0x0046) { # A..F                 $self->{nc} <= 0x0046) { # A..F
4879                    
4880          $self->{s_kwd} *= 0x10;          $self->{kwd} *= 0x10;
4881          $self->{s_kwd} += $self->{nc} - 0x0040 + 9;          $self->{kwd} += $self->{nc} - 0x0040 + 9;
4882          ## Stay in the state.          ## Stay in the state.
4883                    
4884      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 3948  sub _get_next_token ($) { Line 4915  sub _get_next_token ($) {
4915          #          #
4916        }        }
4917    
4918        my $code = $self->{s_kwd};        my $code = $self->{kwd};
4919        my $l = $self->{line_prev};        my $l = $self->{line_prev};
4920        my $c = $self->{column_prev};        my $c = $self->{column_prev};
4921        if ($charref_map->{$code}) {        if ($charref_map->{$code}) {
# Line 3968  sub _get_next_token ($) { Line 4935  sub _get_next_token ($) {
4935        if ($self->{prev_state} == DATA_STATE) {        if ($self->{prev_state} == DATA_STATE) {
4936                    
4937          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
4938            $self->{s_kwd} = '';
4939          ## Reconsume.          ## Reconsume.
4940          return  ({type => CHARACTER_TOKEN, data => chr $code,          return  ({type => CHARACTER_TOKEN, data => chr $code,
4941                      has_reference => 1,
4942                    line => $l, column => $c,                    line => $l, column => $c,
4943                   });                   });
4944          redo A;          redo A;
# Line 3978  sub _get_next_token ($) { Line 4947  sub _get_next_token ($) {
4947          $self->{ca}->{value} .= chr $code;          $self->{ca}->{value} .= chr $code;
4948          $self->{ca}->{has_reference} = 1;          $self->{ca}->{has_reference} = 1;
4949          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
4950            $self->{s_kwd} = '';
4951          ## Reconsume.          ## Reconsume.
4952          redo A;          redo A;
4953        }        }
4954      } elsif ($self->{state} == ENTITY_NAME_STATE) {      } elsif ($self->{state} == ENTITY_NAME_STATE) {
4955        if (length $self->{s_kwd} < 30 and        if (length $self->{kwd} < 30 and
4956            ## NOTE: Some number greater than the maximum length of entity name            ## NOTE: Some number greater than the maximum length of entity name
4957            ((0x0041 <= $self->{nc} and # a            ((0x0041 <= $self->{nc} and # a
4958              $self->{nc} <= 0x005A) or # x              $self->{nc} <= 0x005A) or # x
# Line 3992  sub _get_next_token ($) { Line 4962  sub _get_next_token ($) {
4962              $self->{nc} <= 0x0039) or # 9              $self->{nc} <= 0x0039) or # 9
4963             $self->{nc} == 0x003B)) { # ;             $self->{nc} == 0x003B)) { # ;
4964          our $EntityChar;          our $EntityChar;
4965          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
4966          if (defined $EntityChar->{$self->{s_kwd}}) {          if (defined $EntityChar->{$self->{kwd}}) {
4967            if ($self->{nc} == 0x003B) { # ;            if ($self->{nc} == 0x003B) { # ;
4968                            
4969              $self->{entity__value} = $EntityChar->{$self->{s_kwd}};              $self->{entity__value} = $EntityChar->{$self->{kwd}};
4970              $self->{entity__match} = 1;              $self->{entity__match} = 1;
4971                            
4972      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 4012  sub _get_next_token ($) { Line 4982  sub _get_next_token ($) {
4982              #              #
4983            } else {            } else {
4984                            
4985              $self->{entity__value} = $EntityChar->{$self->{s_kwd}};              $self->{entity__value} = $EntityChar->{$self->{kwd}};
4986              $self->{entity__match} = -1;              $self->{entity__match} = -1;
4987              ## Stay in the state.              ## Stay in the state.
4988                            
# Line 4060  sub _get_next_token ($) { Line 5030  sub _get_next_token ($) {
5030          if ($self->{prev_state} != DATA_STATE and # in attribute          if ($self->{prev_state} != DATA_STATE and # in attribute
5031              $self->{entity__match} < -1) {              $self->{entity__match} < -1) {
5032                        
5033            $data = '&' . $self->{s_kwd};            $data = '&' . $self->{kwd};
5034            #            #
5035          } else {          } else {
5036                        
# Line 4072  sub _get_next_token ($) { Line 5042  sub _get_next_token ($) {
5042                    
5043          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
5044                          line => $self->{line_prev},                          line => $self->{line_prev},
5045                          column => $self->{column_prev} - length $self->{s_kwd});                          column => $self->{column_prev} - length $self->{kwd});
5046          $data = '&' . $self->{s_kwd};          $data = '&' . $self->{kwd};
5047          #          #
5048        }        }
5049        
# Line 4090  sub _get_next_token ($) { Line 5060  sub _get_next_token ($) {
5060        if ($self->{prev_state} == DATA_STATE) {        if ($self->{prev_state} == DATA_STATE) {
5061                    
5062          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
5063            $self->{s_kwd} = '';
5064          ## Reconsume.          ## Reconsume.
5065          return  ({type => CHARACTER_TOKEN,          return  ({type => CHARACTER_TOKEN,
5066                    data => $data,                    data => $data,
5067                      has_reference => $has_ref,
5068                    line => $self->{line_prev},                    line => $self->{line_prev},
5069                    column => $self->{column_prev} + 1 - length $self->{s_kwd},                    column => $self->{column_prev} + 1 - length $self->{kwd},
5070                   });                   });
5071          redo A;          redo A;
5072        } else {        } else {
# Line 4102  sub _get_next_token ($) { Line 5074  sub _get_next_token ($) {
5074          $self->{ca}->{value} .= $data;          $self->{ca}->{value} .= $data;
5075          $self->{ca}->{has_reference} = 1 if $has_ref;          $self->{ca}->{has_reference} = 1 if $has_ref;
5076          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
5077            $self->{s_kwd} = '';
5078            ## Reconsume.
5079            redo A;
5080          }
5081    
5082        ## XML-only states
5083    
5084        } elsif ($self->{state} == PI_STATE) {
5085          ## XML5: "Pi state" and "DOCTYPE pi state".
5086    
5087          if ($is_space->{$self->{nc}} or
5088              $self->{nc} == 0x003F or # ?
5089              $self->{nc} == -1) {
5090            ## XML5: U+003F: "pi state": Same as "Anything else"; "DOCTYPE
5091            ## pi state": Switch to the "DOCTYPE pi after state".  EOF:
5092            ## "DOCTYPE pi state": Parse error, switch to the "data
5093            ## state".
5094            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare pio', ## TODO: type
5095                            line => $self->{line_prev},
5096                            column => $self->{column_prev}
5097                                - 1 * ($self->{nc} != -1));
5098            $self->{state} = BOGUS_COMMENT_STATE;
5099            ## Reconsume.
5100            $self->{ct} = {type => COMMENT_TOKEN,
5101                           data => '?',
5102                           line => $self->{line_prev},
5103                           column => $self->{column_prev}
5104                               - 1 * ($self->{nc} != -1),
5105                          };
5106            redo A;
5107          } else {
5108            ## XML5: "DOCTYPE pi state": Stay in the state.
5109            $self->{ct} = {type => PI_TOKEN,
5110                           target => chr $self->{nc},
5111                           data => '',
5112                           line => $self->{line_prev},
5113                           column => $self->{column_prev} - 1,
5114                          };
5115            $self->{state} = PI_TARGET_STATE;
5116            
5117        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5118          $self->{line_prev} = $self->{line};
5119          $self->{column_prev} = $self->{column};
5120          $self->{column}++;
5121          $self->{nc}
5122              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5123        } else {
5124          $self->{set_nc}->($self);
5125        }
5126      
5127            redo A;
5128          }
5129        } elsif ($self->{state} == PI_TARGET_STATE) {
5130          if ($is_space->{$self->{nc}}) {
5131            $self->{state} = PI_TARGET_AFTER_STATE;
5132            
5133        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5134          $self->{line_prev} = $self->{line};
5135          $self->{column_prev} = $self->{column};
5136          $self->{column}++;
5137          $self->{nc}
5138              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5139        } else {
5140          $self->{set_nc}->($self);
5141        }
5142      
5143            redo A;
5144          } elsif ($self->{nc} == -1) {
5145            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
5146            if ($self->{in_subset}) {
5147              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5148            } else {
5149              $self->{state} = DATA_STATE;
5150              $self->{s_kwd} = '';
5151            }
5152            ## Reconsume.
5153            return  ($self->{ct}); # pi
5154            redo A;
5155          } elsif ($self->{nc} == 0x003F) { # ?
5156            $self->{state} = PI_AFTER_STATE;
5157            
5158        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5159          $self->{line_prev} = $self->{line};
5160          $self->{column_prev} = $self->{column};
5161          $self->{column}++;
5162          $self->{nc}
5163              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5164        } else {
5165          $self->{set_nc}->($self);
5166        }
5167      
5168            redo A;
5169          } else {
5170            ## XML5: typo ("tag name" -> "target")
5171            $self->{ct}->{target} .= chr $self->{nc}; # pi
5172            
5173        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5174          $self->{line_prev} = $self->{line};
5175          $self->{column_prev} = $self->{column};
5176          $self->{column}++;
5177          $self->{nc}
5178              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5179        } else {
5180          $self->{set_nc}->($self);
5181        }
5182      
5183            redo A;
5184          }
5185        } elsif ($self->{state} == PI_TARGET_AFTER_STATE) {
5186          if ($is_space->{$self->{nc}}) {
5187            ## Stay in the state.
5188            
5189        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5190          $self->{line_prev} = $self->{line};
5191          $self->{column_prev} = $self->{column};
5192          $self->{column}++;
5193          $self->{nc}
5194              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5195        } else {
5196          $self->{set_nc}->($self);
5197        }
5198      
5199            redo A;
5200          } else {
5201            $self->{state} = PI_DATA_STATE;
5202            ## Reprocess.
5203            redo A;
5204          }
5205        } elsif ($self->{state} == PI_DATA_STATE) {
5206          if ($self->{nc} == 0x003F) { # ?
5207            $self->{state} = PI_DATA_AFTER_STATE;
5208            
5209        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5210          $self->{line_prev} = $self->{line};
5211          $self->{column_prev} = $self->{column};
5212          $self->{column}++;
5213          $self->{nc}
5214              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5215        } else {
5216          $self->{set_nc}->($self);
5217        }
5218      
5219            redo A;
5220          } elsif ($self->{nc} == -1) {
5221            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
5222            if ($self->{in_subset}) {
5223              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state"
5224            } else {
5225              $self->{state} = DATA_STATE;
5226              $self->{s_kwd} = '';
5227            }
5228            ## Reprocess.
5229            return  ($self->{ct}); # pi
5230            redo A;
5231          } else {
5232            $self->{ct}->{data} .= chr $self->{nc}; # pi
5233            $self->{read_until}->($self->{ct}->{data}, q[?],
5234                                  length $self->{ct}->{data});
5235            ## Stay in the state.
5236            
5237        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5238          $self->{line_prev} = $self->{line};
5239          $self->{column_prev} = $self->{column};
5240          $self->{column}++;
5241          $self->{nc}
5242              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5243        } else {
5244          $self->{set_nc}->($self);
5245        }
5246      
5247            ## Reprocess.
5248            redo A;
5249          }
5250        } elsif ($self->{state} == PI_AFTER_STATE) {
5251          ## XML5: Part of "Pi after state".
5252    
5253          if ($self->{nc} == 0x003E) { # >
5254            if ($self->{in_subset}) {
5255              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5256            } else {
5257              $self->{state} = DATA_STATE;
5258              $self->{s_kwd} = '';
5259            }
5260            
5261        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5262          $self->{line_prev} = $self->{line};
5263          $self->{column_prev} = $self->{column};
5264          $self->{column}++;
5265          $self->{nc}
5266              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5267        } else {
5268          $self->{set_nc}->($self);
5269        }
5270      
5271            return  ($self->{ct}); # pi
5272            redo A;
5273          } elsif ($self->{nc} == 0x003F) { # ?
5274            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
5275                            line => $self->{line_prev},
5276                            column => $self->{column_prev}); ## XML5: no error
5277            $self->{ct}->{data} .= '?';
5278            $self->{state} = PI_DATA_AFTER_STATE;
5279            
5280        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5281          $self->{line_prev} = $self->{line};
5282          $self->{column_prev} = $self->{column};
5283          $self->{column}++;
5284          $self->{nc}
5285              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5286        } else {
5287          $self->{set_nc}->($self);
5288        }
5289      
5290            redo A;
5291          } else {
5292            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
5293                            line => $self->{line_prev},
5294                            column => $self->{column_prev}
5295                                + 1 * ($self->{nc} == -1)); ## XML5: no error
5296            $self->{ct}->{data} .= '?'; ## XML5: not appended
5297            $self->{state} = PI_DATA_STATE;
5298            ## Reprocess.
5299            redo A;
5300          }
5301        } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
5302          ## XML5: Same as "pi after state" and "DOCTYPE pi after state".
5303    
5304          if ($self->{nc} == 0x003E) { # >
5305            if ($self->{in_subset}) {
5306              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5307            } else {
5308              $self->{state} = DATA_STATE;
5309              $self->{s_kwd} = '';
5310            }
5311            
5312        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5313          $self->{line_prev} = $self->{line};
5314          $self->{column_prev} = $self->{column};
5315          $self->{column}++;
5316          $self->{nc}
5317              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5318        } else {
5319          $self->{set_nc}->($self);
5320        }
5321      
5322            return  ($self->{ct}); # pi
5323            redo A;
5324          } elsif ($self->{nc} == 0x003F) { # ?
5325            $self->{ct}->{data} .= '?';
5326            ## Stay in the state.
5327            
5328        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5329          $self->{line_prev} = $self->{line};
5330          $self->{column_prev} = $self->{column};
5331          $self->{column}++;
5332          $self->{nc}
5333              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5334        } else {
5335          $self->{set_nc}->($self);
5336        }
5337      
5338            redo A;
5339          } else {
5340            $self->{ct}->{data} .= '?'; ## XML5: not appended
5341            $self->{state} = PI_DATA_STATE;
5342            ## Reprocess.
5343            redo A;
5344          }
5345    
5346        } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_STATE) {
5347          if ($self->{nc} == 0x003C) { # <
5348            $self->{state} = DOCTYPE_TAG_STATE;
5349            
5350        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5351          $self->{line_prev} = $self->{line};
5352          $self->{column_prev} = $self->{column};
5353          $self->{column}++;
5354          $self->{nc}
5355              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5356        } else {
5357          $self->{set_nc}->($self);
5358        }
5359      
5360            redo A;
5361          } elsif ($self->{nc} == 0x0025) { # %
5362            ## XML5: Not defined yet.
5363    
5364            ## TODO:
5365            
5366        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5367          $self->{line_prev} = $self->{line};
5368          $self->{column_prev} = $self->{column};
5369          $self->{column}++;
5370          $self->{nc}
5371              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5372        } else {
5373          $self->{set_nc}->($self);
5374        }
5375      
5376            redo A;
5377          } elsif ($self->{nc} == 0x005D) { # ]
5378            delete $self->{in_subset};
5379            $self->{state} = DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
5380            
5381        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5382          $self->{line_prev} = $self->{line};
5383          $self->{column_prev} = $self->{column};
5384          $self->{column}++;
5385          $self->{nc}
5386              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5387        } else {
5388          $self->{set_nc}->($self);
5389        }
5390      
5391            redo A;
5392          } elsif ($is_space->{$self->{nc}}) {
5393            ## Stay in the state.
5394            
5395        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5396          $self->{line_prev} = $self->{line};
5397          $self->{column_prev} = $self->{column};
5398          $self->{column}++;
5399          $self->{nc}
5400              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5401        } else {
5402          $self->{set_nc}->($self);
5403        }
5404      
5405            redo A;
5406          } elsif ($self->{nc} == -1) {
5407            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed internal subset'); ## TODO: type
5408            delete $self->{in_subset};
5409            $self->{state} = DATA_STATE;
5410            $self->{s_kwd} = '';
5411            ## Reconsume.
5412            return  ({type => END_OF_DOCTYPE_TOKEN});
5413            redo A;
5414          } else {
5415            unless ($self->{internal_subset_tainted}) {
5416              ## XML5: No parse error.
5417              $self->{parse_error}->(level => $self->{level}->{must}, type => 'string in internal subset');
5418              $self->{internal_subset_tainted} = 1;
5419            }
5420            ## Stay in the state.
5421            
5422        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5423          $self->{line_prev} = $self->{line};
5424          $self->{column_prev} = $self->{column};
5425          $self->{column}++;
5426          $self->{nc}
5427              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5428        } else {
5429          $self->{set_nc}->($self);
5430        }
5431      
5432            redo A;
5433          }
5434        } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
5435          if ($self->{nc} == 0x003E) { # >
5436            $self->{state} = DATA_STATE;
5437            $self->{s_kwd} = '';
5438            
5439        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5440          $self->{line_prev} = $self->{line};
5441          $self->{column_prev} = $self->{column};
5442          $self->{column}++;
5443          $self->{nc}
5444              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5445        } else {
5446          $self->{set_nc}->($self);
5447        }
5448      
5449            return  ({type => END_OF_DOCTYPE_TOKEN});
5450            redo A;
5451          } elsif ($self->{nc} == -1) {
5452            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
5453            $self->{state} = DATA_STATE;
5454            $self->{s_kwd} = '';
5455            ## Reconsume.
5456            return  ({type => END_OF_DOCTYPE_TOKEN});
5457            redo A;
5458          } else {
5459            ## XML5: No parse error and stay in the state.
5460            $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after internal subset'); ## TODO: type
5461    
5462            $self->{state} = BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
5463            
5464        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5465          $self->{line_prev} = $self->{line};
5466          $self->{column_prev} = $self->{column};
5467          $self->{column}++;
5468          $self->{nc}
5469              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5470        } else {
5471          $self->{set_nc}->($self);
5472        }
5473      
5474            redo A;
5475          }
5476        } elsif ($self->{state} == BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
5477          if ($self->{nc} == 0x003E) { # >
5478            $self->{state} = DATA_STATE;
5479            $self->{s_kwd} = '';
5480            
5481        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5482          $self->{line_prev} = $self->{line};
5483          $self->{column_prev} = $self->{column};
5484          $self->{column}++;
5485          $self->{nc}
5486              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5487        } else {
5488          $self->{set_nc}->($self);
5489        }
5490      
5491            return  ({type => END_OF_DOCTYPE_TOKEN});
5492            redo A;
5493          } elsif ($self->{nc} == -1) {
5494            $self->{state} = DATA_STATE;
5495            $self->{s_kwd} = '';
5496            ## Reconsume.
5497            return  ({type => END_OF_DOCTYPE_TOKEN});
5498            redo A;
5499          } else {
5500            ## Stay in the state.
5501            
5502        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5503          $self->{line_prev} = $self->{line};
5504          $self->{column_prev} = $self->{column};
5505          $self->{column}++;
5506          $self->{nc}
5507              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5508        } else {
5509          $self->{set_nc}->($self);
5510        }
5511      
5512            redo A;
5513          }
5514        } elsif ($self->{state} == DOCTYPE_TAG_STATE) {
5515          if ($self->{nc} == 0x0021) { # !
5516            $self->{state} = DOCTYPE_MARKUP_DECLARATION_OPEN_STATE;
5517            
5518        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5519          $self->{line_prev} = $self->{line};
5520          $self->{column_prev} = $self->{column};
5521          $self->{column}++;
5522          $self->{nc}
5523              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5524        } else {
5525          $self->{set_nc}->($self);
5526        }
5527      
5528            redo A;
5529          } elsif ($self->{nc} == 0x003F) { # ?
5530            $self->{state} = PI_STATE;
5531            
5532        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5533          $self->{line_prev} = $self->{line};
5534          $self->{column_prev} = $self->{column};
5535          $self->{column}++;
5536          $self->{nc}
5537              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5538        } else {
5539          $self->{set_nc}->($self);
5540        }
5541      
5542            redo A;
5543          } elsif ($self->{nc} == -1) {
5544            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago');
5545            $self->{state} = DATA_STATE;
5546            $self->{s_kwd} = '';
5547            ## Reconsume.
5548            redo A;
5549          } else {
5550            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago', ## XML5: Not a parse error.
5551                            line => $self->{line_prev},
5552                            column => $self->{column_prev});
5553            $self->{state} = BOGUS_COMMENT_STATE;
5554            $self->{ct} = {type => COMMENT_TOKEN,
5555                           data => '',
5556                          }; ## NOTE: Will be discarded.
5557            
5558        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5559          $self->{line_prev} = $self->{line};
5560          $self->{column_prev} = $self->{column};
5561          $self->{column}++;
5562          $self->{nc}
5563              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5564        } else {
5565          $self->{set_nc}->($self);
5566        }
5567      
5568            redo A;
5569          }
5570        } elsif ($self->{state} == DOCTYPE_MARKUP_DECLARATION_OPEN_STATE) {
5571          ## XML5: "DOCTYPE markup declaration state".
5572          
5573          if ($self->{nc} == 0x002D) { # -
5574            $self->{state} = MD_HYPHEN_STATE;
5575            
5576        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5577          $self->{line_prev} = $self->{line};
5578          $self->{column_prev} = $self->{column};
5579          $self->{column}++;
5580          $self->{nc}
5581              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5582        } else {
5583          $self->{set_nc}->($self);
5584        }
5585      
5586            redo A;
5587          } elsif ($self->{nc} == 0x0045 or # E
5588                   $self->{nc} == 0x0065) { # e
5589            $self->{state} = MD_E_STATE;
5590            $self->{kwd} = chr $self->{nc};
5591            
5592        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5593          $self->{line_prev} = $self->{line};
5594          $self->{column_prev} = $self->{column};
5595          $self->{column}++;
5596          $self->{nc}
5597              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5598        } else {
5599          $self->{set_nc}->($self);
5600        }
5601      
5602            redo A;
5603          } elsif ($self->{nc} == 0x0041 or # A
5604                   $self->{nc} == 0x0061) { # a
5605            $self->{state} = MD_ATTLIST_STATE;
5606            $self->{kwd} = chr $self->{nc};
5607            
5608        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5609          $self->{line_prev} = $self->{line};
5610          $self->{column_prev} = $self->{column};
5611          $self->{column}++;
5612          $self->{nc}
5613              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5614        } else {
5615          $self->{set_nc}->($self);
5616        }
5617      
5618            redo A;
5619          } elsif ($self->{nc} == 0x004E or # N
5620                   $self->{nc} == 0x006E) { # n
5621            $self->{state} = MD_NOTATION_STATE;
5622            $self->{kwd} = chr $self->{nc};
5623            
5624        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5625          $self->{line_prev} = $self->{line};
5626          $self->{column_prev} = $self->{column};
5627          $self->{column}++;
5628          $self->{nc}
5629              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5630        } else {
5631          $self->{set_nc}->($self);
5632        }
5633      
5634            redo A;
5635          } else {
5636            #
5637          }
5638          
5639          ## XML5: No parse error.
5640          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5641                          line => $self->{line_prev},
5642                          column => $self->{column_prev} - 1);
5643          ## Reconsume.
5644          $self->{state} = BOGUS_COMMENT_STATE;
5645          $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded.
5646          redo A;
5647        } elsif ($self->{state} == MD_E_STATE) {
5648          if ($self->{nc} == 0x004E or # N
5649              $self->{nc} == 0x006E) { # n
5650            $self->{state} = MD_ENTITY_STATE;
5651            $self->{kwd} .= chr $self->{nc};
5652            
5653        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5654          $self->{line_prev} = $self->{line};
5655          $self->{column_prev} = $self->{column};
5656          $self->{column}++;
5657          $self->{nc}
5658              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5659        } else {
5660          $self->{set_nc}->($self);
5661        }
5662      
5663            redo A;
5664          } elsif ($self->{nc} == 0x004C or # L
5665                   $self->{nc} == 0x006C) { # l
5666            ## XML5: <!ELEMENT> not supported.
5667            $self->{state} = MD_ELEMENT_STATE;
5668            $self->{kwd} .= chr $self->{nc};
5669            
5670        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5671          $self->{line_prev} = $self->{line};
5672          $self->{column_prev} = $self->{column};
5673          $self->{column}++;
5674          $self->{nc}
5675              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5676        } else {
5677          $self->{set_nc}->($self);
5678        }
5679      
5680            redo A;
5681          } else {
5682            ## XML5: No parse error.
5683            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5684                            line => $self->{line_prev},
5685                            column => $self->{column_prev} - 2
5686                                + 1 * ($self->{nc} == -1));
5687          ## Reconsume.          ## Reconsume.
5688            $self->{state} = BOGUS_COMMENT_STATE;
5689            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5690            redo A;
5691          }
5692        } elsif ($self->{state} == MD_ENTITY_STATE) {
5693          if ($self->{nc} == [
5694                undef,
5695                undef,
5696                0x0054, # T
5697                0x0049, # I
5698                0x0054, # T
5699              ]->[length $self->{kwd}] or
5700              $self->{nc} == [
5701                undef,
5702                undef,
5703                0x0074, # t
5704                0x0069, # i
5705                0x0074, # t
5706              ]->[length $self->{kwd}]) {
5707            ## Stay in the state.
5708            $self->{kwd} .= chr $self->{nc};
5709            
5710        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5711          $self->{line_prev} = $self->{line};
5712          $self->{column_prev} = $self->{column};
5713          $self->{column}++;
5714          $self->{nc}
5715              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5716        } else {
5717          $self->{set_nc}->($self);
5718        }
5719      
5720            redo A;
5721          } elsif ((length $self->{kwd}) == 5 and
5722                   ($self->{nc} == 0x0059 or # Y
5723                    $self->{nc} == 0x0079)) { # y
5724            if ($self->{kwd} ne 'ENTIT' or $self->{nc} == 0x0079) {
5725              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
5726                              text => 'ENTITY',
5727                              line => $self->{line_prev},
5728                              column => $self->{column_prev} - 4);
5729            }
5730            $self->{ct} = {type => GENERAL_ENTITY_TOKEN, name => '',
5731                           line => $self->{line_prev},
5732                           column => $self->{column_prev} - 6};
5733            $self->{state} = DOCTYPE_MD_STATE;
5734            
5735        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5736          $self->{line_prev} = $self->{line};
5737          $self->{column_prev} = $self->{column};
5738          $self->{column}++;
5739          $self->{nc}
5740              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5741        } else {
5742          $self->{set_nc}->($self);
5743        }
5744      
5745            redo A;
5746          } else {
5747            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5748                            line => $self->{line_prev},
5749                            column => $self->{column_prev} - 1
5750                                - (length $self->{kwd})
5751                                + 1 * ($self->{nc} == -1));
5752            $self->{state} = BOGUS_COMMENT_STATE;
5753            ## Reconsume.
5754            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5755            redo A;
5756          }
5757        } elsif ($self->{state} == MD_ELEMENT_STATE) {
5758          if ($self->{nc} == [
5759               undef,
5760               undef,
5761               0x0045, # E
5762               0x004D, # M
5763               0x0045, # E
5764               0x004E, # N
5765              ]->[length $self->{kwd}] or
5766              $self->{nc} == [
5767               undef,
5768               undef,
5769               0x0065, # e
5770               0x006D, # m
5771               0x0065, # e
5772               0x006E, # n
5773              ]->[length $self->{kwd}]) {
5774            ## Stay in the state.
5775            $self->{kwd} .= chr $self->{nc};
5776            
5777        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5778          $self->{line_prev} = $self->{line};
5779          $self->{column_prev} = $self->{column};
5780          $self->{column}++;
5781          $self->{nc}
5782              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5783        } else {
5784          $self->{set_nc}->($self);
5785        }
5786      
5787            redo A;
5788          } elsif ((length $self->{kwd}) == 6 and
5789                   ($self->{nc} == 0x0054 or # T
5790                    $self->{nc} == 0x0074)) { # t
5791            if ($self->{kwd} ne 'ELEMEN' or $self->{nc} == 0x0074) {
5792              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
5793                              text => 'ELEMENT',
5794                              line => $self->{line_prev},
5795                              column => $self->{column_prev} - 5);
5796            }
5797            $self->{ct} = {type => ELEMENT_TOKEN, name => '',
5798                           line => $self->{line_prev},
5799                           column => $self->{column_prev} - 6};
5800            $self->{state} = DOCTYPE_MD_STATE;
5801            
5802        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5803          $self->{line_prev} = $self->{line};
5804          $self->{column_prev} = $self->{column};
5805          $self->{column}++;
5806          $self->{nc}
5807              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5808        } else {
5809          $self->{set_nc}->($self);
5810        }
5811      
5812            redo A;
5813          } else {
5814            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5815                            line => $self->{line_prev},
5816                            column => $self->{column_prev} - 1
5817                                - (length $self->{kwd})
5818                                + 1 * ($self->{nc} == -1));
5819            $self->{state} = BOGUS_COMMENT_STATE;
5820            ## Reconsume.
5821            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5822            redo A;
5823          }
5824        } elsif ($self->{state} == MD_ATTLIST_STATE) {
5825          if ($self->{nc} == [
5826               undef,
5827               0x0054, # T
5828               0x0054, # T
5829               0x004C, # L
5830               0x0049, # I
5831               0x0053, # S
5832              ]->[length $self->{kwd}] or
5833              $self->{nc} == [
5834               undef,
5835               0x0074, # t
5836               0x0074, # t
5837               0x006C, # l
5838               0x0069, # i
5839               0x0073, # s
5840              ]->[length $self->{kwd}]) {
5841            ## Stay in the state.
5842            $self->{kwd} .= chr $self->{nc};
5843            
5844        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5845          $self->{line_prev} = $self->{line};
5846          $self->{column_prev} = $self->{column};
5847          $self->{column}++;
5848          $self->{nc}
5849              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5850        } else {
5851          $self->{set_nc}->($self);
5852        }
5853      
5854            redo A;
5855          } elsif ((length $self->{kwd}) == 6 and
5856                   ($self->{nc} == 0x0054 or # T
5857                    $self->{nc} == 0x0074)) { # t
5858            if ($self->{kwd} ne 'ATTLIS' or $self->{nc} == 0x0074) {
5859              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
5860                              text => 'ATTLIST',
5861                              line => $self->{line_prev},
5862                              column => $self->{column_prev} - 5);
5863            }
5864            $self->{ct} = {type => ATTLIST_TOKEN, name => '',
5865                           attrdefs => [],
5866                           line => $self->{line_prev},
5867                           column => $self->{column_prev} - 6};
5868            $self->{state} = DOCTYPE_MD_STATE;
5869            
5870        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5871          $self->{line_prev} = $self->{line};
5872          $self->{column_prev} = $self->{column};
5873          $self->{column}++;
5874          $self->{nc}
5875              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5876        } else {
5877          $self->{set_nc}->($self);
5878        }
5879      
5880            redo A;
5881          } else {
5882            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5883                            line => $self->{line_prev},
5884                            column => $self->{column_prev} - 1
5885                                 - (length $self->{kwd})
5886                                 + 1 * ($self->{nc} == -1));
5887            $self->{state} = BOGUS_COMMENT_STATE;
5888            ## Reconsume.
5889            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5890            redo A;
5891          }
5892        } elsif ($self->{state} == MD_NOTATION_STATE) {
5893          if ($self->{nc} == [
5894               undef,
5895               0x004F, # O
5896               0x0054, # T
5897               0x0041, # A
5898               0x0054, # T
5899               0x0049, # I
5900               0x004F, # O
5901              ]->[length $self->{kwd}] or
5902              $self->{nc} == [
5903               undef,
5904               0x006F, # o
5905               0x0074, # t
5906               0x0061, # a
5907               0x0074, # t
5908               0x0069, # i
5909               0x006F, # o
5910              ]->[length $self->{kwd}]) {
5911            ## Stay in the state.
5912            $self->{kwd} .= chr $self->{nc};
5913            
5914        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5915          $self->{line_prev} = $self->{line};
5916          $self->{column_prev} = $self->{column};
5917          $self->{column}++;
5918          $self->{nc}
5919              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5920        } else {
5921          $self->{set_nc}->($self);
5922        }
5923      
5924            redo A;
5925          } elsif ((length $self->{kwd}) == 7 and
5926                   ($self->{nc} == 0x004E or # N
5927                    $self->{nc} == 0x006E)) { # n
5928            if ($self->{kwd} ne 'NOTATIO' or $self->{nc} == 0x006E) {
5929              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
5930                              text => 'NOTATION',
5931                              line => $self->{line_prev},
5932                              column => $self->{column_prev} - 6);
5933            }
5934            $self->{ct} = {type => NOTATION_TOKEN, name => '',
5935                           line => $self->{line_prev},
5936                           column => $self->{column_prev} - 6};
5937            $self->{state} = DOCTYPE_MD_STATE;
5938            
5939        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5940          $self->{line_prev} = $self->{line};
5941          $self->{column_prev} = $self->{column};
5942          $self->{column}++;
5943          $self->{nc}
5944              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5945        } else {
5946          $self->{set_nc}->($self);
5947        }
5948      
5949            redo A;
5950          } else {
5951            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5952                            line => $self->{line_prev},
5953                            column => $self->{column_prev} - 1
5954                                - (length $self->{kwd})
5955                                + 1 * ($self->{nc} == -1));
5956            $self->{state} = BOGUS_COMMENT_STATE;
5957            ## Reconsume.
5958            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5959            redo A;
5960          }
5961        } elsif ($self->{state} == DOCTYPE_MD_STATE) {
5962          ## XML5: "DOCTYPE ENTITY state", "DOCTYPE ATTLIST state", and
5963          ## "DOCTYPE NOTATION state".
5964    
5965          if ($is_space->{$self->{nc}}) {
5966            ## XML5: [NOTATION] Switch to the "DOCTYPE NOTATION identifier state".
5967            $self->{state} = BEFORE_MD_NAME_STATE;
5968            
5969        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5970          $self->{line_prev} = $self->{line};
5971          $self->{column_prev} = $self->{column};
5972          $self->{column}++;
5973          $self->{nc}
5974              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5975        } else {
5976          $self->{set_nc}->($self);
5977        }
5978      
5979            redo A;
5980          } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
5981                   $self->{nc} == 0x0025) { # %
5982            ## XML5: Switch to the "DOCTYPE bogus comment state".
5983            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before md name'); ## TODO: type
5984            $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
5985            
5986        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5987          $self->{line_prev} = $self->{line};
5988          $self->{column_prev} = $self->{column};
5989          $self->{column}++;
5990          $self->{nc}
5991              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5992        } else {
5993          $self->{set_nc}->($self);
5994        }
5995      
5996            redo A;
5997          } elsif ($self->{nc} == -1) {
5998            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
5999            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6000            ## Reconsume.
6001            redo A;
6002          } elsif ($self->{nc} == 0x003E) { # >
6003            ## XML5: Switch to the "DOCTYPE bogus comment state".
6004            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6005            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6006            
6007        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6008          $self->{line_prev} = $self->{line};
6009          $self->{column_prev} = $self->{column};
6010          $self->{column}++;
6011          $self->{nc}
6012              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6013        } else {
6014          $self->{set_nc}->($self);
6015        }
6016      
6017            redo A;
6018          } else {
6019            ## XML5: Switch to the "DOCTYPE bogus comment state".
6020            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before md name'); ## TODO: type
6021            $self->{state} = BEFORE_MD_NAME_STATE;
6022            redo A;
6023          }
6024        } elsif ($self->{state} == BEFORE_MD_NAME_STATE) {
6025          ## XML5: "DOCTYPE ENTITY parameter state", "DOCTYPE ENTITY type
6026          ## before state", "DOCTYPE ATTLIST name before state".
6027    
6028          if ($is_space->{$self->{nc}}) {
6029            ## Stay in the state.
6030            
6031        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6032          $self->{line_prev} = $self->{line};
6033          $self->{column_prev} = $self->{column};
6034          $self->{column}++;
6035          $self->{nc}
6036              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6037        } else {
6038          $self->{set_nc}->($self);
6039        }
6040      
6041            redo A;
6042          } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
6043                   $self->{nc} == 0x0025) { # %
6044            $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
6045            
6046        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6047          $self->{line_prev} = $self->{line};
6048          $self->{column_prev} = $self->{column};
6049          $self->{column}++;
6050          $self->{nc}
6051              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6052        } else {
6053          $self->{set_nc}->($self);
6054        }
6055      
6056            redo A;
6057          } elsif ($self->{nc} == 0x003E) { # >
6058            ## XML5: Same as "Anything else".
6059            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6060            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6061            
6062        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6063          $self->{line_prev} = $self->{line};
6064          $self->{column_prev} = $self->{column};
6065          $self->{column}++;
6066          $self->{nc}
6067              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6068        } else {
6069          $self->{set_nc}->($self);
6070        }
6071      
6072            redo A;
6073          } elsif ($self->{nc} == -1) {
6074            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6075            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6076            ## Reconsume.
6077            redo A;
6078          } else {
6079            ## XML5: [ATTLIST] Not defined yet.
6080            $self->{ct}->{name} .= chr $self->{nc};
6081            $self->{state} = MD_NAME_STATE;
6082            
6083        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6084          $self->{line_prev} = $self->{line};
6085          $self->{column_prev} = $self->{column};
6086          $self->{column}++;
6087          $self->{nc}
6088              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6089        } else {
6090          $self->{set_nc}->($self);
6091        }
6092      
6093            redo A;
6094          }
6095        } elsif ($self->{state} == DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE) {
6096          if ($is_space->{$self->{nc}}) {
6097            ## XML5: Switch to the "DOCTYPE ENTITY parameter state".
6098            $self->{ct}->{type} = PARAMETER_ENTITY_TOKEN;
6099            $self->{state} = BEFORE_MD_NAME_STATE;
6100            
6101        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6102          $self->{line_prev} = $self->{line};
6103          $self->{column_prev} = $self->{column};
6104          $self->{column}++;
6105          $self->{nc}
6106              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6107        } else {
6108          $self->{set_nc}->($self);
6109        }
6110      
6111            redo A;
6112          } elsif ($self->{nc} == 0x003E) { # >
6113            ## XML5: Same as "Anything else".
6114            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6115            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6116            
6117        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6118          $self->{line_prev} = $self->{line};
6119          $self->{column_prev} = $self->{column};
6120          $self->{column}++;
6121          $self->{nc}
6122              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6123        } else {
6124          $self->{set_nc}->($self);
6125        }
6126      
6127            redo A;
6128          } elsif ($self->{nc} == -1) {
6129            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md');
6130            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6131            ## Reconsume.
6132            redo A;
6133          } else {
6134            ## XML5: No parse error.
6135            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space after ENTITY percent'); ## TODO: type
6136            $self->{state} = BOGUS_COMMENT_STATE;
6137            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6138            ## Reconsume.
6139            redo A;
6140          }
6141        } elsif ($self->{state} == MD_NAME_STATE) {
6142          ## XML5: "DOCTYPE ENTITY name state" and "DOCTYPE ATTLIST name state".
6143          
6144          if ($is_space->{$self->{nc}}) {
6145            if ($self->{ct}->{type} == ATTLIST_TOKEN) {
6146              $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
6147            } elsif ($self->{ct}->{type} == ELEMENT_TOKEN) {
6148              ## TODO: ...
6149              $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
6150            } else { # ENTITY/NOTATION
6151              $self->{state} = AFTER_DOCTYPE_NAME_STATE;
6152            }
6153            
6154        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6155          $self->{line_prev} = $self->{line};
6156          $self->{column_prev} = $self->{column};
6157          $self->{column}++;
6158          $self->{nc}
6159              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6160        } else {
6161          $self->{set_nc}->($self);
6162        }
6163      
6164            redo A;
6165          } elsif ($self->{nc} == 0x003E) { # >
6166            if ($self->{ct}->{type} == ATTLIST_TOKEN) {
6167              #
6168            } else {
6169              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
6170            }
6171            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6172            
6173        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6174          $self->{line_prev} = $self->{line};
6175          $self->{column_prev} = $self->{column};
6176          $self->{column}++;
6177          $self->{nc}
6178              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6179        } else {
6180          $self->{set_nc}->($self);
6181        }
6182      
6183            return  ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
6184            redo A;
6185          } elsif ($self->{nc} == -1) {
6186            ## XML5: [ATTLIST] No parse error.
6187            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md');
6188            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6189            ## Reconsume.
6190            return  ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
6191            redo A;
6192          } else {
6193            ## XML5: [ATTLIST] Not defined yet.
6194            $self->{ct}->{name} .= chr $self->{nc};
6195            ## Stay in the state.
6196            
6197        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6198          $self->{line_prev} = $self->{line};
6199          $self->{column_prev} = $self->{column};
6200          $self->{column}++;
6201          $self->{nc}
6202              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6203        } else {
6204          $self->{set_nc}->($self);
6205        }
6206      
6207            redo A;
6208          }
6209        } elsif ($self->{state} == DOCTYPE_ATTLIST_NAME_AFTER_STATE) {
6210          if ($is_space->{$self->{nc}}) {
6211            ## Stay in the state.
6212            
6213        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6214          $self->{line_prev} = $self->{line};
6215          $self->{column_prev} = $self->{column};
6216          $self->{column}++;
6217          $self->{nc}
6218              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6219        } else {
6220          $self->{set_nc}->($self);
6221        }
6222      
6223            redo A;
6224          } elsif ($self->{nc} == 0x003E) { # >
6225            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6226            
6227        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6228          $self->{line_prev} = $self->{line};
6229          $self->{column_prev} = $self->{column};
6230          $self->{column}++;
6231          $self->{nc}
6232              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6233        } else {
6234          $self->{set_nc}->($self);
6235        }
6236      
6237            return  ($self->{ct}); # ATTLIST
6238            redo A;
6239          } elsif ($self->{nc} == -1) {
6240            ## XML5: No parse error.
6241            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6242            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6243            return  ($self->{ct});
6244            redo A;
6245          } else {
6246            ## XML5: Not defined yet.
6247            $self->{ca} = {name => chr ($self->{nc}), # attrdef
6248                           tokens => [],
6249                           line => $self->{line}, column => $self->{column}};
6250            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE;
6251            
6252        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6253          $self->{line_prev} = $self->{line};
6254          $self->{column_prev} = $self->{column};
6255          $self->{column}++;
6256          $self->{nc}
6257              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6258        } else {
6259          $self->{set_nc}->($self);
6260        }
6261      
6262            redo A;
6263          }
6264        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE) {
6265          if ($is_space->{$self->{nc}}) {
6266            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE;
6267            
6268        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6269          $self->{line_prev} = $self->{line};
6270          $self->{column_prev} = $self->{column};
6271          $self->{column}++;
6272          $self->{nc}
6273              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6274        } else {
6275          $self->{set_nc}->($self);
6276        }
6277      
6278            redo A;
6279          } elsif ($self->{nc} == 0x003E) { # >
6280            ## XML5: Same as "anything else".
6281            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr type'); ## TODO: type
6282            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6283            
6284        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6285          $self->{line_prev} = $self->{line};
6286          $self->{column_prev} = $self->{column};
6287          $self->{column}++;
6288          $self->{nc}
6289              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6290        } else {
6291          $self->{set_nc}->($self);
6292        }
6293      
6294            return  ($self->{ct}); # ATTLIST
6295            redo A;
6296          } elsif ($self->{nc} == 0x0028) { # (
6297            ## XML5: Same as "anything else".
6298            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before paren'); ## TODO: type
6299            $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6300            
6301        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6302          $self->{line_prev} = $self->{line};
6303          $self->{column_prev} = $self->{column};
6304          $self->{column}++;
6305          $self->{nc}
6306              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6307        } else {
6308          $self->{set_nc}->($self);
6309        }
6310      
6311            redo A;
6312          } elsif ($self->{nc} == -1) {
6313            ## XML5: No parse error.
6314            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6315            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6316            
6317        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6318          $self->{line_prev} = $self->{line};
6319          $self->{column_prev} = $self->{column};
6320          $self->{column}++;
6321          $self->{nc}
6322              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6323        } else {
6324          $self->{set_nc}->($self);
6325        }
6326      
6327            return  ($self->{ct}); # ATTLIST
6328            redo A;
6329          } else {
6330            ## XML5: Not defined yet.
6331            $self->{ca}->{name} .= chr $self->{nc};
6332            ## Stay in the state.
6333            
6334        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6335          $self->{line_prev} = $self->{line};
6336          $self->{column_prev} = $self->{column};
6337          $self->{column}++;
6338          $self->{nc}
6339              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6340        } else {
6341          $self->{set_nc}->($self);
6342        }
6343      
6344            redo A;
6345          }
6346        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE) {
6347          if ($is_space->{$self->{nc}}) {
6348            ## Stay in the state.
6349            
6350        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6351          $self->{line_prev} = $self->{line};
6352          $self->{column_prev} = $self->{column};
6353          $self->{column}++;
6354          $self->{nc}
6355              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6356        } else {
6357          $self->{set_nc}->($self);
6358        }
6359      
6360            redo A;
6361          } elsif ($self->{nc} == 0x003E) { # >
6362            ## XML5: Same as "anything else".
6363            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr type'); ## TODO: type
6364            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6365            
6366        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6367          $self->{line_prev} = $self->{line};
6368          $self->{column_prev} = $self->{column};
6369          $self->{column}++;
6370          $self->{nc}
6371              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6372        } else {
6373          $self->{set_nc}->($self);
6374        }
6375      
6376            return  ($self->{ct}); # ATTLIST
6377            redo A;
6378          } elsif ($self->{nc} == 0x0028) { # (
6379            ## XML5: Same as "anything else".
6380            $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6381            
6382        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6383          $self->{line_prev} = $self->{line};
6384          $self->{column_prev} = $self->{column};
6385          $self->{column}++;
6386          $self->{nc}
6387              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6388        } else {
6389          $self->{set_nc}->($self);
6390        }
6391      
6392            redo A;
6393          } elsif ($self->{nc} == -1) {
6394            ## XML5: No parse error.
6395            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6396            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6397            
6398        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6399          $self->{line_prev} = $self->{line};
6400          $self->{column_prev} = $self->{column};
6401          $self->{column}++;
6402          $self->{nc}
6403              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6404        } else {
6405          $self->{set_nc}->($self);
6406        }
6407      
6408            return  ($self->{ct});
6409            redo A;
6410          } else {
6411            ## XML5: Not defined yet.
6412            $self->{ca}->{type} = chr $self->{nc};
6413            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE;
6414            
6415        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6416          $self->{line_prev} = $self->{line};
6417          $self->{column_prev} = $self->{column};
6418          $self->{column}++;
6419          $self->{nc}
6420              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6421        } else {
6422          $self->{set_nc}->($self);
6423        }
6424      
6425            redo A;
6426          }
6427        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE) {
6428          if ($is_space->{$self->{nc}}) {
6429            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE;
6430            
6431        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6432          $self->{line_prev} = $self->{line};
6433          $self->{column_prev} = $self->{column};
6434          $self->{column}++;
6435          $self->{nc}
6436              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6437        } else {
6438          $self->{set_nc}->($self);
6439        }
6440      
6441            redo A;
6442          } elsif ($self->{nc} == 0x0023) { # #
6443            ## XML5: Same as "anything else".
6444            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6445            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
6446            
6447        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6448          $self->{line_prev} = $self->{line};
6449          $self->{column_prev} = $self->{column};
6450          $self->{column}++;
6451          $self->{nc}
6452              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6453        } else {
6454          $self->{set_nc}->($self);
6455        }
6456      
6457            redo A;
6458          } elsif ($self->{nc} == 0x0022) { # "
6459            ## XML5: Same as "anything else".
6460            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6461            $self->{ca}->{value} = '';
6462            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
6463            
6464        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6465          $self->{line_prev} = $self->{line};
6466          $self->{column_prev} = $self->{column};
6467          $self->{column}++;
6468          $self->{nc}
6469              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6470        } else {
6471          $self->{set_nc}->($self);
6472        }
6473      
6474            redo A;
6475          } elsif ($self->{nc} == 0x0027) { # '
6476            ## XML5: Same as "anything else".
6477            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6478            $self->{ca}->{value} = '';
6479            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
6480            
6481        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6482          $self->{line_prev} = $self->{line};
6483          $self->{column_prev} = $self->{column};
6484          $self->{column}++;
6485          $self->{nc}
6486              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6487        } else {
6488          $self->{set_nc}->($self);
6489        }
6490      
6491            redo A;
6492          } elsif ($self->{nc} == 0x003E) { # >
6493            ## XML5: Same as "anything else".
6494            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
6495            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6496            
6497        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6498          $self->{line_prev} = $self->{line};
6499          $self->{column_prev} = $self->{column};
6500          $self->{column}++;
6501          $self->{nc}
6502              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6503        } else {
6504          $self->{set_nc}->($self);
6505        }
6506      
6507            return  ($self->{ct}); # ATTLIST
6508            redo A;
6509          } elsif ($self->{nc} == 0x0028) { # (
6510            ## XML5: Same as "anything else".
6511            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before paren'); ## TODO: type
6512            $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6513            
6514        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6515          $self->{line_prev} = $self->{line};
6516          $self->{column_prev} = $self->{column};
6517          $self->{column}++;
6518          $self->{nc}
6519              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6520        } else {
6521          $self->{set_nc}->($self);
6522        }
6523      
6524            redo A;
6525          } elsif ($self->{nc} == -1) {
6526            ## XML5: No parse error.
6527            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6528            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6529            
6530        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6531          $self->{line_prev} = $self->{line};
6532          $self->{column_prev} = $self->{column};
6533          $self->{column}++;
6534          $self->{nc}
6535              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6536        } else {
6537          $self->{set_nc}->($self);
6538        }
6539      
6540            return  ($self->{ct});
6541            redo A;
6542          } else {
6543            ## XML5: Not defined yet.
6544            $self->{ca}->{type} .= chr $self->{nc};
6545            ## Stay in the state.
6546            
6547        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6548          $self->{line_prev} = $self->{line};
6549          $self->{column_prev} = $self->{column};
6550          $self->{column}++;
6551          $self->{nc}
6552              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6553        } else {
6554          $self->{set_nc}->($self);
6555        }
6556      
6557            redo A;
6558          }
6559        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE) {
6560          if ($is_space->{$self->{nc}}) {
6561            ## Stay in the state.
6562            
6563        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6564          $self->{line_prev} = $self->{line};
6565          $self->{column_prev} = $self->{column};
6566          $self->{column}++;
6567          $self->{nc}
6568              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6569        } else {
6570          $self->{set_nc}->($self);
6571        }
6572      
6573            redo A;
6574          } elsif ($self->{nc} == 0x0028) { # (
6575            ## XML5: Same as "anything else".
6576            $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6577            
6578        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6579          $self->{line_prev} = $self->{line};
6580          $self->{column_prev} = $self->{column};
6581          $self->{column}++;
6582          $self->{nc}
6583              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6584        } else {
6585          $self->{set_nc}->($self);
6586        }
6587      
6588            redo A;
6589          } elsif ($self->{nc} == 0x0023) { # #
6590            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
6591            
6592        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6593          $self->{line_prev} = $self->{line};
6594          $self->{column_prev} = $self->{column};
6595          $self->{column}++;
6596          $self->{nc}
6597              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6598        } else {
6599          $self->{set_nc}->($self);
6600        }
6601      
6602            redo A;
6603          } elsif ($self->{nc} == 0x0022) { # "
6604            ## XML5: Same as "anything else".
6605            $self->{ca}->{value} = '';
6606            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
6607            
6608        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6609          $self->{line_prev} = $self->{line};
6610          $self->{column_prev} = $self->{column};
6611          $self->{column}++;
6612          $self->{nc}
6613              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6614        } else {
6615          $self->{set_nc}->($self);
6616        }
6617      
6618            redo A;
6619          } elsif ($self->{nc} == 0x0027) { # '
6620            ## XML5: Same as "anything else".
6621            $self->{ca}->{value} = '';
6622            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
6623            
6624        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6625          $self->{line_prev} = $self->{line};
6626          $self->{column_prev} = $self->{column};
6627          $self->{column}++;
6628          $self->{nc}
6629              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6630        } else {
6631          $self->{set_nc}->($self);
6632        }
6633      
6634            redo A;
6635          } elsif ($self->{nc} == 0x003E) { # >
6636            ## XML5: Same as "anything else".
6637            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
6638            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6639            
6640        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6641          $self->{line_prev} = $self->{line};
6642          $self->{column_prev} = $self->{column};
6643          $self->{column}++;
6644          $self->{nc}
6645              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6646        } else {
6647          $self->{set_nc}->($self);
6648        }
6649      
6650            return  ($self->{ct}); # ATTLIST
6651            redo A;
6652          } elsif ($self->{nc} == -1) {
6653            ## XML5: No parse error.
6654            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6655            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6656            
6657        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6658          $self->{line_prev} = $self->{line};
6659          $self->{column_prev} = $self->{column};
6660          $self->{column}++;
6661          $self->{nc}
6662              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6663        } else {
6664          $self->{set_nc}->($self);
6665        }
6666      
6667            return  ($self->{ct});
6668            redo A;
6669          } else {
6670            ## XML5: Switch to the "DOCTYPE bogus comment state".
6671            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
6672            $self->{ca}->{value} = '';
6673            $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
6674            ## Reconsume.
6675            redo A;
6676          }
6677        } elsif ($self->{state} == BEFORE_ALLOWED_TOKEN_STATE) {
6678          if ($is_space->{$self->{nc}}) {
6679            ## Stay in the state.
6680            
6681        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6682          $self->{line_prev} = $self->{line};
6683          $self->{column_prev} = $self->{column};
6684          $self->{column}++;
6685          $self->{nc}
6686              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6687        } else {
6688          $self->{set_nc}->($self);
6689        }
6690      
6691            redo A;
6692          } elsif ($self->{nc} == 0x007C) { # |
6693            $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty allowed token'); ## TODO: type
6694            ## Stay in the state.
6695            
6696        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6697          $self->{line_prev} = $self->{line};
6698          $self->{column_prev} = $self->{column};
6699          $self->{column}++;
6700          $self->{nc}
6701              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6702        } else {
6703          $self->{set_nc}->($self);
6704        }
6705      
6706            redo A;
6707          } elsif ($self->{nc} == 0x0029) { # )
6708            $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty allowed token'); ## TODO: type
6709            $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
6710            
6711        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6712          $self->{line_prev} = $self->{line};
6713          $self->{column_prev} = $self->{column};
6714          $self->{column}++;
6715          $self->{nc}
6716              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6717        } else {
6718          $self->{set_nc}->($self);
6719        }
6720      
6721            redo A;
6722          } elsif ($self->{nc} == 0x003E) { # >
6723            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
6724            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6725            
6726        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6727          $self->{line_prev} = $self->{line};
6728          $self->{column_prev} = $self->{column};
6729          $self->{column}++;
6730          $self->{nc}
6731              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6732        } else {
6733          $self->{set_nc}->($self);
6734        }
6735      
6736            return  ($self->{ct}); # ATTLIST
6737            redo A;
6738          } elsif ($self->{nc} == -1) {
6739            ## XML5: No parse error.
6740            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6741            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6742            
6743        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6744          $self->{line_prev} = $self->{line};
6745          $self->{column_prev} = $self->{column};
6746          $self->{column}++;
6747          $self->{nc}
6748              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6749        } else {
6750          $self->{set_nc}->($self);
6751        }
6752      
6753            return  ($self->{ct});
6754            redo A;
6755          } else {
6756            push @{$self->{ca}->{tokens}}, chr $self->{nc};
6757            $self->{state} = ALLOWED_TOKEN_STATE;
6758            
6759        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6760          $self->{line_prev} = $self->{line};
6761          $self->{column_prev} = $self->{column};
6762          $self->{column}++;
6763          $self->{nc}
6764              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6765        } else {
6766          $self->{set_nc}->($self);
6767        }
6768      
6769            redo A;
6770          }
6771        } elsif ($self->{state} == ALLOWED_TOKEN_STATE) {
6772          if ($is_space->{$self->{nc}}) {
6773            $self->{state} = AFTER_ALLOWED_TOKEN_STATE;
6774            
6775        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6776          $self->{line_prev} = $self->{line};
6777          $self->{column_prev} = $self->{column};
6778          $self->{column}++;
6779          $self->{nc}
6780              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6781        } else {
6782          $self->{set_nc}->($self);
6783        }
6784      
6785            redo A;
6786          } elsif ($self->{nc} == 0x007C) { # |
6787            $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6788            
6789        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6790          $self->{line_prev} = $self->{line};
6791          $self->{column_prev} = $self->{column};
6792          $self->{column}++;
6793          $self->{nc}
6794              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6795        } else {
6796          $self->{set_nc}->($self);
6797        }
6798      
6799            redo A;
6800          } elsif ($self->{nc} == 0x0029) { # )
6801            $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
6802            
6803        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6804          $self->{line_prev} = $self->{line};
6805          $self->{column_prev} = $self->{column};
6806          $self->{column}++;
6807          $self->{nc}
6808              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6809        } else {
6810          $self->{set_nc}->($self);
6811        }
6812      
6813            redo A;
6814          } elsif ($self->{nc} == 0x003E) { # >
6815            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
6816            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6817            
6818        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6819          $self->{line_prev} = $self->{line};
6820          $self->{column_prev} = $self->{column};
6821          $self->{column}++;
6822          $self->{nc}
6823              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6824        } else {
6825          $self->{set_nc}->($self);
6826        }
6827      
6828            return  ($self->{ct}); # ATTLIST
6829            redo A;
6830          } elsif ($self->{nc} == -1) {
6831            ## XML5: No parse error.
6832            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6833            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6834            
6835        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6836          $self->{line_prev} = $self->{line};
6837          $self->{column_prev} = $self->{column};
6838          $self->{column}++;
6839          $self->{nc}
6840              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6841        } else {
6842          $self->{set_nc}->($self);
6843        }
6844      
6845            return  ($self->{ct});
6846            redo A;
6847          } else {
6848            $self->{ca}->{tokens}->[-1] .= chr $self->{nc};
6849            ## Stay in the state.
6850            
6851        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6852          $self->{line_prev} = $self->{line};
6853          $self->{column_prev} = $self->{column};
6854          $self->{column}++;
6855          $self->{nc}
6856              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6857        } else {
6858          $self->{set_nc}->($self);
6859        }
6860      
6861            redo A;
6862          }
6863        } elsif ($self->{state} == AFTER_ALLOWED_TOKEN_STATE) {
6864          if ($is_space->{$self->{nc}}) {
6865            ## Stay in the state.
6866            
6867        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6868          $self->{line_prev} = $self->{line};
6869          $self->{column_prev} = $self->{column};
6870          $self->{column}++;
6871          $self->{nc}
6872              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6873        } else {
6874          $self->{set_nc}->($self);
6875        }
6876      
6877            redo A;
6878          } elsif ($self->{nc} == 0x007C) { # |
6879            $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6880            
6881        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6882          $self->{line_prev} = $self->{line};
6883          $self->{column_prev} = $self->{column};
6884          $self->{column}++;
6885          $self->{nc}
6886              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6887        } else {
6888          $self->{set_nc}->($self);
6889        }
6890      
6891            redo A;
6892          } elsif ($self->{nc} == 0x0029) { # )
6893            $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
6894            
6895        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6896          $self->{line_prev} = $self->{line};
6897          $self->{column_prev} = $self->{column};
6898          $self->{column}++;
6899          $self->{nc}
6900              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6901        } else {
6902          $self->{set_nc}->($self);
6903        }
6904      
6905            redo A;
6906          } elsif ($self->{nc} == 0x003E) { # >
6907            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
6908            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6909            
6910        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6911          $self->{line_prev} = $self->{line};
6912          $self->{column_prev} = $self->{column};
6913          $self->{column}++;
6914          $self->{nc}
6915              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6916        } else {
6917          $self->{set_nc}->($self);
6918        }
6919      
6920            return  ($self->{ct}); # ATTLIST
6921            redo A;
6922          } elsif ($self->{nc} == -1) {
6923            ## XML5: No parse error.
6924            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6925            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6926            
6927        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6928          $self->{line_prev} = $self->{line};
6929          $self->{column_prev} = $self->{column};
6930          $self->{column}++;
6931          $self->{nc}
6932              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6933        } else {
6934          $self->{set_nc}->($self);
6935        }
6936      
6937            return  ($self->{ct});
6938            redo A;
6939          } else {
6940            $self->{parse_error}->(level => $self->{level}->{must}, type => 'space in allowed token', ## TODO: type
6941                            line => $self->{line_prev},
6942                            column => $self->{column_prev});
6943            $self->{ca}->{tokens}->[-1] .= ' ' . chr $self->{nc};
6944            $self->{state} = ALLOWED_TOKEN_STATE;
6945            
6946        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6947          $self->{line_prev} = $self->{line};
6948          $self->{column_prev} = $self->{column};
6949          $self->{column}++;
6950          $self->{nc}
6951              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6952        } else {
6953          $self->{set_nc}->($self);
6954        }
6955      
6956            redo A;
6957          }
6958        } elsif ($self->{state} == AFTER_ALLOWED_TOKENS_STATE) {
6959          if ($is_space->{$self->{nc}}) {
6960            $self->{state} = BEFORE_ATTR_DEFAULT_STATE;
6961            
6962        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6963          $self->{line_prev} = $self->{line};
6964          $self->{column_prev} = $self->{column};
6965          $self->{column}++;
6966          $self->{nc}
6967              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6968        } else {
6969          $self->{set_nc}->($self);
6970        }
6971      
6972            redo A;
6973          } elsif ($self->{nc} == 0x0023) { # #
6974            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6975            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
6976            
6977        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6978          $self->{line_prev} = $self->{line};
6979          $self->{column_prev} = $self->{column};
6980          $self->{column}++;
6981          $self->{nc}
6982              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6983        } else {
6984          $self->{set_nc}->($self);
6985        }
6986      
6987            redo A;
6988          } elsif ($self->{nc} == 0x0022) { # "
6989            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6990            $self->{ca}->{value} = '';
6991            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
6992            
6993        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6994          $self->{line_prev} = $self->{line};
6995          $self->{column_prev} = $self->{column};
6996          $self->{column}++;
6997          $self->{nc}
6998              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6999        } else {
7000          $self->{set_nc}->($self);
7001        }
7002      
7003            redo A;
7004          } elsif ($self->{nc} == 0x0027) { # '
7005            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7006            $self->{ca}->{value} = '';
7007            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7008            
7009        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7010          $self->{line_prev} = $self->{line};
7011          $self->{column_prev} = $self->{column};
7012          $self->{column}++;
7013          $self->{nc}
7014              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7015        } else {
7016          $self->{set_nc}->($self);
7017        }
7018      
7019            redo A;
7020          } elsif ($self->{nc} == 0x003E) { # >
7021            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7022            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7023            
7024        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7025          $self->{line_prev} = $self->{line};
7026          $self->{column_prev} = $self->{column};
7027          $self->{column}++;
7028          $self->{nc}
7029              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7030        } else {
7031          $self->{set_nc}->($self);
7032        }
7033      
7034            return  ($self->{ct}); # ATTLIST
7035            redo A;
7036          } elsif ($self->{nc} == -1) {
7037            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7038            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7039            
7040        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7041          $self->{line_prev} = $self->{line};
7042          $self->{column_prev} = $self->{column};
7043          $self->{column}++;
7044          $self->{nc}
7045              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7046        } else {
7047          $self->{set_nc}->($self);
7048        }
7049      
7050            return  ($self->{ct});
7051            redo A;
7052          } else {
7053            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
7054            $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7055            ## Reconsume.
7056            redo A;
7057          }
7058        } elsif ($self->{state} == BEFORE_ATTR_DEFAULT_STATE) {
7059          if ($is_space->{$self->{nc}}) {
7060            ## Stay in the state.
7061            
7062        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7063          $self->{line_prev} = $self->{line};
7064          $self->{column_prev} = $self->{column};
7065          $self->{column}++;
7066          $self->{nc}
7067              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7068        } else {
7069          $self->{set_nc}->($self);
7070        }
7071      
7072            redo A;
7073          } elsif ($self->{nc} == 0x0023) { # #
7074            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
7075            
7076        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7077          $self->{line_prev} = $self->{line};
7078          $self->{column_prev} = $self->{column};
7079          $self->{column}++;
7080          $self->{nc}
7081              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7082        } else {
7083          $self->{set_nc}->($self);
7084        }
7085      
7086            redo A;
7087          } elsif ($self->{nc} == 0x0022) { # "
7088            $self->{ca}->{value} = '';
7089            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7090            
7091        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7092          $self->{line_prev} = $self->{line};
7093          $self->{column_prev} = $self->{column};
7094          $self->{column}++;
7095          $self->{nc}
7096              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7097        } else {
7098          $self->{set_nc}->($self);
7099        }
7100      
7101            redo A;
7102          } elsif ($self->{nc} == 0x0027) { # '
7103            $self->{ca}->{value} = '';
7104            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7105            
7106        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7107          $self->{line_prev} = $self->{line};
7108          $self->{column_prev} = $self->{column};
7109          $self->{column}++;
7110          $self->{nc}
7111              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7112        } else {
7113          $self->{set_nc}->($self);
7114        }
7115      
7116            redo A;
7117          } elsif ($self->{nc} == 0x003E) { # >
7118            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7119            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7120            
7121        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7122          $self->{line_prev} = $self->{line};
7123          $self->{column_prev} = $self->{column};
7124          $self->{column}++;
7125          $self->{nc}
7126              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7127        } else {
7128          $self->{set_nc}->($self);
7129        }
7130      
7131            return  ($self->{ct}); # ATTLIST
7132            redo A;
7133          } elsif ($self->{nc} == -1) {
7134            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7135            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7136            
7137        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7138          $self->{line_prev} = $self->{line};
7139          $self->{column_prev} = $self->{column};
7140          $self->{column}++;
7141          $self->{nc}
7142              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7143        } else {
7144          $self->{set_nc}->($self);
7145        }
7146      
7147            return  ($self->{ct});
7148            redo A;
7149          } else {
7150            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
7151            $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7152            ## Reconsume.
7153            redo A;
7154          }
7155        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE) {
7156          if ($is_space->{$self->{nc}}) {
7157            ## XML5: No parse error.
7158            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no default type'); ## TODO: type
7159            $self->{state} = BOGUS_MD_STATE;
7160            ## Reconsume.
7161            redo A;
7162          } elsif ($self->{nc} == 0x0022) { # "
7163            ## XML5: Same as "anything else".
7164            $self->{ca}->{value} = '';
7165            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7166            
7167        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7168          $self->{line_prev} = $self->{line};
7169          $self->{column_prev} = $self->{column};
7170          $self->{column}++;
7171          $self->{nc}
7172              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7173        } else {
7174          $self->{set_nc}->($self);
7175        }
7176      
7177            redo A;
7178          } elsif ($self->{nc} == 0x0027) { # '
7179            ## XML5: Same as "anything else".
7180            $self->{ca}->{value} = '';
7181            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7182            
7183        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7184          $self->{line_prev} = $self->{line};
7185          $self->{column_prev} = $self->{column};
7186          $self->{column}++;
7187          $self->{nc}
7188              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7189        } else {
7190          $self->{set_nc}->($self);
7191        }
7192      
7193            redo A;
7194          } elsif ($self->{nc} == 0x003E) { # >
7195            ## XML5: Same as "anything else".
7196            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7197            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7198            
7199        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7200          $self->{line_prev} = $self->{line};
7201          $self->{column_prev} = $self->{column};
7202          $self->{column}++;
7203          $self->{nc}
7204              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7205        } else {
7206          $self->{set_nc}->($self);
7207        }
7208      
7209            return  ($self->{ct}); # ATTLIST
7210            redo A;
7211          } elsif ($self->{nc} == -1) {
7212            ## XML5: No parse error.
7213            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7214            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7215            
7216        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7217          $self->{line_prev} = $self->{line};
7218          $self->{column_prev} = $self->{column};
7219          $self->{column}++;
7220          $self->{nc}
7221              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7222        } else {
7223          $self->{set_nc}->($self);
7224        }
7225      
7226            return  ($self->{ct});
7227            redo A;
7228          } else {
7229            $self->{ca}->{default} = chr $self->{nc};
7230            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE;
7231            
7232        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7233          $self->{line_prev} = $self->{line};
7234          $self->{column_prev} = $self->{column};
7235          $self->{column}++;
7236          $self->{nc}
7237              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7238        } else {
7239          $self->{set_nc}->($self);
7240        }
7241      
7242            redo A;
7243          }
7244        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE) {
7245          if ($is_space->{$self->{nc}}) {
7246            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE;
7247            
7248        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7249          $self->{line_prev} = $self->{line};
7250          $self->{column_prev} = $self->{column};
7251          $self->{column}++;
7252          $self->{nc}
7253              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7254        } else {
7255          $self->{set_nc}->($self);
7256        }
7257      
7258            redo A;
7259          } elsif ($self->{nc} == 0x0022) { # "
7260            ## XML5: Same as "anything else".
7261            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7262            $self->{ca}->{value} = '';
7263            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7264            
7265        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7266          $self->{line_prev} = $self->{line};
7267          $self->{column_prev} = $self->{column};
7268          $self->{column}++;
7269          $self->{nc}
7270              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7271        } else {
7272          $self->{set_nc}->($self);
7273        }
7274      
7275            redo A;
7276          } elsif ($self->{nc} == 0x0027) { # '
7277            ## XML5: Same as "anything else".
7278            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7279            $self->{ca}->{value} = '';
7280            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7281            
7282        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7283          $self->{line_prev} = $self->{line};
7284          $self->{column_prev} = $self->{column};
7285          $self->{column}++;
7286          $self->{nc}
7287              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7288        } else {
7289          $self->{set_nc}->($self);
7290        }
7291      
7292            redo A;
7293          } elsif ($self->{nc} == 0x003E) { # >
7294            ## XML5: Same as "anything else".
7295            push @{$self->{ct}->{attrdefs}}, $self->{ca};
7296            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7297            
7298        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7299          $self->{line_prev} = $self->{line};
7300          $self->{column_prev} = $self->{column};
7301          $self->{column}++;
7302          $self->{nc}
7303              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7304        } else {
7305          $self->{set_nc}->($self);
7306        }
7307      
7308            return  ($self->{ct}); # ATTLIST
7309            redo A;
7310          } elsif ($self->{nc} == -1) {
7311            ## XML5: No parse error.
7312            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7313            push @{$self->{ct}->{attrdefs}}, $self->{ca};
7314            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7315            
7316        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7317          $self->{line_prev} = $self->{line};
7318          $self->{column_prev} = $self->{column};
7319          $self->{column}++;
7320          $self->{nc}
7321              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7322        } else {
7323          $self->{set_nc}->($self);
7324        }
7325      
7326            return  ($self->{ct});
7327            redo A;
7328          } else {
7329            $self->{ca}->{default} .= chr $self->{nc};
7330            ## Stay in the state.
7331            
7332        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7333          $self->{line_prev} = $self->{line};
7334          $self->{column_prev} = $self->{column};
7335          $self->{column}++;
7336          $self->{nc}
7337              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7338        } else {
7339          $self->{set_nc}->($self);
7340        }
7341      
7342            redo A;
7343          }
7344        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE) {
7345          if ($is_space->{$self->{nc}}) {
7346            ## Stay in the state.
7347            
7348        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7349          $self->{line_prev} = $self->{line};
7350          $self->{column_prev} = $self->{column};
7351          $self->{column}++;
7352          $self->{nc}
7353              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7354        } else {
7355          $self->{set_nc}->($self);
7356        }
7357      
7358            redo A;
7359          } elsif ($self->{nc} == 0x0022) { # "
7360            $self->{ca}->{value} = '';
7361            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7362            
7363        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7364          $self->{line_prev} = $self->{line};
7365          $self->{column_prev} = $self->{column};
7366          $self->{column}++;
7367          $self->{nc}
7368              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7369        } else {
7370          $self->{set_nc}->($self);
7371        }
7372      
7373            redo A;
7374          } elsif ($self->{nc} == 0x0027) { # '
7375            $self->{ca}->{value} = '';
7376            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7377            
7378        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7379          $self->{line_prev} = $self->{line};
7380          $self->{column_prev} = $self->{column};
7381          $self->{column}++;
7382          $self->{nc}
7383              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7384        } else {
7385          $self->{set_nc}->($self);
7386        }
7387      
7388            redo A;
7389          } elsif ($self->{nc} == 0x003E) { # >
7390            push @{$self->{ct}->{attrdefs}}, $self->{ca};
7391            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7392            
7393        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7394          $self->{line_prev} = $self->{line};
7395          $self->{column_prev} = $self->{column};
7396          $self->{column}++;
7397          $self->{nc}
7398              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7399        } else {
7400          $self->{set_nc}->($self);
7401        }
7402      
7403            return  ($self->{ct}); # ATTLIST
7404            redo A;
7405          } elsif ($self->{nc} == -1) {
7406            ## XML5: No parse error.
7407            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7408            push @{$self->{ct}->{attrdefs}}, $self->{ca};
7409            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7410            
7411        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7412          $self->{line_prev} = $self->{line};
7413          $self->{column_prev} = $self->{column};
7414          $self->{column}++;
7415          $self->{nc}
7416              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7417        } else {
7418          $self->{set_nc}->($self);
7419        }
7420      
7421            return  ($self->{ct});
7422            redo A;
7423          } else {
7424            ## XML5: Not defined yet.
7425            if ($self->{ca}->{default} eq 'FIXED') {
7426              $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7427            } else {
7428              push @{$self->{ct}->{attrdefs}}, $self->{ca};
7429              $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7430            }
7431            ## Reconsume.
7432            redo A;
7433          }
7434        } elsif ($self->{state} == AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE) {
7435          if ($is_space->{$self->{nc}} or
7436              $self->{nc} == -1 or
7437              $self->{nc} == 0x003E) { # >
7438            $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7439            ## Reconsume.
7440            redo A;
7441          } else {
7442            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before attr name'); ## TODO: type
7443            $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7444            ## Reconsume.
7445            redo A;
7446          }
7447        } elsif ($self->{state} == NDATA_STATE) {
7448          ## ASCII case-insensitive
7449          if ($self->{nc} == [
7450                undef,
7451                0x0044, # D
7452                0x0041, # A
7453                0x0054, # T
7454              ]->[length $self->{kwd}] or
7455              $self->{nc} == [
7456                undef,
7457                0x0064, # d
7458                0x0061, # a
7459                0x0074, # t
7460              ]->[length $self->{kwd}]) {
7461            
7462            ## Stay in the state.
7463            $self->{kwd} .= chr $self->{nc};
7464            
7465        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7466          $self->{line_prev} = $self->{line};
7467          $self->{column_prev} = $self->{column};
7468          $self->{column}++;
7469          $self->{nc}
7470              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7471        } else {
7472          $self->{set_nc}->($self);
7473        }
7474      
7475            redo A;
7476          } elsif ((length $self->{kwd}) == 4 and
7477                   ($self->{nc} == 0x0041 or # A
7478                    $self->{nc} == 0x0061)) { # a
7479            if ($self->{kwd} ne 'NDAT' or $self->{nc} == 0x0061) { # a
7480              
7481              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
7482                              text => 'NDATA',
7483                              line => $self->{line_prev},
7484                              column => $self->{column_prev} - 4);
7485            } else {
7486              
7487            }
7488            $self->{state} = AFTER_NDATA_STATE;
7489            
7490        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7491          $self->{line_prev} = $self->{line};
7492          $self->{column_prev} = $self->{column};
7493          $self->{column}++;
7494          $self->{nc}
7495              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7496        } else {
7497          $self->{set_nc}->($self);
7498        }
7499      
7500            redo A;
7501          } else {
7502            $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after literal', ## TODO: type
7503                            line => $self->{line_prev},
7504                            column => $self->{column_prev} + 1
7505                                - length $self->{kwd});
7506            
7507            $self->{state} = BOGUS_MD_STATE;
7508            ## Reconsume.
7509            redo A;
7510          }
7511        } elsif ($self->{state} == AFTER_NDATA_STATE) {
7512          if ($is_space->{$self->{nc}}) {
7513            $self->{state} = BEFORE_NOTATION_NAME_STATE;
7514            
7515        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7516          $self->{line_prev} = $self->{line};
7517          $self->{column_prev} = $self->{column};
7518          $self->{column}++;
7519          $self->{nc}
7520              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7521        } else {
7522          $self->{set_nc}->($self);
7523        }
7524      
7525            redo A;
7526          } elsif ($self->{nc} == 0x003E) { # >
7527            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no notation name'); ## TODO: type
7528            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7529            
7530        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7531          $self->{line_prev} = $self->{line};
7532          $self->{column_prev} = $self->{column};
7533          $self->{column}++;
7534          $self->{nc}
7535              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7536        } else {
7537          $self->{set_nc}->($self);
7538        }
7539      
7540            return  ($self->{ct}); # ENTITY
7541            redo A;
7542          } elsif ($self->{nc} == -1) {
7543            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7544            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7545            
7546        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7547          $self->{line_prev} = $self->{line};
7548          $self->{column_prev} = $self->{column};
7549          $self->{column}++;
7550          $self->{nc}
7551              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7552        } else {
7553          $self->{set_nc}->($self);
7554        }
7555      
7556            return  ($self->{ct}); # ENTITY
7557            redo A;
7558          } else {
7559            $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after literal', ## TODO: type
7560                            line => $self->{line_prev},
7561                            column => $self->{column_prev} + 1
7562                                - length $self->{kwd});
7563            $self->{state} = BOGUS_MD_STATE;
7564            ## Reconsume.
7565            redo A;
7566          }
7567        } elsif ($self->{state} == BEFORE_NOTATION_NAME_STATE) {
7568          if ($is_space->{$self->{nc}}) {
7569            ## Stay in the state.
7570            
7571        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7572          $self->{line_prev} = $self->{line};
7573          $self->{column_prev} = $self->{column};
7574          $self->{column}++;
7575          $self->{nc}
7576              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7577        } else {
7578          $self->{set_nc}->($self);
7579        }
7580      
7581            redo A;
7582          } elsif ($self->{nc} == 0x003E) { # >
7583            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no notation name'); ## TODO: type
7584            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7585            
7586        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7587          $self->{line_prev} = $self->{line};
7588          $self->{column_prev} = $self->{column};
7589          $self->{column}++;
7590          $self->{nc}
7591              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7592        } else {
7593          $self->{set_nc}->($self);
7594        }
7595      
7596            return  ($self->{ct}); # ENTITY
7597            redo A;
7598          } elsif ($self->{nc} == -1) {
7599            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7600            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7601            
7602        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7603          $self->{line_prev} = $self->{line};
7604          $self->{column_prev} = $self->{column};
7605          $self->{column}++;
7606          $self->{nc}
7607              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7608        } else {
7609          $self->{set_nc}->($self);
7610        }
7611      
7612            return  ($self->{ct}); # ENTITY
7613            redo A;
7614          } else {
7615            $self->{ct}->{notation} = chr $self->{nc}; # ENTITY
7616            $self->{state} = NOTATION_NAME_STATE;
7617            
7618        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7619          $self->{line_prev} = $self->{line};
7620          $self->{column_prev} = $self->{column};
7621          $self->{column}++;
7622          $self->{nc}
7623              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7624        } else {
7625          $self->{set_nc}->($self);
7626        }
7627      
7628            redo A;
7629          }
7630        } elsif ($self->{state} == NOTATION_NAME_STATE) {
7631          if ($is_space->{$self->{nc}}) {
7632            $self->{state} = AFTER_NOTATION_NAME_STATE;
7633            
7634        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7635          $self->{line_prev} = $self->{line};
7636          $self->{column_prev} = $self->{column};
7637          $self->{column}++;
7638          $self->{nc}
7639              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7640        } else {
7641          $self->{set_nc}->($self);
7642        }
7643      
7644            redo A;
7645          } elsif ($self->{nc} == 0x003E) { # >
7646            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7647            
7648        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7649          $self->{line_prev} = $self->{line};
7650          $self->{column_prev} = $self->{column};
7651          $self->{column}++;
7652          $self->{nc}
7653              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7654        } else {
7655          $self->{set_nc}->($self);
7656        }
7657      
7658            return  ($self->{ct}); # ENTITY
7659            redo A;
7660          } elsif ($self->{nc} == -1) {
7661            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7662            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7663            
7664        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7665          $self->{line_prev} = $self->{line};
7666          $self->{column_prev} = $self->{column};
7667          $self->{column}++;
7668          $self->{nc}
7669              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7670        } else {
7671          $self->{set_nc}->($self);
7672        }
7673      
7674            return  ($self->{ct}); # ENTITY
7675            redo A;
7676          } else {
7677            $self->{ct}->{notation} .= chr $self->{nc}; # ENTITY
7678            ## Stay in the state.
7679            
7680        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7681          $self->{line_prev} = $self->{line};
7682          $self->{column_prev} = $self->{column};
7683          $self->{column}++;
7684          $self->{nc}
7685              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7686        } else {
7687          $self->{set_nc}->($self);
7688        }
7689      
7690            redo A;
7691          }
7692        } elsif ($self->{state} == AFTER_NOTATION_NAME_STATE) {
7693          if ($is_space->{$self->{nc}}) {
7694            ## Stay in the state.
7695            
7696        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7697          $self->{line_prev} = $self->{line};
7698          $self->{column_prev} = $self->{column};
7699          $self->{column}++;
7700          $self->{nc}
7701              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7702        } else {
7703          $self->{set_nc}->($self);
7704        }
7705      
7706            redo A;
7707          } elsif ($self->{nc} == 0x003E) { # >
7708            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7709            
7710        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7711          $self->{line_prev} = $self->{line};
7712          $self->{column_prev} = $self->{column};
7713          $self->{column}++;
7714          $self->{nc}
7715              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7716        } else {
7717          $self->{set_nc}->($self);
7718        }
7719      
7720            return  ($self->{ct}); # ENTITY
7721            redo A;
7722          } elsif ($self->{nc} == -1) {
7723            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7724            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7725            
7726        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7727          $self->{line_prev} = $self->{line};
7728          $self->{column_prev} = $self->{column};
7729          $self->{column}++;
7730          $self->{nc}
7731              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7732        } else {
7733          $self->{set_nc}->($self);
7734        }
7735      
7736            return  ($self->{ct}); # ENTITY
7737            redo A;
7738          } else {
7739            $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after notation name'); ## TODO: type
7740            $self->{state} = BOGUS_MD_STATE;
7741            ## Reconsume.
7742            redo A;
7743          }
7744    
7745    
7746        } elsif ($self->{state} == BOGUS_MD_STATE) {
7747          if ($self->{nc} == 0x003E) { # >
7748            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7749            
7750        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7751          $self->{line_prev} = $self->{line};
7752          $self->{column_prev} = $self->{column};
7753          $self->{column}++;
7754          $self->{nc}
7755              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7756        } else {
7757          $self->{set_nc}->($self);
7758        }
7759      
7760            return  ($self->{ct}); # ATTLIST/ENTITY/NOTATION
7761            redo A;
7762          } elsif ($self->{nc} == -1) {
7763            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7764            ## Reconsume.
7765            return  ($self->{ct}); # ATTLIST/ENTITY/NOTATION
7766            redo A;
7767          } else {
7768            ## Stay in the state.
7769            
7770        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7771          $self->{line_prev} = $self->{line};
7772          $self->{column_prev} = $self->{column};
7773          $self->{column}++;
7774          $self->{nc}
7775              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7776        } else {
7777          $self->{set_nc}->($self);
7778        }
7779      
7780          redo A;          redo A;
7781        }        }
7782      } else {      } else {
# Line 4115  sub _get_next_token ($) { Line 7789  sub _get_next_token ($) {
7789    
7790  1;  1;
7791  ## $Date$  ## $Date$
7792                                    

Legend:
Removed from v.1.1  
changed lines
  Added in v.1.18

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24