/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm
Suika

Diff of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.1 by wakaba, Tue Oct 14 02:27:58 2008 UTC revision 1.26 by wakaba, Thu Jul 2 21:42:43 2009 UTC
# Line 2  package Whatpm::HTML::Tokenizer; Line 2  package Whatpm::HTML::Tokenizer;
2  use strict;  use strict;
3  our $VERSION=do{my @r=(q$Revision$=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};  our $VERSION=do{my @r=(q$Revision$=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4    
5    BEGIN {
6      require Exporter;
7      push our @ISA, 'Exporter';
8    
9      our @EXPORT_OK = qw(
10        DOCTYPE_TOKEN
11        COMMENT_TOKEN
12        START_TAG_TOKEN
13        END_TAG_TOKEN
14        END_OF_FILE_TOKEN
15        CHARACTER_TOKEN
16        PI_TOKEN
17        ABORT_TOKEN
18        END_OF_DOCTYPE_TOKEN
19        ATTLIST_TOKEN
20        ELEMENT_TOKEN
21        GENERAL_ENTITY_TOKEN
22        PARAMETER_ENTITY_TOKEN
23        NOTATION_TOKEN
24      );
25      
26      our %EXPORT_TAGS = (
27        token => [qw(
28          DOCTYPE_TOKEN
29          COMMENT_TOKEN
30          START_TAG_TOKEN
31          END_TAG_TOKEN
32          END_OF_FILE_TOKEN
33          CHARACTER_TOKEN
34          PI_TOKEN
35          ABORT_TOKEN
36          END_OF_DOCTYPE_TOKEN
37          ATTLIST_TOKEN
38          ELEMENT_TOKEN
39          GENERAL_ENTITY_TOKEN
40          PARAMETER_ENTITY_TOKEN
41          NOTATION_TOKEN
42        )],
43      );
44    }
45    
46    ## NOTE: Differences from the XML5 draft are marked as "XML5:".
47    
48    ## Token types
49    
50    sub DOCTYPE_TOKEN () { 1 } ## XML5: No DOCTYPE token.
51    sub COMMENT_TOKEN () { 2 }
52    sub START_TAG_TOKEN () { 3 }
53    sub END_TAG_TOKEN () { 4 }
54    sub END_OF_FILE_TOKEN () { 5 }
55    sub CHARACTER_TOKEN () { 6 }
56    sub PI_TOKEN () { 7 } ## NOTE: XML only.
57    sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing.
58    sub END_OF_DOCTYPE_TOKEN () { 9 } ## NOTE: XML only.
59    sub ATTLIST_TOKEN () { 10 } ## NOTE: XML only.
60    sub ELEMENT_TOKEN () { 11 } ## NOTE: XML only.
61    sub GENERAL_ENTITY_TOKEN () { 12 } ## NOTE: XML only.
62    sub PARAMETER_ENTITY_TOKEN () { 13 } ## NOTE: XML only.
63    sub NOTATION_TOKEN () { 14 } ## NOTE: XML only.
64    
65    ## XML5: XML5 has "empty tag token".  In this implementation, it is
66    ## represented as a start tag token with $self->{self_closing} flag
67    ## set to true.
68    
69    ## XML5: XML5 has "short end tag token".  In this implementation, it
70    ## is represented as an end tag token with $token->{tag_name} flag set
71    ## to an empty string.
72    
73  package Whatpm::HTML;  package Whatpm::HTML;
74    
75    BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
76    
77  ## Content model flags  ## Content model flags
78    
79  sub CM_ENTITY () { 0b001 } # & markup in data  sub CM_ENTITY () { 0b001 } # & markup in data
# Line 72  sub HEXREF_HEX_STATE () { 48 } Line 142  sub HEXREF_HEX_STATE () { 48 }
142  sub ENTITY_NAME_STATE () { 49 }  sub ENTITY_NAME_STATE () { 49 }
143  sub PCDATA_STATE () { 50 } # "data state" in the spec  sub PCDATA_STATE () { 50 } # "data state" in the spec
144    
145  ## Token types  ## XML-only states
146    sub PI_STATE () { 51 }
147  sub DOCTYPE_TOKEN () { 1 }  sub PI_TARGET_STATE () { 52 }
148  sub COMMENT_TOKEN () { 2 }  sub PI_TARGET_AFTER_STATE () { 53 }
149  sub START_TAG_TOKEN () { 3 }  sub PI_DATA_STATE () { 54 }
150  sub END_TAG_TOKEN () { 4 }  sub PI_AFTER_STATE () { 55 }
151  sub END_OF_FILE_TOKEN () { 5 }  sub PI_DATA_AFTER_STATE () { 56 }
152  sub CHARACTER_TOKEN () { 6 }  sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 }
153    sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 }
154    sub BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 59 }
155    sub DOCTYPE_TAG_STATE () { 60 }
156    sub DOCTYPE_MARKUP_DECLARATION_OPEN_STATE () { 61 }
157    sub MD_ATTLIST_STATE () { 62 }
158    sub MD_E_STATE () { 63 }
159    sub MD_ELEMENT_STATE () { 64 }
160    sub MD_ENTITY_STATE () { 65 }
161    sub MD_NOTATION_STATE () { 66 }
162    sub DOCTYPE_MD_STATE () { 67 }
163    sub BEFORE_MD_NAME_STATE () { 68 }
164    sub MD_NAME_STATE () { 69 }
165    sub DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE () { 70 }
166    sub DOCTYPE_ATTLIST_NAME_AFTER_STATE () { 71 }
167    sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE () { 72 }
168    sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE () { 73 }
169    sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE () { 74 }
170    sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE () { 75 }
171    sub BEFORE_ALLOWED_TOKEN_STATE () { 76 }
172    sub ALLOWED_TOKEN_STATE () { 77 }
173    sub AFTER_ALLOWED_TOKEN_STATE () { 78 }
174    sub AFTER_ALLOWED_TOKENS_STATE () { 79 }
175    sub BEFORE_ATTR_DEFAULT_STATE () { 80 }
176    sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE () { 81 }
177    sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE () { 82 }
178    sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE () { 83 }
179    sub AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE () { 84 }
180    sub BEFORE_NDATA_STATE () { 85 }
181    sub NDATA_STATE () { 86 }
182    sub AFTER_NDATA_STATE () { 87 }
183    sub BEFORE_NOTATION_NAME_STATE () { 88 }
184    sub NOTATION_NAME_STATE () { 89 }
185    sub DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE () { 90 }
186    sub DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE () { 91 }
187    sub ENTITY_VALUE_ENTITY_STATE () { 92 }
188    sub AFTER_ELEMENT_NAME_STATE () { 93 }
189    sub BEFORE_ELEMENT_CONTENT_STATE () { 94 }
190    sub CONTENT_KEYWORD_STATE () { 95 }
191    sub AFTER_CM_GROUP_OPEN_STATE () { 96 }
192    sub CM_ELEMENT_NAME_STATE () { 97 }
193    sub AFTER_CM_ELEMENT_NAME_STATE () { 98 }
194    sub AFTER_CM_GROUP_CLOSE_STATE () { 99 }
195    sub AFTER_MD_DEF_STATE () { 100 }
196    sub BOGUS_MD_STATE () { 101 }
197    
198  ## Tree constructor state constants (see Whatpm::HTML for the full  ## Tree constructor state constants (see Whatpm::HTML for the full
199  ## list and descriptions)  ## list and descriptions)
# Line 142  sub _initialize_tokenizer ($) { Line 256  sub _initialize_tokenizer ($) {
256    #$self->{level}    #$self->{level}
257    #$self->{set_nc}    #$self->{set_nc}
258    #$self->{parse_error}    #$self->{parse_error}
259      #$self->{is_xml} (if XML)
260    
261    $self->{state} = DATA_STATE; # MUST    $self->{state} = DATA_STATE; # MUST
262    #$self->{s_kwd}; # state keyword - initialized when used    $self->{s_kwd} = ''; # Data state keyword
263      #$self->{kwd} = ''; # State-dependent keyword; initialized when used
264    #$self->{entity__value}; # initialized when used    #$self->{entity__value}; # initialized when used
265    #$self->{entity__match}; # initialized when used    #$self->{entity__match}; # initialized when used
266    $self->{content_model} = PCDATA_CONTENT_MODEL; # be    $self->{content_model} = PCDATA_CONTENT_MODEL; # be
# Line 174  sub _initialize_tokenizer ($) { Line 290  sub _initialize_tokenizer ($) {
290    
291  ## A token has:  ## A token has:
292  ##   ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,  ##   ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
293  ##       CHARACTER_TOKEN, or END_OF_FILE_TOKEN  ##       CHARACTER_TOKEN, END_OF_FILE_TOKEN, PI_TOKEN, or ABORT_TOKEN
294  ##   ->{name} (DOCTYPE_TOKEN)  ##   ->{name} (DOCTYPE_TOKEN)
295  ##   ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)  ##   ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
296    ##   ->{target} (PI_TOKEN)
297  ##   ->{pubid} (DOCTYPE_TOKEN)  ##   ->{pubid} (DOCTYPE_TOKEN)
298  ##   ->{sysid} (DOCTYPE_TOKEN)  ##   ->{sysid} (DOCTYPE_TOKEN)
299  ##   ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag  ##   ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
# Line 184  sub _initialize_tokenizer ($) { Line 301  sub _initialize_tokenizer ($) {
301  ##        ->{name}  ##        ->{name}
302  ##        ->{value}  ##        ->{value}
303  ##        ->{has_reference} == 1 or 0  ##        ->{has_reference} == 1 or 0
304  ##   ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)  ##        ->{index}: Index of the attribute in a tag.
305    ##   ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)
306    ##   ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
307    ##   ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.
308    ##   ->{has_internal_subset} = 1 or 0 (DOCTYPE_TOKEN)
309    
310  ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.  ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
311  ##     |->{self_closing}| is used to save the value of |$self->{self_closing}|  ##     |->{self_closing}| is used to save the value of |$self->{self_closing}|
312  ##     while the token is pushed back to the stack.  ##     while the token is pushed back to the stack.
# Line 204  my $is_space = { Line 326  my $is_space = {
326    0x0009 => 1, # CHARACTER TABULATION (HT)    0x0009 => 1, # CHARACTER TABULATION (HT)
327    0x000A => 1, # LINE FEED (LF)    0x000A => 1, # LINE FEED (LF)
328    #0x000B => 0, # LINE TABULATION (VT)    #0x000B => 0, # LINE TABULATION (VT)
329    0x000C => 1, # FORM FEED (FF)    0x000C => 1, # FORM FEED (FF) ## XML5: Not a space character.
330    #0x000D => 1, # CARRIAGE RETURN (CR)    #0x000D => 1, # CARRIAGE RETURN (CR)
331    0x0020 => 1, # SPACE (SP)    0x0020 => 1, # SPACE (SP)
332  };  };
# Line 328  sub _get_next_token ($) { Line 450  sub _get_next_token ($) {
450          }          }
451        } elsif ($self->{nc} == 0x002D) { # -        } elsif ($self->{nc} == 0x002D) { # -
452          if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA          if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
453            $self->{s_kwd} .= '-';            if ($self->{s_kwd} eq '<!-') {
             
           if ($self->{s_kwd} eq '<!--') {  
454                            
455              $self->{escape} = 1; # unless $self->{escape};              $self->{escape} = 1; # unless $self->{escape};
456              $self->{s_kwd} = '--';              $self->{s_kwd} = '--';
457              #              #
458            } elsif ($self->{s_kwd} eq '---') {            } elsif ($self->{s_kwd} eq '-') {
459                            
460              $self->{s_kwd} = '--';              $self->{s_kwd} = '--';
461              #              #
462              } elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') {
463                
464                $self->{s_kwd} .= '-';
465                #
466            } else {            } else {
467                            
468                $self->{s_kwd} = '-';
469              #              #
470            }            }
471          }          }
# Line 386  sub _get_next_token ($) { Line 511  sub _get_next_token ($) {
511            if ($self->{s_kwd} eq '--') {            if ($self->{s_kwd} eq '--') {
512                            
513              delete $self->{escape};              delete $self->{escape};
514                #
515            } else {            } else {
516                            
517                #
518            }            }
519            } elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') {
520              
521              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched mse', ## TODO: type
522                              line => $self->{line_prev},
523                              column => $self->{column_prev} - 1);
524              #
525          } else {          } else {
526                        
527              #
528          }          }
529                    
530          $self->{s_kwd} = '';          $self->{s_kwd} = '';
531          #          #
532          } elsif ($self->{nc} == 0x005D) { # ]
533            if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') {
534              
535              $self->{s_kwd} .= ']';
536            } elsif ($self->{s_kwd} eq ']]') {
537              
538              #
539            } else {
540              
541              $self->{s_kwd} = '';
542            }
543            #
544        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
545                    
546          $self->{s_kwd} = '';          $self->{s_kwd} = '';
# Line 412  sub _get_next_token ($) { Line 558  sub _get_next_token ($) {
558                     data => chr $self->{nc},                     data => chr $self->{nc},
559                     line => $self->{line}, column => $self->{column},                     line => $self->{line}, column => $self->{column},
560                    };                    };
561        if ($self->{read_until}->($token->{data}, q[-!<>&],        if ($self->{read_until}->($token->{data}, q{-!<>&\]},
562                                  length $token->{data})) {                                  length $token->{data})) {
563          $self->{s_kwd} = '';          $self->{s_kwd} = '';
564        }        }
565    
566        ## Stay in the data state.        ## Stay in the data state.
567        if ($self->{content_model} == PCDATA_CONTENT_MODEL) {        if (not $self->{is_xml} and
568              $self->{content_model} == PCDATA_CONTENT_MODEL) {
569                    
570          $self->{state} = PCDATA_STATE;          $self->{state} = PCDATA_STATE;
571        } else {        } else {
# Line 439  sub _get_next_token ($) { Line 586  sub _get_next_token ($) {
586        return  ($token);        return  ($token);
587        redo A;        redo A;
588      } elsif ($self->{state} == TAG_OPEN_STATE) {      } elsif ($self->{state} == TAG_OPEN_STATE) {
589          ## XML5: "tag state".
590    
591        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
592          if ($self->{nc} == 0x002F) { # /          if ($self->{nc} == 0x002F) { # /
593                        
# Line 457  sub _get_next_token ($) { Line 606  sub _get_next_token ($) {
606            redo A;            redo A;
607          } elsif ($self->{nc} == 0x0021) { # !          } elsif ($self->{nc} == 0x0021) { # !
608                        
609            $self->{s_kwd} = '<' unless $self->{escape};            $self->{s_kwd} = $self->{escaped} ? '' : '<';
610            #            #
611          } else {          } else {
612                        
613              $self->{s_kwd} = '';
614            #            #
615          }          }
616    
# Line 507  sub _get_next_token ($) { Line 657  sub _get_next_token ($) {
657                        
658            $self->{ct}            $self->{ct}
659              = {type => START_TAG_TOKEN,              = {type => START_TAG_TOKEN,
660                 tag_name => chr ($self->{nc} + 0x0020),                 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
661                 line => $self->{line_prev},                 line => $self->{line_prev},
662                 column => $self->{column_prev}};                 column => $self->{column_prev}};
663            $self->{state} = TAG_NAME_STATE;            $self->{state} = TAG_NAME_STATE;
# Line 549  sub _get_next_token ($) { Line 699  sub _get_next_token ($) {
699                            line => $self->{line_prev},                            line => $self->{line_prev},
700                            column => $self->{column_prev});                            column => $self->{column_prev});
701            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
702              $self->{s_kwd} = '';
703                        
704      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
705        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 568  sub _get_next_token ($) { Line 719  sub _get_next_token ($) {
719    
720            redo A;            redo A;
721          } elsif ($self->{nc} == 0x003F) { # ?          } elsif ($self->{nc} == 0x003F) { # ?
722                        if ($self->{is_xml}) {
723            $self->{parse_error}->(level => $self->{level}->{must}, type => 'pio',              
724                            line => $self->{line_prev},              $self->{state} = PI_STATE;
725                            column => $self->{column_prev});              
726            $self->{state} = BOGUS_COMMENT_STATE;      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
727            $self->{ct} = {type => COMMENT_TOKEN, data => '',        $self->{line_prev} = $self->{line};
728                                      line => $self->{line_prev},        $self->{column_prev} = $self->{column};
729                                      column => $self->{column_prev},        $self->{column}++;
730                                     };        $self->{nc}
731            ## $self->{nc} is intentionally left as is            = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
732            redo A;      } else {
733          } else {        $self->{set_nc}->($self);
734        }
735      
736                redo A;
737              } else {
738                
739                $self->{parse_error}->(level => $self->{level}->{must}, type => 'pio',
740                                line => $self->{line_prev},
741                                column => $self->{column_prev});
742                $self->{state} = BOGUS_COMMENT_STATE;
743                $self->{ct} = {type => COMMENT_TOKEN, data => '',
744                               line => $self->{line_prev},
745                               column => $self->{column_prev},
746                              };
747                ## $self->{nc} is intentionally left as is
748                redo A;
749              }
750            } elsif (not $self->{is_xml} or $is_space->{$self->{nc}}) {
751                        
752            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago',            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago',
753                            line => $self->{line_prev},                            line => $self->{line_prev},
754                            column => $self->{column_prev});                            column => $self->{column_prev});
755            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
756              $self->{s_kwd} = '';
757            ## reconsume            ## reconsume
758    
759            return  ({type => CHARACTER_TOKEN, data => '<',            return  ({type => CHARACTER_TOKEN, data => '<',
# Line 593  sub _get_next_token ($) { Line 762  sub _get_next_token ($) {
762                     });                     });
763    
764            redo A;            redo A;
765            } else {
766              ## XML5: "<:" is a parse error.
767              
768              $self->{ct} = {type => START_TAG_TOKEN,
769                                        tag_name => chr ($self->{nc}),
770                                        line => $self->{line_prev},
771                                        column => $self->{column_prev}};
772              $self->{state} = TAG_NAME_STATE;
773              
774        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
775          $self->{line_prev} = $self->{line};
776          $self->{column_prev} = $self->{column};
777          $self->{column}++;
778          $self->{nc}
779              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
780        } else {
781          $self->{set_nc}->($self);
782        }
783      
784              redo A;
785          }          }
786        } else {        } else {
787          die "$0: $self->{content_model} in tag open";          die "$0: $self->{content_model} in tag open";
# Line 601  sub _get_next_token ($) { Line 790  sub _get_next_token ($) {
790        ## NOTE: The "close tag open state" in the spec is implemented as        ## NOTE: The "close tag open state" in the spec is implemented as
791        ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.        ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
792    
793          ## XML5: "end tag state".
794    
795        my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"        my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
796        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
797          if (defined $self->{last_stag_name}) {          if (defined $self->{last_stag_name}) {
798            $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;            $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
799            $self->{s_kwd} = '';            $self->{kwd} = '';
800            ## Reconsume.            ## Reconsume.
801            redo A;            redo A;
802          } else {          } else {
# Line 613  sub _get_next_token ($) { Line 804  sub _get_next_token ($) {
804            ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.            ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
805                        
806            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
807              $self->{s_kwd} = '';
808            ## Reconsume.            ## Reconsume.
809            return  ({type => CHARACTER_TOKEN, data => '</',            return  ({type => CHARACTER_TOKEN, data => '</',
810                      line => $l, column => $c,                      line => $l, column => $c,
# Line 626  sub _get_next_token ($) { Line 818  sub _get_next_token ($) {
818                    
819          $self->{ct}          $self->{ct}
820              = {type => END_TAG_TOKEN,              = {type => END_TAG_TOKEN,
821                 tag_name => chr ($self->{nc} + 0x0020),                 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
822                 line => $l, column => $c};                 line => $l, column => $c};
823          $self->{state} = TAG_NAME_STATE;          $self->{state} = TAG_NAME_STATE;
824                    
# Line 661  sub _get_next_token ($) { Line 853  sub _get_next_token ($) {
853        
854          redo A;          redo A;
855        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
           
856          $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty end tag',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty end tag',
857                          line => $self->{line_prev}, ## "<" in "</>"                          line => $self->{line_prev}, ## "<" in "</>"
858                          column => $self->{column_prev} - 1);                          column => $self->{column_prev} - 1);
859          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
860                    $self->{s_kwd} = '';
861            if ($self->{is_xml}) {
862              
863              ## XML5: No parse error.
864              
865              ## NOTE: This parser raises a parse error, since it supports
866              ## XML1, not XML5.
867    
868              ## NOTE: A short end tag token.
869              my $ct = {type => END_TAG_TOKEN,
870                        tag_name => '',
871                        line => $self->{line_prev},
872                        column => $self->{column_prev} - 1,
873                       };
874              
875      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
876        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
877        $self->{column_prev} = $self->{column};        $self->{column_prev} = $self->{column};
# Line 677  sub _get_next_token ($) { Line 882  sub _get_next_token ($) {
882        $self->{set_nc}->($self);        $self->{set_nc}->($self);
883      }      }
884        
885              return  ($ct);
886            } else {
887              
888              
889        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
890          $self->{line_prev} = $self->{line};
891          $self->{column_prev} = $self->{column};
892          $self->{column}++;
893          $self->{nc}
894              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
895        } else {
896          $self->{set_nc}->($self);
897        }
898      
899            }
900          redo A;          redo A;
901        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
902                    
903          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare etago');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare etago');
904            $self->{s_kwd} = '';
905          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
906          # reconsume          # reconsume
907    
# Line 689  sub _get_next_token ($) { Line 910  sub _get_next_token ($) {
910                   });                   });
911    
912          redo A;          redo A;
913        } else {        } elsif (not $self->{is_xml} or
914                   $is_space->{$self->{nc}}) {
915                    
916          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus end tag');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus end tag',
917                            line => $self->{line_prev}, # "<" of "</"
918                            column => $self->{column_prev} - 1);
919          $self->{state} = BOGUS_COMMENT_STATE;          $self->{state} = BOGUS_COMMENT_STATE;
920          $self->{ct} = {type => COMMENT_TOKEN, data => '',          $self->{ct} = {type => COMMENT_TOKEN, data => '',
921                                    line => $self->{line_prev}, # "<" of "</"                                    line => $self->{line_prev}, # "<" of "</"
# Line 704  sub _get_next_token ($) { Line 928  sub _get_next_token ($) {
928          ## generated from the bogus end tag, as defined in the          ## generated from the bogus end tag, as defined in the
929          ## "bogus comment state" entry.          ## "bogus comment state" entry.
930          redo A;          redo A;
931          } else {
932            ## XML5: "</:" is a parse error.
933            
934            $self->{ct} = {type => END_TAG_TOKEN,
935                           tag_name => chr ($self->{nc}),
936                           line => $l, column => $c};
937            $self->{state} = TAG_NAME_STATE; ## XML5: "end tag name state".
938            
939        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
940          $self->{line_prev} = $self->{line};
941          $self->{column_prev} = $self->{column};
942          $self->{column}++;
943          $self->{nc}
944              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
945        } else {
946          $self->{set_nc}->($self);
947        }
948      
949            redo A;
950        }        }
951      } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {      } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
952        my $ch = substr $self->{last_stag_name}, length $self->{s_kwd}, 1;        my $ch = substr $self->{last_stag_name}, length $self->{kwd}, 1;
953        if (length $ch) {        if (length $ch) {
954          my $CH = $ch;          my $CH = $ch;
955          $ch =~ tr/a-z/A-Z/;          $ch =~ tr/a-z/A-Z/;
# Line 714  sub _get_next_token ($) { Line 957  sub _get_next_token ($) {
957          if ($nch eq $ch or $nch eq $CH) {          if ($nch eq $ch or $nch eq $CH) {
958                        
959            ## Stay in the state.            ## Stay in the state.
960            $self->{s_kwd} .= $nch;            $self->{kwd} .= $nch;
961                        
962      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
963        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 730  sub _get_next_token ($) { Line 973  sub _get_next_token ($) {
973          } else {          } else {
974                        
975            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
976              $self->{s_kwd} = '';
977            ## Reconsume.            ## Reconsume.
978            return  ({type => CHARACTER_TOKEN,            return  ({type => CHARACTER_TOKEN,
979                      data => '</' . $self->{s_kwd},                      data => '</' . $self->{kwd},
980                      line => $self->{line_prev},                      line => $self->{line_prev},
981                      column => $self->{column_prev} - 1 - length $self->{s_kwd},                      column => $self->{column_prev} - 1 - length $self->{kwd},
982                     });                     });
983            redo A;            redo A;
984          }          }
# Line 748  sub _get_next_token ($) { Line 992  sub _get_next_token ($) {
992                        
993            ## Reconsume.            ## Reconsume.
994            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
995              $self->{s_kwd} = '';
996            return  ({type => CHARACTER_TOKEN,            return  ({type => CHARACTER_TOKEN,
997                      data => '</' . $self->{s_kwd},                      data => '</' . $self->{kwd},
998                      line => $self->{line_prev},                      line => $self->{line_prev},
999                      column => $self->{column_prev} - 1 - length $self->{s_kwd},                      column => $self->{column_prev} - 1 - length $self->{kwd},
1000                     });                     });
1001            redo A;            redo A;
1002          } else {          } else {
# Line 760  sub _get_next_token ($) { Line 1005  sub _get_next_token ($) {
1005                = {type => END_TAG_TOKEN,                = {type => END_TAG_TOKEN,
1006                   tag_name => $self->{last_stag_name},                   tag_name => $self->{last_stag_name},
1007                   line => $self->{line_prev},                   line => $self->{line_prev},
1008                   column => $self->{column_prev} - 1 - length $self->{s_kwd}};                   column => $self->{column_prev} - 1 - length $self->{kwd}};
1009            $self->{state} = TAG_NAME_STATE;            $self->{state} = TAG_NAME_STATE;
1010            ## Reconsume.            ## Reconsume.
1011            redo A;            redo A;
# Line 799  sub _get_next_token ($) { Line 1044  sub _get_next_token ($) {
1044            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1045          }          }
1046          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1047            $self->{s_kwd} = '';
1048                    
1049      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1050        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 817  sub _get_next_token ($) { Line 1063  sub _get_next_token ($) {
1063        } elsif (0x0041 <= $self->{nc} and        } elsif (0x0041 <= $self->{nc} and
1064                 $self->{nc} <= 0x005A) { # A..Z                 $self->{nc} <= 0x005A) { # A..Z
1065                    
1066          $self->{ct}->{tag_name} .= chr ($self->{nc} + 0x0020);          $self->{ct}->{tag_name}
1067                .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1068            # start tag or end tag            # start tag or end tag
1069          ## Stay in this state          ## Stay in this state
1070                    
# Line 850  sub _get_next_token ($) { Line 1097  sub _get_next_token ($) {
1097            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1098          }          }
1099          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1100            $self->{s_kwd} = '';
1101          # reconsume          # reconsume
1102    
1103          return  ($self->{ct}); # start tag or end tag          return  ($self->{ct}); # start tag or end tag
# Line 889  sub _get_next_token ($) { Line 1137  sub _get_next_token ($) {
1137          redo A;          redo A;
1138        }        }
1139      } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {      } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
1140          ## XML5: "Tag attribute name before state".
1141    
1142        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
1143                    
1144          ## Stay in the state          ## Stay in the state
# Line 920  sub _get_next_token ($) { Line 1170  sub _get_next_token ($) {
1170            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1171          }          }
1172          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1173            $self->{s_kwd} = '';
1174                    
1175      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1176        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 939  sub _get_next_token ($) { Line 1190  sub _get_next_token ($) {
1190                 $self->{nc} <= 0x005A) { # A..Z                 $self->{nc} <= 0x005A) { # A..Z
1191                    
1192          $self->{ca}          $self->{ca}
1193              = {name => chr ($self->{nc} + 0x0020),              = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1194                 value => '',                 value => '',
1195                 line => $self->{line}, column => $self->{column}};                 line => $self->{line}, column => $self->{column}};
1196          $self->{state} = ATTRIBUTE_NAME_STATE;          $self->{state} = ATTRIBUTE_NAME_STATE;
# Line 987  sub _get_next_token ($) { Line 1238  sub _get_next_token ($) {
1238            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1239          }          }
1240          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1241            $self->{s_kwd} = '';
1242          # reconsume          # reconsume
1243    
1244          return  ($self->{ct}); # start tag or end tag          return  ($self->{ct}); # start tag or end tag
# Line 999  sub _get_next_token ($) { Line 1251  sub _get_next_token ($) {
1251               0x003D => 1, # =               0x003D => 1, # =
1252              }->{$self->{nc}}) {              }->{$self->{nc}}) {
1253                        
1254              ## XML5: Not a parse error.
1255            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1256          } else {          } else {
1257                        
1258              ## XML5: ":" raises a parse error and is ignored.
1259          }          }
1260          $self->{ca}          $self->{ca}
1261              = {name => chr ($self->{nc}),              = {name => chr ($self->{nc}),
# Line 1022  sub _get_next_token ($) { Line 1276  sub _get_next_token ($) {
1276          redo A;          redo A;
1277        }        }
1278      } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {      } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
1279          ## XML5: "Tag attribute name state".
1280    
1281        my $before_leave = sub {        my $before_leave = sub {
1282          if (exists $self->{ct}->{attributes} # start tag or end tag          if (exists $self->{ct}->{attributes} # start tag or end tag
1283              ->{$self->{ca}->{name}}) { # MUST              ->{$self->{ca}->{name}}) { # MUST
# Line 1032  sub _get_next_token ($) { Line 1288  sub _get_next_token ($) {
1288                        
1289            $self->{ct}->{attributes}->{$self->{ca}->{name}}            $self->{ct}->{attributes}->{$self->{ca}->{name}}
1290              = $self->{ca};              = $self->{ca};
1291              $self->{ca}->{index} = ++$self->{ct}->{last_index};
1292          }          }
1293        }; # $before_leave        }; # $before_leave
1294    
# Line 1068  sub _get_next_token ($) { Line 1325  sub _get_next_token ($) {
1325        
1326          redo A;          redo A;
1327        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
1328            if ($self->{is_xml}) {
1329              
1330              ## XML5: Not a parse error.
1331              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1332            } else {
1333              
1334            }
1335    
1336          $before_leave->();          $before_leave->();
1337          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
1338                        
# Line 1082  sub _get_next_token ($) { Line 1347  sub _get_next_token ($) {
1347            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1348          }          }
1349          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1350            $self->{s_kwd} = '';
1351                    
1352      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1353        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 1100  sub _get_next_token ($) { Line 1366  sub _get_next_token ($) {
1366        } elsif (0x0041 <= $self->{nc} and        } elsif (0x0041 <= $self->{nc} and
1367                 $self->{nc} <= 0x005A) { # A..Z                 $self->{nc} <= 0x005A) { # A..Z
1368                    
1369          $self->{ca}->{name} .= chr ($self->{nc} + 0x0020);          $self->{ca}->{name}
1370                .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1371          ## Stay in the state          ## Stay in the state
1372                    
1373      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 1115  sub _get_next_token ($) { Line 1382  sub _get_next_token ($) {
1382        
1383          redo A;          redo A;
1384        } elsif ($self->{nc} == 0x002F) { # /        } elsif ($self->{nc} == 0x002F) { # /
1385            if ($self->{is_xml}) {
1386              
1387              ## XML5: Not a parse error.
1388              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1389            } else {
1390              
1391            }
1392                    
1393          $before_leave->();          $before_leave->();
1394          $self->{state} = SELF_CLOSING_START_TAG_STATE;          $self->{state} = SELF_CLOSING_START_TAG_STATE;
# Line 1149  sub _get_next_token ($) { Line 1423  sub _get_next_token ($) {
1423            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1424          }          }
1425          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1426            $self->{s_kwd} = '';
1427          # reconsume          # reconsume
1428    
1429          return  ($self->{ct}); # start tag or end tag          return  ($self->{ct}); # start tag or end tag
# Line 1158  sub _get_next_token ($) { Line 1433  sub _get_next_token ($) {
1433          if ($self->{nc} == 0x0022 or # "          if ($self->{nc} == 0x0022 or # "
1434              $self->{nc} == 0x0027) { # '              $self->{nc} == 0x0027) { # '
1435                        
1436              ## XML5: Not a parse error.
1437            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1438          } else {          } else {
1439                        
# Line 1178  sub _get_next_token ($) { Line 1454  sub _get_next_token ($) {
1454          redo A;          redo A;
1455        }        }
1456      } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {      } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1457          ## XML5: "Tag attribute name after state".
1458          
1459        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
1460                    
1461          ## Stay in the state          ## Stay in the state
# Line 1209  sub _get_next_token ($) { Line 1487  sub _get_next_token ($) {
1487        
1488          redo A;          redo A;
1489        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
1490            if ($self->{is_xml}) {
1491              
1492              ## XML5: Not a parse error.
1493              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1494            } else {
1495              
1496            }
1497    
1498          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
1499                        
1500            $self->{last_stag_name} = $self->{ct}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
# Line 1225  sub _get_next_token ($) { Line 1511  sub _get_next_token ($) {
1511            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1512          }          }
1513          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1514            $self->{s_kwd} = '';
1515                    
1516      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1517        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 1244  sub _get_next_token ($) { Line 1531  sub _get_next_token ($) {
1531                 $self->{nc} <= 0x005A) { # A..Z                 $self->{nc} <= 0x005A) { # A..Z
1532                    
1533          $self->{ca}          $self->{ca}
1534              = {name => chr ($self->{nc} + 0x0020),              = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1535                 value => '',                 value => '',
1536                 line => $self->{line}, column => $self->{column}};                 line => $self->{line}, column => $self->{column}};
1537          $self->{state} = ATTRIBUTE_NAME_STATE;          $self->{state} = ATTRIBUTE_NAME_STATE;
# Line 1261  sub _get_next_token ($) { Line 1548  sub _get_next_token ($) {
1548        
1549          redo A;          redo A;
1550        } elsif ($self->{nc} == 0x002F) { # /        } elsif ($self->{nc} == 0x002F) { # /
1551            if ($self->{is_xml}) {
1552              
1553              ## XML5: Not a parse error.
1554              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1555            } else {
1556              
1557            }
1558                    
1559          $self->{state} = SELF_CLOSING_START_TAG_STATE;          $self->{state} = SELF_CLOSING_START_TAG_STATE;
1560                    
# Line 1292  sub _get_next_token ($) { Line 1586  sub _get_next_token ($) {
1586          } else {          } else {
1587            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1588          }          }
1589            $self->{s_kwd} = '';
1590          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1591          # reconsume          # reconsume
1592    
# Line 1299  sub _get_next_token ($) { Line 1594  sub _get_next_token ($) {
1594    
1595          redo A;          redo A;
1596        } else {        } else {
1597            if ($self->{is_xml}) {
1598              
1599              ## XML5: Not a parse error.
1600              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr value'); ## TODO: type
1601            } else {
1602              
1603            }
1604    
1605          if ($self->{nc} == 0x0022 or # "          if ($self->{nc} == 0x0022 or # "
1606              $self->{nc} == 0x0027) { # '              $self->{nc} == 0x0027) { # '
1607                        
1608              ## XML5: Not a parse error.
1609            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1610          } else {          } else {
1611                        
# Line 1325  sub _get_next_token ($) { Line 1629  sub _get_next_token ($) {
1629          redo A;                  redo A;        
1630        }        }
1631      } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {      } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1632          ## XML5: "Tag attribute value before state".
1633    
1634        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
1635                    
1636          ## Stay in the state          ## Stay in the state
# Line 1393  sub _get_next_token ($) { Line 1699  sub _get_next_token ($) {
1699            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1700          }          }
1701          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1702            $self->{s_kwd} = '';
1703                    
1704      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1705        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 1426  sub _get_next_token ($) { Line 1733  sub _get_next_token ($) {
1733            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1734          }          }
1735          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1736            $self->{s_kwd} = '';
1737          ## reconsume          ## reconsume
1738    
1739          return  ($self->{ct}); # start tag or end tag          return  ($self->{ct}); # start tag or end tag
1740    
1741          redo A;          redo A;
1742        } else {        } else {
1743          if ($self->{nc} == 0x003D) { # =          if ($self->{nc} == 0x003D or $self->{nc} == 0x003C) { # =, <
1744                        
1745              ## XML5: Not a parse error.
1746            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
1747            } elsif ($self->{is_xml}) {
1748              
1749              ## XML5: No parse error.
1750              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO
1751          } else {          } else {
1752                        
1753          }          }
# Line 1454  sub _get_next_token ($) { Line 1767  sub _get_next_token ($) {
1767          redo A;          redo A;
1768        }        }
1769      } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {      } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1770          ## XML5: "Tag attribute value double quoted state" and "DOCTYPE
1771          ## ATTLIST attribute value double quoted state".
1772          
1773        if ($self->{nc} == 0x0022) { # "        if ($self->{nc} == 0x0022) { # "
1774                    if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1775          $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;            
1776              ## XML5: "DOCTYPE ATTLIST name after state".
1777              push @{$self->{ct}->{attrdefs}}, $self->{ca};
1778              $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1779            } else {
1780              
1781              ## XML5: "Tag attribute name before state".
1782              $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1783            }
1784                    
1785      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1786        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 1471  sub _get_next_token ($) { Line 1795  sub _get_next_token ($) {
1795          redo A;          redo A;
1796        } elsif ($self->{nc} == 0x0026) { # &        } elsif ($self->{nc} == 0x0026) { # &
1797                    
1798            ## XML5: Not defined yet.
1799    
1800          ## NOTE: In the spec, the tokenizer is switched to the          ## NOTE: In the spec, the tokenizer is switched to the
1801          ## "entity in attribute value state".  In this implementation, the          ## "entity in attribute value state".  In this implementation, the
1802          ## tokenizer is switched to the |ENTITY_STATE|, which is an          ## tokenizer is switched to the |ENTITY_STATE|, which is an
# Line 1490  sub _get_next_token ($) { Line 1816  sub _get_next_token ($) {
1816      }      }
1817        
1818          redo A;          redo A;
1819          } elsif ($self->{is_xml} and
1820                   $is_space->{$self->{nc}}) {
1821            
1822            $self->{ca}->{value} .= ' ';
1823            ## Stay in the state.
1824            
1825        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1826          $self->{line_prev} = $self->{line};
1827          $self->{column_prev} = $self->{column};
1828          $self->{column}++;
1829          $self->{nc}
1830              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1831        } else {
1832          $self->{set_nc}->($self);
1833        }
1834      
1835            redo A;
1836        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
1837          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1838          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
1839                        
1840            $self->{last_stag_name} = $self->{ct}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
1841    
1842              $self->{state} = DATA_STATE;
1843              $self->{s_kwd} = '';
1844              ## reconsume
1845              return  ($self->{ct}); # start tag
1846              redo A;
1847          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1848            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1849            if ($self->{ct}->{attributes}) {            if ($self->{ct}->{attributes}) {
# Line 1504  sub _get_next_token ($) { Line 1853  sub _get_next_token ($) {
1853              ## NOTE: This state should never be reached.              ## NOTE: This state should never be reached.
1854                            
1855            }            }
1856    
1857              $self->{state} = DATA_STATE;
1858              $self->{s_kwd} = '';
1859              ## reconsume
1860              return  ($self->{ct}); # end tag
1861              redo A;
1862            } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1863              ## XML5: No parse error above; not defined yet.
1864              push @{$self->{ct}->{attrdefs}}, $self->{ca};
1865              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1866              ## Reconsume.
1867              return  ($self->{ct}); # ATTLIST
1868              redo A;
1869          } else {          } else {
1870            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
1871          }          }
         $self->{state} = DATA_STATE;  
         ## reconsume  
   
         return  ($self->{ct}); # start tag or end tag  
   
         redo A;  
1872        } else {        } else {
1873                    ## XML5 [ATTLIST]: Not defined yet.
1874            if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1875              
1876              ## XML5: Not a parse error.
1877              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
1878            } else {
1879              
1880            }
1881          $self->{ca}->{value} .= chr ($self->{nc});          $self->{ca}->{value} .= chr ($self->{nc});
1882          $self->{read_until}->($self->{ca}->{value},          $self->{read_until}->($self->{ca}->{value},
1883                                q["&],                                qq["&<\x09\x0C\x20],
1884                                length $self->{ca}->{value});                                length $self->{ca}->{value});
1885    
1886          ## Stay in the state          ## Stay in the state
# Line 1535  sub _get_next_token ($) { Line 1898  sub _get_next_token ($) {
1898          redo A;          redo A;
1899        }        }
1900      } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {      } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1901          ## XML5: "Tag attribute value single quoted state" and "DOCTYPE
1902          ## ATTLIST attribute value single quoted state".
1903    
1904        if ($self->{nc} == 0x0027) { # '        if ($self->{nc} == 0x0027) { # '
1905                    if ($self->{ct}->{type} == ATTLIST_TOKEN) {
1906          $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;            
1907              ## XML5: "DOCTYPE ATTLIST name after state".
1908              push @{$self->{ct}->{attrdefs}}, $self->{ca};
1909              $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE;
1910            } else {
1911              
1912              ## XML5: "Before attribute name state" (sic).
1913              $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1914            }
1915                    
1916      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1917        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 1552  sub _get_next_token ($) { Line 1926  sub _get_next_token ($) {
1926          redo A;          redo A;
1927        } elsif ($self->{nc} == 0x0026) { # &        } elsif ($self->{nc} == 0x0026) { # &
1928                    
1929            ## XML5: Not defined yet.
1930    
1931          ## NOTE: In the spec, the tokenizer is switched to the          ## NOTE: In the spec, the tokenizer is switched to the
1932          ## "entity in attribute value state".  In this implementation, the          ## "entity in attribute value state".  In this implementation, the
1933          ## tokenizer is switched to the |ENTITY_STATE|, which is an          ## tokenizer is switched to the |ENTITY_STATE|, which is an
# Line 1571  sub _get_next_token ($) { Line 1947  sub _get_next_token ($) {
1947      }      }
1948        
1949          redo A;          redo A;
1950          } elsif ($self->{is_xml} and
1951                   $is_space->{$self->{nc}}) {
1952            
1953            $self->{ca}->{value} .= ' ';
1954            ## Stay in the state.
1955            
1956        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1957          $self->{line_prev} = $self->{line};
1958          $self->{column_prev} = $self->{column};
1959          $self->{column}++;
1960          $self->{nc}
1961              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1962        } else {
1963          $self->{set_nc}->($self);
1964        }
1965      
1966            redo A;
1967        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
1968          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1969          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
1970                        
1971            $self->{last_stag_name} = $self->{ct}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
1972    
1973              $self->{state} = DATA_STATE;
1974              $self->{s_kwd} = '';
1975              ## reconsume
1976              return  ($self->{ct}); # start tag
1977              redo A;
1978          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1979            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1980            if ($self->{ct}->{attributes}) {            if ($self->{ct}->{attributes}) {
# Line 1585  sub _get_next_token ($) { Line 1984  sub _get_next_token ($) {
1984              ## NOTE: This state should never be reached.              ## NOTE: This state should never be reached.
1985                            
1986            }            }
1987    
1988              $self->{state} = DATA_STATE;
1989              $self->{s_kwd} = '';
1990              ## reconsume
1991              return  ($self->{ct}); # end tag
1992              redo A;
1993            } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
1994              ## XML5: No parse error above; not defined yet.
1995              push @{$self->{ct}->{attrdefs}}, $self->{ca};
1996              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1997              ## Reconsume.
1998              return  ($self->{ct}); # ATTLIST
1999              redo A;
2000          } else {          } else {
2001            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
2002          }          }
         $self->{state} = DATA_STATE;  
         ## reconsume  
   
         return  ($self->{ct}); # start tag or end tag  
   
         redo A;  
2003        } else {        } else {
2004                    ## XML5 [ATTLIST]: Not defined yet.
2005            if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
2006              
2007              ## XML5: Not a parse error.
2008              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type
2009            } else {
2010              
2011            }
2012          $self->{ca}->{value} .= chr ($self->{nc});          $self->{ca}->{value} .= chr ($self->{nc});
2013          $self->{read_until}->($self->{ca}->{value},          $self->{read_until}->($self->{ca}->{value},
2014                                q['&],                                qq['&<\x09\x0C\x20],
2015                                length $self->{ca}->{value});                                length $self->{ca}->{value});
2016    
2017          ## Stay in the state          ## Stay in the state
# Line 1616  sub _get_next_token ($) { Line 2029  sub _get_next_token ($) {
2029          redo A;          redo A;
2030        }        }
2031      } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {      } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
2032          ## XML5: "Tag attribute value unquoted state".
2033    
2034        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
2035                    if ($self->{ct}->{type} == ATTLIST_TOKEN) {
2036          $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;            
2037              push @{$self->{ct}->{attrdefs}}, $self->{ca};
2038              $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
2039            } else {
2040              
2041              ## XML5: "Tag attribute name before state".
2042              $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2043            }
2044                    
2045      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2046        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 1633  sub _get_next_token ($) { Line 2055  sub _get_next_token ($) {
2055          redo A;          redo A;
2056        } elsif ($self->{nc} == 0x0026) { # &        } elsif ($self->{nc} == 0x0026) { # &
2057                    
2058    
2059            ## XML5: Not defined yet.
2060    
2061          ## NOTE: In the spec, the tokenizer is switched to the          ## NOTE: In the spec, the tokenizer is switched to the
2062          ## "entity in attribute value state".  In this implementation, the          ## "entity in attribute value state".  In this implementation, the
2063          ## tokenizer is switched to the |ENTITY_STATE|, which is an          ## tokenizer is switched to the |ENTITY_STATE|, which is an
# Line 1656  sub _get_next_token ($) { Line 2081  sub _get_next_token ($) {
2081          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
2082                        
2083            $self->{last_stag_name} = $self->{ct}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
2084    
2085              $self->{state} = DATA_STATE;
2086              $self->{s_kwd} = '';
2087              
2088        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2089          $self->{line_prev} = $self->{line};
2090          $self->{column_prev} = $self->{column};
2091          $self->{column}++;
2092          $self->{nc}
2093              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2094        } else {
2095          $self->{set_nc}->($self);
2096        }
2097      
2098              return  ($self->{ct}); # start tag
2099              redo A;
2100          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2101            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2102            if ($self->{ct}->{attributes}) {            if ($self->{ct}->{attributes}) {
# Line 1665  sub _get_next_token ($) { Line 2106  sub _get_next_token ($) {
2106              ## NOTE: This state should never be reached.              ## NOTE: This state should never be reached.
2107                            
2108            }            }
2109          } else {  
2110            die "$0: $self->{ct}->{type}: Unknown token type";            $self->{state} = DATA_STATE;
2111          }            $self->{s_kwd} = '';
2112          $self->{state} = DATA_STATE;            
           
2113      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2114        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
2115        $self->{column_prev} = $self->{column};        $self->{column_prev} = $self->{column};
# Line 1680  sub _get_next_token ($) { Line 2120  sub _get_next_token ($) {
2120        $self->{set_nc}->($self);        $self->{set_nc}->($self);
2121      }      }
2122        
2123              return  ($self->{ct}); # end tag
2124          return  ($self->{ct}); # start tag or end tag            redo A;
2125            } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2126          redo A;            push @{$self->{ct}->{attrdefs}}, $self->{ca};
2127              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2128              
2129        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2130          $self->{line_prev} = $self->{line};
2131          $self->{column_prev} = $self->{column};
2132          $self->{column}++;
2133          $self->{nc}
2134              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2135        } else {
2136          $self->{set_nc}->($self);
2137        }
2138      
2139              return  ($self->{ct}); # ATTLIST
2140              redo A;
2141            } else {
2142              die "$0: $self->{ct}->{type}: Unknown token type";
2143            }
2144        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
         $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');  
2145          if ($self->{ct}->{type} == START_TAG_TOKEN) {          if ($self->{ct}->{type} == START_TAG_TOKEN) {
2146                        
2147              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2148            $self->{last_stag_name} = $self->{ct}->{tag_name};            $self->{last_stag_name} = $self->{ct}->{tag_name};
2149    
2150              $self->{state} = DATA_STATE;
2151              $self->{s_kwd} = '';
2152              ## reconsume
2153              return  ($self->{ct}); # start tag
2154              redo A;
2155          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {          } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2156              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2157            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST            $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2158            if ($self->{ct}->{attributes}) {            if ($self->{ct}->{attributes}) {
2159                            
# Line 1698  sub _get_next_token ($) { Line 2162  sub _get_next_token ($) {
2162              ## NOTE: This state should never be reached.              ## NOTE: This state should never be reached.
2163                            
2164            }            }
2165    
2166              $self->{state} = DATA_STATE;
2167              $self->{s_kwd} = '';
2168              ## reconsume
2169              return  ($self->{ct}); # end tag
2170              redo A;
2171            } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) {
2172              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
2173              push @{$self->{ct}->{attrdefs}}, $self->{ca};
2174              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2175              ## Reconsume.
2176              return  ($self->{ct}); # ATTLIST
2177              redo A;
2178          } else {          } else {
2179            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
2180          }          }
         $self->{state} = DATA_STATE;  
         ## reconsume  
   
         return  ($self->{ct}); # start tag or end tag  
   
         redo A;  
2181        } else {        } else {
2182          if ({          if ({
2183               0x0022 => 1, # "               0x0022 => 1, # "
2184               0x0027 => 1, # '               0x0027 => 1, # '
2185               0x003D => 1, # =               0x003D => 1, # =
2186                 0x003C => 1, # <
2187              }->{$self->{nc}}) {              }->{$self->{nc}}) {
2188                        
2189              ## XML5: Not a parse error.
2190            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
2191          } else {          } else {
2192                        
2193          }          }
2194          $self->{ca}->{value} .= chr ($self->{nc});          $self->{ca}->{value} .= chr ($self->{nc});
2195          $self->{read_until}->($self->{ca}->{value},          $self->{read_until}->($self->{ca}->{value},
2196                                q["'=& >],                                qq["'=& \x09\x0C>],
2197                                length $self->{ca}->{value});                                length $self->{ca}->{value});
2198    
2199          ## Stay in the state          ## Stay in the state
# Line 1770  sub _get_next_token ($) { Line 2243  sub _get_next_token ($) {
2243            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
2244          }          }
2245          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2246            $self->{s_kwd} = '';
2247                    
2248      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2249        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 1817  sub _get_next_token ($) { Line 2291  sub _get_next_token ($) {
2291            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
2292          }          }
2293          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2294            $self->{s_kwd} = '';
2295          ## Reconsume.          ## Reconsume.
2296          return  ($self->{ct}); # start tag or end tag          return  ($self->{ct}); # start tag or end tag
2297          redo A;          redo A;
# Line 1828  sub _get_next_token ($) { Line 2303  sub _get_next_token ($) {
2303          redo A;          redo A;
2304        }        }
2305      } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {      } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
2306          ## XML5: "Empty tag state".
2307    
2308        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
2309          if ($self->{ct}->{type} == END_TAG_TOKEN) {          if ($self->{ct}->{type} == END_TAG_TOKEN) {
2310                        
# Line 1847  sub _get_next_token ($) { Line 2324  sub _get_next_token ($) {
2324          }          }
2325    
2326          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2327            $self->{s_kwd} = '';
2328                    
2329      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2330        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 1878  sub _get_next_token ($) { Line 2356  sub _get_next_token ($) {
2356          } else {          } else {
2357            die "$0: $self->{ct}->{type}: Unknown token type";            die "$0: $self->{ct}->{type}: Unknown token type";
2358          }          }
2359            ## XML5: "Tag attribute name before state".
2360          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
2361            $self->{s_kwd} = '';
2362          ## Reconsume.          ## Reconsume.
2363          return  ($self->{ct}); # start tag or end tag          return  ($self->{ct}); # start tag or end tag
2364          redo A;          redo A;
# Line 1891  sub _get_next_token ($) { Line 2371  sub _get_next_token ($) {
2371          redo A;          redo A;
2372        }        }
2373      } elsif ($self->{state} == BOGUS_COMMENT_STATE) {      } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
2374        ## (only happen if PCDATA state)        ## XML5: "Bogus comment state" and "DOCTYPE bogus comment state".
2375    
2376        ## NOTE: Unlike spec's "bogus comment state", this implementation        ## NOTE: Unlike spec's "bogus comment state", this implementation
2377        ## consumes characters one-by-one basis.        ## consumes characters one-by-one basis.
2378                
2379        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
2380                    if ($self->{in_subset}) {
2381          $self->{state} = DATA_STATE;            
2382              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2383            } else {
2384              
2385              $self->{state} = DATA_STATE;
2386              $self->{s_kwd} = '';
2387            }
2388                    
2389      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2390        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 1914  sub _get_next_token ($) { Line 2400  sub _get_next_token ($) {
2400          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
2401          redo A;          redo A;
2402        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
2403                    if ($self->{in_subset}) {
2404          $self->{state} = DATA_STATE;            
2405              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2406            } else {
2407              
2408              $self->{state} = DATA_STATE;
2409              $self->{s_kwd} = '';
2410            }
2411          ## reconsume          ## reconsume
2412    
2413          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
# Line 1942  sub _get_next_token ($) { Line 2434  sub _get_next_token ($) {
2434          redo A;          redo A;
2435        }        }
2436      } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {      } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
2437        ## (only happen if PCDATA state)        ## XML5: "Markup declaration state".
2438                
2439        if ($self->{nc} == 0x002D) { # -        if ($self->{nc} == 0x002D) { # -
2440                    
# Line 1964  sub _get_next_token ($) { Line 2456  sub _get_next_token ($) {
2456          ## ASCII case-insensitive.          ## ASCII case-insensitive.
2457                    
2458          $self->{state} = MD_DOCTYPE_STATE;          $self->{state} = MD_DOCTYPE_STATE;
2459          $self->{s_kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
2460                    
2461      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2462        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 1977  sub _get_next_token ($) { Line 2469  sub _get_next_token ($) {
2469      }      }
2470        
2471          redo A;          redo A;
2472        } elsif ($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and        } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
2473                 $self->{open_elements}->[-1]->[1] & FOREIGN_EL and                   $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
2474                    $self->{is_xml}) and
2475                 $self->{nc} == 0x005B) { # [                 $self->{nc} == 0x005B) { # [
2476                                                    
2477          $self->{state} = MD_CDATA_STATE;          $self->{state} = MD_CDATA_STATE;
2478          $self->{s_kwd} = '[';          $self->{kwd} = '[';
2479                    
2480      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2481        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2016  sub _get_next_token ($) { Line 2509  sub _get_next_token ($) {
2509                                    line => $self->{line_prev},                                    line => $self->{line_prev},
2510                                    column => $self->{column_prev} - 2,                                    column => $self->{column_prev} - 2,
2511                                   };                                   };
2512          $self->{state} = COMMENT_START_STATE;          $self->{state} = COMMENT_START_STATE; ## XML5: "comment state".
2513                    
2514      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2515        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2052  sub _get_next_token ($) { Line 2545  sub _get_next_token ($) {
2545              0x0054, # T              0x0054, # T
2546              0x0059, # Y              0x0059, # Y
2547              0x0050, # P              0x0050, # P
2548            ]->[length $self->{s_kwd}] or            ]->[length $self->{kwd}] or
2549            $self->{nc} == [            $self->{nc} == [
2550              undef,              undef,
2551              0x006F, # o              0x006F, # o
# Line 2060  sub _get_next_token ($) { Line 2553  sub _get_next_token ($) {
2553              0x0074, # t              0x0074, # t
2554              0x0079, # y              0x0079, # y
2555              0x0070, # p              0x0070, # p
2556            ]->[length $self->{s_kwd}]) {            ]->[length $self->{kwd}]) {
2557                    
2558          ## Stay in the state.          ## Stay in the state.
2559          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
2560                    
2561      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2562        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2076  sub _get_next_token ($) { Line 2569  sub _get_next_token ($) {
2569      }      }
2570        
2571          redo A;          redo A;
2572        } elsif ((length $self->{s_kwd}) == 6 and        } elsif ((length $self->{kwd}) == 6 and
2573                 ($self->{nc} == 0x0045 or # E                 ($self->{nc} == 0x0045 or # E
2574                  $self->{nc} == 0x0065)) { # e                  $self->{nc} == 0x0065)) { # e
2575                    if ($self->{is_xml} and
2576                ($self->{kwd} ne 'DOCTYP' or $self->{nc} == 0x0065)) {
2577              
2578              ## XML5: case-sensitive.
2579              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO
2580                              text => 'DOCTYPE',
2581                              line => $self->{line_prev},
2582                              column => $self->{column_prev} - 5);
2583            } else {
2584              
2585            }
2586          $self->{state} = DOCTYPE_STATE;          $self->{state} = DOCTYPE_STATE;
2587          $self->{ct} = {type => DOCTYPE_TOKEN,          $self->{ct} = {type => DOCTYPE_TOKEN,
2588                                    quirks => 1,                                    quirks => 1,
# Line 2102  sub _get_next_token ($) { Line 2605  sub _get_next_token ($) {
2605                                    
2606          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2607                          line => $self->{line_prev},                          line => $self->{line_prev},
2608                          column => $self->{column_prev} - 1 - length $self->{s_kwd});                          column => $self->{column_prev} - 1 - length $self->{kwd});
2609          $self->{state} = BOGUS_COMMENT_STATE;          $self->{state} = BOGUS_COMMENT_STATE;
2610          ## Reconsume.          ## Reconsume.
2611          $self->{ct} = {type => COMMENT_TOKEN,          $self->{ct} = {type => COMMENT_TOKEN,
2612                                    data => $self->{s_kwd},                                    data => $self->{kwd},
2613                                    line => $self->{line_prev},                                    line => $self->{line_prev},
2614                                    column => $self->{column_prev} - 1 - length $self->{s_kwd},                                    column => $self->{column_prev} - 1 - length $self->{kwd},
2615                                   };                                   };
2616          redo A;          redo A;
2617        }        }
# Line 2119  sub _get_next_token ($) { Line 2622  sub _get_next_token ($) {
2622              '[CD' => 0x0041, # A              '[CD' => 0x0041, # A
2623              '[CDA' => 0x0054, # T              '[CDA' => 0x0054, # T
2624              '[CDAT' => 0x0041, # A              '[CDAT' => 0x0041, # A
2625            }->{$self->{s_kwd}}) {            }->{$self->{kwd}}) {
2626                    
2627          ## Stay in the state.          ## Stay in the state.
2628          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
2629                    
2630      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2631        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2135  sub _get_next_token ($) { Line 2638  sub _get_next_token ($) {
2638      }      }
2639        
2640          redo A;          redo A;
2641        } elsif ($self->{s_kwd} eq '[CDATA' and        } elsif ($self->{kwd} eq '[CDATA' and
2642                 $self->{nc} == 0x005B) { # [                 $self->{nc} == 0x005B) { # [
2643                    if ($self->{is_xml} and
2644                not $self->{tainted} and
2645                @{$self->{open_elements} or []} == 0) {
2646              
2647              $self->{parse_error}->(level => $self->{level}->{must}, type => 'cdata outside of root element',
2648                              line => $self->{line_prev},
2649                              column => $self->{column_prev} - 7);
2650              $self->{tainted} = 1;
2651            } else {
2652              
2653            }
2654    
2655          $self->{ct} = {type => CHARACTER_TOKEN,          $self->{ct} = {type => CHARACTER_TOKEN,
2656                                    data => '',                                    data => '',
2657                                    line => $self->{line_prev},                                    line => $self->{line_prev},
# Line 2159  sub _get_next_token ($) { Line 2673  sub _get_next_token ($) {
2673                    
2674          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2675                          line => $self->{line_prev},                          line => $self->{line_prev},
2676                          column => $self->{column_prev} - 1 - length $self->{s_kwd});                          column => $self->{column_prev} - 1 - length $self->{kwd});
2677          $self->{state} = BOGUS_COMMENT_STATE;          $self->{state} = BOGUS_COMMENT_STATE;
2678          ## Reconsume.          ## Reconsume.
2679          $self->{ct} = {type => COMMENT_TOKEN,          $self->{ct} = {type => COMMENT_TOKEN,
2680                                    data => $self->{s_kwd},                                    data => $self->{kwd},
2681                                    line => $self->{line_prev},                                    line => $self->{line_prev},
2682                                    column => $self->{column_prev} - 1 - length $self->{s_kwd},                                    column => $self->{column_prev} - 1 - length $self->{kwd},
2683                                   };                                   };
2684          redo A;          redo A;
2685        }        }
# Line 2186  sub _get_next_token ($) { Line 2700  sub _get_next_token ($) {
2700        
2701          redo A;          redo A;
2702        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
           
2703          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2704          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
2705              
2706              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2707            } else {
2708              
2709              $self->{state} = DATA_STATE;
2710              $self->{s_kwd} = '';
2711            }
2712                    
2713      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2714        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2205  sub _get_next_token ($) { Line 2725  sub _get_next_token ($) {
2725    
2726          redo A;          redo A;
2727        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
2728          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2729          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
2730              
2731              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2732            } else {
2733              
2734              $self->{state} = DATA_STATE;
2735              $self->{s_kwd} = '';
2736            }
2737          ## reconsume          ## reconsume
2738    
2739          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
# Line 2248  sub _get_next_token ($) { Line 2774  sub _get_next_token ($) {
2774        
2775          redo A;          redo A;
2776        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
           
2777          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2778          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
2779              
2780              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2781            } else {
2782              
2783              $self->{state} = DATA_STATE;
2784              $self->{s_kwd} = '';
2785            }
2786                    
2787      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2788        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2267  sub _get_next_token ($) { Line 2799  sub _get_next_token ($) {
2799    
2800          redo A;          redo A;
2801        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
2802          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2803          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
2804              
2805              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2806            } else {
2807              
2808              $self->{state} = DATA_STATE;
2809              $self->{s_kwd} = '';
2810            }
2811          ## reconsume          ## reconsume
2812    
2813          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
# Line 2294  sub _get_next_token ($) { Line 2832  sub _get_next_token ($) {
2832          redo A;          redo A;
2833        }        }
2834      } elsif ($self->{state} == COMMENT_STATE) {      } elsif ($self->{state} == COMMENT_STATE) {
2835          ## XML5: "Comment state" and "DOCTYPE comment state".
2836    
2837        if ($self->{nc} == 0x002D) { # -        if ($self->{nc} == 0x002D) { # -
2838                    
2839          $self->{state} = COMMENT_END_DASH_STATE;          $self->{state} = COMMENT_END_DASH_STATE;
# Line 2310  sub _get_next_token ($) { Line 2850  sub _get_next_token ($) {
2850        
2851          redo A;          redo A;
2852        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
2853          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2854          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
2855              
2856              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2857            } else {
2858              
2859              $self->{state} = DATA_STATE;
2860              $self->{s_kwd} = '';
2861            }
2862          ## reconsume          ## reconsume
2863    
2864          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
# Line 2340  sub _get_next_token ($) { Line 2886  sub _get_next_token ($) {
2886          redo A;          redo A;
2887        }        }
2888      } elsif ($self->{state} == COMMENT_END_DASH_STATE) {      } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2889          ## XML5: "Comment dash state" and "DOCTYPE comment dash state".
2890    
2891        if ($self->{nc} == 0x002D) { # -        if ($self->{nc} == 0x002D) { # -
2892                    
2893          $self->{state} = COMMENT_END_STATE;          $self->{state} = COMMENT_END_STATE;
# Line 2356  sub _get_next_token ($) { Line 2904  sub _get_next_token ($) {
2904        
2905          redo A;          redo A;
2906        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
2907          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2908          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
2909              
2910              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2911            } else {
2912              
2913              $self->{state} = DATA_STATE;
2914              $self->{s_kwd} = '';
2915            }
2916          ## reconsume          ## reconsume
2917    
2918          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
# Line 2382  sub _get_next_token ($) { Line 2936  sub _get_next_token ($) {
2936          redo A;          redo A;
2937        }        }
2938      } elsif ($self->{state} == COMMENT_END_STATE) {      } elsif ($self->{state} == COMMENT_END_STATE) {
2939          ## XML5: "Comment end state" and "DOCTYPE comment end state".
2940    
2941        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
2942                    if ($self->{in_subset}) {
2943          $self->{state} = DATA_STATE;            
2944              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2945            } else {
2946              
2947              $self->{state} = DATA_STATE;
2948              $self->{s_kwd} = '';
2949            }
2950                    
2951      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2952        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2402  sub _get_next_token ($) { Line 2964  sub _get_next_token ($) {
2964          redo A;          redo A;
2965        } elsif ($self->{nc} == 0x002D) { # -        } elsif ($self->{nc} == 0x002D) { # -
2966                    
2967            ## XML5: Not a parse error.
2968          $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2969                          line => $self->{line_prev},                          line => $self->{line_prev},
2970                          column => $self->{column_prev});                          column => $self->{column_prev});
# Line 2420  sub _get_next_token ($) { Line 2983  sub _get_next_token ($) {
2983        
2984          redo A;          redo A;
2985        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
2986          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2987          $self->{state} = DATA_STATE;          if ($self->{in_subset}) {
2988              
2989              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2990            } else {
2991              
2992              $self->{state} = DATA_STATE;
2993              $self->{s_kwd} = '';
2994            }
2995          ## reconsume          ## reconsume
2996    
2997          return  ($self->{ct}); # comment          return  ($self->{ct}); # comment
# Line 2430  sub _get_next_token ($) { Line 2999  sub _get_next_token ($) {
2999          redo A;          redo A;
3000        } else {        } else {
3001                    
3002            ## XML5: Not a parse error.
3003          $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
3004                          line => $self->{line_prev},                          line => $self->{line_prev},
3005                          column => $self->{column_prev});                          column => $self->{column_prev});
# Line 2466  sub _get_next_token ($) { Line 3036  sub _get_next_token ($) {
3036          redo A;          redo A;
3037        } else {        } else {
3038                    
3039            ## XML5: Unless EOF, swith to the bogus comment state.
3040          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before DOCTYPE name');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before DOCTYPE name');
3041          $self->{state} = BEFORE_DOCTYPE_NAME_STATE;          $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
3042          ## reconsume          ## reconsume
3043          redo A;          redo A;
3044        }        }
3045      } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {      } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
3046          ## XML5: "DOCTYPE root name before state".
3047    
3048        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
3049                    
3050          ## Stay in the state          ## Stay in the state
# Line 2489  sub _get_next_token ($) { Line 3062  sub _get_next_token ($) {
3062          redo A;          redo A;
3063        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
3064                    
3065            ## XML5: No parse error.
3066          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3067          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3068            $self->{s_kwd} = '';
3069                    
3070      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3071        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2510  sub _get_next_token ($) { Line 3085  sub _get_next_token ($) {
3085                    
3086          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3087          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3088            $self->{s_kwd} = '';
3089          ## reconsume          ## reconsume
3090    
3091          return  ($self->{ct}); # DOCTYPE (quirks)          return  ($self->{ct}); # DOCTYPE (quirks)
3092    
3093          redo A;          redo A;
3094          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3095            
3096            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
3097            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3098            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3099            $self->{in_subset} = 1;
3100            
3101        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3102          $self->{line_prev} = $self->{line};
3103          $self->{column_prev} = $self->{column};
3104          $self->{column}++;
3105          $self->{nc}
3106              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3107        } else {
3108          $self->{set_nc}->($self);
3109        }
3110      
3111            return  ($self->{ct}); # DOCTYPE
3112            redo A;
3113        } else {        } else {
3114                    
3115          $self->{ct}->{name} = chr $self->{nc};          $self->{ct}->{name} = chr $self->{nc};
# Line 2534  sub _get_next_token ($) { Line 3129  sub _get_next_token ($) {
3129          redo A;          redo A;
3130        }        }
3131      } elsif ($self->{state} == DOCTYPE_NAME_STATE) {      } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
3132  ## ISSUE: Redundant "First," in the spec.        ## XML5: "DOCTYPE root name state".
3133    
3134          ## ISSUE: Redundant "First," in the spec.
3135    
3136        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
3137                    
3138          $self->{state} = AFTER_DOCTYPE_NAME_STATE;          $self->{state} = AFTER_DOCTYPE_NAME_STATE;
# Line 2553  sub _get_next_token ($) { Line 3151  sub _get_next_token ($) {
3151        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
3152                    
3153          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3154            $self->{s_kwd} = '';
3155                    
3156      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3157        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2572  sub _get_next_token ($) { Line 3171  sub _get_next_token ($) {
3171                    
3172          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3173          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
3174            $self->{s_kwd} = '';
3175          ## reconsume          ## reconsume
3176    
3177          $self->{ct}->{quirks} = 1;          $self->{ct}->{quirks} = 1;
3178          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
3179    
3180          redo A;          redo A;
3181          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
3182            
3183            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3184            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3185            $self->{in_subset} = 1;
3186            
3187        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3188          $self->{line_prev} = $self->{line};
3189          $self->{column_prev} = $self->{column};
3190          $self->{column}++;
3191          $self->{nc}
3192              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3193        } else {
3194          $self->{set_nc}->($self);
3195        }
3196      
3197            return  ($self->{ct}); # DOCTYPE
3198            redo A;
3199        } else {        } else {
3200                    
3201          $self->{ct}->{name}          $self->{ct}->{name}
# Line 2597  sub _get_next_token ($) { Line 3215  sub _get_next_token ($) {
3215          redo A;          redo A;
3216        }        }
3217      } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {      } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
3218          ## XML5: Corresponding to XML5's "DOCTYPE root name after
3219          ## state", but implemented differently.
3220    
3221        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
3222                    
3223          ## Stay in the state          ## Stay in the state
# Line 2613  sub _get_next_token ($) { Line 3234  sub _get_next_token ($) {
3234        
3235          redo A;          redo A;
3236        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
3237            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3238              
3239              $self->{state} = DATA_STATE;
3240              $self->{s_kwd} = '';
3241            } else {
3242              
3243              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
3244              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3245            }
3246                    
         $self->{state} = DATA_STATE;  
3247                    
3248      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3249        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2626  sub _get_next_token ($) { Line 3255  sub _get_next_token ($) {
3255        $self->{set_nc}->($self);        $self->{set_nc}->($self);
3256      }      }
3257        
3258            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         return  ($self->{ct}); # DOCTYPE  
   
3259          redo A;          redo A;
3260        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
3261            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3262              
3263              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3264              $self->{state} = DATA_STATE;
3265              $self->{s_kwd} = '';
3266              $self->{ct}->{quirks} = 1;
3267            } else {
3268              
3269              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3270              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3271            }
3272                    
3273          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');          ## Reconsume.
3274          $self->{state} = DATA_STATE;          return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         ## reconsume  
   
         $self->{ct}->{quirks} = 1;  
         return  ($self->{ct}); # DOCTYPE  
   
3275          redo A;          redo A;
3276        } elsif ($self->{nc} == 0x0050 or # P        } elsif ($self->{nc} == 0x0050 or # P
3277                 $self->{nc} == 0x0070) { # p                 $self->{nc} == 0x0070) { # p
3278            
3279          $self->{state} = PUBLIC_STATE;          $self->{state} = PUBLIC_STATE;
3280          $self->{s_kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
3281                    
3282      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3283        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2658  sub _get_next_token ($) { Line 3292  sub _get_next_token ($) {
3292          redo A;          redo A;
3293        } elsif ($self->{nc} == 0x0053 or # S        } elsif ($self->{nc} == 0x0053 or # S
3294                 $self->{nc} == 0x0073) { # s                 $self->{nc} == 0x0073) { # s
3295            
3296          $self->{state} = SYSTEM_STATE;          $self->{state} = SYSTEM_STATE;
3297          $self->{s_kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
3298                    
3299      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3300        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2672  sub _get_next_token ($) { Line 3307  sub _get_next_token ($) {
3307      }      }
3308        
3309          redo A;          redo A;
3310        } else {        } elsif ($self->{nc} == 0x0022 and # "
3311                   ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
3312                    $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
3313                    
3314          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name');          $self->{state} = DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE;
3315          $self->{ct}->{quirks} = 1;          $self->{ct}->{value} = ''; # ENTITY
3316            
3317        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3318          $self->{line_prev} = $self->{line};
3319          $self->{column_prev} = $self->{column};
3320          $self->{column}++;
3321          $self->{nc}
3322              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3323        } else {
3324          $self->{set_nc}->($self);
3325        }
3326      
3327            redo A;
3328          } elsif ($self->{nc} == 0x0027 and # '
3329                   ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
3330                    $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
3331            
3332            $self->{state} = DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE;
3333            $self->{ct}->{value} = ''; # ENTITY
3334            
3335        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3336          $self->{line_prev} = $self->{line};
3337          $self->{column_prev} = $self->{column};
3338          $self->{column}++;
3339          $self->{nc}
3340              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3341        } else {
3342          $self->{set_nc}->($self);
3343        }
3344      
3345            redo A;
3346          } elsif ($self->{is_xml} and
3347                   $self->{ct}->{type} == DOCTYPE_TOKEN and
3348                   $self->{nc} == 0x005B) { # [
3349            
3350            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3351            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3352            $self->{in_subset} = 1;
3353            
3354        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3355          $self->{line_prev} = $self->{line};
3356          $self->{column_prev} = $self->{column};
3357          $self->{column}++;
3358          $self->{nc}
3359              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3360        } else {
3361          $self->{set_nc}->($self);
3362        }
3363      
3364            return  ($self->{ct}); # DOCTYPE
3365            redo A;
3366          } else {
3367            $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name'); ## TODO: type
3368    
3369            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3370              
3371              $self->{ct}->{quirks} = 1;
3372              $self->{state} = BOGUS_DOCTYPE_STATE;
3373            } else {
3374              
3375              $self->{state} = BOGUS_MD_STATE;
3376            }
3377    
         $self->{state} = BOGUS_DOCTYPE_STATE;  
3378                    
3379      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3380        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2699  sub _get_next_token ($) { Line 3396  sub _get_next_token ($) {
3396              0x0042, # B              0x0042, # B
3397              0x004C, # L              0x004C, # L
3398              0x0049, # I              0x0049, # I
3399            ]->[length $self->{s_kwd}] or            ]->[length $self->{kwd}] or
3400            $self->{nc} == [            $self->{nc} == [
3401              undef,              undef,
3402              0x0075, # u              0x0075, # u
3403              0x0062, # b              0x0062, # b
3404              0x006C, # l              0x006C, # l
3405              0x0069, # i              0x0069, # i
3406            ]->[length $self->{s_kwd}]) {            ]->[length $self->{kwd}]) {
3407                    
3408          ## Stay in the state.          ## Stay in the state.
3409          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
3410                    
3411      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3412        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2722  sub _get_next_token ($) { Line 3419  sub _get_next_token ($) {
3419      }      }
3420        
3421          redo A;          redo A;
3422        } elsif ((length $self->{s_kwd}) == 5 and        } elsif ((length $self->{kwd}) == 5 and
3423                 ($self->{nc} == 0x0043 or # C                 ($self->{nc} == 0x0043 or # C
3424                  $self->{nc} == 0x0063)) { # c                  $self->{nc} == 0x0063)) { # c
3425                    if ($self->{is_xml} and
3426                ($self->{kwd} ne 'PUBLI' or $self->{nc} == 0x0063)) { # c
3427              
3428              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
3429                              text => 'PUBLIC',
3430                              line => $self->{line_prev},
3431                              column => $self->{column_prev} - 4);
3432            } else {
3433              
3434            }
3435          $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;          $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3436                    
3437      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 2740  sub _get_next_token ($) { Line 3446  sub _get_next_token ($) {
3446        
3447          redo A;          redo A;
3448        } else {        } else {
3449                    $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name', ## TODO: type
         $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name',  
3450                          line => $self->{line_prev},                          line => $self->{line_prev},
3451                          column => $self->{column_prev} + 1 - length $self->{s_kwd});                          column => $self->{column_prev} + 1 - length $self->{kwd});
3452          $self->{ct}->{quirks} = 1;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3453              
3454          $self->{state} = BOGUS_DOCTYPE_STATE;            $self->{ct}->{quirks} = 1;
3455              $self->{state} = BOGUS_DOCTYPE_STATE;
3456            } else {
3457              
3458              $self->{state} = BOGUS_MD_STATE;
3459            }
3460          ## Reconsume.          ## Reconsume.
3461          redo A;          redo A;
3462        }        }
# Line 2758  sub _get_next_token ($) { Line 3468  sub _get_next_token ($) {
3468              0x0053, # S              0x0053, # S
3469              0x0054, # T              0x0054, # T
3470              0x0045, # E              0x0045, # E
3471            ]->[length $self->{s_kwd}] or            ]->[length $self->{kwd}] or
3472            $self->{nc} == [            $self->{nc} == [
3473              undef,              undef,
3474              0x0079, # y              0x0079, # y
3475              0x0073, # s              0x0073, # s
3476              0x0074, # t              0x0074, # t
3477              0x0065, # e              0x0065, # e
3478            ]->[length $self->{s_kwd}]) {            ]->[length $self->{kwd}]) {
3479                    
3480          ## Stay in the state.          ## Stay in the state.
3481          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
3482                    
3483      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3484        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2781  sub _get_next_token ($) { Line 3491  sub _get_next_token ($) {
3491      }      }
3492        
3493          redo A;          redo A;
3494        } elsif ((length $self->{s_kwd}) == 5 and        } elsif ((length $self->{kwd}) == 5 and
3495                 ($self->{nc} == 0x004D or # M                 ($self->{nc} == 0x004D or # M
3496                  $self->{nc} == 0x006D)) { # m                  $self->{nc} == 0x006D)) { # m
3497                    if ($self->{is_xml} and
3498                ($self->{kwd} ne 'SYSTE' or $self->{nc} == 0x006D)) { # m
3499              
3500              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
3501                              text => 'SYSTEM',
3502                              line => $self->{line_prev},
3503                              column => $self->{column_prev} - 4);
3504            } else {
3505              
3506            }
3507          $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;          $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
3508                    
3509      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 2799  sub _get_next_token ($) { Line 3518  sub _get_next_token ($) {
3518        
3519          redo A;          redo A;
3520        } else {        } else {
3521                    $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name', ## TODO: type
         $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name',  
3522                          line => $self->{line_prev},                          line => $self->{line_prev},
3523                          column => $self->{column_prev} + 1 - length $self->{s_kwd});                          column => $self->{column_prev} + 1 - length $self->{kwd});
3524          $self->{ct}->{quirks} = 1;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3525              
3526          $self->{state} = BOGUS_DOCTYPE_STATE;            $self->{ct}->{quirks} = 1;
3527              $self->{state} = BOGUS_DOCTYPE_STATE;
3528            } else {
3529              
3530              $self->{state} = BOGUS_MD_STATE;
3531            }
3532          ## Reconsume.          ## Reconsume.
3533          redo A;          redo A;
3534        }        }
# Line 2858  sub _get_next_token ($) { Line 3581  sub _get_next_token ($) {
3581        
3582          redo A;          redo A;
3583        } elsif ($self->{nc} eq 0x003E) { # >        } elsif ($self->{nc} eq 0x003E) { # >
           
3584          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3585            
3586          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3587              
3588              $self->{state} = DATA_STATE;
3589              $self->{s_kwd} = '';
3590              $self->{ct}->{quirks} = 1;
3591            } else {
3592              
3593              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3594            }
3595            
3596                    
3597      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3598        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2873  sub _get_next_token ($) { Line 3604  sub _get_next_token ($) {
3604        $self->{set_nc}->($self);        $self->{set_nc}->($self);
3605      }      }
3606        
3607            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         return  ($self->{ct}); # DOCTYPE  
   
3608          redo A;          redo A;
3609        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
3610            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3611              
3612              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3613              $self->{state} = DATA_STATE;
3614              $self->{s_kwd} = '';
3615              $self->{ct}->{quirks} = 1;
3616            } else {
3617              
3618              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3619              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3620            }
3621                    
         $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');  
   
         $self->{state} = DATA_STATE;  
3622          ## reconsume          ## reconsume
   
         $self->{ct}->{quirks} = 1;  
3623          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
   
3624          redo A;          redo A;
3625        } else {        } elsif ($self->{is_xml} and
3626                   $self->{ct}->{type} == DOCTYPE_TOKEN and
3627                   $self->{nc} == 0x005B) { # [
3628                    
3629            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3630            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3631            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3632            $self->{in_subset} = 1;
3633            
3634        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3635          $self->{line_prev} = $self->{line};
3636          $self->{column_prev} = $self->{column};
3637          $self->{column}++;
3638          $self->{nc}
3639              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3640        } else {
3641          $self->{set_nc}->($self);
3642        }
3643      
3644            return  ($self->{ct}); # DOCTYPE
3645            redo A;
3646          } else {
3647          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC');
         $self->{ct}->{quirks} = 1;  
3648    
3649          $self->{state} = BOGUS_DOCTYPE_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3650              
3651              $self->{ct}->{quirks} = 1;
3652              $self->{state} = BOGUS_DOCTYPE_STATE;
3653            } else {
3654              
3655              $self->{state} = BOGUS_MD_STATE;
3656            }
3657    
3658                    
3659      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3660        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2925  sub _get_next_token ($) { Line 3685  sub _get_next_token ($) {
3685        
3686          redo A;          redo A;
3687        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
           
3688          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3689    
3690          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3691              
3692              $self->{state} = DATA_STATE;
3693              $self->{s_kwd} = '';
3694              $self->{ct}->{quirks} = 1;
3695            } else {
3696              
3697              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3698            }
3699    
3700                    
3701      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3702        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 2940  sub _get_next_token ($) { Line 3708  sub _get_next_token ($) {
3708        $self->{set_nc}->($self);        $self->{set_nc}->($self);
3709      }      }
3710        
3711            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         return  ($self->{ct}); # DOCTYPE  
   
3712          redo A;          redo A;
3713        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
3714          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3715    
3716          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3717          ## reconsume            
3718              $self->{state} = DATA_STATE;
3719          $self->{ct}->{quirks} = 1;            $self->{s_kwd} = '';
3720              $self->{ct}->{quirks} = 1;
3721            } else {
3722              
3723              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3724            }
3725            
3726            ## Reconsume.
3727          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
   
3728          redo A;          redo A;
3729        } else {        } else {
3730                    
3731          $self->{ct}->{pubid} # DOCTYPE          $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
             .= chr $self->{nc};  
3732          $self->{read_until}->($self->{ct}->{pubid}, q[">],          $self->{read_until}->($self->{ct}->{pubid}, q[">],
3733                                length $self->{ct}->{pubid});                                length $self->{ct}->{pubid});
3734    
# Line 2994  sub _get_next_token ($) { Line 3763  sub _get_next_token ($) {
3763        
3764          redo A;          redo A;
3765        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
           
3766          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3767    
3768          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3769              
3770              $self->{state} = DATA_STATE;
3771              $self->{s_kwd} = '';
3772              $self->{ct}->{quirks} = 1;
3773            } else {
3774              
3775              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3776            }
3777    
3778                    
3779      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3780        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3009  sub _get_next_token ($) { Line 3786  sub _get_next_token ($) {
3786        $self->{set_nc}->($self);        $self->{set_nc}->($self);
3787      }      }
3788        
3789            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         return  ($self->{ct}); # DOCTYPE  
   
3790          redo A;          redo A;
3791        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
3792          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3793    
3794          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3795              
3796              $self->{state} = DATA_STATE;
3797              $self->{s_kwd} = '';
3798              $self->{ct}->{quirks} = 1;
3799            } else {
3800              
3801              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3802            }
3803          
3804          ## reconsume          ## reconsume
3805            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         return  ($self->{ct}); # DOCTYPE  
   
3806          redo A;          redo A;
3807        } else {        } else {
3808                    
3809          $self->{ct}->{pubid} # DOCTYPE          $self->{ct}->{pubid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
             .= chr $self->{nc};  
3810          $self->{read_until}->($self->{ct}->{pubid}, q['>],          $self->{read_until}->($self->{ct}->{pubid}, q['>],
3811                                length $self->{ct}->{pubid});                                length $self->{ct}->{pubid});
3812    
# Line 3064  sub _get_next_token ($) { Line 3842  sub _get_next_token ($) {
3842          redo A;          redo A;
3843        } elsif ($self->{nc} == 0x0022) { # "        } elsif ($self->{nc} == 0x0022) { # "
3844                    
3845          $self->{ct}->{sysid} = ''; # DOCTYPE          $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
3846          $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;          $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3847                    
3848      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 3080  sub _get_next_token ($) { Line 3858  sub _get_next_token ($) {
3858          redo A;          redo A;
3859        } elsif ($self->{nc} == 0x0027) { # '        } elsif ($self->{nc} == 0x0027) { # '
3860                    
3861          $self->{ct}->{sysid} = ''; # DOCTYPE          $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION
3862          $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;          $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
3863                    
3864      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 3095  sub _get_next_token ($) { Line 3873  sub _get_next_token ($) {
3873        
3874          redo A;          redo A;
3875        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
3876            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3877              if ($self->{is_xml}) {
3878                
3879                $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3880              } else {
3881                
3882              }
3883              $self->{state} = DATA_STATE;
3884              $self->{s_kwd} = '';
3885            } else {
3886              if ($self->{ct}->{type} == NOTATION_TOKEN) {
3887                
3888              } else {
3889                
3890                $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');            
3891              }
3892              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3893            }
3894                    
         $self->{state} = DATA_STATE;  
3895                    
3896      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3897        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3108  sub _get_next_token ($) { Line 3903  sub _get_next_token ($) {
3903        $self->{set_nc}->($self);        $self->{set_nc}->($self);
3904      }      }
3905        
3906            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         return  ($self->{ct}); # DOCTYPE  
   
3907          redo A;          redo A;
3908        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
3909            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3910              
3911              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3912              
3913              $self->{state} = DATA_STATE;
3914              $self->{s_kwd} = '';
3915              $self->{ct}->{quirks} = 1;
3916            } else {
3917              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
3918              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3919            }
3920                    
         $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');  
   
         $self->{state} = DATA_STATE;  
3921          ## reconsume          ## reconsume
3922            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
3923          $self->{ct}->{quirks} = 1;          redo A;
3924          } elsif ($self->{is_xml} and
3925                   $self->{ct}->{type} == DOCTYPE_TOKEN and
3926                   $self->{nc} == 0x005B) { # [
3927            
3928            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3929            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
3930            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
3931            $self->{in_subset} = 1;
3932            
3933        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3934          $self->{line_prev} = $self->{line};
3935          $self->{column_prev} = $self->{column};
3936          $self->{column}++;
3937          $self->{nc}
3938              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3939        } else {
3940          $self->{set_nc}->($self);
3941        }
3942      
3943          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
   
3944          redo A;          redo A;
3945        } else {        } else {
           
3946          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC literal');
         $self->{ct}->{quirks} = 1;  
3947    
3948          $self->{state} = BOGUS_DOCTYPE_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
3949              
3950              $self->{ct}->{quirks} = 1;
3951              $self->{state} = BOGUS_DOCTYPE_STATE;
3952            } else {
3953              
3954              $self->{state} = BOGUS_MD_STATE;
3955            }
3956    
3957                    
3958      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3959        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3191  sub _get_next_token ($) { Line 4016  sub _get_next_token ($) {
4016        
4017          redo A;          redo A;
4018        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
           
4019          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
         $self->{state} = DATA_STATE;  
4020                    
4021      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4022        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3206  sub _get_next_token ($) { Line 4029  sub _get_next_token ($) {
4029      }      }
4030        
4031    
4032          $self->{ct}->{quirks} = 1;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4033          return  ($self->{ct}); # DOCTYPE            
4034              $self->{state} = DATA_STATE;
4035              $self->{s_kwd} = '';
4036              $self->{ct}->{quirks} = 1;
4037            } else {
4038              
4039              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4040            }
4041    
4042            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4043          redo A;          redo A;
4044        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
4045            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4046              
4047              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
4048              $self->{state} = DATA_STATE;
4049              $self->{s_kwd} = '';
4050              $self->{ct}->{quirks} = 1;
4051            } else {
4052              
4053              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4054              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4055            }
4056                    
         $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');  
   
         $self->{state} = DATA_STATE;  
4057          ## reconsume          ## reconsume
4058            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4059            redo A;
4060          } elsif ($self->{is_xml} and
4061                   $self->{ct}->{type} == DOCTYPE_TOKEN and
4062                   $self->{nc} == 0x005B) { # [
4063            
4064            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
4065    
4066          $self->{ct}->{quirks} = 1;          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4067            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4068            $self->{in_subset} = 1;
4069            
4070        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4071          $self->{line_prev} = $self->{line};
4072          $self->{column_prev} = $self->{column};
4073          $self->{column}++;
4074          $self->{nc}
4075              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4076        } else {
4077          $self->{set_nc}->($self);
4078        }
4079      
4080          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
   
4081          redo A;          redo A;
4082        } else {        } else {
           
4083          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM');
         $self->{ct}->{quirks} = 1;  
4084    
4085          $self->{state} = BOGUS_DOCTYPE_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4086                        
4087              $self->{ct}->{quirks} = 1;
4088              $self->{state} = BOGUS_DOCTYPE_STATE;
4089            } else {
4090              
4091              $self->{state} = BOGUS_MD_STATE;
4092            }
4093    
4094                    
4095      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4096        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3256  sub _get_next_token ($) { Line 4120  sub _get_next_token ($) {
4120      }      }
4121        
4122          redo A;          redo A;
4123        } elsif ($self->{nc} == 0x003E) { # >        } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
           
4124          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4125    
4126          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4127              
4128              $self->{state} = DATA_STATE;
4129              $self->{s_kwd} = '';
4130              $self->{ct}->{quirks} = 1;
4131            } else {
4132              
4133              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4134            }
4135            
4136                    
4137      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4138        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3272  sub _get_next_token ($) { Line 4144  sub _get_next_token ($) {
4144        $self->{set_nc}->($self);        $self->{set_nc}->($self);
4145      }      }
4146        
4147            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         return  ($self->{ct}); # DOCTYPE  
   
4148          redo A;          redo A;
4149        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
4150          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4151    
4152          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4153              
4154              $self->{state} = DATA_STATE;
4155              $self->{s_kwd} = '';
4156              $self->{ct}->{quirks} = 1;
4157            } else {
4158              
4159              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4160            }
4161            
4162          ## reconsume          ## reconsume
4163            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
         $self->{ct}->{quirks} = 1;  
         return  ($self->{ct}); # DOCTYPE  
   
4164          redo A;          redo A;
4165        } else {        } else {
4166                    
4167          $self->{ct}->{sysid} # DOCTYPE          $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
             .= chr $self->{nc};  
4168          $self->{read_until}->($self->{ct}->{sysid}, q[">],          $self->{read_until}->($self->{ct}->{sysid}, q[">],
4169                                length $self->{ct}->{sysid});                                length $self->{ct}->{sysid});
4170    
# Line 3325  sub _get_next_token ($) { Line 4198  sub _get_next_token ($) {
4198      }      }
4199        
4200          redo A;          redo A;
4201        } elsif ($self->{nc} == 0x003E) { # >        } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
4202                    
4203          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4204    
4205          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
4206            $self->{s_kwd} = '';
4207                    
4208      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4209        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3347  sub _get_next_token ($) { Line 4221  sub _get_next_token ($) {
4221    
4222          redo A;          redo A;
4223        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
           
4224          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
4225    
4226          $self->{state} = DATA_STATE;          if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4227          ## reconsume            
4228              $self->{state} = DATA_STATE;
4229          $self->{ct}->{quirks} = 1;            $self->{s_kwd} = '';
4230          return  ($self->{ct}); # DOCTYPE            $self->{ct}->{quirks} = 1;
4231            } else {
4232              
4233              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4234            }
4235    
4236            ## reconsume
4237            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4238          redo A;          redo A;
4239        } else {        } else {
4240                    
4241          $self->{ct}->{sysid} # DOCTYPE          $self->{ct}->{sysid} .= chr $self->{nc}; # DOCTYPE/ENTITY/NOTATION
             .= chr $self->{nc};  
4242          $self->{read_until}->($self->{ct}->{sysid}, q['>],          $self->{read_until}->($self->{ct}->{sysid}, q['>],
4243                                length $self->{ct}->{sysid});                                length $self->{ct}->{sysid});
4244    
# Line 3380  sub _get_next_token ($) { Line 4258  sub _get_next_token ($) {
4258        }        }
4259      } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {      } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
4260        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
4261                    if ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN) {
4262          ## Stay in the state            
4263              $self->{state} = BEFORE_NDATA_STATE;
4264            } else {
4265              
4266              ## Stay in the state
4267            }
4268                    
4269      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4270        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3395  sub _get_next_token ($) { Line 4278  sub _get_next_token ($) {
4278        
4279          redo A;          redo A;
4280        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
4281            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4282              
4283              $self->{state} = DATA_STATE;
4284              $self->{s_kwd} = '';
4285            } else {
4286              
4287              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4288            }
4289    
4290                    
4291          $self->{state} = DATA_STATE;      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4292          $self->{line_prev} = $self->{line};
4293          $self->{column_prev} = $self->{column};
4294          $self->{column}++;
4295          $self->{nc}
4296              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4297        } else {
4298          $self->{set_nc}->($self);
4299        }
4300      
4301            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4302            redo A;
4303          } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
4304                   ($self->{nc} == 0x004E or # N
4305                    $self->{nc} == 0x006E)) { # n
4306            
4307            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before NDATA'); ## TODO: type
4308            $self->{state} = NDATA_STATE;
4309            $self->{kwd} = chr $self->{nc};
4310                    
4311      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4312        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3408  sub _get_next_token ($) { Line 4318  sub _get_next_token ($) {
4318        $self->{set_nc}->($self);        $self->{set_nc}->($self);
4319      }      }
4320        
4321            redo A;
4322          } elsif ($self->{nc} == -1) {
4323            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4324              
4325              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
4326              $self->{state} = DATA_STATE;
4327              $self->{s_kwd} = '';
4328              $self->{ct}->{quirks} = 1;
4329            } else {
4330              
4331              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4332              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4333            }
4334    
4335            ## reconsume
4336            return  ($self->{ct}); # DOCTYPE/ENTITY/NOTATION
4337            redo A;
4338          } elsif ($self->{is_xml} and
4339                   $self->{ct}->{type} == DOCTYPE_TOKEN and
4340                   $self->{nc} == 0x005B) { # [
4341            
4342            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4343            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4344            $self->{in_subset} = 1;
4345            
4346        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4347          $self->{line_prev} = $self->{line};
4348          $self->{column_prev} = $self->{column};
4349          $self->{column}++;
4350          $self->{nc}
4351              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4352        } else {
4353          $self->{set_nc}->($self);
4354        }
4355      
4356          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
4357            redo A;
4358          } else {
4359            $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
4360    
4361            if ($self->{ct}->{type} == DOCTYPE_TOKEN) {
4362              
4363              #$self->{ct}->{quirks} = 1;
4364              $self->{state} = BOGUS_DOCTYPE_STATE;
4365            } else {
4366              
4367              $self->{state} = BOGUS_MD_STATE;
4368            }
4369    
4370            
4371        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4372          $self->{line_prev} = $self->{line};
4373          $self->{column_prev} = $self->{column};
4374          $self->{column}++;
4375          $self->{nc}
4376              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4377        } else {
4378          $self->{set_nc}->($self);
4379        }
4380      
4381            redo A;
4382          }
4383        } elsif ($self->{state} == BEFORE_NDATA_STATE) {
4384          if ($is_space->{$self->{nc}}) {
4385            
4386            ## Stay in the state.
4387            
4388        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4389          $self->{line_prev} = $self->{line};
4390          $self->{column_prev} = $self->{column};
4391          $self->{column}++;
4392          $self->{nc}
4393              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4394        } else {
4395          $self->{set_nc}->($self);
4396        }
4397      
4398            redo A;
4399          } elsif ($self->{nc} == 0x003E) { # >
4400            
4401            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4402            
4403        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4404          $self->{line_prev} = $self->{line};
4405          $self->{column_prev} = $self->{column};
4406          $self->{column}++;
4407          $self->{nc}
4408              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4409        } else {
4410          $self->{set_nc}->($self);
4411        }
4412      
4413            return  ($self->{ct}); # ENTITY
4414            redo A;
4415          } elsif ($self->{nc} == 0x004E or # N
4416                   $self->{nc} == 0x006E) { # n
4417            
4418            $self->{state} = NDATA_STATE;
4419            $self->{kwd} = chr $self->{nc};
4420            
4421        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4422          $self->{line_prev} = $self->{line};
4423          $self->{column_prev} = $self->{column};
4424          $self->{column}++;
4425          $self->{nc}
4426              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4427        } else {
4428          $self->{set_nc}->($self);
4429        }
4430      
4431          redo A;          redo A;
4432        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
4433                    
4434          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
4435          $self->{state} = DATA_STATE;          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4436          ## reconsume          ## reconsume
4437            return  ($self->{ct}); # ENTITY
         $self->{ct}->{quirks} = 1;  
         return  ($self->{ct}); # DOCTYPE  
   
4438          redo A;          redo A;
4439        } else {        } else {
4440                    
4441          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
4442          #$self->{ct}->{quirks} = 1;          $self->{state} = BOGUS_MD_STATE;
   
         $self->{state} = BOGUS_DOCTYPE_STATE;  
4443                    
4444      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4445        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3445  sub _get_next_token ($) { Line 4457  sub _get_next_token ($) {
4457        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
4458                    
4459          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
4460            $self->{s_kwd} = '';
4461                    
4462      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4463        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3460  sub _get_next_token ($) { Line 4473  sub _get_next_token ($) {
4473          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
4474    
4475          redo A;          redo A;
4476          } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
4477            
4478            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
4479            $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
4480            $self->{in_subset} = 1;
4481            
4482        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4483          $self->{line_prev} = $self->{line};
4484          $self->{column_prev} = $self->{column};
4485          $self->{column}++;
4486          $self->{nc}
4487              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4488        } else {
4489          $self->{set_nc}->($self);
4490        }
4491      
4492            return  ($self->{ct}); # DOCTYPE
4493            redo A;
4494        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
4495                    
4496          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
4497            $self->{s_kwd} = '';
4498          ## reconsume          ## reconsume
4499    
4500          return  ($self->{ct}); # DOCTYPE          return  ($self->{ct}); # DOCTYPE
# Line 3471  sub _get_next_token ($) { Line 4503  sub _get_next_token ($) {
4503        } else {        } else {
4504                    
4505          my $s = '';          my $s = '';
4506          $self->{read_until}->($s, q[>], 0);          $self->{read_until}->($s, q{>[}, 0);
4507    
4508          ## Stay in the state          ## Stay in the state
4509                    
# Line 3491  sub _get_next_token ($) { Line 4523  sub _get_next_token ($) {
4523        ## NOTE: "CDATA section state" in the state is jointly implemented        ## NOTE: "CDATA section state" in the state is jointly implemented
4524        ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,        ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
4525        ## and |CDATA_SECTION_MSE2_STATE|.        ## and |CDATA_SECTION_MSE2_STATE|.
4526    
4527          ## XML5: "CDATA state".
4528                
4529        if ($self->{nc} == 0x005D) { # ]        if ($self->{nc} == 0x005D) { # ]
4530                    
# Line 3508  sub _get_next_token ($) { Line 4542  sub _get_next_token ($) {
4542        
4543          redo A;          redo A;
4544        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
4545            if ($self->{is_xml}) {
4546              
4547              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no mse'); ## TODO: type
4548            } else {
4549              
4550            }
4551    
4552          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
4553                    $self->{s_kwd} = '';
4554      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {          ## Reconsume.
       $self->{line_prev} = $self->{line};  
       $self->{column_prev} = $self->{column};  
       $self->{column}++;  
       $self->{nc}  
           = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);  
     } else {  
       $self->{set_nc}->($self);  
     }  
     
4555          if (length $self->{ct}->{data}) { # character          if (length $self->{ct}->{data}) { # character
4556                        
4557            return  ($self->{ct}); # character            return  ($self->{ct}); # character
# Line 3552  sub _get_next_token ($) { Line 4584  sub _get_next_token ($) {
4584    
4585        ## ISSUE: "text tokens" in spec.        ## ISSUE: "text tokens" in spec.
4586      } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {      } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
4587          ## XML5: "CDATA bracket state".
4588    
4589        if ($self->{nc} == 0x005D) { # ]        if ($self->{nc} == 0x005D) { # ]
4590                    
4591          $self->{state} = CDATA_SECTION_MSE2_STATE;          $self->{state} = CDATA_SECTION_MSE2_STATE;
# Line 3569  sub _get_next_token ($) { Line 4603  sub _get_next_token ($) {
4603          redo A;          redo A;
4604        } else {        } else {
4605                    
4606            ## XML5: If EOF, "]" is not appended and changed to the data state.
4607          $self->{ct}->{data} .= ']';          $self->{ct}->{data} .= ']';
4608          $self->{state} = CDATA_SECTION_STATE;          $self->{state} = CDATA_SECTION_STATE; ## XML5: Stay in the state.
4609          ## Reconsume.          ## Reconsume.
4610          redo A;          redo A;
4611        }        }
4612      } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {      } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
4613          ## XML5: "CDATA end state".
4614    
4615        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
4616          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
4617            $self->{s_kwd} = '';
4618                    
4619      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4620        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3616  sub _get_next_token ($) { Line 4654  sub _get_next_token ($) {
4654                    
4655          $self->{ct}->{data} .= ']]'; # character          $self->{ct}->{data} .= ']]'; # character
4656          $self->{state} = CDATA_SECTION_STATE;          $self->{state} = CDATA_SECTION_STATE;
4657          ## Reconsume.          ## Reconsume. ## XML5: Emit.
4658          redo A;          redo A;
4659        }        }
4660      } elsif ($self->{state} == ENTITY_STATE) {      } elsif ($self->{state} == ENTITY_STATE) {
# Line 3625  sub _get_next_token ($) { Line 4663  sub _get_next_token ($) {
4663              0x003C => 1, 0x0026 => 1, -1 => 1, # <, &              0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
4664              $self->{entity_add} => 1,              $self->{entity_add} => 1,
4665            }->{$self->{nc}}) {            }->{$self->{nc}}) {
4666                    if ($self->{is_xml}) {
4667              
4668              $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
4669                              line => $self->{line_prev},
4670                              column => $self->{column_prev}
4671                                  + ($self->{nc} == -1 ? 1 : 0));
4672            } else {
4673              
4674              ## No error
4675            }
4676          ## Don't consume          ## Don't consume
         ## No error  
4677          ## Return nothing.          ## Return nothing.
4678          #          #
4679        } elsif ($self->{nc} == 0x0023) { # #        } elsif ($self->{nc} == 0x0023) { # #
4680                    
4681          $self->{state} = ENTITY_HASH_STATE;          $self->{state} = ENTITY_HASH_STATE;
4682          $self->{s_kwd} = '#';          $self->{kwd} = '#';
4683                    
4684      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4685        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3646  sub _get_next_token ($) { Line 4692  sub _get_next_token ($) {
4692      }      }
4693        
4694          redo A;          redo A;
4695        } elsif ((0x0041 <= $self->{nc} and        } elsif ($self->{is_xml} or
4696                   (0x0041 <= $self->{nc} and
4697                  $self->{nc} <= 0x005A) or # A..Z                  $self->{nc} <= 0x005A) or # A..Z
4698                 (0x0061 <= $self->{nc} and                 (0x0061 <= $self->{nc} and
4699                  $self->{nc} <= 0x007A)) { # a..z                  $self->{nc} <= 0x007A)) { # a..z
4700                    
4701          require Whatpm::_NamedEntityList;          require Whatpm::_NamedEntityList;
4702          $self->{state} = ENTITY_NAME_STATE;          $self->{state} = ENTITY_NAME_STATE;
4703          $self->{s_kwd} = chr $self->{nc};          $self->{kwd} = chr $self->{nc};
4704          $self->{entity__value} = $self->{s_kwd};          $self->{entity__value} = $self->{kwd};
4705          $self->{entity__match} = 0;          $self->{entity__match} = 0;
4706                    
4707      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 3684  sub _get_next_token ($) { Line 4731  sub _get_next_token ($) {
4731        if ($self->{prev_state} == DATA_STATE) {        if ($self->{prev_state} == DATA_STATE) {
4732                    
4733          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
4734            $self->{s_kwd} = '';
4735          ## Reconsume.          ## Reconsume.
4736          return  ({type => CHARACTER_TOKEN, data => '&',          return  ({type => CHARACTER_TOKEN, data => '&',
4737                    line => $self->{line_prev},                    line => $self->{line_prev},
# Line 3694  sub _get_next_token ($) { Line 4742  sub _get_next_token ($) {
4742                    
4743          $self->{ca}->{value} .= '&';          $self->{ca}->{value} .= '&';
4744          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
4745            $self->{s_kwd} = '';
4746          ## Reconsume.          ## Reconsume.
4747          redo A;          redo A;
4748        }        }
4749      } elsif ($self->{state} == ENTITY_HASH_STATE) {      } elsif ($self->{state} == ENTITY_HASH_STATE) {
4750        if ($self->{nc} == 0x0078 or # x        if ($self->{nc} == 0x0078) { # x
4751            $self->{nc} == 0x0058) { # X          
4752            $self->{state} = HEXREF_X_STATE;
4753            $self->{kwd} .= chr $self->{nc};
4754            
4755        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4756          $self->{line_prev} = $self->{line};
4757          $self->{column_prev} = $self->{column};
4758          $self->{column}++;
4759          $self->{nc}
4760              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4761        } else {
4762          $self->{set_nc}->($self);
4763        }
4764      
4765            redo A;
4766          } elsif ($self->{nc} == 0x0058) { # X
4767                    
4768            if ($self->{is_xml}) {
4769              $self->{parse_error}->(level => $self->{level}->{must}, type => 'uppercase hcro'); ## TODO: type
4770            }
4771          $self->{state} = HEXREF_X_STATE;          $self->{state} = HEXREF_X_STATE;
4772          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
4773                    
4774      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4775        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3719  sub _get_next_token ($) { Line 4786  sub _get_next_token ($) {
4786                 $self->{nc} <= 0x0039) { # 0..9                 $self->{nc} <= 0x0039) { # 0..9
4787                    
4788          $self->{state} = NCR_NUM_STATE;          $self->{state} = NCR_NUM_STATE;
4789          $self->{s_kwd} = $self->{nc} - 0x0030;          $self->{kwd} = $self->{nc} - 0x0030;
4790                    
4791      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4792        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 3744  sub _get_next_token ($) { Line 4811  sub _get_next_token ($) {
4811          if ($self->{prev_state} == DATA_STATE) {          if ($self->{prev_state} == DATA_STATE) {
4812                        
4813            $self->{state} = $self->{prev_state};            $self->{state} = $self->{prev_state};
4814              $self->{s_kwd} = '';
4815            ## Reconsume.            ## Reconsume.
4816            return  ({type => CHARACTER_TOKEN,            return  ({type => CHARACTER_TOKEN,
4817                      data => '&#',                      data => '&#',
# Line 3755  sub _get_next_token ($) { Line 4823  sub _get_next_token ($) {
4823                        
4824            $self->{ca}->{value} .= '&#';            $self->{ca}->{value} .= '&#';
4825            $self->{state} = $self->{prev_state};            $self->{state} = $self->{prev_state};
4826              $self->{s_kwd} = '';
4827            ## Reconsume.            ## Reconsume.
4828            redo A;            redo A;
4829          }          }
# Line 3763  sub _get_next_token ($) { Line 4832  sub _get_next_token ($) {
4832        if (0x0030 <= $self->{nc} and        if (0x0030 <= $self->{nc} and
4833            $self->{nc} <= 0x0039) { # 0..9            $self->{nc} <= 0x0039) { # 0..9
4834                    
4835          $self->{s_kwd} *= 10;          $self->{kwd} *= 10;
4836          $self->{s_kwd} += $self->{nc} - 0x0030;          $self->{kwd} += $self->{nc} - 0x0030;
4837                    
4838          ## Stay in the state.          ## Stay in the state.
4839                    
# Line 3800  sub _get_next_token ($) { Line 4869  sub _get_next_token ($) {
4869          #          #
4870        }        }
4871    
4872        my $code = $self->{s_kwd};        my $code = $self->{kwd};
4873        my $l = $self->{line_prev};        my $l = $self->{line_prev};
4874        my $c = $self->{column_prev};        my $c = $self->{column_prev};
4875        if ($charref_map->{$code}) {        if ((not $self->{is_xml} and $charref_map->{$code}) or
4876              ($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or
4877              ($self->{is_xml} and $code == 0x0000)) {
4878                    
4879          $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4880                          text => (sprintf 'U+%04X', $code),                          text => (sprintf 'U+%04X', $code),
# Line 3820  sub _get_next_token ($) { Line 4891  sub _get_next_token ($) {
4891        if ($self->{prev_state} == DATA_STATE) {        if ($self->{prev_state} == DATA_STATE) {
4892                    
4893          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
4894            $self->{s_kwd} = '';
4895          ## Reconsume.          ## Reconsume.
4896          return  ({type => CHARACTER_TOKEN, data => chr $code,          return  ({type => CHARACTER_TOKEN, data => chr $code,
4897                      has_reference => 1,
4898                    line => $l, column => $c,                    line => $l, column => $c,
4899                   });                   });
4900          redo A;          redo A;
# Line 3830  sub _get_next_token ($) { Line 4903  sub _get_next_token ($) {
4903          $self->{ca}->{value} .= chr $code;          $self->{ca}->{value} .= chr $code;
4904          $self->{ca}->{has_reference} = 1;          $self->{ca}->{has_reference} = 1;
4905          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
4906            $self->{s_kwd} = '';
4907          ## Reconsume.          ## Reconsume.
4908          redo A;          redo A;
4909        }        }
# Line 3840  sub _get_next_token ($) { Line 4914  sub _get_next_token ($) {
4914          # 0..9, A..F, a..f          # 0..9, A..F, a..f
4915                    
4916          $self->{state} = HEXREF_HEX_STATE;          $self->{state} = HEXREF_HEX_STATE;
4917          $self->{s_kwd} = 0;          $self->{kwd} = 0;
4918          ## Reconsume.          ## Reconsume.
4919          redo A;          redo A;
4920        } else {        } else {
# Line 3855  sub _get_next_token ($) { Line 4929  sub _get_next_token ($) {
4929          if ($self->{prev_state} == DATA_STATE) {          if ($self->{prev_state} == DATA_STATE) {
4930                        
4931            $self->{state} = $self->{prev_state};            $self->{state} = $self->{prev_state};
4932              $self->{s_kwd} = '';
4933            ## Reconsume.            ## Reconsume.
4934            return  ({type => CHARACTER_TOKEN,            return  ({type => CHARACTER_TOKEN,
4935                      data => '&' . $self->{s_kwd},                      data => '&' . $self->{kwd},
4936                      line => $self->{line_prev},                      line => $self->{line_prev},
4937                      column => $self->{column_prev} - length $self->{s_kwd},                      column => $self->{column_prev} - length $self->{kwd},
4938                     });                     });
4939            redo A;            redo A;
4940          } else {          } else {
4941                        
4942            $self->{ca}->{value} .= '&' . $self->{s_kwd};            $self->{ca}->{value} .= '&' . $self->{kwd};
4943            $self->{state} = $self->{prev_state};            $self->{state} = $self->{prev_state};
4944              $self->{s_kwd} = '';
4945            ## Reconsume.            ## Reconsume.
4946            redo A;            redo A;
4947          }          }
# Line 3874  sub _get_next_token ($) { Line 4950  sub _get_next_token ($) {
4950        if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {        if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
4951          # 0..9          # 0..9
4952                    
4953          $self->{s_kwd} *= 0x10;          $self->{kwd} *= 0x10;
4954          $self->{s_kwd} += $self->{nc} - 0x0030;          $self->{kwd} += $self->{nc} - 0x0030;
4955          ## Stay in the state.          ## Stay in the state.
4956                    
4957      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 3892  sub _get_next_token ($) { Line 4968  sub _get_next_token ($) {
4968        } elsif (0x0061 <= $self->{nc} and        } elsif (0x0061 <= $self->{nc} and
4969                 $self->{nc} <= 0x0066) { # a..f                 $self->{nc} <= 0x0066) { # a..f
4970                    
4971          $self->{s_kwd} *= 0x10;          $self->{kwd} *= 0x10;
4972          $self->{s_kwd} += $self->{nc} - 0x0060 + 9;          $self->{kwd} += $self->{nc} - 0x0060 + 9;
4973          ## Stay in the state.          ## Stay in the state.
4974                    
4975      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 3910  sub _get_next_token ($) { Line 4986  sub _get_next_token ($) {
4986        } elsif (0x0041 <= $self->{nc} and        } elsif (0x0041 <= $self->{nc} and
4987                 $self->{nc} <= 0x0046) { # A..F                 $self->{nc} <= 0x0046) { # A..F
4988                    
4989          $self->{s_kwd} *= 0x10;          $self->{kwd} *= 0x10;
4990          $self->{s_kwd} += $self->{nc} - 0x0040 + 9;          $self->{kwd} += $self->{nc} - 0x0040 + 9;
4991          ## Stay in the state.          ## Stay in the state.
4992                    
4993      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 3948  sub _get_next_token ($) { Line 5024  sub _get_next_token ($) {
5024          #          #
5025        }        }
5026    
5027        my $code = $self->{s_kwd};        my $code = $self->{kwd};
5028        my $l = $self->{line_prev};        my $l = $self->{line_prev};
5029        my $c = $self->{column_prev};        my $c = $self->{column_prev};
5030        if ($charref_map->{$code}) {        if ((not $self->{is_xml} and $charref_map->{$code}) or
5031              ($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or
5032              ($self->{is_xml} and $code == 0x0000)) {
5033                    
5034          $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
5035                          text => (sprintf 'U+%04X', $code),                          text => (sprintf 'U+%04X', $code),
# Line 3968  sub _get_next_token ($) { Line 5046  sub _get_next_token ($) {
5046        if ($self->{prev_state} == DATA_STATE) {        if ($self->{prev_state} == DATA_STATE) {
5047                    
5048          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
5049            $self->{s_kwd} = '';
5050          ## Reconsume.          ## Reconsume.
5051          return  ({type => CHARACTER_TOKEN, data => chr $code,          return  ({type => CHARACTER_TOKEN, data => chr $code,
5052                      has_reference => 1,
5053                    line => $l, column => $c,                    line => $l, column => $c,
5054                   });                   });
5055          redo A;          redo A;
# Line 3978  sub _get_next_token ($) { Line 5058  sub _get_next_token ($) {
5058          $self->{ca}->{value} .= chr $code;          $self->{ca}->{value} .= chr $code;
5059          $self->{ca}->{has_reference} = 1;          $self->{ca}->{has_reference} = 1;
5060          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
5061            $self->{s_kwd} = '';
5062          ## Reconsume.          ## Reconsume.
5063          redo A;          redo A;
5064        }        }
5065      } elsif ($self->{state} == ENTITY_NAME_STATE) {      } elsif ($self->{state} == ENTITY_NAME_STATE) {
5066        if (length $self->{s_kwd} < 30 and        if ((0x0041 <= $self->{nc} and # a
5067            ## NOTE: Some number greater than the maximum length of entity name             $self->{nc} <= 0x005A) or # x
5068            ((0x0041 <= $self->{nc} and # a            (0x0061 <= $self->{nc} and # a
5069              $self->{nc} <= 0x005A) or # x             $self->{nc} <= 0x007A) or # z
5070             (0x0061 <= $self->{nc} and # a            (0x0030 <= $self->{nc} and # 0
5071              $self->{nc} <= 0x007A) or # z             $self->{nc} <= 0x0039) or # 9
5072             (0x0030 <= $self->{nc} and # 0            $self->{nc} == 0x003B or # ;
5073              $self->{nc} <= 0x0039) or # 9            ($self->{is_xml} and
5074             $self->{nc} == 0x003B)) { # ;             not ($is_space->{$self->{nc}} or
5075                    {
5076                      0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
5077                      $self->{entity_add} => 1,
5078                    }->{$self->{nc}}))) {
5079          our $EntityChar;          our $EntityChar;
5080          $self->{s_kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
5081          if (defined $EntityChar->{$self->{s_kwd}}) {          if (defined $EntityChar->{$self->{kwd}} or
5082                $self->{ge}->{$self->{kwd}}) {
5083            if ($self->{nc} == 0x003B) { # ;            if ($self->{nc} == 0x003B) { # ;
5084                            if (defined $self->{ge}->{$self->{kwd}}) {
5085              $self->{entity__value} = $EntityChar->{$self->{s_kwd}};                if ($self->{ge}->{$self->{kwd}}->{only_text}) {
5086                    
5087                    $self->{entity__value} = $self->{ge}->{$self->{kwd}}->{value};
5088                  } else {
5089                    if (defined $self->{ge}->{$self->{kwd}}->{notation}) {
5090                      
5091                      $self->{parse_error}->(level => $self->{level}->{must}, type => 'unparsed entity', ## TODO: type
5092                                      value => $self->{kwd});
5093                    } else {
5094                      
5095                    }
5096                    $self->{entity__value} = '&' . $self->{kwd}; ## TODO: expand
5097                  }
5098                } else {
5099                  if ($self->{is_xml}) {
5100                    
5101                    $self->{parse_error}->(level => $self->{level}->{must}, type => 'entity not declared', ## TODO: type
5102                                    value => $self->{kwd},
5103                                    level => {
5104                                              'amp;' => $self->{level}->{warn},
5105                                              'quot;' => $self->{level}->{warn},
5106                                              'lt;' => $self->{level}->{warn},
5107                                              'gt;' => $self->{level}->{warn},
5108                                              'apos;' => $self->{level}->{warn},
5109                                             }->{$self->{kwd}} ||
5110                                             $self->{level}->{must});
5111                  } else {
5112                    
5113                  }
5114                  $self->{entity__value} = $EntityChar->{$self->{kwd}};
5115                }
5116              $self->{entity__match} = 1;              $self->{entity__match} = 1;
5117                            
5118      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 4012  sub _get_next_token ($) { Line 5128  sub _get_next_token ($) {
5128              #              #
5129            } else {            } else {
5130                            
5131              $self->{entity__value} = $EntityChar->{$self->{s_kwd}};              $self->{entity__value} = $EntityChar->{$self->{kwd}};
5132              $self->{entity__match} = -1;              $self->{entity__match} = -1;
5133              ## Stay in the state.              ## Stay in the state.
5134                            
# Line 4060  sub _get_next_token ($) { Line 5176  sub _get_next_token ($) {
5176          if ($self->{prev_state} != DATA_STATE and # in attribute          if ($self->{prev_state} != DATA_STATE and # in attribute
5177              $self->{entity__match} < -1) {              $self->{entity__match} < -1) {
5178                        
5179            $data = '&' . $self->{s_kwd};            $data = '&' . $self->{kwd};
5180            #            #
5181          } else {          } else {
5182                        
# Line 4072  sub _get_next_token ($) { Line 5188  sub _get_next_token ($) {
5188                    
5189          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
5190                          line => $self->{line_prev},                          line => $self->{line_prev},
5191                          column => $self->{column_prev} - length $self->{s_kwd});                          column => $self->{column_prev} - length $self->{kwd});
5192          $data = '&' . $self->{s_kwd};          $data = '&' . $self->{kwd};
5193          #          #
5194        }        }
5195        
# Line 4090  sub _get_next_token ($) { Line 5206  sub _get_next_token ($) {
5206        if ($self->{prev_state} == DATA_STATE) {        if ($self->{prev_state} == DATA_STATE) {
5207                    
5208          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
5209            $self->{s_kwd} = '';
5210          ## Reconsume.          ## Reconsume.
5211          return  ({type => CHARACTER_TOKEN,          return  ({type => CHARACTER_TOKEN,
5212                    data => $data,                    data => $data,
5213                      has_reference => $has_ref,
5214                    line => $self->{line_prev},                    line => $self->{line_prev},
5215                    column => $self->{column_prev} + 1 - length $self->{s_kwd},                    column => $self->{column_prev} + 1 - length $self->{kwd},
5216                   });                   });
5217          redo A;          redo A;
5218        } else {        } else {
# Line 4102  sub _get_next_token ($) { Line 5220  sub _get_next_token ($) {
5220          $self->{ca}->{value} .= $data;          $self->{ca}->{value} .= $data;
5221          $self->{ca}->{has_reference} = 1 if $has_ref;          $self->{ca}->{has_reference} = 1 if $has_ref;
5222          $self->{state} = $self->{prev_state};          $self->{state} = $self->{prev_state};
5223            $self->{s_kwd} = '';
5224            ## Reconsume.
5225            redo A;
5226          }
5227    
5228        ## XML-only states
5229    
5230        } elsif ($self->{state} == PI_STATE) {
5231          ## XML5: "Pi state" and "DOCTYPE pi state".
5232    
5233          if ($is_space->{$self->{nc}} or
5234              $self->{nc} == 0x003F or # ?
5235              $self->{nc} == -1) {
5236            ## XML5: U+003F: "pi state": Same as "Anything else"; "DOCTYPE
5237            ## pi state": Switch to the "DOCTYPE pi after state".  EOF:
5238            ## "DOCTYPE pi state": Parse error, switch to the "data
5239            ## state".
5240            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare pio', ## TODO: type
5241                            line => $self->{line_prev},
5242                            column => $self->{column_prev}
5243                                - 1 * ($self->{nc} != -1));
5244            $self->{state} = BOGUS_COMMENT_STATE;
5245          ## Reconsume.          ## Reconsume.
5246            $self->{ct} = {type => COMMENT_TOKEN,
5247                           data => '?',
5248                           line => $self->{line_prev},
5249                           column => $self->{column_prev}
5250                               - 1 * ($self->{nc} != -1),
5251                          };
5252            redo A;
5253          } else {
5254            ## XML5: "DOCTYPE pi state": Stay in the state.
5255            $self->{ct} = {type => PI_TOKEN,
5256                           target => chr $self->{nc},
5257                           data => '',
5258                           line => $self->{line_prev},
5259                           column => $self->{column_prev} - 1,
5260                          };
5261            $self->{state} = PI_TARGET_STATE;
5262            
5263        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5264          $self->{line_prev} = $self->{line};
5265          $self->{column_prev} = $self->{column};
5266          $self->{column}++;
5267          $self->{nc}
5268              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5269        } else {
5270          $self->{set_nc}->($self);
5271        }
5272      
5273            redo A;
5274          }
5275        } elsif ($self->{state} == PI_TARGET_STATE) {
5276          if ($is_space->{$self->{nc}}) {
5277            $self->{state} = PI_TARGET_AFTER_STATE;
5278            
5279        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5280          $self->{line_prev} = $self->{line};
5281          $self->{column_prev} = $self->{column};
5282          $self->{column}++;
5283          $self->{nc}
5284              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5285        } else {
5286          $self->{set_nc}->($self);
5287        }
5288      
5289            redo A;
5290          } elsif ($self->{nc} == -1) {
5291            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
5292            if ($self->{in_subset}) {
5293              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5294            } else {
5295              $self->{state} = DATA_STATE;
5296              $self->{s_kwd} = '';
5297            }
5298            ## Reconsume.
5299            return  ($self->{ct}); # pi
5300            redo A;
5301          } elsif ($self->{nc} == 0x003F) { # ?
5302            $self->{state} = PI_AFTER_STATE;
5303            
5304        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5305          $self->{line_prev} = $self->{line};
5306          $self->{column_prev} = $self->{column};
5307          $self->{column}++;
5308          $self->{nc}
5309              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5310        } else {
5311          $self->{set_nc}->($self);
5312        }
5313      
5314            redo A;
5315          } else {
5316            ## XML5: typo ("tag name" -> "target")
5317            $self->{ct}->{target} .= chr $self->{nc}; # pi
5318            
5319        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5320          $self->{line_prev} = $self->{line};
5321          $self->{column_prev} = $self->{column};
5322          $self->{column}++;
5323          $self->{nc}
5324              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5325        } else {
5326          $self->{set_nc}->($self);
5327        }
5328      
5329            redo A;
5330          }
5331        } elsif ($self->{state} == PI_TARGET_AFTER_STATE) {
5332          if ($is_space->{$self->{nc}}) {
5333            ## Stay in the state.
5334            
5335        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5336          $self->{line_prev} = $self->{line};
5337          $self->{column_prev} = $self->{column};
5338          $self->{column}++;
5339          $self->{nc}
5340              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5341        } else {
5342          $self->{set_nc}->($self);
5343        }
5344      
5345            redo A;
5346          } else {
5347            $self->{state} = PI_DATA_STATE;
5348            ## Reprocess.
5349            redo A;
5350          }
5351        } elsif ($self->{state} == PI_DATA_STATE) {
5352          if ($self->{nc} == 0x003F) { # ?
5353            $self->{state} = PI_DATA_AFTER_STATE;
5354            
5355        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5356          $self->{line_prev} = $self->{line};
5357          $self->{column_prev} = $self->{column};
5358          $self->{column}++;
5359          $self->{nc}
5360              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5361        } else {
5362          $self->{set_nc}->($self);
5363        }
5364      
5365            redo A;
5366          } elsif ($self->{nc} == -1) {
5367            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
5368            if ($self->{in_subset}) {
5369              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state"
5370            } else {
5371              $self->{state} = DATA_STATE;
5372              $self->{s_kwd} = '';
5373            }
5374            ## Reprocess.
5375            return  ($self->{ct}); # pi
5376            redo A;
5377          } else {
5378            $self->{ct}->{data} .= chr $self->{nc}; # pi
5379            $self->{read_until}->($self->{ct}->{data}, q[?],
5380                                  length $self->{ct}->{data});
5381            ## Stay in the state.
5382            
5383        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5384          $self->{line_prev} = $self->{line};
5385          $self->{column_prev} = $self->{column};
5386          $self->{column}++;
5387          $self->{nc}
5388              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5389        } else {
5390          $self->{set_nc}->($self);
5391        }
5392      
5393            ## Reprocess.
5394            redo A;
5395          }
5396        } elsif ($self->{state} == PI_AFTER_STATE) {
5397          ## XML5: Part of "Pi after state".
5398    
5399          if ($self->{nc} == 0x003E) { # >
5400            if ($self->{in_subset}) {
5401              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5402            } else {
5403              $self->{state} = DATA_STATE;
5404              $self->{s_kwd} = '';
5405            }
5406            
5407        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5408          $self->{line_prev} = $self->{line};
5409          $self->{column_prev} = $self->{column};
5410          $self->{column}++;
5411          $self->{nc}
5412              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5413        } else {
5414          $self->{set_nc}->($self);
5415        }
5416      
5417            return  ($self->{ct}); # pi
5418            redo A;
5419          } elsif ($self->{nc} == 0x003F) { # ?
5420            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
5421                            line => $self->{line_prev},
5422                            column => $self->{column_prev}); ## XML5: no error
5423            $self->{ct}->{data} .= '?';
5424            $self->{state} = PI_DATA_AFTER_STATE;
5425            
5426        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5427          $self->{line_prev} = $self->{line};
5428          $self->{column_prev} = $self->{column};
5429          $self->{column}++;
5430          $self->{nc}
5431              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5432        } else {
5433          $self->{set_nc}->($self);
5434        }
5435      
5436            redo A;
5437          } else {
5438            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
5439                            line => $self->{line_prev},
5440                            column => $self->{column_prev}
5441                                + 1 * ($self->{nc} == -1)); ## XML5: no error
5442            $self->{ct}->{data} .= '?'; ## XML5: not appended
5443            $self->{state} = PI_DATA_STATE;
5444            ## Reprocess.
5445            redo A;
5446          }
5447        } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
5448          ## XML5: Same as "pi after state" and "DOCTYPE pi after state".
5449    
5450          if ($self->{nc} == 0x003E) { # >
5451            if ($self->{in_subset}) {
5452              $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5453            } else {
5454              $self->{state} = DATA_STATE;
5455              $self->{s_kwd} = '';
5456            }
5457            
5458        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5459          $self->{line_prev} = $self->{line};
5460          $self->{column_prev} = $self->{column};
5461          $self->{column}++;
5462          $self->{nc}
5463              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5464        } else {
5465          $self->{set_nc}->($self);
5466        }
5467      
5468            return  ($self->{ct}); # pi
5469            redo A;
5470          } elsif ($self->{nc} == 0x003F) { # ?
5471            $self->{ct}->{data} .= '?';
5472            ## Stay in the state.
5473            
5474        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5475          $self->{line_prev} = $self->{line};
5476          $self->{column_prev} = $self->{column};
5477          $self->{column}++;
5478          $self->{nc}
5479              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5480        } else {
5481          $self->{set_nc}->($self);
5482        }
5483      
5484            redo A;
5485          } else {
5486            $self->{ct}->{data} .= '?'; ## XML5: not appended
5487            $self->{state} = PI_DATA_STATE;
5488            ## Reprocess.
5489            redo A;
5490          }
5491    
5492        } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_STATE) {
5493          if ($self->{nc} == 0x003C) { # <
5494            $self->{state} = DOCTYPE_TAG_STATE;
5495            
5496        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5497          $self->{line_prev} = $self->{line};
5498          $self->{column_prev} = $self->{column};
5499          $self->{column}++;
5500          $self->{nc}
5501              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5502        } else {
5503          $self->{set_nc}->($self);
5504        }
5505      
5506            redo A;
5507          } elsif ($self->{nc} == 0x0025) { # %
5508            ## XML5: Not defined yet.
5509    
5510            ## TODO:
5511    
5512            if (not $self->{stop_processing} and
5513                not $self->{document}->xml_standalone) {
5514              $self->{parse_error}->(level => $self->{level}->{must}, type => 'stop processing', ## TODO: type
5515                              level => $self->{level}->{info});
5516              $self->{stop_processing} = 1;
5517            }
5518    
5519            
5520        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5521          $self->{line_prev} = $self->{line};
5522          $self->{column_prev} = $self->{column};
5523          $self->{column}++;
5524          $self->{nc}
5525              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5526        } else {
5527          $self->{set_nc}->($self);
5528        }
5529      
5530            redo A;
5531          } elsif ($self->{nc} == 0x005D) { # ]
5532            delete $self->{in_subset};
5533            $self->{state} = DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
5534            
5535        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5536          $self->{line_prev} = $self->{line};
5537          $self->{column_prev} = $self->{column};
5538          $self->{column}++;
5539          $self->{nc}
5540              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5541        } else {
5542          $self->{set_nc}->($self);
5543        }
5544      
5545            redo A;
5546          } elsif ($is_space->{$self->{nc}}) {
5547            ## Stay in the state.
5548            
5549        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5550          $self->{line_prev} = $self->{line};
5551          $self->{column_prev} = $self->{column};
5552          $self->{column}++;
5553          $self->{nc}
5554              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5555        } else {
5556          $self->{set_nc}->($self);
5557        }
5558      
5559            redo A;
5560          } elsif ($self->{nc} == -1) {
5561            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed internal subset'); ## TODO: type
5562            delete $self->{in_subset};
5563            $self->{state} = DATA_STATE;
5564            $self->{s_kwd} = '';
5565            ## Reconsume.
5566            return  ({type => END_OF_DOCTYPE_TOKEN});
5567            redo A;
5568          } else {
5569            unless ($self->{internal_subset_tainted}) {
5570              ## XML5: No parse error.
5571              $self->{parse_error}->(level => $self->{level}->{must}, type => 'string in internal subset');
5572              $self->{internal_subset_tainted} = 1;
5573            }
5574            ## Stay in the state.
5575            
5576        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5577          $self->{line_prev} = $self->{line};
5578          $self->{column_prev} = $self->{column};
5579          $self->{column}++;
5580          $self->{nc}
5581              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5582        } else {
5583          $self->{set_nc}->($self);
5584        }
5585      
5586            redo A;
5587          }
5588        } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
5589          if ($self->{nc} == 0x003E) { # >
5590            $self->{state} = DATA_STATE;
5591            $self->{s_kwd} = '';
5592            
5593        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5594          $self->{line_prev} = $self->{line};
5595          $self->{column_prev} = $self->{column};
5596          $self->{column}++;
5597          $self->{nc}
5598              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5599        } else {
5600          $self->{set_nc}->($self);
5601        }
5602      
5603            return  ({type => END_OF_DOCTYPE_TOKEN});
5604            redo A;
5605          } elsif ($self->{nc} == -1) {
5606            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
5607            $self->{state} = DATA_STATE;
5608            $self->{s_kwd} = '';
5609            ## Reconsume.
5610            return  ({type => END_OF_DOCTYPE_TOKEN});
5611            redo A;
5612          } else {
5613            ## XML5: No parse error and stay in the state.
5614            $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after internal subset'); ## TODO: type
5615    
5616            $self->{state} = BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
5617            
5618        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5619          $self->{line_prev} = $self->{line};
5620          $self->{column_prev} = $self->{column};
5621          $self->{column}++;
5622          $self->{nc}
5623              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5624        } else {
5625          $self->{set_nc}->($self);
5626        }
5627      
5628            redo A;
5629          }
5630        } elsif ($self->{state} == BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
5631          if ($self->{nc} == 0x003E) { # >
5632            $self->{state} = DATA_STATE;
5633            $self->{s_kwd} = '';
5634            
5635        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5636          $self->{line_prev} = $self->{line};
5637          $self->{column_prev} = $self->{column};
5638          $self->{column}++;
5639          $self->{nc}
5640              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5641        } else {
5642          $self->{set_nc}->($self);
5643        }
5644      
5645            return  ({type => END_OF_DOCTYPE_TOKEN});
5646            redo A;
5647          } elsif ($self->{nc} == -1) {
5648            $self->{state} = DATA_STATE;
5649            $self->{s_kwd} = '';
5650            ## Reconsume.
5651            return  ({type => END_OF_DOCTYPE_TOKEN});
5652            redo A;
5653          } else {
5654            ## Stay in the state.
5655            
5656        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5657          $self->{line_prev} = $self->{line};
5658          $self->{column_prev} = $self->{column};
5659          $self->{column}++;
5660          $self->{nc}
5661              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5662        } else {
5663          $self->{set_nc}->($self);
5664        }
5665      
5666            redo A;
5667          }
5668        } elsif ($self->{state} == DOCTYPE_TAG_STATE) {
5669          if ($self->{nc} == 0x0021) { # !
5670            $self->{state} = DOCTYPE_MARKUP_DECLARATION_OPEN_STATE;
5671            
5672        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5673          $self->{line_prev} = $self->{line};
5674          $self->{column_prev} = $self->{column};
5675          $self->{column}++;
5676          $self->{nc}
5677              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5678        } else {
5679          $self->{set_nc}->($self);
5680        }
5681      
5682            redo A;
5683          } elsif ($self->{nc} == 0x003F) { # ?
5684            $self->{state} = PI_STATE;
5685            
5686        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5687          $self->{line_prev} = $self->{line};
5688          $self->{column_prev} = $self->{column};
5689          $self->{column}++;
5690          $self->{nc}
5691              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5692        } else {
5693          $self->{set_nc}->($self);
5694        }
5695      
5696            redo A;
5697          } elsif ($self->{nc} == -1) {
5698            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago');
5699            $self->{state} = DATA_STATE;
5700            $self->{s_kwd} = '';
5701            ## Reconsume.
5702            redo A;
5703          } else {
5704            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago', ## XML5: Not a parse error.
5705                            line => $self->{line_prev},
5706                            column => $self->{column_prev});
5707            $self->{state} = BOGUS_COMMENT_STATE;
5708            $self->{ct} = {type => COMMENT_TOKEN,
5709                           data => '',
5710                          }; ## NOTE: Will be discarded.
5711            
5712        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5713          $self->{line_prev} = $self->{line};
5714          $self->{column_prev} = $self->{column};
5715          $self->{column}++;
5716          $self->{nc}
5717              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5718        } else {
5719          $self->{set_nc}->($self);
5720        }
5721      
5722            redo A;
5723          }
5724        } elsif ($self->{state} == DOCTYPE_MARKUP_DECLARATION_OPEN_STATE) {
5725          ## XML5: "DOCTYPE markup declaration state".
5726          
5727          if ($self->{nc} == 0x002D) { # -
5728            $self->{state} = MD_HYPHEN_STATE;
5729            
5730        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5731          $self->{line_prev} = $self->{line};
5732          $self->{column_prev} = $self->{column};
5733          $self->{column}++;
5734          $self->{nc}
5735              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5736        } else {
5737          $self->{set_nc}->($self);
5738        }
5739      
5740            redo A;
5741          } elsif ($self->{nc} == 0x0045 or # E
5742                   $self->{nc} == 0x0065) { # e
5743            $self->{state} = MD_E_STATE;
5744            $self->{kwd} = chr $self->{nc};
5745            
5746        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5747          $self->{line_prev} = $self->{line};
5748          $self->{column_prev} = $self->{column};
5749          $self->{column}++;
5750          $self->{nc}
5751              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5752        } else {
5753          $self->{set_nc}->($self);
5754        }
5755      
5756            redo A;
5757          } elsif ($self->{nc} == 0x0041 or # A
5758                   $self->{nc} == 0x0061) { # a
5759            $self->{state} = MD_ATTLIST_STATE;
5760            $self->{kwd} = chr $self->{nc};
5761            
5762        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5763          $self->{line_prev} = $self->{line};
5764          $self->{column_prev} = $self->{column};
5765          $self->{column}++;
5766          $self->{nc}
5767              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5768        } else {
5769          $self->{set_nc}->($self);
5770        }
5771      
5772            redo A;
5773          } elsif ($self->{nc} == 0x004E or # N
5774                   $self->{nc} == 0x006E) { # n
5775            $self->{state} = MD_NOTATION_STATE;
5776            $self->{kwd} = chr $self->{nc};
5777            
5778        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5779          $self->{line_prev} = $self->{line};
5780          $self->{column_prev} = $self->{column};
5781          $self->{column}++;
5782          $self->{nc}
5783              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5784        } else {
5785          $self->{set_nc}->($self);
5786        }
5787      
5788            redo A;
5789          } else {
5790            #
5791          }
5792          
5793          ## XML5: No parse error.
5794          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5795                          line => $self->{line_prev},
5796                          column => $self->{column_prev} - 1);
5797          ## Reconsume.
5798          $self->{state} = BOGUS_COMMENT_STATE;
5799          $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded.
5800          redo A;
5801        } elsif ($self->{state} == MD_E_STATE) {
5802          if ($self->{nc} == 0x004E or # N
5803              $self->{nc} == 0x006E) { # n
5804            $self->{state} = MD_ENTITY_STATE;
5805            $self->{kwd} .= chr $self->{nc};
5806            
5807        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5808          $self->{line_prev} = $self->{line};
5809          $self->{column_prev} = $self->{column};
5810          $self->{column}++;
5811          $self->{nc}
5812              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5813        } else {
5814          $self->{set_nc}->($self);
5815        }
5816      
5817            redo A;
5818          } elsif ($self->{nc} == 0x004C or # L
5819                   $self->{nc} == 0x006C) { # l
5820            ## XML5: <!ELEMENT> not supported.
5821            $self->{state} = MD_ELEMENT_STATE;
5822            $self->{kwd} .= chr $self->{nc};
5823            
5824        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5825          $self->{line_prev} = $self->{line};
5826          $self->{column_prev} = $self->{column};
5827          $self->{column}++;
5828          $self->{nc}
5829              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5830        } else {
5831          $self->{set_nc}->($self);
5832        }
5833      
5834            redo A;
5835          } else {
5836            ## XML5: No parse error.
5837            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5838                            line => $self->{line_prev},
5839                            column => $self->{column_prev} - 2
5840                                + 1 * ($self->{nc} == -1));
5841            ## Reconsume.
5842            $self->{state} = BOGUS_COMMENT_STATE;
5843            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5844            redo A;
5845          }
5846        } elsif ($self->{state} == MD_ENTITY_STATE) {
5847          if ($self->{nc} == [
5848                undef,
5849                undef,
5850                0x0054, # T
5851                0x0049, # I
5852                0x0054, # T
5853              ]->[length $self->{kwd}] or
5854              $self->{nc} == [
5855                undef,
5856                undef,
5857                0x0074, # t
5858                0x0069, # i
5859                0x0074, # t
5860              ]->[length $self->{kwd}]) {
5861            ## Stay in the state.
5862            $self->{kwd} .= chr $self->{nc};
5863            
5864        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5865          $self->{line_prev} = $self->{line};
5866          $self->{column_prev} = $self->{column};
5867          $self->{column}++;
5868          $self->{nc}
5869              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5870        } else {
5871          $self->{set_nc}->($self);
5872        }
5873      
5874            redo A;
5875          } elsif ((length $self->{kwd}) == 5 and
5876                   ($self->{nc} == 0x0059 or # Y
5877                    $self->{nc} == 0x0079)) { # y
5878            if ($self->{kwd} ne 'ENTIT' or $self->{nc} == 0x0079) {
5879              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
5880                              text => 'ENTITY',
5881                              line => $self->{line_prev},
5882                              column => $self->{column_prev} - 4);
5883            }
5884            $self->{ct} = {type => GENERAL_ENTITY_TOKEN, name => '',
5885                           line => $self->{line_prev},
5886                           column => $self->{column_prev} - 6};
5887            $self->{state} = DOCTYPE_MD_STATE;
5888            
5889        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5890          $self->{line_prev} = $self->{line};
5891          $self->{column_prev} = $self->{column};
5892          $self->{column}++;
5893          $self->{nc}
5894              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5895        } else {
5896          $self->{set_nc}->($self);
5897        }
5898      
5899            redo A;
5900          } else {
5901            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5902                            line => $self->{line_prev},
5903                            column => $self->{column_prev} - 1
5904                                - (length $self->{kwd})
5905                                + 1 * ($self->{nc} == -1));
5906            $self->{state} = BOGUS_COMMENT_STATE;
5907            ## Reconsume.
5908            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5909            redo A;
5910          }
5911        } elsif ($self->{state} == MD_ELEMENT_STATE) {
5912          if ($self->{nc} == [
5913               undef,
5914               undef,
5915               0x0045, # E
5916               0x004D, # M
5917               0x0045, # E
5918               0x004E, # N
5919              ]->[length $self->{kwd}] or
5920              $self->{nc} == [
5921               undef,
5922               undef,
5923               0x0065, # e
5924               0x006D, # m
5925               0x0065, # e
5926               0x006E, # n
5927              ]->[length $self->{kwd}]) {
5928            ## Stay in the state.
5929            $self->{kwd} .= chr $self->{nc};
5930            
5931        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5932          $self->{line_prev} = $self->{line};
5933          $self->{column_prev} = $self->{column};
5934          $self->{column}++;
5935          $self->{nc}
5936              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5937        } else {
5938          $self->{set_nc}->($self);
5939        }
5940      
5941            redo A;
5942          } elsif ((length $self->{kwd}) == 6 and
5943                   ($self->{nc} == 0x0054 or # T
5944                    $self->{nc} == 0x0074)) { # t
5945            if ($self->{kwd} ne 'ELEMEN' or $self->{nc} == 0x0074) {
5946              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
5947                              text => 'ELEMENT',
5948                              line => $self->{line_prev},
5949                              column => $self->{column_prev} - 5);
5950            }
5951            $self->{ct} = {type => ELEMENT_TOKEN, name => '',
5952                           line => $self->{line_prev},
5953                           column => $self->{column_prev} - 7};
5954            $self->{state} = DOCTYPE_MD_STATE;
5955            
5956        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5957          $self->{line_prev} = $self->{line};
5958          $self->{column_prev} = $self->{column};
5959          $self->{column}++;
5960          $self->{nc}
5961              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5962        } else {
5963          $self->{set_nc}->($self);
5964        }
5965      
5966            redo A;
5967          } else {
5968            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5969                            line => $self->{line_prev},
5970                            column => $self->{column_prev} - 1
5971                                - (length $self->{kwd})
5972                                + 1 * ($self->{nc} == -1));
5973            $self->{state} = BOGUS_COMMENT_STATE;
5974            ## Reconsume.
5975            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5976            redo A;
5977          }
5978        } elsif ($self->{state} == MD_ATTLIST_STATE) {
5979          if ($self->{nc} == [
5980               undef,
5981               0x0054, # T
5982               0x0054, # T
5983               0x004C, # L
5984               0x0049, # I
5985               0x0053, # S
5986              ]->[length $self->{kwd}] or
5987              $self->{nc} == [
5988               undef,
5989               0x0074, # t
5990               0x0074, # t
5991               0x006C, # l
5992               0x0069, # i
5993               0x0073, # s
5994              ]->[length $self->{kwd}]) {
5995            ## Stay in the state.
5996            $self->{kwd} .= chr $self->{nc};
5997            
5998        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5999          $self->{line_prev} = $self->{line};
6000          $self->{column_prev} = $self->{column};
6001          $self->{column}++;
6002          $self->{nc}
6003              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6004        } else {
6005          $self->{set_nc}->($self);
6006        }
6007      
6008            redo A;
6009          } elsif ((length $self->{kwd}) == 6 and
6010                   ($self->{nc} == 0x0054 or # T
6011                    $self->{nc} == 0x0074)) { # t
6012            if ($self->{kwd} ne 'ATTLIS' or $self->{nc} == 0x0074) {
6013              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
6014                              text => 'ATTLIST',
6015                              line => $self->{line_prev},
6016                              column => $self->{column_prev} - 5);
6017            }
6018            $self->{ct} = {type => ATTLIST_TOKEN, name => '',
6019                           attrdefs => [],
6020                           line => $self->{line_prev},
6021                           column => $self->{column_prev} - 7};
6022            $self->{state} = DOCTYPE_MD_STATE;
6023            
6024        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6025          $self->{line_prev} = $self->{line};
6026          $self->{column_prev} = $self->{column};
6027          $self->{column}++;
6028          $self->{nc}
6029              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6030        } else {
6031          $self->{set_nc}->($self);
6032        }
6033      
6034            redo A;
6035          } else {
6036            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
6037                            line => $self->{line_prev},
6038                            column => $self->{column_prev} - 1
6039                                 - (length $self->{kwd})
6040                                 + 1 * ($self->{nc} == -1));
6041            $self->{state} = BOGUS_COMMENT_STATE;
6042            ## Reconsume.
6043            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6044            redo A;
6045          }
6046        } elsif ($self->{state} == MD_NOTATION_STATE) {
6047          if ($self->{nc} == [
6048               undef,
6049               0x004F, # O
6050               0x0054, # T
6051               0x0041, # A
6052               0x0054, # T
6053               0x0049, # I
6054               0x004F, # O
6055              ]->[length $self->{kwd}] or
6056              $self->{nc} == [
6057               undef,
6058               0x006F, # o
6059               0x0074, # t
6060               0x0061, # a
6061               0x0074, # t
6062               0x0069, # i
6063               0x006F, # o
6064              ]->[length $self->{kwd}]) {
6065            ## Stay in the state.
6066            $self->{kwd} .= chr $self->{nc};
6067            
6068        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6069          $self->{line_prev} = $self->{line};
6070          $self->{column_prev} = $self->{column};
6071          $self->{column}++;
6072          $self->{nc}
6073              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6074        } else {
6075          $self->{set_nc}->($self);
6076        }
6077      
6078            redo A;
6079          } elsif ((length $self->{kwd}) == 7 and
6080                   ($self->{nc} == 0x004E or # N
6081                    $self->{nc} == 0x006E)) { # n
6082            if ($self->{kwd} ne 'NOTATIO' or $self->{nc} == 0x006E) {
6083              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
6084                              text => 'NOTATION',
6085                              line => $self->{line_prev},
6086                              column => $self->{column_prev} - 6);
6087            }
6088            $self->{ct} = {type => NOTATION_TOKEN, name => '',
6089                           line => $self->{line_prev},
6090                           column => $self->{column_prev} - 8};
6091            $self->{state} = DOCTYPE_MD_STATE;
6092            
6093        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6094          $self->{line_prev} = $self->{line};
6095          $self->{column_prev} = $self->{column};
6096          $self->{column}++;
6097          $self->{nc}
6098              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6099        } else {
6100          $self->{set_nc}->($self);
6101        }
6102      
6103            redo A;
6104          } else {
6105            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
6106                            line => $self->{line_prev},
6107                            column => $self->{column_prev} - 1
6108                                - (length $self->{kwd})
6109                                + 1 * ($self->{nc} == -1));
6110            $self->{state} = BOGUS_COMMENT_STATE;
6111            ## Reconsume.
6112            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6113            redo A;
6114          }
6115        } elsif ($self->{state} == DOCTYPE_MD_STATE) {
6116          ## XML5: "DOCTYPE ENTITY state", "DOCTYPE ATTLIST state", and
6117          ## "DOCTYPE NOTATION state".
6118    
6119          if ($is_space->{$self->{nc}}) {
6120            ## XML5: [NOTATION] Switch to the "DOCTYPE NOTATION identifier state".
6121            $self->{state} = BEFORE_MD_NAME_STATE;
6122            
6123        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6124          $self->{line_prev} = $self->{line};
6125          $self->{column_prev} = $self->{column};
6126          $self->{column}++;
6127          $self->{nc}
6128              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6129        } else {
6130          $self->{set_nc}->($self);
6131        }
6132      
6133            redo A;
6134          } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
6135                   $self->{nc} == 0x0025) { # %
6136            ## XML5: Switch to the "DOCTYPE bogus comment state".
6137            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before md name'); ## TODO: type
6138            $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
6139            
6140        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6141          $self->{line_prev} = $self->{line};
6142          $self->{column_prev} = $self->{column};
6143          $self->{column}++;
6144          $self->{nc}
6145              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6146        } else {
6147          $self->{set_nc}->($self);
6148        }
6149      
6150            redo A;
6151          } elsif ($self->{nc} == -1) {
6152            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6153            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6154            ## Reconsume.
6155            redo A;
6156          } elsif ($self->{nc} == 0x003E) { # >
6157            ## XML5: Switch to the "DOCTYPE bogus comment state".
6158            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6159            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6160            
6161        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6162          $self->{line_prev} = $self->{line};
6163          $self->{column_prev} = $self->{column};
6164          $self->{column}++;
6165          $self->{nc}
6166              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6167        } else {
6168          $self->{set_nc}->($self);
6169        }
6170      
6171            redo A;
6172          } else {
6173            ## XML5: Switch to the "DOCTYPE bogus comment state".
6174            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before md name'); ## TODO: type
6175            $self->{state} = BEFORE_MD_NAME_STATE;
6176            redo A;
6177          }
6178        } elsif ($self->{state} == BEFORE_MD_NAME_STATE) {
6179          ## XML5: "DOCTYPE ENTITY parameter state", "DOCTYPE ENTITY type
6180          ## before state", "DOCTYPE ATTLIST name before state".
6181    
6182          if ($is_space->{$self->{nc}}) {
6183            ## Stay in the state.
6184            
6185        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6186          $self->{line_prev} = $self->{line};
6187          $self->{column_prev} = $self->{column};
6188          $self->{column}++;
6189          $self->{nc}
6190              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6191        } else {
6192          $self->{set_nc}->($self);
6193        }
6194      
6195            redo A;
6196          } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
6197                   $self->{nc} == 0x0025) { # %
6198            $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
6199            
6200        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6201          $self->{line_prev} = $self->{line};
6202          $self->{column_prev} = $self->{column};
6203          $self->{column}++;
6204          $self->{nc}
6205              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6206        } else {
6207          $self->{set_nc}->($self);
6208        }
6209      
6210            redo A;
6211          } elsif ($self->{nc} == 0x003E) { # >
6212            ## XML5: Same as "Anything else".
6213            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6214            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6215            
6216        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6217          $self->{line_prev} = $self->{line};
6218          $self->{column_prev} = $self->{column};
6219          $self->{column}++;
6220          $self->{nc}
6221              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6222        } else {
6223          $self->{set_nc}->($self);
6224        }
6225      
6226            redo A;
6227          } elsif ($self->{nc} == -1) {
6228            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6229            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6230            ## Reconsume.
6231            redo A;
6232          } else {
6233            ## XML5: [ATTLIST] Not defined yet.
6234            $self->{ct}->{name} .= chr $self->{nc};
6235            $self->{state} = MD_NAME_STATE;
6236            
6237        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6238          $self->{line_prev} = $self->{line};
6239          $self->{column_prev} = $self->{column};
6240          $self->{column}++;
6241          $self->{nc}
6242              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6243        } else {
6244          $self->{set_nc}->($self);
6245        }
6246      
6247            redo A;
6248          }
6249        } elsif ($self->{state} == DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE) {
6250          if ($is_space->{$self->{nc}}) {
6251            ## XML5: Switch to the "DOCTYPE ENTITY parameter state".
6252            $self->{ct}->{type} = PARAMETER_ENTITY_TOKEN;
6253            $self->{state} = BEFORE_MD_NAME_STATE;
6254            
6255        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6256          $self->{line_prev} = $self->{line};
6257          $self->{column_prev} = $self->{column};
6258          $self->{column}++;
6259          $self->{nc}
6260              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6261        } else {
6262          $self->{set_nc}->($self);
6263        }
6264      
6265            redo A;
6266          } elsif ($self->{nc} == 0x003E) { # >
6267            ## XML5: Same as "Anything else".
6268            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
6269            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6270            
6271        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6272          $self->{line_prev} = $self->{line};
6273          $self->{column_prev} = $self->{column};
6274          $self->{column}++;
6275          $self->{nc}
6276              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6277        } else {
6278          $self->{set_nc}->($self);
6279        }
6280      
6281            redo A;
6282          } elsif ($self->{nc} == -1) {
6283            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md');
6284            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6285            ## Reconsume.
6286            redo A;
6287          } else {
6288            ## XML5: No parse error.
6289            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space after ENTITY percent'); ## TODO: type
6290            $self->{state} = BOGUS_COMMENT_STATE;
6291            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
6292            ## Reconsume.
6293            redo A;
6294          }
6295        } elsif ($self->{state} == MD_NAME_STATE) {
6296          ## XML5: "DOCTYPE ENTITY name state" and "DOCTYPE ATTLIST name state".
6297          
6298          if ($is_space->{$self->{nc}}) {
6299            if ($self->{ct}->{type} == ATTLIST_TOKEN) {
6300              $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
6301            } elsif ($self->{ct}->{type} == ELEMENT_TOKEN) {
6302              $self->{state} = AFTER_ELEMENT_NAME_STATE;
6303            } else { # ENTITY/NOTATION
6304              $self->{state} = AFTER_DOCTYPE_NAME_STATE;
6305            }
6306            
6307        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6308          $self->{line_prev} = $self->{line};
6309          $self->{column_prev} = $self->{column};
6310          $self->{column}++;
6311          $self->{nc}
6312              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6313        } else {
6314          $self->{set_nc}->($self);
6315        }
6316      
6317            redo A;
6318          } elsif ($self->{nc} == 0x003E) { # >
6319            if ($self->{ct}->{type} == ATTLIST_TOKEN) {
6320              #
6321            } else {
6322              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
6323            }
6324            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6325            
6326        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6327          $self->{line_prev} = $self->{line};
6328          $self->{column_prev} = $self->{column};
6329          $self->{column}++;
6330          $self->{nc}
6331              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6332        } else {
6333          $self->{set_nc}->($self);
6334        }
6335      
6336            return  ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
6337            redo A;
6338          } elsif ($self->{nc} == -1) {
6339            ## XML5: [ATTLIST] No parse error.
6340            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md');
6341            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6342            ## Reconsume.
6343            return  ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
6344            redo A;
6345          } else {
6346            ## XML5: [ATTLIST] Not defined yet.
6347            $self->{ct}->{name} .= chr $self->{nc};
6348            ## Stay in the state.
6349            
6350        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6351          $self->{line_prev} = $self->{line};
6352          $self->{column_prev} = $self->{column};
6353          $self->{column}++;
6354          $self->{nc}
6355              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6356        } else {
6357          $self->{set_nc}->($self);
6358        }
6359      
6360            redo A;
6361          }
6362        } elsif ($self->{state} == DOCTYPE_ATTLIST_NAME_AFTER_STATE) {
6363          if ($is_space->{$self->{nc}}) {
6364            ## Stay in the state.
6365            
6366        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6367          $self->{line_prev} = $self->{line};
6368          $self->{column_prev} = $self->{column};
6369          $self->{column}++;
6370          $self->{nc}
6371              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6372        } else {
6373          $self->{set_nc}->($self);
6374        }
6375      
6376            redo A;
6377          } elsif ($self->{nc} == 0x003E) { # >
6378            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6379            
6380        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6381          $self->{line_prev} = $self->{line};
6382          $self->{column_prev} = $self->{column};
6383          $self->{column}++;
6384          $self->{nc}
6385              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6386        } else {
6387          $self->{set_nc}->($self);
6388        }
6389      
6390            return  ($self->{ct}); # ATTLIST
6391            redo A;
6392          } elsif ($self->{nc} == -1) {
6393            ## XML5: No parse error.
6394            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6395            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6396            return  ($self->{ct});
6397            redo A;
6398          } else {
6399            ## XML5: Not defined yet.
6400            $self->{ca} = {name => chr ($self->{nc}), # attrdef
6401                           tokens => [],
6402                           line => $self->{line}, column => $self->{column}};
6403            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE;
6404            
6405        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6406          $self->{line_prev} = $self->{line};
6407          $self->{column_prev} = $self->{column};
6408          $self->{column}++;
6409          $self->{nc}
6410              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6411        } else {
6412          $self->{set_nc}->($self);
6413        }
6414      
6415            redo A;
6416          }
6417        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE) {
6418          if ($is_space->{$self->{nc}}) {
6419            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE;
6420            
6421        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6422          $self->{line_prev} = $self->{line};
6423          $self->{column_prev} = $self->{column};
6424          $self->{column}++;
6425          $self->{nc}
6426              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6427        } else {
6428          $self->{set_nc}->($self);
6429        }
6430      
6431            redo A;
6432          } elsif ($self->{nc} == 0x003E) { # >
6433            ## XML5: Same as "anything else".
6434            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr type'); ## TODO: type
6435            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6436            
6437        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6438          $self->{line_prev} = $self->{line};
6439          $self->{column_prev} = $self->{column};
6440          $self->{column}++;
6441          $self->{nc}
6442              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6443        } else {
6444          $self->{set_nc}->($self);
6445        }
6446      
6447            return  ($self->{ct}); # ATTLIST
6448            redo A;
6449          } elsif ($self->{nc} == 0x0028) { # (
6450            ## XML5: Same as "anything else".
6451            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before paren'); ## TODO: type
6452            $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6453            
6454        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6455          $self->{line_prev} = $self->{line};
6456          $self->{column_prev} = $self->{column};
6457          $self->{column}++;
6458          $self->{nc}
6459              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6460        } else {
6461          $self->{set_nc}->($self);
6462        }
6463      
6464            redo A;
6465          } elsif ($self->{nc} == -1) {
6466            ## XML5: No parse error.
6467            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6468            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6469            
6470        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6471          $self->{line_prev} = $self->{line};
6472          $self->{column_prev} = $self->{column};
6473          $self->{column}++;
6474          $self->{nc}
6475              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6476        } else {
6477          $self->{set_nc}->($self);
6478        }
6479      
6480            return  ($self->{ct}); # ATTLIST
6481            redo A;
6482          } else {
6483            ## XML5: Not defined yet.
6484            $self->{ca}->{name} .= chr $self->{nc};
6485            ## Stay in the state.
6486            
6487        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6488          $self->{line_prev} = $self->{line};
6489          $self->{column_prev} = $self->{column};
6490          $self->{column}++;
6491          $self->{nc}
6492              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6493        } else {
6494          $self->{set_nc}->($self);
6495        }
6496      
6497            redo A;
6498          }
6499        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE) {
6500          if ($is_space->{$self->{nc}}) {
6501            ## Stay in the state.
6502            
6503        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6504          $self->{line_prev} = $self->{line};
6505          $self->{column_prev} = $self->{column};
6506          $self->{column}++;
6507          $self->{nc}
6508              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6509        } else {
6510          $self->{set_nc}->($self);
6511        }
6512      
6513            redo A;
6514          } elsif ($self->{nc} == 0x003E) { # >
6515            ## XML5: Same as "anything else".
6516            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr type'); ## TODO: type
6517            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6518            
6519        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6520          $self->{line_prev} = $self->{line};
6521          $self->{column_prev} = $self->{column};
6522          $self->{column}++;
6523          $self->{nc}
6524              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6525        } else {
6526          $self->{set_nc}->($self);
6527        }
6528      
6529            return  ($self->{ct}); # ATTLIST
6530            redo A;
6531          } elsif ($self->{nc} == 0x0028) { # (
6532            ## XML5: Same as "anything else".
6533            $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6534            
6535        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6536          $self->{line_prev} = $self->{line};
6537          $self->{column_prev} = $self->{column};
6538          $self->{column}++;
6539          $self->{nc}
6540              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6541        } else {
6542          $self->{set_nc}->($self);
6543        }
6544      
6545            redo A;
6546          } elsif ($self->{nc} == -1) {
6547            ## XML5: No parse error.
6548            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6549            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6550            
6551        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6552          $self->{line_prev} = $self->{line};
6553          $self->{column_prev} = $self->{column};
6554          $self->{column}++;
6555          $self->{nc}
6556              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6557        } else {
6558          $self->{set_nc}->($self);
6559        }
6560      
6561            return  ($self->{ct});
6562            redo A;
6563          } else {
6564            ## XML5: Not defined yet.
6565            $self->{ca}->{type} = chr $self->{nc};
6566            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE;
6567            
6568        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6569          $self->{line_prev} = $self->{line};
6570          $self->{column_prev} = $self->{column};
6571          $self->{column}++;
6572          $self->{nc}
6573              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6574        } else {
6575          $self->{set_nc}->($self);
6576        }
6577      
6578            redo A;
6579          }
6580        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE) {
6581          if ($is_space->{$self->{nc}}) {
6582            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE;
6583            
6584        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6585          $self->{line_prev} = $self->{line};
6586          $self->{column_prev} = $self->{column};
6587          $self->{column}++;
6588          $self->{nc}
6589              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6590        } else {
6591          $self->{set_nc}->($self);
6592        }
6593      
6594            redo A;
6595          } elsif ($self->{nc} == 0x0023) { # #
6596            ## XML5: Same as "anything else".
6597            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6598            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
6599            
6600        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6601          $self->{line_prev} = $self->{line};
6602          $self->{column_prev} = $self->{column};
6603          $self->{column}++;
6604          $self->{nc}
6605              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6606        } else {
6607          $self->{set_nc}->($self);
6608        }
6609      
6610            redo A;
6611          } elsif ($self->{nc} == 0x0022) { # "
6612            ## XML5: Same as "anything else".
6613            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6614            $self->{ca}->{value} = '';
6615            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
6616            
6617        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6618          $self->{line_prev} = $self->{line};
6619          $self->{column_prev} = $self->{column};
6620          $self->{column}++;
6621          $self->{nc}
6622              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6623        } else {
6624          $self->{set_nc}->($self);
6625        }
6626      
6627            redo A;
6628          } elsif ($self->{nc} == 0x0027) { # '
6629            ## XML5: Same as "anything else".
6630            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
6631            $self->{ca}->{value} = '';
6632            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
6633            
6634        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6635          $self->{line_prev} = $self->{line};
6636          $self->{column_prev} = $self->{column};
6637          $self->{column}++;
6638          $self->{nc}
6639              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6640        } else {
6641          $self->{set_nc}->($self);
6642        }
6643      
6644            redo A;
6645          } elsif ($self->{nc} == 0x003E) { # >
6646            ## XML5: Same as "anything else".
6647            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
6648            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6649            
6650        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6651          $self->{line_prev} = $self->{line};
6652          $self->{column_prev} = $self->{column};
6653          $self->{column}++;
6654          $self->{nc}
6655              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6656        } else {
6657          $self->{set_nc}->($self);
6658        }
6659      
6660            return  ($self->{ct}); # ATTLIST
6661            redo A;
6662          } elsif ($self->{nc} == 0x0028) { # (
6663            ## XML5: Same as "anything else".
6664            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before paren'); ## TODO: type
6665            $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6666            
6667        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6668          $self->{line_prev} = $self->{line};
6669          $self->{column_prev} = $self->{column};
6670          $self->{column}++;
6671          $self->{nc}
6672              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6673        } else {
6674          $self->{set_nc}->($self);
6675        }
6676      
6677            redo A;
6678          } elsif ($self->{nc} == -1) {
6679            ## XML5: No parse error.
6680            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6681            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6682            
6683        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6684          $self->{line_prev} = $self->{line};
6685          $self->{column_prev} = $self->{column};
6686          $self->{column}++;
6687          $self->{nc}
6688              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6689        } else {
6690          $self->{set_nc}->($self);
6691        }
6692      
6693            return  ($self->{ct});
6694            redo A;
6695          } else {
6696            ## XML5: Not defined yet.
6697            $self->{ca}->{type} .= chr $self->{nc};
6698            ## Stay in the state.
6699            
6700        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6701          $self->{line_prev} = $self->{line};
6702          $self->{column_prev} = $self->{column};
6703          $self->{column}++;
6704          $self->{nc}
6705              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6706        } else {
6707          $self->{set_nc}->($self);
6708        }
6709      
6710            redo A;
6711          }
6712        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE) {
6713          if ($is_space->{$self->{nc}}) {
6714            ## Stay in the state.
6715            
6716        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6717          $self->{line_prev} = $self->{line};
6718          $self->{column_prev} = $self->{column};
6719          $self->{column}++;
6720          $self->{nc}
6721              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6722        } else {
6723          $self->{set_nc}->($self);
6724        }
6725      
6726            redo A;
6727          } elsif ($self->{nc} == 0x0028) { # (
6728            ## XML5: Same as "anything else".
6729            $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6730            
6731        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6732          $self->{line_prev} = $self->{line};
6733          $self->{column_prev} = $self->{column};
6734          $self->{column}++;
6735          $self->{nc}
6736              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6737        } else {
6738          $self->{set_nc}->($self);
6739        }
6740      
6741            redo A;
6742          } elsif ($self->{nc} == 0x0023) { # #
6743            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
6744            
6745        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6746          $self->{line_prev} = $self->{line};
6747          $self->{column_prev} = $self->{column};
6748          $self->{column}++;
6749          $self->{nc}
6750              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6751        } else {
6752          $self->{set_nc}->($self);
6753        }
6754      
6755            redo A;
6756          } elsif ($self->{nc} == 0x0022) { # "
6757            ## XML5: Same as "anything else".
6758            $self->{ca}->{value} = '';
6759            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
6760            
6761        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6762          $self->{line_prev} = $self->{line};
6763          $self->{column_prev} = $self->{column};
6764          $self->{column}++;
6765          $self->{nc}
6766              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6767        } else {
6768          $self->{set_nc}->($self);
6769        }
6770      
6771            redo A;
6772          } elsif ($self->{nc} == 0x0027) { # '
6773            ## XML5: Same as "anything else".
6774            $self->{ca}->{value} = '';
6775            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
6776            
6777        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6778          $self->{line_prev} = $self->{line};
6779          $self->{column_prev} = $self->{column};
6780          $self->{column}++;
6781          $self->{nc}
6782              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6783        } else {
6784          $self->{set_nc}->($self);
6785        }
6786      
6787            redo A;
6788          } elsif ($self->{nc} == 0x003E) { # >
6789            ## XML5: Same as "anything else".
6790            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
6791            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6792            
6793        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6794          $self->{line_prev} = $self->{line};
6795          $self->{column_prev} = $self->{column};
6796          $self->{column}++;
6797          $self->{nc}
6798              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6799        } else {
6800          $self->{set_nc}->($self);
6801        }
6802      
6803            return  ($self->{ct}); # ATTLIST
6804            redo A;
6805          } elsif ($self->{nc} == -1) {
6806            ## XML5: No parse error.
6807            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6808            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6809            
6810        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6811          $self->{line_prev} = $self->{line};
6812          $self->{column_prev} = $self->{column};
6813          $self->{column}++;
6814          $self->{nc}
6815              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6816        } else {
6817          $self->{set_nc}->($self);
6818        }
6819      
6820            return  ($self->{ct});
6821            redo A;
6822          } else {
6823            ## XML5: Switch to the "DOCTYPE bogus comment state".
6824            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
6825            $self->{ca}->{value} = '';
6826            $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
6827            ## Reconsume.
6828            redo A;
6829          }
6830        } elsif ($self->{state} == BEFORE_ALLOWED_TOKEN_STATE) {
6831          if ($is_space->{$self->{nc}}) {
6832            ## Stay in the state.
6833            
6834        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6835          $self->{line_prev} = $self->{line};
6836          $self->{column_prev} = $self->{column};
6837          $self->{column}++;
6838          $self->{nc}
6839              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6840        } else {
6841          $self->{set_nc}->($self);
6842        }
6843      
6844            redo A;
6845          } elsif ($self->{nc} == 0x007C) { # |
6846            $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty allowed token'); ## TODO: type
6847            ## Stay in the state.
6848            
6849        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6850          $self->{line_prev} = $self->{line};
6851          $self->{column_prev} = $self->{column};
6852          $self->{column}++;
6853          $self->{nc}
6854              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6855        } else {
6856          $self->{set_nc}->($self);
6857        }
6858      
6859            redo A;
6860          } elsif ($self->{nc} == 0x0029) { # )
6861            $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty allowed token'); ## TODO: type
6862            $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
6863            
6864        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6865          $self->{line_prev} = $self->{line};
6866          $self->{column_prev} = $self->{column};
6867          $self->{column}++;
6868          $self->{nc}
6869              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6870        } else {
6871          $self->{set_nc}->($self);
6872        }
6873      
6874            redo A;
6875          } elsif ($self->{nc} == 0x003E) { # >
6876            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
6877            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6878            
6879        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6880          $self->{line_prev} = $self->{line};
6881          $self->{column_prev} = $self->{column};
6882          $self->{column}++;
6883          $self->{nc}
6884              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6885        } else {
6886          $self->{set_nc}->($self);
6887        }
6888      
6889            return  ($self->{ct}); # ATTLIST
6890            redo A;
6891          } elsif ($self->{nc} == -1) {
6892            ## XML5: No parse error.
6893            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6894            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6895            
6896        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6897          $self->{line_prev} = $self->{line};
6898          $self->{column_prev} = $self->{column};
6899          $self->{column}++;
6900          $self->{nc}
6901              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6902        } else {
6903          $self->{set_nc}->($self);
6904        }
6905      
6906            return  ($self->{ct});
6907            redo A;
6908          } else {
6909            push @{$self->{ca}->{tokens}}, chr $self->{nc};
6910            $self->{state} = ALLOWED_TOKEN_STATE;
6911            
6912        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6913          $self->{line_prev} = $self->{line};
6914          $self->{column_prev} = $self->{column};
6915          $self->{column}++;
6916          $self->{nc}
6917              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6918        } else {
6919          $self->{set_nc}->($self);
6920        }
6921      
6922            redo A;
6923          }
6924        } elsif ($self->{state} == ALLOWED_TOKEN_STATE) {
6925          if ($is_space->{$self->{nc}}) {
6926            $self->{state} = AFTER_ALLOWED_TOKEN_STATE;
6927            
6928        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6929          $self->{line_prev} = $self->{line};
6930          $self->{column_prev} = $self->{column};
6931          $self->{column}++;
6932          $self->{nc}
6933              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6934        } else {
6935          $self->{set_nc}->($self);
6936        }
6937      
6938            redo A;
6939          } elsif ($self->{nc} == 0x007C) { # |
6940            $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
6941            
6942        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6943          $self->{line_prev} = $self->{line};
6944          $self->{column_prev} = $self->{column};
6945          $self->{column}++;
6946          $self->{nc}
6947              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6948        } else {
6949          $self->{set_nc}->($self);
6950        }
6951      
6952            redo A;
6953          } elsif ($self->{nc} == 0x0029) { # )
6954            $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
6955            
6956        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6957          $self->{line_prev} = $self->{line};
6958          $self->{column_prev} = $self->{column};
6959          $self->{column}++;
6960          $self->{nc}
6961              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6962        } else {
6963          $self->{set_nc}->($self);
6964        }
6965      
6966            redo A;
6967          } elsif ($self->{nc} == 0x003E) { # >
6968            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
6969            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
6970            
6971        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6972          $self->{line_prev} = $self->{line};
6973          $self->{column_prev} = $self->{column};
6974          $self->{column}++;
6975          $self->{nc}
6976              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6977        } else {
6978          $self->{set_nc}->($self);
6979        }
6980      
6981            return  ($self->{ct}); # ATTLIST
6982            redo A;
6983          } elsif ($self->{nc} == -1) {
6984            ## XML5: No parse error.
6985            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
6986            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
6987            
6988        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
6989          $self->{line_prev} = $self->{line};
6990          $self->{column_prev} = $self->{column};
6991          $self->{column}++;
6992          $self->{nc}
6993              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
6994        } else {
6995          $self->{set_nc}->($self);
6996        }
6997      
6998            return  ($self->{ct});
6999            redo A;
7000          } else {
7001            $self->{ca}->{tokens}->[-1] .= chr $self->{nc};
7002            ## Stay in the state.
7003            
7004        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7005          $self->{line_prev} = $self->{line};
7006          $self->{column_prev} = $self->{column};
7007          $self->{column}++;
7008          $self->{nc}
7009              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7010        } else {
7011          $self->{set_nc}->($self);
7012        }
7013      
7014            redo A;
7015          }
7016        } elsif ($self->{state} == AFTER_ALLOWED_TOKEN_STATE) {
7017          if ($is_space->{$self->{nc}}) {
7018            ## Stay in the state.
7019            
7020        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7021          $self->{line_prev} = $self->{line};
7022          $self->{column_prev} = $self->{column};
7023          $self->{column}++;
7024          $self->{nc}
7025              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7026        } else {
7027          $self->{set_nc}->($self);
7028        }
7029      
7030            redo A;
7031          } elsif ($self->{nc} == 0x007C) { # |
7032            $self->{state} = BEFORE_ALLOWED_TOKEN_STATE;
7033            
7034        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7035          $self->{line_prev} = $self->{line};
7036          $self->{column_prev} = $self->{column};
7037          $self->{column}++;
7038          $self->{nc}
7039              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7040        } else {
7041          $self->{set_nc}->($self);
7042        }
7043      
7044            redo A;
7045          } elsif ($self->{nc} == 0x0029) { # )
7046            $self->{state} = AFTER_ALLOWED_TOKENS_STATE;
7047            
7048        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7049          $self->{line_prev} = $self->{line};
7050          $self->{column_prev} = $self->{column};
7051          $self->{column}++;
7052          $self->{nc}
7053              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7054        } else {
7055          $self->{set_nc}->($self);
7056        }
7057      
7058            redo A;
7059          } elsif ($self->{nc} == 0x003E) { # >
7060            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type
7061            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7062            
7063        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7064          $self->{line_prev} = $self->{line};
7065          $self->{column_prev} = $self->{column};
7066          $self->{column}++;
7067          $self->{nc}
7068              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7069        } else {
7070          $self->{set_nc}->($self);
7071        }
7072      
7073            return  ($self->{ct}); # ATTLIST
7074            redo A;
7075          } elsif ($self->{nc} == -1) {
7076            ## XML5: No parse error.
7077            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7078            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7079            
7080        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7081          $self->{line_prev} = $self->{line};
7082          $self->{column_prev} = $self->{column};
7083          $self->{column}++;
7084          $self->{nc}
7085              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7086        } else {
7087          $self->{set_nc}->($self);
7088        }
7089      
7090            return  ($self->{ct});
7091            redo A;
7092          } else {
7093            $self->{parse_error}->(level => $self->{level}->{must}, type => 'space in allowed token', ## TODO: type
7094                            line => $self->{line_prev},
7095                            column => $self->{column_prev});
7096            $self->{ca}->{tokens}->[-1] .= ' ' . chr $self->{nc};
7097            $self->{state} = ALLOWED_TOKEN_STATE;
7098            
7099        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7100          $self->{line_prev} = $self->{line};
7101          $self->{column_prev} = $self->{column};
7102          $self->{column}++;
7103          $self->{nc}
7104              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7105        } else {
7106          $self->{set_nc}->($self);
7107        }
7108      
7109            redo A;
7110          }
7111        } elsif ($self->{state} == AFTER_ALLOWED_TOKENS_STATE) {
7112          if ($is_space->{$self->{nc}}) {
7113            $self->{state} = BEFORE_ATTR_DEFAULT_STATE;
7114            
7115        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7116          $self->{line_prev} = $self->{line};
7117          $self->{column_prev} = $self->{column};
7118          $self->{column}++;
7119          $self->{nc}
7120              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7121        } else {
7122          $self->{set_nc}->($self);
7123        }
7124      
7125            redo A;
7126          } elsif ($self->{nc} == 0x0023) { # #
7127            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7128            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
7129            
7130        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7131          $self->{line_prev} = $self->{line};
7132          $self->{column_prev} = $self->{column};
7133          $self->{column}++;
7134          $self->{nc}
7135              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7136        } else {
7137          $self->{set_nc}->($self);
7138        }
7139      
7140            redo A;
7141          } elsif ($self->{nc} == 0x0022) { # "
7142            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7143            $self->{ca}->{value} = '';
7144            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7145            
7146        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7147          $self->{line_prev} = $self->{line};
7148          $self->{column_prev} = $self->{column};
7149          $self->{column}++;
7150          $self->{nc}
7151              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7152        } else {
7153          $self->{set_nc}->($self);
7154        }
7155      
7156            redo A;
7157          } elsif ($self->{nc} == 0x0027) { # '
7158            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7159            $self->{ca}->{value} = '';
7160            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7161            
7162        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7163          $self->{line_prev} = $self->{line};
7164          $self->{column_prev} = $self->{column};
7165          $self->{column}++;
7166          $self->{nc}
7167              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7168        } else {
7169          $self->{set_nc}->($self);
7170        }
7171      
7172            redo A;
7173          } elsif ($self->{nc} == 0x003E) { # >
7174            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7175            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7176            
7177        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7178          $self->{line_prev} = $self->{line};
7179          $self->{column_prev} = $self->{column};
7180          $self->{column}++;
7181          $self->{nc}
7182              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7183        } else {
7184          $self->{set_nc}->($self);
7185        }
7186      
7187            return  ($self->{ct}); # ATTLIST
7188            redo A;
7189          } elsif ($self->{nc} == -1) {
7190            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7191            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7192            
7193        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7194          $self->{line_prev} = $self->{line};
7195          $self->{column_prev} = $self->{column};
7196          $self->{column}++;
7197          $self->{nc}
7198              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7199        } else {
7200          $self->{set_nc}->($self);
7201        }
7202      
7203            return  ($self->{ct});
7204            redo A;
7205          } else {
7206            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
7207            $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7208            ## Reconsume.
7209            redo A;
7210          }
7211        } elsif ($self->{state} == BEFORE_ATTR_DEFAULT_STATE) {
7212          if ($is_space->{$self->{nc}}) {
7213            ## Stay in the state.
7214            
7215        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7216          $self->{line_prev} = $self->{line};
7217          $self->{column_prev} = $self->{column};
7218          $self->{column}++;
7219          $self->{nc}
7220              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7221        } else {
7222          $self->{set_nc}->($self);
7223        }
7224      
7225            redo A;
7226          } elsif ($self->{nc} == 0x0023) { # #
7227            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE;
7228            
7229        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7230          $self->{line_prev} = $self->{line};
7231          $self->{column_prev} = $self->{column};
7232          $self->{column}++;
7233          $self->{nc}
7234              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7235        } else {
7236          $self->{set_nc}->($self);
7237        }
7238      
7239            redo A;
7240          } elsif ($self->{nc} == 0x0022) { # "
7241            $self->{ca}->{value} = '';
7242            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7243            
7244        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7245          $self->{line_prev} = $self->{line};
7246          $self->{column_prev} = $self->{column};
7247          $self->{column}++;
7248          $self->{nc}
7249              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7250        } else {
7251          $self->{set_nc}->($self);
7252        }
7253      
7254            redo A;
7255          } elsif ($self->{nc} == 0x0027) { # '
7256            $self->{ca}->{value} = '';
7257            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7258            
7259        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7260          $self->{line_prev} = $self->{line};
7261          $self->{column_prev} = $self->{column};
7262          $self->{column}++;
7263          $self->{nc}
7264              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7265        } else {
7266          $self->{set_nc}->($self);
7267        }
7268      
7269            redo A;
7270          } elsif ($self->{nc} == 0x003E) { # >
7271            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7272            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7273            
7274        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7275          $self->{line_prev} = $self->{line};
7276          $self->{column_prev} = $self->{column};
7277          $self->{column}++;
7278          $self->{nc}
7279              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7280        } else {
7281          $self->{set_nc}->($self);
7282        }
7283      
7284            return  ($self->{ct}); # ATTLIST
7285            redo A;
7286          } elsif ($self->{nc} == -1) {
7287            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7288            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7289            
7290        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7291          $self->{line_prev} = $self->{line};
7292          $self->{column_prev} = $self->{column};
7293          $self->{column}++;
7294          $self->{nc}
7295              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7296        } else {
7297          $self->{set_nc}->($self);
7298        }
7299      
7300            return  ($self->{ct});
7301            redo A;
7302          } else {
7303            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type
7304            $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7305            ## Reconsume.
7306            redo A;
7307          }
7308        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE) {
7309          if ($is_space->{$self->{nc}}) {
7310            ## XML5: No parse error.
7311            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no default type'); ## TODO: type
7312            $self->{state} = BOGUS_MD_STATE;
7313            ## Reconsume.
7314            redo A;
7315          } elsif ($self->{nc} == 0x0022) { # "
7316            ## XML5: Same as "anything else".
7317            $self->{ca}->{value} = '';
7318            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7319            
7320        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7321          $self->{line_prev} = $self->{line};
7322          $self->{column_prev} = $self->{column};
7323          $self->{column}++;
7324          $self->{nc}
7325              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7326        } else {
7327          $self->{set_nc}->($self);
7328        }
7329      
7330            redo A;
7331          } elsif ($self->{nc} == 0x0027) { # '
7332            ## XML5: Same as "anything else".
7333            $self->{ca}->{value} = '';
7334            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7335            
7336        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7337          $self->{line_prev} = $self->{line};
7338          $self->{column_prev} = $self->{column};
7339          $self->{column}++;
7340          $self->{nc}
7341              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7342        } else {
7343          $self->{set_nc}->($self);
7344        }
7345      
7346            redo A;
7347          } elsif ($self->{nc} == 0x003E) { # >
7348            ## XML5: Same as "anything else".
7349            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type
7350            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7351            
7352        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7353          $self->{line_prev} = $self->{line};
7354          $self->{column_prev} = $self->{column};
7355          $self->{column}++;
7356          $self->{nc}
7357              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7358        } else {
7359          $self->{set_nc}->($self);
7360        }
7361      
7362            return  ($self->{ct}); # ATTLIST
7363            redo A;
7364          } elsif ($self->{nc} == -1) {
7365            ## XML5: No parse error.
7366            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7367            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7368            
7369        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7370          $self->{line_prev} = $self->{line};
7371          $self->{column_prev} = $self->{column};
7372          $self->{column}++;
7373          $self->{nc}
7374              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7375        } else {
7376          $self->{set_nc}->($self);
7377        }
7378      
7379            return  ($self->{ct});
7380            redo A;
7381          } else {
7382            $self->{ca}->{default} = chr $self->{nc};
7383            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE;
7384            
7385        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7386          $self->{line_prev} = $self->{line};
7387          $self->{column_prev} = $self->{column};
7388          $self->{column}++;
7389          $self->{nc}
7390              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7391        } else {
7392          $self->{set_nc}->($self);
7393        }
7394      
7395            redo A;
7396          }
7397        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE) {
7398          if ($is_space->{$self->{nc}}) {
7399            $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE;
7400            
7401        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7402          $self->{line_prev} = $self->{line};
7403          $self->{column_prev} = $self->{column};
7404          $self->{column}++;
7405          $self->{nc}
7406              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7407        } else {
7408          $self->{set_nc}->($self);
7409        }
7410      
7411            redo A;
7412          } elsif ($self->{nc} == 0x0022) { # "
7413            ## XML5: Same as "anything else".
7414            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7415            $self->{ca}->{value} = '';
7416            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7417            
7418        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7419          $self->{line_prev} = $self->{line};
7420          $self->{column_prev} = $self->{column};
7421          $self->{column}++;
7422          $self->{nc}
7423              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7424        } else {
7425          $self->{set_nc}->($self);
7426        }
7427      
7428            redo A;
7429          } elsif ($self->{nc} == 0x0027) { # '
7430            ## XML5: Same as "anything else".
7431            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type
7432            $self->{ca}->{value} = '';
7433            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7434            
7435        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7436          $self->{line_prev} = $self->{line};
7437          $self->{column_prev} = $self->{column};
7438          $self->{column}++;
7439          $self->{nc}
7440              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7441        } else {
7442          $self->{set_nc}->($self);
7443        }
7444      
7445            redo A;
7446          } elsif ($self->{nc} == 0x003E) { # >
7447            ## XML5: Same as "anything else".
7448            push @{$self->{ct}->{attrdefs}}, $self->{ca};
7449            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7450            
7451        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7452          $self->{line_prev} = $self->{line};
7453          $self->{column_prev} = $self->{column};
7454          $self->{column}++;
7455          $self->{nc}
7456              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7457        } else {
7458          $self->{set_nc}->($self);
7459        }
7460      
7461            return  ($self->{ct}); # ATTLIST
7462            redo A;
7463          } elsif ($self->{nc} == -1) {
7464            ## XML5: No parse error.
7465            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7466            push @{$self->{ct}->{attrdefs}}, $self->{ca};
7467            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7468            
7469        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7470          $self->{line_prev} = $self->{line};
7471          $self->{column_prev} = $self->{column};
7472          $self->{column}++;
7473          $self->{nc}
7474              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7475        } else {
7476          $self->{set_nc}->($self);
7477        }
7478      
7479            return  ($self->{ct});
7480            redo A;
7481          } else {
7482            $self->{ca}->{default} .= chr $self->{nc};
7483            ## Stay in the state.
7484            
7485        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7486          $self->{line_prev} = $self->{line};
7487          $self->{column_prev} = $self->{column};
7488          $self->{column}++;
7489          $self->{nc}
7490              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7491        } else {
7492          $self->{set_nc}->($self);
7493        }
7494      
7495            redo A;
7496          }
7497        } elsif ($self->{state} == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE) {
7498          if ($is_space->{$self->{nc}}) {
7499            ## Stay in the state.
7500            
7501        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7502          $self->{line_prev} = $self->{line};
7503          $self->{column_prev} = $self->{column};
7504          $self->{column}++;
7505          $self->{nc}
7506              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7507        } else {
7508          $self->{set_nc}->($self);
7509        }
7510      
7511            redo A;
7512          } elsif ($self->{nc} == 0x0022) { # "
7513            $self->{ca}->{value} = '';
7514            $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
7515            
7516        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7517          $self->{line_prev} = $self->{line};
7518          $self->{column_prev} = $self->{column};
7519          $self->{column}++;
7520          $self->{nc}
7521              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7522        } else {
7523          $self->{set_nc}->($self);
7524        }
7525      
7526            redo A;
7527          } elsif ($self->{nc} == 0x0027) { # '
7528            $self->{ca}->{value} = '';
7529            $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
7530            
7531        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7532          $self->{line_prev} = $self->{line};
7533          $self->{column_prev} = $self->{column};
7534          $self->{column}++;
7535          $self->{nc}
7536              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7537        } else {
7538          $self->{set_nc}->($self);
7539        }
7540      
7541            redo A;
7542          } elsif ($self->{nc} == 0x003E) { # >
7543            push @{$self->{ct}->{attrdefs}}, $self->{ca};
7544            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7545            
7546        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7547          $self->{line_prev} = $self->{line};
7548          $self->{column_prev} = $self->{column};
7549          $self->{column}++;
7550          $self->{nc}
7551              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7552        } else {
7553          $self->{set_nc}->($self);
7554        }
7555      
7556            return  ($self->{ct}); # ATTLIST
7557            redo A;
7558          } elsif ($self->{nc} == -1) {
7559            ## XML5: No parse error.
7560            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7561            push @{$self->{ct}->{attrdefs}}, $self->{ca};
7562            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
7563            
7564        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7565          $self->{line_prev} = $self->{line};
7566          $self->{column_prev} = $self->{column};
7567          $self->{column}++;
7568          $self->{nc}
7569              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7570        } else {
7571          $self->{set_nc}->($self);
7572        }
7573      
7574            return  ($self->{ct});
7575            redo A;
7576          } else {
7577            ## XML5: Not defined yet.
7578            if ($self->{ca}->{default} eq 'FIXED') {
7579              $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
7580            } else {
7581              push @{$self->{ct}->{attrdefs}}, $self->{ca};
7582              $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7583            }
7584            ## Reconsume.
7585            redo A;
7586          }
7587        } elsif ($self->{state} == AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE) {
7588          if ($is_space->{$self->{nc}} or
7589              $self->{nc} == -1 or
7590              $self->{nc} == 0x003E) { # >
7591            $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7592            ## Reconsume.
7593            redo A;
7594          } else {
7595            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before attr name'); ## TODO: type
7596            $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
7597            ## Reconsume.
7598            redo A;
7599          }
7600        } elsif ($self->{state} == NDATA_STATE) {
7601          ## ASCII case-insensitive
7602          if ($self->{nc} == [
7603                undef,
7604                0x0044, # D
7605                0x0041, # A
7606                0x0054, # T
7607              ]->[length $self->{kwd}] or
7608              $self->{nc} == [
7609                undef,
7610                0x0064, # d
7611                0x0061, # a
7612                0x0074, # t
7613              ]->[length $self->{kwd}]) {
7614            
7615            ## Stay in the state.
7616            $self->{kwd} .= chr $self->{nc};
7617            
7618        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7619          $self->{line_prev} = $self->{line};
7620          $self->{column_prev} = $self->{column};
7621          $self->{column}++;
7622          $self->{nc}
7623              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7624        } else {
7625          $self->{set_nc}->($self);
7626        }
7627      
7628            redo A;
7629          } elsif ((length $self->{kwd}) == 4 and
7630                   ($self->{nc} == 0x0041 or # A
7631                    $self->{nc} == 0x0061)) { # a
7632            if ($self->{kwd} ne 'NDAT' or $self->{nc} == 0x0061) { # a
7633              
7634              $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type
7635                              text => 'NDATA',
7636                              line => $self->{line_prev},
7637                              column => $self->{column_prev} - 4);
7638            } else {
7639              
7640            }
7641            $self->{state} = AFTER_NDATA_STATE;
7642            
7643        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7644          $self->{line_prev} = $self->{line};
7645          $self->{column_prev} = $self->{column};
7646          $self->{column}++;
7647          $self->{nc}
7648              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7649        } else {
7650          $self->{set_nc}->($self);
7651        }
7652      
7653            redo A;
7654          } else {
7655            $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after literal', ## TODO: type
7656                            line => $self->{line_prev},
7657                            column => $self->{column_prev} + 1
7658                                - length $self->{kwd});
7659            
7660            $self->{state} = BOGUS_MD_STATE;
7661            ## Reconsume.
7662            redo A;
7663          }
7664        } elsif ($self->{state} == AFTER_NDATA_STATE) {
7665          if ($is_space->{$self->{nc}}) {
7666            $self->{state} = BEFORE_NOTATION_NAME_STATE;
7667            
7668        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7669          $self->{line_prev} = $self->{line};
7670          $self->{column_prev} = $self->{column};
7671          $self->{column}++;
7672          $self->{nc}
7673              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7674        } else {
7675          $self->{set_nc}->($self);
7676        }
7677      
7678            redo A;
7679          } elsif ($self->{nc} == 0x003E) { # >
7680            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no notation name'); ## TODO: type
7681            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7682            
7683        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7684          $self->{line_prev} = $self->{line};
7685          $self->{column_prev} = $self->{column};
7686          $self->{column}++;
7687          $self->{nc}
7688              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7689        } else {
7690          $self->{set_nc}->($self);
7691        }
7692      
7693            return  ($self->{ct}); # ENTITY
7694            redo A;
7695          } elsif ($self->{nc} == -1) {
7696            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7697            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7698            
7699        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7700          $self->{line_prev} = $self->{line};
7701          $self->{column_prev} = $self->{column};
7702          $self->{column}++;
7703          $self->{nc}
7704              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7705        } else {
7706          $self->{set_nc}->($self);
7707        }
7708      
7709            return  ($self->{ct}); # ENTITY
7710            redo A;
7711          } else {
7712            $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after literal', ## TODO: type
7713                            line => $self->{line_prev},
7714                            column => $self->{column_prev} + 1
7715                                - length $self->{kwd});
7716            $self->{state} = BOGUS_MD_STATE;
7717            ## Reconsume.
7718            redo A;
7719          }
7720        } elsif ($self->{state} == BEFORE_NOTATION_NAME_STATE) {
7721          if ($is_space->{$self->{nc}}) {
7722            ## Stay in the state.
7723            
7724        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7725          $self->{line_prev} = $self->{line};
7726          $self->{column_prev} = $self->{column};
7727          $self->{column}++;
7728          $self->{nc}
7729              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7730        } else {
7731          $self->{set_nc}->($self);
7732        }
7733      
7734            redo A;
7735          } elsif ($self->{nc} == 0x003E) { # >
7736            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no notation name'); ## TODO: type
7737            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7738            
7739        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7740          $self->{line_prev} = $self->{line};
7741          $self->{column_prev} = $self->{column};
7742          $self->{column}++;
7743          $self->{nc}
7744              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7745        } else {
7746          $self->{set_nc}->($self);
7747        }
7748      
7749            return  ($self->{ct}); # ENTITY
7750            redo A;
7751          } elsif ($self->{nc} == -1) {
7752            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7753            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7754            
7755        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7756          $self->{line_prev} = $self->{line};
7757          $self->{column_prev} = $self->{column};
7758          $self->{column}++;
7759          $self->{nc}
7760              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7761        } else {
7762          $self->{set_nc}->($self);
7763        }
7764      
7765            return  ($self->{ct}); # ENTITY
7766            redo A;
7767          } else {
7768            $self->{ct}->{notation} = chr $self->{nc}; # ENTITY
7769            $self->{state} = NOTATION_NAME_STATE;
7770            
7771        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7772          $self->{line_prev} = $self->{line};
7773          $self->{column_prev} = $self->{column};
7774          $self->{column}++;
7775          $self->{nc}
7776              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7777        } else {
7778          $self->{set_nc}->($self);
7779        }
7780      
7781            redo A;
7782          }
7783        } elsif ($self->{state} == NOTATION_NAME_STATE) {
7784          if ($is_space->{$self->{nc}}) {
7785            $self->{state} = AFTER_MD_DEF_STATE;
7786            
7787        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7788          $self->{line_prev} = $self->{line};
7789          $self->{column_prev} = $self->{column};
7790          $self->{column}++;
7791          $self->{nc}
7792              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7793        } else {
7794          $self->{set_nc}->($self);
7795        }
7796      
7797            redo A;
7798          } elsif ($self->{nc} == 0x003E) { # >
7799            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7800            
7801        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7802          $self->{line_prev} = $self->{line};
7803          $self->{column_prev} = $self->{column};
7804          $self->{column}++;
7805          $self->{nc}
7806              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7807        } else {
7808          $self->{set_nc}->($self);
7809        }
7810      
7811            return  ($self->{ct}); # ENTITY
7812            redo A;
7813          } elsif ($self->{nc} == -1) {
7814            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7815            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7816            
7817        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7818          $self->{line_prev} = $self->{line};
7819          $self->{column_prev} = $self->{column};
7820          $self->{column}++;
7821          $self->{nc}
7822              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7823        } else {
7824          $self->{set_nc}->($self);
7825        }
7826      
7827            return  ($self->{ct}); # ENTITY
7828            redo A;
7829          } else {
7830            $self->{ct}->{notation} .= chr $self->{nc}; # ENTITY
7831            ## Stay in the state.
7832            
7833        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7834          $self->{line_prev} = $self->{line};
7835          $self->{column_prev} = $self->{column};
7836          $self->{column}++;
7837          $self->{nc}
7838              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7839        } else {
7840          $self->{set_nc}->($self);
7841        }
7842      
7843            redo A;
7844          }
7845        } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE) {
7846          if ($self->{nc} == 0x0022) { # "
7847            $self->{state} = AFTER_MD_DEF_STATE;
7848            
7849        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7850          $self->{line_prev} = $self->{line};
7851          $self->{column_prev} = $self->{column};
7852          $self->{column}++;
7853          $self->{nc}
7854              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7855        } else {
7856          $self->{set_nc}->($self);
7857        }
7858      
7859            redo A;
7860          } elsif ($self->{nc} == 0x0026) { # &
7861            $self->{prev_state} = $self->{state};
7862            $self->{state} = ENTITY_VALUE_ENTITY_STATE;
7863            $self->{entity_add} = 0x0022; # "
7864            
7865        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7866          $self->{line_prev} = $self->{line};
7867          $self->{column_prev} = $self->{column};
7868          $self->{column}++;
7869          $self->{nc}
7870              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7871        } else {
7872          $self->{set_nc}->($self);
7873        }
7874      
7875            redo A;
7876    ## TODO: %
7877          } elsif ($self->{nc} == -1) {
7878            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed entity value'); ## TODO: type
7879            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7880            ## Reconsume.
7881            return  ($self->{ct}); # ENTITY
7882            redo A;
7883          } else {
7884            $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
7885            
7886        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7887          $self->{line_prev} = $self->{line};
7888          $self->{column_prev} = $self->{column};
7889          $self->{column}++;
7890          $self->{nc}
7891              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7892        } else {
7893          $self->{set_nc}->($self);
7894        }
7895      
7896            redo A;
7897          }
7898        } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE) {
7899          if ($self->{nc} == 0x0027) { # '
7900            $self->{state} = AFTER_MD_DEF_STATE;
7901            
7902        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7903          $self->{line_prev} = $self->{line};
7904          $self->{column_prev} = $self->{column};
7905          $self->{column}++;
7906          $self->{nc}
7907              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7908        } else {
7909          $self->{set_nc}->($self);
7910        }
7911      
7912            redo A;
7913          } elsif ($self->{nc} == 0x0026) { # &
7914            $self->{prev_state} = $self->{state};
7915            $self->{state} = ENTITY_VALUE_ENTITY_STATE;
7916            $self->{entity_add} = 0x0027; # '
7917            
7918        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7919          $self->{line_prev} = $self->{line};
7920          $self->{column_prev} = $self->{column};
7921          $self->{column}++;
7922          $self->{nc}
7923              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7924        } else {
7925          $self->{set_nc}->($self);
7926        }
7927      
7928            redo A;
7929    ## TODO: %
7930          } elsif ($self->{nc} == -1) {
7931            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed entity value'); ## TODO: type
7932            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7933            ## Reconsume.
7934            return  ($self->{ct}); # ENTITY
7935            redo A;
7936          } else {
7937            $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
7938            
7939        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7940          $self->{line_prev} = $self->{line};
7941          $self->{column_prev} = $self->{column};
7942          $self->{column}++;
7943          $self->{nc}
7944              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7945        } else {
7946          $self->{set_nc}->($self);
7947        }
7948      
7949            redo A;
7950          }
7951        } elsif ($self->{state} == ENTITY_VALUE_ENTITY_STATE) {
7952          if ($is_space->{$self->{nc}} or
7953              {
7954                0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
7955                $self->{entity_add} => 1,
7956              }->{$self->{nc}}) {
7957            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
7958                            line => $self->{line_prev},
7959                            column => $self->{column_prev}
7960                                + ($self->{nc} == -1 ? 1 : 0));
7961            ## Don't consume
7962            ## Return nothing.
7963            #
7964          } elsif ($self->{nc} == 0x0023) { # #
7965            $self->{ca} = $self->{ct};
7966            $self->{state} = ENTITY_HASH_STATE;
7967            $self->{kwd} = '#';
7968            
7969        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7970          $self->{line_prev} = $self->{line};
7971          $self->{column_prev} = $self->{column};
7972          $self->{column}++;
7973          $self->{nc}
7974              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7975        } else {
7976          $self->{set_nc}->($self);
7977        }
7978      
7979            redo A;
7980          } else {
7981            #
7982          }
7983    
7984          $self->{ct}->{value} .= '&';
7985          $self->{state} = $self->{prev_state};
7986          ## Reconsume.
7987          redo A;
7988        } elsif ($self->{state} == AFTER_ELEMENT_NAME_STATE) {
7989          if ($is_space->{$self->{nc}}) {
7990            $self->{state} = BEFORE_ELEMENT_CONTENT_STATE;
7991            
7992        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7993          $self->{line_prev} = $self->{line};
7994          $self->{column_prev} = $self->{column};
7995          $self->{column}++;
7996          $self->{nc}
7997              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7998        } else {
7999          $self->{set_nc}->($self);
8000        }
8001      
8002            redo A;
8003          } elsif ($self->{nc} == 0x0028) { # (
8004            $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
8005            $self->{ct}->{content} = ['('];
8006            $self->{group_depth} = 1;
8007            
8008        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8009          $self->{line_prev} = $self->{line};
8010          $self->{column_prev} = $self->{column};
8011          $self->{column}++;
8012          $self->{nc}
8013              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8014        } else {
8015          $self->{set_nc}->($self);
8016        }
8017      
8018            redo A;
8019          } elsif ($self->{nc} == 0x003E) { # >
8020            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
8021            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8022            
8023        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8024          $self->{line_prev} = $self->{line};
8025          $self->{column_prev} = $self->{column};
8026          $self->{column}++;
8027          $self->{nc}
8028              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8029        } else {
8030          $self->{set_nc}->($self);
8031        }
8032      
8033            return  ($self->{ct}); # ELEMENT
8034            redo A;
8035          } elsif ($self->{nc} == -1) {
8036            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8037            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8038            
8039        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8040          $self->{line_prev} = $self->{line};
8041          $self->{column_prev} = $self->{column};
8042          $self->{column}++;
8043          $self->{nc}
8044              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8045        } else {
8046          $self->{set_nc}->($self);
8047        }
8048      
8049            return  ($self->{ct}); # ELEMENT
8050            redo A;
8051          } else {
8052            $self->{ct}->{content} = [chr $self->{nc}];
8053            $self->{state} = CONTENT_KEYWORD_STATE;
8054            
8055        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8056          $self->{line_prev} = $self->{line};
8057          $self->{column_prev} = $self->{column};
8058          $self->{column}++;
8059          $self->{nc}
8060              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8061        } else {
8062          $self->{set_nc}->($self);
8063        }
8064      
8065            redo A;
8066          }
8067        } elsif ($self->{state} == CONTENT_KEYWORD_STATE) {
8068          if ($is_space->{$self->{nc}}) {
8069            $self->{state} = AFTER_MD_DEF_STATE;
8070            
8071        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8072          $self->{line_prev} = $self->{line};
8073          $self->{column_prev} = $self->{column};
8074          $self->{column}++;
8075          $self->{nc}
8076              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8077        } else {
8078          $self->{set_nc}->($self);
8079        }
8080      
8081            redo A;
8082          } elsif ($self->{nc} == 0x003E) { # >
8083            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8084            
8085        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8086          $self->{line_prev} = $self->{line};
8087          $self->{column_prev} = $self->{column};
8088          $self->{column}++;
8089          $self->{nc}
8090              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8091        } else {
8092          $self->{set_nc}->($self);
8093        }
8094      
8095            return  ($self->{ct}); # ELEMENT
8096            redo A;
8097          } elsif ($self->{nc} == -1) {
8098            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8099            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8100            
8101        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8102          $self->{line_prev} = $self->{line};
8103          $self->{column_prev} = $self->{column};
8104          $self->{column}++;
8105          $self->{nc}
8106              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8107        } else {
8108          $self->{set_nc}->($self);
8109        }
8110      
8111            return  ($self->{ct}); # ELEMENT
8112            redo A;
8113          } else {
8114            $self->{ct}->{content}->[-1] .= chr $self->{nc}; # ELEMENT
8115            ## Stay in the state.
8116            
8117        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8118          $self->{line_prev} = $self->{line};
8119          $self->{column_prev} = $self->{column};
8120          $self->{column}++;
8121          $self->{nc}
8122              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8123        } else {
8124          $self->{set_nc}->($self);
8125        }
8126      
8127            redo A;
8128          }
8129        } elsif ($self->{state} == AFTER_CM_GROUP_OPEN_STATE) {
8130          if ($is_space->{$self->{nc}}) {
8131            ## Stay in the state.
8132            
8133        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8134          $self->{line_prev} = $self->{line};
8135          $self->{column_prev} = $self->{column};
8136          $self->{column}++;
8137          $self->{nc}
8138              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8139        } else {
8140          $self->{set_nc}->($self);
8141        }
8142      
8143            redo A;
8144          } elsif ($self->{nc} == 0x0028) { # (
8145            $self->{group_depth}++;
8146            push @{$self->{ct}->{content}}, chr $self->{nc};
8147            ## Stay in the state.
8148            
8149        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8150          $self->{line_prev} = $self->{line};
8151          $self->{column_prev} = $self->{column};
8152          $self->{column}++;
8153          $self->{nc}
8154              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8155        } else {
8156          $self->{set_nc}->($self);
8157        }
8158      
8159            redo A;
8160          } elsif ($self->{nc} == 0x007C or # |
8161                   $self->{nc} == 0x002C) { # ,
8162            $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty element name'); ## TODO: type
8163            ## Stay in the state.
8164            
8165        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8166          $self->{line_prev} = $self->{line};
8167          $self->{column_prev} = $self->{column};
8168          $self->{column}++;
8169          $self->{nc}
8170              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8171        } else {
8172          $self->{set_nc}->($self);
8173        }
8174      
8175            redo A;
8176          } elsif ($self->{nc} == 0x0029) { # )
8177            $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty element name'); ## TODO: type
8178            push @{$self->{ct}->{content}}, chr $self->{nc};
8179            $self->{group_depth}--;
8180            $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8181            
8182        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8183          $self->{line_prev} = $self->{line};
8184          $self->{column_prev} = $self->{column};
8185          $self->{column}++;
8186          $self->{nc}
8187              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8188        } else {
8189          $self->{set_nc}->($self);
8190        }
8191      
8192            redo A;
8193          } elsif ($self->{nc} == 0x003E) { # >
8194            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8195            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8196            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8197            
8198        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8199          $self->{line_prev} = $self->{line};
8200          $self->{column_prev} = $self->{column};
8201          $self->{column}++;
8202          $self->{nc}
8203              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8204        } else {
8205          $self->{set_nc}->($self);
8206        }
8207      
8208            return  ($self->{ct}); # ELEMENT
8209            redo A;
8210          } elsif ($self->{nc} == -1) {
8211            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8212            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8213            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8214            
8215        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8216          $self->{line_prev} = $self->{line};
8217          $self->{column_prev} = $self->{column};
8218          $self->{column}++;
8219          $self->{nc}
8220              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8221        } else {
8222          $self->{set_nc}->($self);
8223        }
8224      
8225            return  ($self->{ct}); # ELEMENT
8226            redo A;
8227          } else {
8228            push @{$self->{ct}->{content}}, chr $self->{nc};
8229            $self->{state} = CM_ELEMENT_NAME_STATE;
8230            
8231        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8232          $self->{line_prev} = $self->{line};
8233          $self->{column_prev} = $self->{column};
8234          $self->{column}++;
8235          $self->{nc}
8236              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8237        } else {
8238          $self->{set_nc}->($self);
8239        }
8240      
8241            redo A;
8242          }
8243        } elsif ($self->{state} == CM_ELEMENT_NAME_STATE) {
8244          if ($is_space->{$self->{nc}}) {
8245            $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8246            
8247        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8248          $self->{line_prev} = $self->{line};
8249          $self->{column_prev} = $self->{column};
8250          $self->{column}++;
8251          $self->{nc}
8252              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8253        } else {
8254          $self->{set_nc}->($self);
8255        }
8256      
8257            redo A;
8258          } elsif ($self->{nc} == 0x002A or # *
8259                   $self->{nc} == 0x002B or # +
8260                   $self->{nc} == 0x003F) { # ?
8261            push @{$self->{ct}->{content}}, chr $self->{nc};
8262            $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8263            
8264        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8265          $self->{line_prev} = $self->{line};
8266          $self->{column_prev} = $self->{column};
8267          $self->{column}++;
8268          $self->{nc}
8269              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8270        } else {
8271          $self->{set_nc}->($self);
8272        }
8273      
8274            redo A;
8275          } elsif ($self->{nc} == 0x007C or # |
8276                   $self->{nc} == 0x002C) { # ,
8277            push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
8278            $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
8279            
8280        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8281          $self->{line_prev} = $self->{line};
8282          $self->{column_prev} = $self->{column};
8283          $self->{column}++;
8284          $self->{nc}
8285              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8286        } else {
8287          $self->{set_nc}->($self);
8288        }
8289      
8290            redo A;
8291          } elsif ($self->{nc} == 0x0029) { # )
8292            $self->{group_depth}--;
8293            push @{$self->{ct}->{content}}, chr $self->{nc};
8294            $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8295            
8296        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8297          $self->{line_prev} = $self->{line};
8298          $self->{column_prev} = $self->{column};
8299          $self->{column}++;
8300          $self->{nc}
8301              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8302        } else {
8303          $self->{set_nc}->($self);
8304        }
8305      
8306            redo A;
8307          } elsif ($self->{nc} == 0x003E) { # >
8308            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8309            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8310            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8311            
8312        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8313          $self->{line_prev} = $self->{line};
8314          $self->{column_prev} = $self->{column};
8315          $self->{column}++;
8316          $self->{nc}
8317              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8318        } else {
8319          $self->{set_nc}->($self);
8320        }
8321      
8322            return  ($self->{ct}); # ELEMENT
8323            redo A;
8324          } elsif ($self->{nc} == -1) {
8325            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8326            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8327            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8328            
8329        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8330          $self->{line_prev} = $self->{line};
8331          $self->{column_prev} = $self->{column};
8332          $self->{column}++;
8333          $self->{nc}
8334              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8335        } else {
8336          $self->{set_nc}->($self);
8337        }
8338      
8339            return  ($self->{ct}); # ELEMENT
8340            redo A;
8341          } else {
8342            $self->{ct}->{content}->[-1] .= chr $self->{nc};
8343            ## Stay in the state.
8344            
8345        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8346          $self->{line_prev} = $self->{line};
8347          $self->{column_prev} = $self->{column};
8348          $self->{column}++;
8349          $self->{nc}
8350              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8351        } else {
8352          $self->{set_nc}->($self);
8353        }
8354      
8355            redo A;
8356          }
8357        } elsif ($self->{state} == AFTER_CM_ELEMENT_NAME_STATE) {
8358          if ($is_space->{$self->{nc}}) {
8359            ## Stay in the state.
8360            
8361        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8362          $self->{line_prev} = $self->{line};
8363          $self->{column_prev} = $self->{column};
8364          $self->{column}++;
8365          $self->{nc}
8366              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8367        } else {
8368          $self->{set_nc}->($self);
8369        }
8370      
8371            redo A;
8372          } elsif ($self->{nc} == 0x007C or # |
8373                   $self->{nc} == 0x002C) { # ,
8374            push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
8375            $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
8376            
8377        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8378          $self->{line_prev} = $self->{line};
8379          $self->{column_prev} = $self->{column};
8380          $self->{column}++;
8381          $self->{nc}
8382              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8383        } else {
8384          $self->{set_nc}->($self);
8385        }
8386      
8387            redo A;
8388          } elsif ($self->{nc} == 0x0029) { # )
8389            $self->{group_depth}--;
8390            push @{$self->{ct}->{content}}, chr $self->{nc};
8391            $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8392            
8393        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8394          $self->{line_prev} = $self->{line};
8395          $self->{column_prev} = $self->{column};
8396          $self->{column}++;
8397          $self->{nc}
8398              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8399        } else {
8400          $self->{set_nc}->($self);
8401        }
8402      
8403            redo A;
8404          } elsif ($self->{nc} == 0x003E) { # >
8405            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8406            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8407            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8408            
8409        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8410          $self->{line_prev} = $self->{line};
8411          $self->{column_prev} = $self->{column};
8412          $self->{column}++;
8413          $self->{nc}
8414              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8415        } else {
8416          $self->{set_nc}->($self);
8417        }
8418      
8419            return  ($self->{ct}); # ELEMENT
8420            redo A;
8421          } elsif ($self->{nc} == -1) {
8422            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8423            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8424            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8425            
8426        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8427          $self->{line_prev} = $self->{line};
8428          $self->{column_prev} = $self->{column};
8429          $self->{column}++;
8430          $self->{nc}
8431              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8432        } else {
8433          $self->{set_nc}->($self);
8434        }
8435      
8436            return  ($self->{ct}); # ELEMENT
8437            redo A;
8438          } else {
8439            $self->{parse_error}->(level => $self->{level}->{must}, type => 'after element name'); ## TODO: type
8440            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8441            $self->{state} = BOGUS_MD_STATE;
8442            
8443        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8444          $self->{line_prev} = $self->{line};
8445          $self->{column_prev} = $self->{column};
8446          $self->{column}++;
8447          $self->{nc}
8448              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8449        } else {
8450          $self->{set_nc}->($self);
8451        }
8452      
8453            redo A;
8454          }
8455        } elsif ($self->{state} == AFTER_CM_GROUP_CLOSE_STATE) {
8456          if ($is_space->{$self->{nc}}) {
8457            if ($self->{group_depth}) {
8458              $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8459            } else {
8460              $self->{state} = AFTER_MD_DEF_STATE;
8461            }
8462            
8463        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8464          $self->{line_prev} = $self->{line};
8465          $self->{column_prev} = $self->{column};
8466          $self->{column}++;
8467          $self->{nc}
8468              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8469        } else {
8470          $self->{set_nc}->($self);
8471        }
8472      
8473            redo A;
8474          } elsif ($self->{nc} == 0x002A or # *
8475                   $self->{nc} == 0x002B or # +
8476                   $self->{nc} == 0x003F) { # ?
8477            push @{$self->{ct}->{content}}, chr $self->{nc};
8478            if ($self->{group_depth}) {
8479              $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8480            } else {
8481              $self->{state} = AFTER_MD_DEF_STATE;
8482            }
8483            
8484        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8485          $self->{line_prev} = $self->{line};
8486          $self->{column_prev} = $self->{column};
8487          $self->{column}++;
8488          $self->{nc}
8489              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8490        } else {
8491          $self->{set_nc}->($self);
8492        }
8493      
8494            redo A;
8495          } elsif ($self->{nc} == 0x0029) { # )
8496            if ($self->{group_depth}) {
8497              $self->{group_depth}--;
8498              push @{$self->{ct}->{content}}, chr $self->{nc};
8499              ## Stay in the state.
8500              
8501        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8502          $self->{line_prev} = $self->{line};
8503          $self->{column_prev} = $self->{column};
8504          $self->{column}++;
8505          $self->{nc}
8506              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8507        } else {
8508          $self->{set_nc}->($self);
8509        }
8510      
8511              redo A;
8512            } else {
8513              $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8514              $self->{state} = BOGUS_MD_STATE;
8515              ## Reconsume.
8516              redo A;
8517            }
8518          } elsif ($self->{nc} == 0x003E) { # >
8519            if ($self->{group_depth}) {
8520              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8521              push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8522            }
8523            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8524            
8525        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8526          $self->{line_prev} = $self->{line};
8527          $self->{column_prev} = $self->{column};
8528          $self->{column}++;
8529          $self->{nc}
8530              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8531        } else {
8532          $self->{set_nc}->($self);
8533        }
8534      
8535            return  ($self->{ct}); # ELEMENT
8536            redo A;
8537          } elsif ($self->{nc} == -1) {
8538            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8539            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8540            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8541            
8542        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8543          $self->{line_prev} = $self->{line};
8544          $self->{column_prev} = $self->{column};
8545          $self->{column}++;
8546          $self->{nc}
8547              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8548        } else {
8549          $self->{set_nc}->($self);
8550        }
8551      
8552            return  ($self->{ct}); # ELEMENT
8553            redo A;
8554          } else {
8555            if ($self->{group_depth}) {
8556              $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8557            } else {
8558              $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8559              $self->{state} = BOGUS_MD_STATE;
8560            }
8561            ## Reconsume.
8562            redo A;
8563          }
8564        } elsif ($self->{state} == AFTER_MD_DEF_STATE) {
8565          if ($is_space->{$self->{nc}}) {
8566            ## Stay in the state.
8567            
8568        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8569          $self->{line_prev} = $self->{line};
8570          $self->{column_prev} = $self->{column};
8571          $self->{column}++;
8572          $self->{nc}
8573              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8574        } else {
8575          $self->{set_nc}->($self);
8576        }
8577      
8578            redo A;
8579          } elsif ($self->{nc} == 0x003E) { # >
8580            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8581            
8582        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8583          $self->{line_prev} = $self->{line};
8584          $self->{column_prev} = $self->{column};
8585          $self->{column}++;
8586          $self->{nc}
8587              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8588        } else {
8589          $self->{set_nc}->($self);
8590        }
8591      
8592            return  ($self->{ct}); # ENTITY/ELEMENT
8593            redo A;
8594          } elsif ($self->{nc} == -1) {
8595            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8596            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8597            
8598        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8599          $self->{line_prev} = $self->{line};
8600          $self->{column_prev} = $self->{column};
8601          $self->{column}++;
8602          $self->{nc}
8603              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8604        } else {
8605          $self->{set_nc}->($self);
8606        }
8607      
8608            return  ($self->{ct}); # ENTITY/ELEMENT
8609            redo A;
8610          } else {
8611            $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8612            $self->{state} = BOGUS_MD_STATE;
8613            ## Reconsume.
8614            redo A;
8615          }
8616        } elsif ($self->{state} == BOGUS_MD_STATE) {
8617          if ($self->{nc} == 0x003E) { # >
8618            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8619            
8620        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8621          $self->{line_prev} = $self->{line};
8622          $self->{column_prev} = $self->{column};
8623          $self->{column}++;
8624          $self->{nc}
8625              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8626        } else {
8627          $self->{set_nc}->($self);
8628        }
8629      
8630            return  ($self->{ct}); # ATTLIST/ENTITY/NOTATION
8631            redo A;
8632          } elsif ($self->{nc} == -1) {
8633            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8634            ## Reconsume.
8635            return  ($self->{ct}); # ATTLIST/ENTITY/NOTATION
8636            redo A;
8637          } else {
8638            ## Stay in the state.
8639            
8640        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8641          $self->{line_prev} = $self->{line};
8642          $self->{column_prev} = $self->{column};
8643          $self->{column}++;
8644          $self->{nc}
8645              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8646        } else {
8647          $self->{set_nc}->($self);
8648        }
8649      
8650          redo A;          redo A;
8651        }        }
8652      } else {      } else {
# Line 4115  sub _get_next_token ($) { Line 8659  sub _get_next_token ($) {
8659    
8660  1;  1;
8661  ## $Date$  ## $Date$
8662                                    

Legend:
Removed from v.1.1  
changed lines
  Added in v.1.26

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24